# Links
- https://www.kaggle.com/c/restaurant-revenue-prediction

# Solutions:
- 13-th place: https://github.com/bensolucky/TFI
- 14-th place: https://github.com/rohanrao91/Kaggle_TFI (stable on both public and private boards)
- 32-th place: https://github.com/ITankoyeu/Kaggle_TFI

In [3]:
import math
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.gaussian_process import GaussianProcess, GaussianProcessRegressor
from sklearn.linear_model import HuberRegressor, ARDRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import clone as sk_clone
from sklearn.metrics import make_scorer
from sklearn.preprocessing import Imputer

%run dstools/dstools/ml/transformers.py

In [5]:
def rmse(true, pred):
    return math.sqrt(mean_squared_error(true, pred))

def split_test(est, n_tests):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    target = df.revenue
    
    scores = []
    for i in range(n_tests):
        m = sk_clone(est)
        xtr, xtst, ytr, ytst = train_test_split(features, target, test_size=.2)
        m.fit(xtr, ytr)
        scorer = make_scorer(rmse)
        scores.append(scorer(m, xtst, ytst))

    scores = np.array(scores)
    return {'RMSE-mean': scores.mean(), 'RMSE-STD': scores.std()}

In [6]:
def submit(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    labels = df.revenue

    model = est.fit(features, labels)

    df_test = pd.read_csv('test.csv.gz', index_col='Id')

    y_pred = model.predict(df_test)

    res_df = pd.DataFrame({'Prediction': y_pred}, index=df_test.index)
    res_df.to_csv('results.csv', index_label='Id')

In [7]:
def outliers_filter(features, target):
    threshold = target.mean()+target.std()*3
    return features[target < threshold], target[target < threshold]

class SamplesFilteringPipeline(BaseEstimator):
    def __init__(self, pipeline, samples_filter):
        self.pipeline = pipeline
        self.samples_filter = samples_filter

    def fit(self, X, y):
        X_filtered, y_filtered = self.samples_filter(X, y)
        return self.pipeline.fit(X_filtered, y_filtered)

    def predict(self, X):
        return self.pipeline.predict(X)

    def predict_proba(self, X):
        return self.pipeline.predict_proba(X)
    
def no_outliers_pipeline(est):
    return SamplesFilteringPipeline(est, outliers_filter)

def days_to_delta(df):
    delta = np.timedelta64(1, 'D')
    days_open = (pd.to_datetime('2015-02-01') - pd.to_datetime(df['Open Date'])) / delta
    dfc = df.drop('Open Date', axis=1).copy()
    dfc['days_open'] = days_open
    return dfc

In [52]:
import json
import os.path
import time

def update_model_stats(stats_file, params, results):
    if os.path.exists(stats_file):
        with open(stats_file, 'r') as f:
            stats = json.load(f)
    else:
        stats = []
        
    stats.append({**results, **params})
    
    with open(stats_file, 'w') as f:
        json.dump(stats, f, indent=4)

def run_experiment(experiment, params, stats_file):
    start = time.time()
    scores = experiment(params)
    exec_time = time.time() - start
    update_model_stats(stats_file, params, {**scores, 'exec-time-sec': exec_time})

In [64]:
def run_experiment_tfi(experiment, params):
    run_experiment(experiment, params, 'experiments.json')

def validate(params):
    category_encoding = params['category_encoding']
    
    if category_encoding == 'onehot':
        df2dict = FunctionTransformer(
            lambda x: x.to_dict(orient='records'), validate=False)
            
        transf = make_pipeline(
            FunctionTransformer(days_to_delta, validate=False),
            df2dict,
            DictVectorizer(sparse=False),
        )
    elif category_encoding == 'empyrical_bayes':
        transf = make_pipeline(
        FunctionTransformer(days_to_delta, validate=False),
            empirical_bayes_encoder_normal_distr(),
            Imputer()
        )
    elif category_encoding == 'count':
        transf = make_pipeline(
            FunctionTransformer(days_to_delta, validate=False),
            count_encoder(),
            Imputer()
        )
    
    reg_type = params['regressor_type']
    
    if reg_type == 'rfr':
        reg = make_pipeline(
            SelectKBest(f_regression, params['k_best']),
            RandomForestRegressor(
                n_jobs=params['n_jobs'],
                n_estimators=params['n_estimators'],
                max_features=params['max_features'],
                max_depth=params['max_depth'],
                random_state=1))
    elif reg_type == 'huber':
        reg = HuberRegressor(epsilon=params['epsilon'])
    elif reg_type == 'ard':
        reg = ARDRegression()
        
    est = make_pipeline(transf, reg)
    
    if params['drop_outliers']:
        est = no_outliers_pipeline(est)
            
    valid_mode = params['valid_mode']
    n_folds = params['n_folds']
    if valid_mode == 'split':
        return split_test(est, n_folds)

In [69]:
stats = pd.read_json('experiments.json')
pd.set_option('display.float_format', lambda e: f'{e:.2f}')
stats.sort_values('RMSE-mean').T

Unnamed: 0,0,5,2,1,4,3
RMSE-STD,696388.01,724216.82,760802.35,773110.81,770942.46,782948.36
RMSE-mean,2273463.89,2339395.66,2350349.72,2378698.69,2448658.61,2497804.55
category_encoding,onehot,onehot,count,empyrical_bayes,onehot,onehot
drop_outliers,True,True,True,True,True,False
epsilon,,,,,1.20,1.20
exec-time-sec,34.53,9.99,33.31,34.17,7.39,8.38
k_best,20.00,,20.00,20.00,,
max_depth,2.00,,2.00,2.00,,
max_features,0.20,,0.20,0.20,,
n_estimators,100.00,,100.00,100.00,,
