# Links
- https://www.kaggle.com/c/restaurant-revenue-prediction

# Solutions:
- 13-th place: https://github.com/bensolucky/TFI
- 14-th place: https://github.com/rohanrao91/Kaggle_TFI (stable on both public and private boards)
- 32-th place: https://github.com/ITankoyeu/Kaggle_TFI

In [2]:
import math
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, LeaveOneOut
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.gaussian_process import GaussianProcess, GaussianProcessRegressor
from sklearn.linear_model import HuberRegressor, ARDRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import clone as sk_clone
from sklearn.metrics import make_scorer
from sklearn.preprocessing import Imputer

%run dstools/dstools/ml/transformers.py

In [3]:
def rmse(true, pred):
    return math.sqrt(mean_squared_error(true, pred))

In [4]:
def split_test(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    target = df.revenue
    
    scores = []
    for i in range(100):
        m = sk_clone(est)
        xtr, xtst, ytr, ytst = train_test_split(features, target, test_size=.2)
        m.fit(xtr, ytr)
        scorer = make_scorer(rmse)
        scores.append(scorer(m, xtst, ytst))

    scores = np.array(scores)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [5]:
def submit(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    labels = df.revenue

    model = est.fit(features, labels)

    df_test = pd.read_csv('test.csv.gz', index_col='Id')

    y_pred = model.predict(df_test)

    res_df = pd.DataFrame({'Prediction': y_pred}, index=df_test.index)
    res_df.to_csv('results.csv', index_label='Id')

In [6]:
def outliers_filter(features, target):
    threshold = target.mean()+target.std()*3
    return features[target < threshold], target[target < threshold]

class SamplesFilteringPipeline(BaseEstimator):
    def __init__(self, pipeline, samples_filter):
        self.pipeline = pipeline
        self.samples_filter = samples_filter

    def fit(self, X, y):
        X_filtered, y_filtered = self.samples_filter(X, y)
        return self.pipeline.fit(X_filtered, y_filtered)

    def predict(self, X):
        return self.pipeline.predict(X)

    def predict_proba(self, X):
        return self.pipeline.predict_proba(X)
    
def no_outliers_pipeline(est):
    return SamplesFilteringPipeline(est, outliers_filter)

In [7]:
def days_to_delta(df):
    delta = np.timedelta64(1, 'D')
    days_open = (pd.to_datetime('2015-02-01') - pd.to_datetime(df['Open Date'])) / delta
    dfc = df.drop('Open Date', axis=1).copy()
    dfc['days_open'] = days_open
    return dfc

df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

transf = make_pipeline(
    FunctionTransformer(days_to_delta, validate=False),
    df2dict,
    DictVectorizer(sparse=False),
)

transf2 = make_pipeline(
    FunctionTransformer(days_to_delta, validate=False),
    empirical_bayes_encoder_normal_distr(),
    Imputer()
)

transf3 = make_pipeline(
    FunctionTransformer(days_to_delta, validate=False),
    count_encoder(),
    Imputer()
)

In [16]:
est1 = no_outliers_pipeline(make_pipeline(
    transf,
    SelectKBest(f_regression, 20),
    RandomForestRegressor(n_jobs=4, n_estimators=100, max_features=0.2, max_depth=2, random_state=1)
))

In [46]:
%%time
split_test(est1)

mean: 2349991.57899, std: 779557.222073
CPU times: user 1min 40s, sys: 11 s, total: 1min 51s
Wall time: 2min 10s


In [12]:
est2 = no_outliers_pipeline(make_pipeline(
    transf2,
    SelectKBest(f_regression, 20),
    RandomForestRegressor(n_jobs=4, n_estimators=100, max_features=0.2, max_depth=2, random_state=1)
))

In [13]:
%%time
split_test(est2)

mean: 2263352.0087537235, std: 709362.6026967816
CPU times: user 21.9 s, sys: 3.27 s, total: 25.2 s
Wall time: 37.2 s


In [12]:
est22 = no_outliers_pipeline(make_pipeline(
    transf3,
    SelectKBest(f_regression, 20),
    RandomForestRegressor(n_jobs=4, n_estimators=100, max_features=0.2, max_depth=2, random_state=1)
))

In [13]:
%%time
split_test(est22)

mean: 2289320.651580539, std: 779319.7397884463
CPU times: user 31 s, sys: 3.74 s, total: 34.7 s
Wall time: 32.7 s


In [58]:
est3 = make_pipeline(
    transf,
    HuberRegressor(epsilon=1.2),
)

In [60]:
%%time
split_test(est3)

mean: 2459744.97601, std: 701037.985616
CPU times: user 27.6 s, sys: 257 ms, total: 27.8 s
Wall time: 17.2 s


In [61]:
est4 = no_outliers_pipeline(make_pipeline(
    transf,
    HuberRegressor(epsilon=1.2),
))

In [62]:
%%time
split_test(est4)

mean: 2477808.29769, std: 747109.15184
CPU times: user 28.1 s, sys: 283 ms, total: 28.3 s
Wall time: 18 s


In [63]:
est5 = no_outliers_pipeline(make_pipeline(
    transf,
    ARDRegression(),
))

In [64]:
%%time
split_test(est5)

mean: 2376231.35043, std: 711591.713199
CPU times: user 33.2 s, sys: 233 ms, total: 33.4 s
Wall time: 20.2 s
