# Links
- https://www.kaggle.com/c/restaurant-revenue-prediction

# Solutions:
- 13-th place: https://github.com/bensolucky/TFI
- 14-th place: https://github.com/rohanrao91/Kaggle_TFI (stable on both public and private boards)
- 32-th place: https://github.com/ITankoyeu/Kaggle_TFI

In [2]:
import math
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.gaussian_process import GaussianProcess
from sklearn.linear_model import HuberRegressor, ARDRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

In [3]:
def rmse(est, features, labels):
    pred = est.predict(features)
    return math.sqrt(mean_squared_error(labels, pred))

In [6]:
def ybin(y):
    return (y.astype(np.float64) / np.max(y) * 10).astype(np.byte)

In [8]:
def cv_test(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    target = df.revenue

    cv = StratifiedKFold(ybin(target), 50, shuffle=True)

    scores = cross_val_score(estimator=est, X=features, y=target, scoring=rmse, cv=cv)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [7]:
def submit(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['revenue'], axis=1)
    labels = df.revenue

    model = est.fit(features, labels)

    df_test = pd.read_csv('test.csv.gz', index_col='Id')

    y_pred = model.predict(df_test)

    res_df = pd.DataFrame({'Prediction': y_pred}, index=df_test.index)
    res_df.to_csv('results.csv', index_label='Id')

In [4]:
def outliers_filter(features, target):
    threshold = target.mean()+target.std()*3
    return features[target < threshold], target[target < threshold]

class SamplesFilteringPipeline(BaseEstimator):
    def __init__(self, pipeline, samples_filter):
        self.pipeline = pipeline
        self.samples_filter = samples_filter

    def fit(self, X, y):
        X_filtered, y_filtered = self.samples_filter(X, y)
        return self.pipeline.fit(X_filtered, y_filtered)

    def predict(self, X):
        return self.pipeline.predict(X)

    def predict_proba(self, X):
        return self.pipeline.predict_proba(X)
    
def no_outliers_pipeline(est):
    return SamplesFilteringPipeline(est, outliers_filter)

In [5]:
def days_to_delta(df):
    delta = np.timedelta64(1, 'D')
    days_open = (pd.to_datetime('2015-02-01') - pd.to_datetime(df['Open Date'])) / delta
    dfc = df.drop('Open Date', axis=1).copy()
    dfc['days_open'] = days_open
    return dfc

df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

transf = make_pipeline(
    FunctionTransformer(days_to_delta, validate=False),
    df2dict,
    DictVectorizer(sparse=False),
)

In [9]:
# mean: 1824379.8869, std: 1459901.28338
# cv execution time: 42.1881010532 sec
est1 = no_outliers_pipeline(make_pipeline(
    transf,
    SelectKBest(f_regression, 20),
    RandomForestRegressor(n_jobs=4, n_estimators=100, max_features=0.2, max_depth=2, random_state=1)
))

In [10]:
# mean: 1742530.13007, std: 1364629.88699
# cv execution time: 7.0613951683 sec
est2 = no_outliers_pipeline(make_pipeline(
    transf,
    GaussianProcess(theta0=.5),
))



In [11]:
# mean: 1766325.28518, std: 1481625.49979
# cv execution time: 5.615224123 sec
est3 = make_pipeline(
    transf,
    HuberRegressor(epsilon=1.2),
)

In [12]:
# mean: 1892192.83136, std: 1573990.82032
# cv execution time: 5.36941003799 sec
est4 = no_outliers_pipeline(make_pipeline(
    transf,
    HuberRegressor(epsilon=1.2),
))

In [13]:
# mean: 1763053.40639, std: 1446500.61418
# cv execution time: 6.88036894798 sec
est5 = no_outliers_pipeline(make_pipeline(
    transf,
    ARDRegression(),
))