# Links
- https://www.kaggle.com/c/allstate-claims-severity

# Discussions
- https://habrahabr.ru/post/318518/
- https://www.youtube.com/watch?v=p7ArDjMImiI

In [3]:
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

In [2]:
%run ds_tools/dstools/ml/transformers.py

In [3]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [4]:
%run ds_tools/dstools/h2o/sklearn_tools.py

In [4]:
class TargetTransfRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_est, transf_to, transf_from):
        self.base_est = base_est
        self.transf_to = transf_to
        self.transf_from = transf_from

    def fit(self, X, y):
        self.base_est.fit(X, self.transf_to(y))
        return self

    def predict(self, X):
        return self.transf_from(self.base_est.predict(X))

In [1]:
def mape(y_true, y_pred):
    return np.average(np.abs(y_pred - y_true), axis=0)


def mape_evalerror_exp(preds, dtrain):
    res = np.average(np.abs(np.exp(preds) - np.exp(dtrain.get_label())), axis=0)
    return 'mae', res


def mape_evalerror(preds, dtrain):
    return 'mape', mape(dtrain.get_label(), preds)


def ybin(y):
    return (y.astype(np.float64) / np.max(y) * 10).astype(np.byte)

In [7]:
def cv_test(est):
    df = pd.read_csv('train.csv.gz', index_col='id')

    features = df.drop('loss', axis=1)
    target = df.loss.values

    if type(est) is tuple:
        transform, estimator = est
        features_t = transform.fit_transform(features, target)
    else:
        estimator = est
        features_t = features

    cv = KFold(3, shuffle=True)

    scores = cross_val_score(estimator, X=features_t, y=target, scoring=make_scorer(mape), cv=cv)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [8]:
def pred_vs_true(est, path):
    df = pd.read_csv('train.csv.gz', index_col='id')
    features = df.drop('loss', axis=1)
    target = df.loss.values

    transform, estimator = est
    pl = make_pipeline(transform, estimator)

    x_train, x_test, y_train, y_test = train_test_split(features, target, train_size=0.9, random_state=123)
    y_pred = pl.fit(x_train, y_train).predict(x_test)
    pd.DataFrame({'pred': y_pred, 'true': y_test}).to_csv(path, index=False, sep='\t')

In [11]:
xgb_params = {
    "objective": "reg:linear",
    "eta": 0.1,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "eval_func": mape_evalerror_exp,
    "silent": 1,
    "max_depth": 4,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .1,
    "ybin": ybin,
}

In [12]:
xgb_params2 = {
    "objective": "count:poisson",
    "eta": 0.05,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "eval_func": mape_evalerror,
    "silent": 1,
    "max_depth": 4,
    "num_rounds": 1000,
    "num_es_rounds": None,
    "es_share": .1,
    "ybin": ybin,
}

In [13]:
def high_cardinality_zeroing(df, min_entries=50, substitute='zeroed'):
    dfc = df.copy()
    for col in dfc.select_dtypes(include=['object']):
        vc = dfc[col].value_counts()
        dfc.ix[~dfc[col].isin(vc[vc >= min_entries].index), col] = substitute
    return dfc

hcz_transf = FunctionTransformer(high_cardinality_zeroing, validate=False)

df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

transf = make_pipeline(
    hcz_transf,
    df2dict,
    DictVectorizer(sparse=False))

In [14]:
transf2 = CountEncoder()

In [15]:
transf3 = TargetMeanEncoder(reg_threshold=0.1)

In [16]:
# mean: 1155.38459245, std: 1.66143441458
# cv execution time: 2173.90924597 sec
est1 = transf, TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)

In [17]:
# mean: 1155.41551477, std: 1.95293210141
# cv execution time: 341.660254955 sec
est2 = transf2, TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)

In [18]:
# mean: 1193.08011539, std: 3.45706005863
# cv execution time: 382.747917891 sec
est3 = transf2, XGBoostRegressor(**xgb_params2)

In [19]:
h2o_gbm_params = {
    'model_id': 'kaggle_allstate_gbm',
    'distribution': 'laplace',
    'ntrees': 1000,
    'learn_rate': .1,
    'max_depth': 4,
    'sample_rate': .7,
    'col_sample_rate_per_tree': .5
}

# mean: 1169.26570704, std: 8.97162412921
# cv execution time: 885.310971975 sec
est4 = H2ODecorator('gbm', h2o_gbm_params)

In [20]:
h2o_xgb_params = {
    'model_id': 'kaggle_allstate_xgb',
    'distribution': 'poisson',
    'ntrees': 1000,
    'learn_rate': .1,
    'max_depth': 4,
    'sample_rate': .7,
    'col_sample_rate_per_tree': .5
}

# mean: 3036.83769373, std: 12.8838933342
# cv execution time: 322.227102995 sec
est5 = H2ODecorator('xgb', h2o_xgb_params)

In [21]:
# mean: 1154.64991991, std: 4.90194453789
# cv execution time: 373.575634003 sec
est6 = transf2, TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)