In [3]:
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

%run ds_tools/dstools/ml/transformers.py

%run ds_tools/dstools/ml/xgboost_tools.py

%run ds_tools/dstools/h2o/sklearn_tools.py

In [4]:
class TargetTransfRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_est, transf_to, transf_from):
        self.base_est = base_est
        self.transf_to = transf_to
        self.transf_from = transf_from

    def fit(self, X, y):
        self.base_est.fit(X, self.transf_to(y))
        return self

    def predict(self, X):
        return self.transf_from(self.base_est.predict(X))

In [5]:
def mape(y_true, y_pred):
    return np.average(np.abs(y_pred - y_true), axis=0)


def mape_evalerror_exp(preds, dtrain):
    res = np.average(np.abs(np.exp(preds) - np.exp(dtrain.get_label())), axis=0)
    return 'mae', res


def mape_evalerror(preds, dtrain):
    return 'mape', mape(dtrain.get_label(), preds)


def ybin(y):
    return (y.astype(np.float64) / np.max(y) * 10).astype(np.byte)

In [15]:
def cv_test(est):
    df = pd.read_csv('train.csv.gz', index_col='id')

    features = df.drop('loss', axis=1)
    target = df.loss

    if type(est) is tuple:
        transform, estimator = est
        features_t = transform.fit_transform(features, target)
    else:
        estimator = est
        features_t = features

    cv = KFold(3, shuffle=True)

    scores = cross_val_score(estimator, X=features_t, y=target, scoring=make_scorer(mape), cv=cv)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [7]:
def pred_vs_true(est, path):
    df = pd.read_csv('train.csv.gz', index_col='id')
    features = df.drop('loss', axis=1)
    target = df.loss.values

    transform, estimator = est
    pl = make_pipeline(transform, estimator)

    x_train, x_test, y_train, y_test = train_test_split(features, target, train_size=0.9, random_state=123)
    y_pred = pl.fit(x_train, y_train).predict(x_test)
    pd.DataFrame({'pred': y_pred, 'true': y_test}).to_csv(path, index=False, sep='\t')

In [8]:
xgb_params = {
    "objective": "reg:linear",
    "eta": 0.1,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "eval_func": mape_evalerror_exp,
    "silent": 1,
    "max_depth": 4,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .1,
    "ybin": ybin,
}

In [9]:
xgb_params2 = {
    "objective": "count:poisson",
    "eta": 0.05,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "eval_func": mape_evalerror,
    "silent": 1,
    "max_depth": 4,
    "num_rounds": 1000,
    "num_es_rounds": None,
    "es_share": .1,
    "ybin": ybin,
}

In [11]:
def high_cardinality_zeroing(df, min_entries=50, substitute='zeroed'):
    dfc = df.copy()
    for col in dfc.select_dtypes(include=['object']):
        vc = dfc[col].value_counts()
        dfc.ix[~dfc[col].isin(vc[vc >= min_entries].index), col] = substitute
    return dfc

hcz_transf = FunctionTransformer(high_cardinality_zeroing, validate=False)

df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

transf = make_pipeline(
    hcz_transf,
    df2dict,
    DictVectorizer(sparse=False))

transf2 = count_encoder()

transf3 = target_mean_encoder(size_threshold=20)

transf4 = empirical_bayes_encoder_normal_distr()

In [16]:
# mean: 1155.38459245, std: 1.66143441458
est1 = make_pipeline(
    transf,
    TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)
)

In [17]:
# mean: 1155.41551477, std: 1.95293210141
est2 = make_pipeline(
    transf2,
    TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)
)

In [18]:
# mean: 1193.08011539, std: 3.45706005863
est3 = make_pipeline(
    transf2,
    XGBoostRegressor(**xgb_params2)
)

In [19]:
h2o_gbm_params = {
    'model_id': 'kaggle_allstate_gbm',
    'distribution': 'laplace',
    'ntrees': 1000,
    'learn_rate': .1,
    'max_depth': 4,
    'sample_rate': .7,
    'col_sample_rate_per_tree': .5
}

# mean: 1169.26570704, std: 8.97162412921
est4 = H2ODecorator('gbm', h2o_gbm_params)

In [18]:
# mean: 1154.64991991, std: 4.90194453789
est6 = make_pipeline(
    transf3,
    TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)
)

In [19]:
%%time
cv_test(est6)

[0]	train-mae:3041.1	validation-mae:3035.43
Multiple eval metrics have been passed: 'validation-mae' will be used for early stopping.

Will train until validation-mae hasn't improved in 120 rounds.
[120]	train-mae:1163.03	validation-mae:1178.73
[240]	train-mae:1137.35	validation-mae:1164.95
[360]	train-mae:1123.28	validation-mae:1161.24
[480]	train-mae:1111.53	validation-mae:1159.18
[600]	train-mae:1100.89	validation-mae:1157.77
[720]	train-mae:1092.4	validation-mae:1157.22
[840]	train-mae:1084.09	validation-mae:1157.4
Stopping. Best iteration:
[825]	train-mae:1085.02	validation-mae:1156.93

[0]	train-mae:3042.26	validation-mae:3020.93
Multiple eval metrics have been passed: 'validation-mae' will be used for early stopping.

Will train until validation-mae hasn't improved in 120 rounds.
[120]	train-mae:1161.29	validation-mae:1171.96
[240]	train-mae:1135.8	validation-mae:1158.46
[360]	train-mae:1120.72	validation-mae:1152.84
[480]	train-mae:1109.44	validation-mae:1151.63
[600]	train-mae

In [13]:
est7 = make_pipeline(
    transf4,
    TargetTransfRegressor(XGBoostRegressor(**xgb_params), np.log, np.exp)
)

In [16]:
%%time
cv_test(est7)

[0]	train-mae:3028.06	validation-mae:3053.95
Multiple eval metrics have been passed: 'validation-mae' will be used for early stopping.

Will train until validation-mae hasn't improved in 120 rounds.
[120]	train-mae:1161.49	validation-mae:1185.23
[240]	train-mae:1135.97	validation-mae:1172.01
[360]	train-mae:1122.02	validation-mae:1166.24
[480]	train-mae:1110.81	validation-mae:1163.61
[600]	train-mae:1101.16	validation-mae:1162.5
[720]	train-mae:1092.72	validation-mae:1161.18
[840]	train-mae:1084.41	validation-mae:1159.57
[960]	train-mae:1077.96	validation-mae:1159.35
[1080]	train-mae:1070.54	validation-mae:1158.29
Stopping. Best iteration:
[1048]	train-mae:1072.11	validation-mae:1158.01

[0]	train-mae:3037.67	validation-mae:3055.18
Multiple eval metrics have been passed: 'validation-mae' will be used for early stopping.

Will train until validation-mae hasn't improved in 120 rounds.
[120]	train-mae:1168.99	validation-mae:1174.61
[240]	train-mae:1142.27	validation-mae:1159.38
[360]	trai