# Links
- https://www.kaggle.com/c/prudential-life-insurance-assessment

In [17]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeClassifier
from scipy.optimize import fmin_powell

In [2]:
%run ds_tools/dstools/ml/metrics.py

In [3]:
%run ds_tools/dstools/ml/ensemble.py

In [4]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [6]:
def preds_to_rank(preds, min, max):
    return np.clip(np.round(preds), min, max).astype(int)

In [7]:
def qwk_score(est, features, labels):
    raw_pred = est.predict(features)
    pred = preds_to_rank(raw_pred, np.min(labels), np.max(labels))
    return quadratic_weighted_kappa(labels, pred)

In [8]:
def score_offset(pred_base, pred_modified, labels, offset, rank_number, scorer):
    pred_modified[pred_base.astype(int) == rank_number] = pred_base[pred_base.astype(int) == rank_number] + offset
    rank = preds_to_rank(pred_modified, np.min(labels), np.max(labels))
    score = scorer(labels, rank)
    return score

def apply_offsets(data, offsets):
    res = np.copy(data)
    for j in range(len(offsets)):
        res[data.astype(int) == j] = data[data.astype(int) == j] + offsets[j]
    return res

def minimize_reminders(preds, true, scorer):
    offsets = np.zeros(len(set(true)))
    optimized_preds = apply_offsets(preds, offsets)
    for j in range(len(offsets)):
        def train_offset(x): return -score_offset(preds, optimized_preds, true, x, j, scorer) * 100
        offsets[j] = fmin_powell(train_offset, offsets[j])
    return offsets

class RemindersMinimizingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_estimator, scorer):
        self.base_estimator = base_estimator
        self.scorer = scorer
        self.offsets = None

    def fit(self, X, y):
        self.base_estimator.fit(X, y)
        preds = self.base_estimator.predict(X)

        self.offsets = minimize_reminders(preds, y, self.scorer)

        return self

    def predict(self, X):
        preds = self.base_estimator.predict(X)
        preds_fix = apply_offsets(preds, self.offsets)
        return preds_fix

In [9]:
def cv_test(est):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['Response'], axis=1)
    target = df['Response'].values
    scores = cross_val_score(
        estimator=est,
        X=features,
        y=target,
        cv=StratifiedKFold(target, 3, shuffle=True),
        scoring=qwk_score,
        n_jobs=1,
        verbose=1)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [10]:
def submission(est, name='results'):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['Response'], axis=1)
    target = df['Response'].values
    model = est.fit(features, target)

    df_test = pd.read_csv('test.csv.gz', index_col='Id')

    y_pred = preds_to_rank(model.predict(df_test), np.min(target), np.max(target))

    res = pd.Series(y_pred, index=df_test.index, name='Response')
    res.to_csv(name+'.csv', index_label='Id', header=True)

In [11]:
def pred_vs_true(est, path):
    df = pd.read_csv('train.csv.gz', index_col='Id')
    features = df.drop(['Response'], axis=1)
    target = df['Response'].values

    x_train, x_test, y_train, y_test = train_test_split(features, target, train_size=0.9)
    y_pred = est.fit(x_train, y_train).predict(x_test)

    pd.DataFrame({'pred': y_pred, 'true': y_test}).to_csv(path, index=False, sep='\t')

In [12]:
def results_corr(pred_files):
    preds = np.array([pd.read_csv(file, index_col='Id', squeeze=True) for file in pred_files], dtype=np.int32)
    print DataFrame(np.corrcoef(preds), index=pred_files, columns=pred_files)

In [13]:
def submission_mix(pred_files, name):
    preds = DataFrame(dict([(file, pd.read_csv(file, index_col='Id', squeeze=True)) for file in pred_files]))
    mix = preds_to_rank(preds.mean(axis=1), 1, 8)
    res = Series(mix, index=preds.index, name='Response')
    res.to_csv(name+'.csv', index_label='Id', header=True)

In [14]:
def submission_mix_m(pred_files, name):
    preds = DataFrame(dict([(file, pd.read_csv(file, index_col='Id', squeeze=True)) for file in pred_files]))
    mix = preds.mode(axis=1)[0]
    mix[mix.isnull()] = preds.median(axis=1)
    res = Series(mix.astype(int), name='Response')
    res.to_csv(name+'.csv', index_label='Id', header=True)

In [18]:
df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

transf1 = make_pipeline(
    df2dict,
    DictVectorizer(sparse=False),
    Imputer(strategy='median'),
)

In [19]:
xgb_params = {
    "objective": "reg:linear",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "silent": 1,
    "max_depth": 6,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .05,
}

In [21]:
# mean: 0.605788631592, std: 0.00317117715894
# cv execution time: 334.374572039 sec
pl1 = make_pipeline(transf1, XGBoostRegressor(**xgb_params))

In [22]:
# mean: 0.58282885229129844, std: 0.0036042550445012792
# cv execution time: 398.817314148 sec
pl3 = make_pipeline(transf1, RandomForestRegressor(n_estimators=200, n_jobs=-1, verbose=1))

In [23]:
# mean: 0.58556649850197451, std: 0.001689581447944851
# cv execution time: 437.633708954 sec
pl4 = make_pipeline(transf1, ExtraTreesRegressor(n_estimators=200, n_jobs=-1, verbose=1))

In [24]:
# mean: 0.610298252514, std: 0.0051761541879
# cv execution time: 344.473729849 sec
pl10 = make_pipeline(
    transf1,
    ModelEnsembleRegressor(
        intermediate_estimators=[
            XGBoostRegressor(**xgb_params),
        ],
        assembly_estimator=DecisionTreeClassifier(max_depth=2),
        ensemble_train_size=1
    ),
)

In [25]:
# mean: 0.654195672603, std: 0.0032825843272
# cv execution time: 386.967167854 sec
pl19 = make_pipeline(
    transf1,
    RemindersMinimizingRegressor(
        base_estimator=XGBoostRegressor(**xgb_params),
        scorer=quadratic_weighted_kappa
    ),
)