# Links
- http://mlbootcamp.ru/championship/7/

# Discussions:
- [Скоро открытие ML Boot Camp III, блог mail.ru](https://habrahabr.ru/company/mailru/blog/321016/)
  - predict time/(m*n*k) instead of time
    - according to complimentary article
  - https://github.com/KarachunMikhail/mlbootcamp_matrix/blob/master/mlbootcamp.py

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition.truncated_svd import TruncatedSVD
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble.bagging import BaggingRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection.univariate_selection import SelectKBest, f_regression
from sklearn.gaussian_process import GaussianProcess
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
from sklearn.mixture.gmm import GMM
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing.data import PolynomialFeatures

In [2]:
%run ds_tools/dstools/ml/ensemble.py

In [3]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [5]:
def mape(y_true, y_pred):
    return np.average(np.abs((y_pred - y_true) / y_true), axis=0)


def mape_evalerror(preds, dtrain):
    return 'mape', mape(dtrain.get_label(), preds)


def mape_obj(preds, dtrain):
    labels = dtrain.get_label()
    grad = (preds - labels) / labels
    hess = np.full(len(preds), 1.)
    return grad, hess


def ybin(y):
    return np.digitize(np.log2(y), bins=np.arange(0, 9))

In [6]:
def dataset(path):
    x = pd.read_csv(path)
    x['memFreq'] = x.memFreq.replace('None', np.nan).astype(np.float64)
    x['memtRFC'] = x.memtRFC.replace('None', np.nan).astype(np.float64)
    return x

In [7]:
def cv_test(est):
    x = dataset('x_train.csv.gz')

    y = pd.read_csv('y_train.csv', squeeze=True)

    cv = StratifiedKFold(ybin(y), 5, shuffle=True)

    scores = cross_val_score(est, x, y, scoring=make_scorer(mape), cv=cv)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [9]:
def submission(est, name='results'):
    x_tr = dataset('x_train.csv.gz')
    y_tr = pd.read_csv('y_train.csv', squeeze=True)

    m = est.fit(x_tr, y_tr)

    x_test = dataset('x_test.csv.gz')

    y_pred = m.predict(x_test)

    res = pd.Series(y_pred, index=x_test.index, name='time')
    res.to_csv(name + '.csv', header=False, index=False)

In [10]:
def pred_vs_true(est, path):
    x_tr = dataset('x_train.csv.gz')
    y_tr = pd.read_csv('y_train.csv', squeeze=True)

    x_train, x_test, y_train, y_test = train_test_split(x_tr, y_tr, train_size=0.9)
    y_pred = est.fit(x_train, y_train).predict(x_test)

    pd.DataFrame({'pred': y_pred, 'true': y_test}).to_csv(path, index=False, sep='\t')

In [11]:
def drop_transform(x):
    return x.drop(['memType', 'os', 'cpuFull', 'cpuArch'], axis=1)

drop_transformer = FunctionTransformer(drop_transform, validate=False)

In [12]:
xgb_params_base = {
    "objective": "reg:linear",
    "eta": 0.1,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "silent": 1,
    "max_depth": 4,
    "num_rounds": 10000,
    "num_es_rounds": 120,
    "es_share": .2,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
}

# mean: 0.18989341816, std: 0.0160804510843
# cv execution time: 53.5373871326 sec
est1 = make_pipeline(
    drop_transformer,
    XGBoostRegressor(**xgb_params_base),
)

In [13]:
xgb_params2_3 = {
    "objective": "reg:linear",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "silent": 1,
    "max_depth": 10,
    "num_rounds": 10000,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
    'objective_func': mape_obj,
    'num_parallel_tree': 2,
}

# mean: 0.100802273555, std: 0.00297206137388
# cv execution time: 1551.09467602 sec
est2_3 = make_pipeline(
    drop_transformer,
    XGBoostRegressor(**xgb_params2_3),
)

In [14]:
xgb_params2_4q = {
    "objective": "reg:linear",
    "eta": 0.05,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.5,
    "silent": 1,
    "max_depth": 10,
    "num_rounds": 1000,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
    'objective_func': mape_obj,
    'num_parallel_tree': 2,
    'es_share': 0.,
}

# mean: 0.115525732228, std: 0.00485697975408
# cv execution time: 73.8630280495 sec
est2_4q = make_pipeline(
    drop_transformer,
    XGBoostRegressor(**xgb_params2_4q),
)

In [15]:
xgb_params4_8 = {
    "objective": "count:poisson",
    "eta": 0.01,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.3,
    "silent": 1,
    "max_depth": 10,
    "num_rounds": 5000,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
    'num_parallel_tree': 2,
    'es_share': 0.,
}

# mean: 0.0815279171801, std: 0.00207426442531
# cv execution time: 467.35233593 sec
est4_8 = make_pipeline(
    drop_transformer,
    XGBoostRegressor(**xgb_params4_8),
)

In [16]:
xgb_params4_8q = {
    "objective": "count:poisson",
    "eta": 0.05,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.3,
    "silent": 1,
    "max_depth": 10,
    "num_rounds": 1000,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
    'num_parallel_tree': 2,
    'es_share': 0.
}

# mean: 0.0895119550018, std: 0.00514490683152
# cv execution time: 82.7434458733 sec
est4_8q = make_pipeline(
    drop_transformer,
    XGBoostRegressor(**xgb_params4_8q),
)

In [17]:
# mean: 0.188509803951, std: 0.0900312025872
# cv execution time: 101.356594801 sec
est7 = make_pipeline(
    drop_transformer,
    Imputer(),
    StandardScaler(),
    PCA(n_components=100),
    XGBoostRegressor(**xgb_params4_8q),
)

In [18]:
# mean: 0.113271580876, std: 0.00501539399312
# cv execution time: 974.13361311 sec
est8 = make_pipeline(
    drop_transformer,
    Imputer(),
    ExtraTreesRegressor(n_estimators=10000, n_jobs=-1),
)

In [19]:
# mean: 0.40358272167, std: 0.0333605756569
# cv execution time: 28.1571791172 sec
est11 = make_pipeline(
    drop_transformer,
    Imputer(),
    StandardScaler(),
    PCA(n_components=30),
    GaussianProcess(theta0=2.),
)



In [20]:
pca2c_transformer = make_pipeline(
    drop_transformer,
    Imputer(),
    StandardScaler(),
    PCA(n_components=2),
)

os_transformer = make_pipeline(
    FunctionTransformer(lambda x: x.os, validate=False),
    CountVectorizer(),
    TruncatedSVD(n_components=10),
)

arch_transformer = FunctionTransformer(lambda x: pd.get_dummies(x.cpuArch), validate=False)

gmm_transformer = make_pipeline(
    drop_transformer,
    Imputer(),
    StandardScaler(),
    PCA(n_components=2),
    FunctionTransformer(lambda x: GMM(n_components=3).fit_predict(x)[np.newaxis].T)
)

# mean: 0.0841074580969, std: 0.00253239384855
# cv execution time: 693.359354973 sec
est13 = make_pipeline(
    make_union(
        drop_transformer,
        gmm_transformer,
        os_transformer,
        arch_transformer,
        pca2c_transformer,
    ),
    XGBoostRegressor(**xgb_params4_8),
)

In [21]:
# mean: 0.0951764830567, std: 0.00784125475434
# cv execution time: 876.678437948 sec
est14 = make_pipeline(
    drop_transformer,
    Imputer(),
    BaggingRegressor(
        base_estimator=XGBoostRegressor(**xgb_params4_8q),
        n_estimators=10,
        max_features=1.,
        max_samples=1.,
    ),
)

In [22]:
xgb_params15 = {
    "objective": "count:poisson",
    "eta": 0.05,
    "min_child_weight": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.3,
    "silent": 1,
    "max_depth": 10,
    "num_rounds": 5000,
    'eval_func': mape_evalerror,
    'ybin_func': ybin,
    'num_parallel_tree': 2,
}

# mean: 0.126245924707, std: 0.00954889171685
# cv execution time: 158.827933073 sec
est15 = make_pipeline(
    drop_transformer,
    Imputer(),
    StandardScaler(),
    PolynomialFeatures(degree=2, interaction_only=True),
    SelectKBest(f_regression, 200),
    XGBoostRegressor(**xgb_params15),
)

In [23]:
# mean: 0.287046299445, std: 0.0499242435477
# cv execution time: 6.1778948307 sec
est16 = PerGroupRegressor(
    estimator=make_pipeline(
        drop_transformer,
        Imputer(),
        StandardScaler(),
        Ridge(alpha=10)
    ),
    split_condition=['os', 'cpuFreq', 'memSize_MB'],
    n_jobs=1,
    verbose=1
)

In [24]:
xgb_params17 = {
    "booster": 'gblinear',
    "objective": "reg:linear",
    "eta": 0.01,
    "num_rounds": 100,
    "es_share": .0,
    'lambda': .5,
    'alpha': .5,
}

# mean: 0.296550976215, std: 0.0348970237629
# cv execution time: 26.448786974 sec
est17 = PerGroupRegressor(
    estimator=make_pipeline(
        drop_transformer,
        Imputer(),
        StandardScaler(),
        XGBoostRegressor(**xgb_params17)
    ),
    split_condition=['os', 'cpuFreq', 'memSize_MB'],
    n_jobs=1,
    verbose=1
)

In [25]:
# mean: 0.120320752517, std: 0.00786868554231
# cv execution time: 866.464553833 sec
est18 = PerGroupRegressor(
    estimator=make_pipeline(
        drop_transformer,
        Imputer(),
        StandardScaler(),
        XGBoostRegressor(**xgb_params4_8q)
    ),
    split_condition=['os', 'cpuFreq', 'memSize_MB'],
    n_jobs=1,
    verbose=1
)