# Links

- http://special.habrahabr.ru/beeline/

# Discussions

- [Как я победил в конкурсе BigData от Beeline, @nurumaik](https://habrahabr.ru/post/270367/)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing.imputation import Imputer

In [2]:
%run ds_tools/dstools/ml/transformers.py

In [3]:
%run ds_tools/dstools/ml/xgboost_tools.py



In [4]:
def cv_test(est):
    df = pd.read_csv('train.csv.gz')

    features = df.drop('y', axis=1)
    target = df.y

    if isinstance(est, tuple):
        transf, estimator = est
        features_t = transf.fit_transform(features, target)
    else:
        estimator = est
        features_t = features

    scores = cross_val_score(
        estimator=estimator,
        X=features_t,
        y=target,
        cv=8,
        verbose=1)

    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [6]:
def submission(est):
    df = pd.read_csv('train.csv.gz')

    features = df.drop('y', axis=1)
    target = df.y

    if isinstance(est, tuple):
        transf, estimator = est
        pl = make_pipeline(transf, estimator)
    else:
        pl = est

    model = pl.fit(features, target)

    df_test = pd.read_csv('test.csv.gz', index_col='ID')

    y_pred = model.predict(df_test)

    res_df = pd.DataFrame({'y': y_pred}, index=df_test.index)
    res_df.to_csv('results.csv', index_label='ID')

In [7]:
def hyperopt():
    import hyperopt as hpo

    df = pd.read_csv('train.csv.gz')

    features = df.drop('y', axis=1)
    target = df.y

    def hyperparam_objective(args):
        xgb_params_ho = {
            "objective": "multi:softprob",
            "num_class": 7,
            "eta": 0.005,
            "num_rounds": 10000,
            "max_depth": args['max_depth'],
            "min_child_weight": args['min_child_weight'],
            "gamma": args['gamma'],
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "scale_pos_weight": 1,
            "silent": 2
        }

        est = make_pipeline(
            transf2,
            XGBoostClassifier(**xgb_params_ho),
        )

        scores = cross_val_score(
            estimator=est,
            X=features,
            y=target,
            cv=3,
            n_jobs=-1,
            verbose=1)

        return scores.mean()

    space = {
        'max_depth': hpo.hp.quniform('max_depth', 5, 20, 1),
        'min_child_weight': hpo.hp.quniform('min_child_weight', 1, 20, 1),
        'gamma': hpo.hp.quniform('gamma', 0, 10, 1)
    }


    best = hpo.fmin(hyperparam_objective, space, algo=hpo.tpe.suggest, max_evals=100)
    print(best)

In [5]:
def eval_accuracy(preds, dtrain):
    labels = dtrain.get_label()
    idx = np.argmax(preds, axis=1)
    res = accuracy_score(labels, idx)
    return 'accuracy', -res

In [6]:
transf2 = make_pipeline(
    high_cardinality_zeroing(49),
    df2dict(),
    DictVectorizer(sparse=False),
    Imputer(strategy='median'),
)

In [7]:
transf3 = make_pipeline(
    count_encoder(),
    Imputer(strategy='median'),
)

In [8]:
transf4 = make_pipeline(
    high_cardinality_zeroing(top=20),
    multi_class_target_share_encoder(size_threshold=1),
    Imputer(strategy='median'),
)

In [9]:
transf5 = make_pipeline(
    multi_class_target_share_encoder(size_threshold=100),
    Imputer(strategy='median'),
)

In [47]:
transf6 = make_pipeline(
    empyrical_bayes_encoder(),
    Imputer(strategy='median'),
)

In [48]:
xgb_params2 = {
    "objective": "multi:softprob",
    "num_class": 7,
    "eta": 0.001,
    "num_rounds": 10000,
    "max_depth": 9,
    "min_child_weight": 6,
    "gamma": 0,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "scale_pos_weight": 1,
    "silent": 0,
    "verbose": 10,
    "eval_func": eval_accuracy,
}

In [50]:
# mean: 0.753299206752, std: 0.00461122957968
est2 = transf2, XGBoostClassifier(**xgb_params2)

In [51]:
# mean: 0.751939373068, std: 0.00569865718706
est3 = transf3, XGBoostClassifier(**xgb_params2)

In [52]:
# mean: 0.753899536611, std: 0.0042606233231
est4 = make_pipeline(transf4, XGBoostClassifier(**xgb_params2))

In [54]:
# mean: 0.7407190861094108, std: 0.006672093040968181
est6 = make_pipeline(transf5, XGBoostClassifier(**xgb_params2))

In [55]:
# mean: 0.753339450035363, std: 0.005114320275556508
est7 = make_pipeline(transf6, XGBoostClassifier(**xgb_params2))