In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from functools import partial
import xgboost as xgb

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
path = 'E:/retailhero-uplift/data' 

In [4]:
uplift = pd.read_csv(path + '/train.csv', encoding='utf-8')
uplift.head()

Unnamed: 0,client_id,first_issue_date,first_redeem_date,age,gender,treatment_flg,target
0,000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U,0,1
1,000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F,1,1
2,00010925a5,2018-07-24 16:21:29,2018-09-14 16:12:49,83,U,1,1
3,0001f552b0,2017-06-30 19:20:38,2018-08-28 12:59:45,33,F,1,1
4,00020e7b18,2017-11-27 11:41:45,2018-01-10 17:50:05,73,U,1,1


# 0.Используемые функции

График ROC-AUC кривой.

In [6]:
def roc_auc_plot(y_test, preds):
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    fpr, tpr, thresholds_ = roc_curve(y_test, preds, pos_label=1)
    lw = 2
    plt.plot(fpr, tpr, lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

Mean target кодирование на кроссвалидации.

In [8]:
def mean_target_encoding(train_df, y_train, valid_df, skf):    
    glob_mean = y_train.mean()
    train_df = pd.concat([train_df, pd.Series(y_train, name='y')], axis=1)
    new_train_df = train_df.copy()
    
    cat_features = train_df.columns[train_df.dtypes == 'object'].tolist()
    
    for col in cat_features:
        new_train_df[col + '_mean_target'] = [glob_mean for _ in range(new_train_df.shape[0])]
        
    for train_idx, valid_idx in skf.split(train_df, y_train):
        train_df_cv, valid_df_cv = train_df.iloc[train_idx, :], train_df.iloc[valid_idx, :]
        
        for col in cat_features:
            
            means = valid_df_cv[col].map(train_df_cv.groupby(col)['y'].means())
            valid_df_cv[col + '_mean_target'] = means.fillna(glob_mean)
            
        new_train_df.iloc[valid_idx] = valid_df_cv
        
    new_train_df.drop(cat_features + ['y'], axis=1, inplace=True)
    
    for col in cat_features:
        means = valid_df[col].map(train_df.groupby(col)['y'].mean())
        valid_df[col + '_mean_target'] = means.fillna(glob_mean)
        
    valid_df.drop(train_df.columns[train_df.dtypes == 'object'], axis=1, inplace=True)
    
    return new_train_df, valid_df

Подсчёт значений на подборе гиперпараметров.

In [13]:
def optimize(params, cv_params, x, y, model):
    if 'max_depth' in params.keys():
        params['max_depth'] = int(params['max_depth'])
    
    cv_result = xgb.cv(params=params, dtrain=data, **cv_params)
    name = [i for i in cv_result.columns if all([i.startswith('test-'), i.endswith('-mean')])][-1]
    score = cv_result[name][-1:].values[0]
    
    return {'loss': score, 'status': STATUS_OK}

# 1. Подбор гиперпараметров с hyperopt

Основные гиперпараметры, используемые в библиотеке XGBoost
* _objective_ - функция распределения
* _eta_ - размер шага
* _max_depth_ - максимальная глубина дерева
* _min_child_weight_ - минимальный вес, необходимый дочерним элементам
* _subsample_ - доля подвыборки для каждой итерации
* _colsample_bytree_ - доля колонок, участвующих в итерации
* _alpha_ - сила регуляризации L1
* _lambda_ - сила регуляризации L2
* _gamma_ - штраф на сложность деревьев
* _num_boost_round_ - число итераций (фиксируем, не следует менять вместе с _eta_)
* _early_stopping_rounds_ - число итераций для остановки, если не произошло улучшение метрики (фиксируем) 

Определим параметры кроссвалидации.

In [10]:
cv_params = {'num_boost_round': 200,
             'objective': 'binary:logistic',
             'nfold': 5,
             'shuffle': True,
             'stratified': False,
             'maximize': False,
             'early_stopping_rounds': 20
              }

Определим пространство поиска.

In [11]:
space_freq = {'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart']),
              'objective': 'binary:logistic',
              'max_depth': hp.choice('max_depth', [5, 8, 10, 12, 15]),
              'min_child_weight': hp.uniform('min_child_weight', 0, 50),
              'subsample': hp.uniform('subsample', 0.5, 1),
              'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
              'alpha': hp.uniform('alpha', 0, 1),
              'lambda': hp.uniform('lambda', 0, 1),
              'eta': hp.uniform('eta', 0.01, 1),
              'tree_method': 'hist'
              }

In [None]:
optimization_function = partial(optimize,
                                 x=X_train, 
                                 y=y_train,
                                 model=xgb.XGBClassifier)
trials = Trials()

fmin(fn=optimization_function, space=space_freq, algo=tpe.suggest, max_evals=20, trials=trials)

best = fmin(fn=optimization_function,
            space=space_freq, 
            trials=trials, 
            algo=tpe.suggest, 
            max_evals=50, 
            timeout=3600)

best_params = space_eval(space_freq, best)
best_params

In [12]:
train_params = {'num_boost_round': 300,
                'maximize': False,
                'verbose_eval': False}