In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval
from functools import partial
from hyperopt.pyll.base import scope
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import TwoModels
from sklift.models import SoloModel
from sklift.models import ClassTransformation

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Финальная оценка результатов

In [3]:
path = 'E:/retailhero-uplift/data'

In [4]:
uplift = pd.read_csv(path + '/uplift_features.csv', index_col='client_id', parse_dates=['first_issue_date', 'first_redeem_date'])
uplift.head()

Unnamed: 0_level_0,first_issue_date,first_redeem_date,age,gender,treatment_flg,target,total_trans_count,last_month_trans_count,regular_points_received_sum_all,express_points_received_sum_all,...,purchase_sum_sum_all,store_id_sum_all,product_quantity_sum_all,regular_points_received_sum_last_month,express_points_received_sum_last_month,regular_points_spent_sum_last_month,express_points_spent_sum_last_month,purchase_sum_sum_last_month,store_id_sum_last_month,product_quantity_sum_last_month
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U,0,1,4,2,25.7,0.0,...,2803.0,4.0,3,10.0,0.0,0.0,0.0,1222.0,2.0,1
000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F,1,1,32,8,54.9,60.0,...,9805.0,32.0,5,13.7,0.0,0.0,0.0,2784.0,8.0,4
00010925a5,2018-07-24 16:21:29,2018-09-14 16:12:49,83,U,1,1,18,8,31.8,0.0,...,5883.0,17.0,2,13.9,0.0,0.0,0.0,2858.0,8.0,2
0001f552b0,2017-06-30 19:20:38,2018-08-28 12:59:45,33,F,1,1,15,7,78.9,0.0,...,6155.18,15.0,4,47.0,0.0,0.0,0.0,2211.37,6.0,2
00020e7b18,2017-11-27 11:41:45,2018-01-10 17:50:05,73,U,1,1,18,5,286.1,0.0,...,25819.61,27.0,4,58.4,0.0,-76.0,-10.0,6096.27,7.0,1


In [5]:
dates = ['first_issue_date', 'first_redeem_date']
ages = ['age']
features = ['total_trans_count', 'last_month_trans_count', 'product_quantity_sum_all', 'regular_points_spent_sum_last_month']

In [6]:
index_train, index_valid = train_test_split(uplift.index, test_size=0.3, random_state=43)

X_train = uplift.loc[index_train, :].drop(['target', 'treatment_flg'], 1)
y_train = uplift.loc[index_train, 'target']
treat_train = uplift.loc[index_train, 'treatment_flg']

X_valid = uplift.loc[index_valid, :].drop(['target', 'treatment_flg'], 1)
y_valid = uplift.loc[index_valid, 'target']
treat_valid = uplift.loc[index_valid, 'treatment_flg']

Класс для выбора колонки.

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.column]]

Класс для удаления ошибок из значений возраста.

In [8]:
class AgeTransformator(BaseEstimator, TransformerMixin):
    
    def __init__(self, key, age_min, age_max):
        self.key = key
        self.age_min = age_min
        self.age_max = age_max
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, age_min  = 16, age_max = 80):
        X.loc[X[self.key] > self.age_max, self.key] = self.age_max
        X.loc[X[self.key] < self.age_min, self.key] = self.age_min
        return X

Класс для приведения даты к типу int.

In [9]:
class DateTransformator(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        self.columns = []
        
    def fit(self, X, y=None):
        self.columns = [self.key+'_'+'int']
        return self
    
    def transform(self, X):
        X[self.key].fillna(uplift[self.key].mode()[0], inplace=True)
        X[self.key+'_'+'int'] = X[self.key].astype(np.int64)
        return X[self.columns]

Класс для тренировки.

In [74]:
class ModelTuning:
    #оптимизация
    def optimize(params,x, y, model):        
        pipeline = Pipeline([('features', feats),
                            ('classifier', model(**params))])
        
        kf = StratifiedKFold(n_splits=20, shuffle=True)
        cv_scores = cross_val_score(pipeline, x, y, 
                                    cv=kf, 
                                    scoring='roc_auc')

        return np.mean(cv_scores)

    def tune(x, y, model):
        trials = Trials()
        best_params = fmin(fn=partial(ModelTuning.optimize, x=x, y=y, model=model),
                    space=space_freq, trials=trials, algo=tpe.suggest, max_evals=15)

        return best_params
    
    # Построение модели с ранней остановкой (early stopping)
    def get_predictions(best_params, train_params, train, y, valid, model):
        
        pipeline = Pipeline([('features', feats),
                            ('classifier', model(params=best_params, 
                                                 **train_params,
                                                 early_stopping_rounds=20))])
    
        pipeline.fit(X=train, y=y)
        preds = pipeline.predict(valid)
        return preds

    #численная оценка roc-auc
    def grade(y_test, preds):
        score = roc_auc_score(y_test, preds)
        return score
    
    #функция для двух независимых моделей
    def two_models(best_params, train_params, model, train, y_train, treat_train, valid, y_valid, k, strategy):
        pipeline = Pipeline([('features', feats),
                            ('classifier', model(params=best_params, 
                                                 **train_params))])
                
        pipeline1 = Pipeline([('features', feats),
                            ('classifier', model(params=best_params, 
                                                 **train_params))])
        
        tm = TwoModels(estimator_trmnt=pipeline, estimator_ctrl=pipeline1)

        tm = tm.fit(train, y_train, treat_train)
        uplift_tm = tm.predict(valid)
        tm_score_30 = uplift_at_k(y_true=y_valid, uplift=uplift_tm, treatment=treat_valid, strategy=strategy, k=k)
    
        return tm_score_30
    
    def solo_learner(best_params, train_params, train, model, y_train, treat_train, valid, y_valid, k, strategy):
        pipeline = Pipeline([('features', feats),
                            ('classifier', model(params=best_params, 
                                                 **train_params))])
        sm = SoloModel(estimator=pipeline)
        sm.fit(X_train, y_train, treat_train)
        uplift_sm = sm.predict(X_valid)
        sm_score_30 = uplift_at_k(y_true=y_valid, uplift=uplift_sm, treatment=treat_valid, strategy=strategy, k=k)
        return sm_score_30
    
        
    def class_transformation(best_params, train_params, train, model, y_train, treat_train, valid, y_valid, k, strategy):
        pipeline = Pipeline([('features', feats),
                            ('classifier', model(params=best_params, 
                                                 **train_params))])
        ct = ClassTransformation(estimator=pipeline)
        ct = ct.fit(X_train, y_train, treat_train)
        uplift_ct = ct.predict(X_valid)

        ct_score_30 = uplift_at_k(y_true=y_valid, uplift=uplift_ct, treatment=treat_valid, strategy=strategy, k=k)
        return ct_score_30

In [12]:
final_transformers = []


for age in ages:
    process_age = Pipeline([('selector', ColumnSelector(column='age')),
                ('process_age', AgeTransformator(key='age', age_min = 16, age_max = 80))
               ])
    
    final_transformers.append((age, process_age))

for date in dates:
    prepare_date = Pipeline([
        ('selector', ColumnSelector(column=date)),
        ('transformer', DateTransformator(key=date))
        ])
    
    final_transformers.append((date, prepare_date))
    
for feature in features:
    num_selector = Pipeline([
                ('selector', ColumnSelector(column=feature))
    ])
    
    final_transformers.append((feature, num_selector))

In [13]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)
feature_processing.fit_transform(X_valid)

array([[ 4.60000000e+01,  1.52252139e+18,  1.52948309e+18, ...,
         9.00000000e+00,  3.00000000e+00,  0.00000000e+00],
       [ 4.90000000e+01,  1.50074818e+18,  1.50563905e+18, ...,
         7.00000000e+00,  6.00000000e+00, -1.80000000e+01],
       [ 3.40000000e+01,  1.54504250e+18,  1.54996771e+18, ...,
         5.00000000e+00,  4.00000000e+00,  0.00000000e+00],
       ...,
       [ 4.50000000e+01,  1.50289970e+18,  1.53537638e+18, ...,
         1.00000000e+00,  2.00000000e+00,  0.00000000e+00],
       [ 3.60000000e+01,  1.50256528e+18,  1.51524309e+18, ...,
         6.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 5.70000000e+01,  1.50861288e+18,  1.55172719e+18, ...,
         8.00000000e+00,  6.00000000e+00, -1.21000000e+02]])

Сделаем модель попроще.

In [60]:
#пространство поиска
space_freq = {
              'objective': 'binary:logistic',
              'max_depth': scope.int(hp.quniform('max_depth', 2, 4, 1)),
              'eta': hp.uniform('eta', 0.01, 0.3),
              }
    
train_params = {
                'num_boost_round': 150,
                'maximize': False,
                'verbose_eval': False}

In [61]:
best_params = ModelTuning.tune(x=X_train,
                              y=y_train,
                              model=xgb.XGBClassifier)

100%|███████████████████████████████████████████████| 15/15 [34:40<00:00, 138.70s/trial, best loss: 0.7768645867639131]


In [62]:
best_params = space_eval(space_freq, best_params)
best_params

{'eta': 0.19452171170790064, 'max_depth': 2, 'objective': 'binary:logistic'}

In [65]:
preds = ModelTuning.get_predictions(best_params=best_params, 
                        train_params=train_params, 
                        train=X_train, 
                        y=y_train, 
                        valid=X_valid, 
                        model=xgb.XGBClassifier)

In [68]:
ModelTuning.grade(y_test = y_valid,
                        preds=preds)

0.690471031569179

Как рассчитывается метрика uplift.
Все тестовые данные отсортированы в порядке убывания прогнозируемого роста. Затем рассчитывается конверсия в 30% лучших в экспериментальной группе и конверсию в 30% лучших в контрольной группе отдельно. Uplift - это разница между двумя преобразованиями.

In [71]:
ModelTuning.two_models(best_params = best_params, 
                       train_params = train_params, 
                       model = xgb.XGBClassifier, 
                       train = X_train, 
                       y_train = y_train, 
                       treat_train = treat_train, 
                       valid=X_valid,
                       y_valid = y_valid, 
                       k=0.3, 
                       strategy='by_group')

0.06912603584648391

In [72]:
ModelTuning.solo_learner(best_params = best_params, 
                       train_params = train_params,
                       model = xgb.XGBClassifier,
                       train = X_train, 
                       y_train = y_train, 
                       treat_train = treat_train, 
                       valid=X_valid,
                       y_valid = y_valid, 
                       k=0.3, 
                       strategy='by_group')

0.04371965770784314

In [73]:
ModelTuning.class_transformation(best_params = best_params, 
                       train_params = train_params,
                       model = xgb.XGBClassifier,
                       train = X_train, 
                       y_train = y_train, 
                       treat_train = treat_train, 
                       valid=X_valid,
                       y_valid = y_valid, 
                       k=0.3, 
                       strategy='by_group')

0.07021892608649721

Roc Auc подрос, а вот uplift - нет. Возможно, нужно строить более сильные фичи и делать другой принцип разбиения на трэйн и тест.