Домашнее задание
1. скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention
2. там поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.
3. сделать разбиение набора данных не тренировочную и тестовую выборки
4. сделать feature engineering на ваше усмотрение (допускается свобода выбора методов)
5. провести uplift-моделирование 3 способами: одна модель с признаком коммуникации (S learner), модель с трансформацией таргета (трансформация классов п. 2. 1) и вариант с двумя независимыми моделями
6. в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% этих 3 моделей
7. построить модель UpliftTreeClassifier и попытаться описать словами полученное дерево
8. (опционально) для модели S learner (модель с дополнительным признаком коммуникации) построить зависимость таргета (конверсии - поле conversion) от значения uplift: 1) сделать прогноз и получить uplift для тестовой выборки 2) отсортировать тестовую выборку по uplift по убыванию 3) разбить на децили (pandas qcut вам в помощь) 4) для каждого дециля посчитать среднюю conversion
9. (опционально) построить модель UpliftRandomForestClassifier и попытаться описать словами полученное дерево

In [384]:
%matplotlib inline

import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from catboost import CatBoostClassifier

from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel
from sklift.models import ClassTransformation
from sklift.models import TwoModels

In [385]:
#  !pip install scikit-uplift==0.2.0

In [386]:
# Чтение данных
df = pd.read_csv('HW_data.csv', sep=',')

In [387]:
df.rename(columns={'conversion': 'target', 'offer': 'treatment'}, inplace=True)
df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0


In [388]:
df.loc[df['treatment'] == 'No Offer', 'treatment'] = 0
df.loc[~(df['treatment'] == 0), 'treatment'] = 1
df['treatment'].value_counts()

1    42694
0    21306
Name: treatment, dtype: int64

In [389]:
target_name = 'target'
cat_features = ['zip_code', 'channel'] #делаем get_dummies
base_features = ['used_discount', 'used_bogo', 'is_referral'] #ничего не делаем
number_deatures = ['recency', 'history'] #standartscaller

In [390]:
stdscal = StandardScaler()
df[number_deatures] = stdscal.fit_transform(df[number_deatures])
df

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,1.207751,-0.389003,1,0,Surburban,0,Phone,1,0
1,0.067359,0.339614,1,1,Rural,1,Web,0,0
2,0.352457,-0.239836,0,1,Surburban,1,Web,1,0
3,0.922653,1.693278,1,0,Rural,1,Web,1,0
4,-1.073034,-0.768068,1,0,Urban,0,Web,1,0
...,...,...,...,...,...,...,...,...,...
63995,1.207751,-0.533055,1,0,Urban,0,Web,1,0
63996,-0.217739,-0.793170,0,1,Urban,1,Phone,1,0
63997,0.067359,-0.827992,1,0,Urban,1,Phone,1,0
63998,-1.358132,1.213532,1,0,Surburban,1,Multichannel,1,0


In [391]:
#кодируем все категориальные признаки с помощью get_dummies
df = pd.get_dummies(df, columns = cat_features, prefix_sep = "_", drop_first = True) 
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,treatment,target,zip_code_Surburban,zip_code_Urban,channel_Phone,channel_Web
0,1.207751,-0.389003,1,0,0,1,0,1,0,1,0
1,0.067359,0.339614,1,1,1,0,0,0,0,0,1
2,0.352457,-0.239836,0,1,1,1,0,1,0,0,1
3,0.922653,1.693278,1,0,1,1,0,0,0,0,1
4,-1.073034,-0.768068,1,0,0,1,0,0,1,0,1


In [392]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', 1), 
                                                    df['target'], random_state=0)

In [393]:
indices_train = X_train.index
indices_test = X_test.index

In [394]:
treat_train = X_train.loc[indices_train, 'treatment']
treat_test = X_test.loc[indices_test, 'treatment']

#### 1.1 Solo-модель

In [395]:
sm = SoloModel(CatBoostClassifier(iterations=200, thread_count=2, random_state=42, silent=True))
sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_test)

In [397]:
sm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.1)
sm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.2)
sm_score_30 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.3)

print(f'uplift@10%: {sm_score_10:.4f}')
print(f'uplift@20%: {sm_score_20:.4f}')
print(f'uplift@30%: {sm_score_30:.4f}')

uplift@10%: 0.0663
uplift@20%: 0.0631
uplift@30%: 0.0566


In [398]:
models_results = {'metod': ['uplift_10', 'uplift_20', 'uplift_30',],
                  'solo_model': [sm_score_10, sm_score_20, sm_score_30]
                 }
models_results

{'metod': ['uplift_10', 'uplift_20', 'uplift_30'],
 'solo_model': [0.06630379424497071,
  0.06305412972468502,
  0.056630064250593026]}

#### 1.2 Трансформация классов

In [399]:
ct = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_test)

  ct = ct.fit(X_train, y_train, treat_train)


In [400]:
ct_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.1)
ct_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.2)
ct_score_30 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.3)

print(f'uplift@10%: {ct_score_10:.4f}')
print(f'uplift@20%: {ct_score_20:.4f}')
print(f'uplift@30%: {ct_score_30:.4f}')

uplift@10%: 0.2331
uplift@20%: 0.1940
uplift@30%: 0.1731


In [401]:
models_results['ct_score'] = [ct_score_10,ct_score_20,ct_score_30]

#### 1.3 Две модели

In [402]:
tm = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    method='vanilla'
)
tm = tm.fit(
    X_train, y_train, treat_train)

uplift_tm = tm.predict(X_test)

In [403]:
tm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.1)
tm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.2)
tm_score_30 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.3)

print(f'uplift@10%: {tm_score_10:.4f}')
print(f'uplift@20%: {tm_score_20:.4f}')
print(f'uplift@30%: {tm_score_30:.4f}')

uplift@10%: 0.0549
uplift@20%: 0.0555
uplift@30%: 0.0585


In [404]:
models_results['tm_score'] = [tm_score_10, tm_score_20, tm_score_30]

In [405]:
#результаты работы всех моделей сведем в табличку
models_results_df = pd.DataFrame(models_results)
models_results_df

Unnamed: 0,metod,solo_model,ct_score,tm_score
0,uplift_10,0.066304,0.233108,0.05494
1,uplift_20,0.063054,0.193971,0.055451
2,uplift_30,0.05663,0.173143,0.058496
