# ДЗ 4. Uplift-моделирование

In [None]:
!pip install scikit-uplift

In [None]:
!pip install causalml

In [None]:
!pip install scikit-learn

In [None]:
!pip install catboost

In [None]:
!pip install pydotplus

In [None]:
!pip install graphviz

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pydotplus
from graphviz import Source
from sklearn import tree

from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel, ClassTransformation, TwoModels

from IPython.display import Image
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

import warnings
warnings.simplefilter('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Набор данных маркетинговых кампаний

Скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention

In [56]:
df = pd.read_csv('/content/drive/My Drive/Colab-Notebooks/data.csv', ',')

In [57]:
df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   recency        64000 non-null  int64  
 1   history        64000 non-null  float64
 2   used_discount  64000 non-null  int64  
 3   used_bogo      64000 non-null  int64  
 4   zip_code       64000 non-null  object 
 5   is_referral    64000 non-null  int64  
 6   channel        64000 non-null  object 
 7   offer          64000 non-null  object 
 8   conversion     64000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 4.4+ MB


## Предобработка данных

Поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.

In [58]:
df.rename(columns={'conversion': 'target', 'offer': 'treatment'}, inplace=True)
df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0


In [59]:
df['treatment'] = df['treatment'].apply(lambda x: 0 if x=='No Offer' else 1)
df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,1,0
1,6,329.08,1,1,Rural,1,Web,0,0
2,7,180.65,0,1,Surburban,1,Web,1,0


## Feature engineering

Cделать feature engineering на ваше усмотрение (допускается свобода выбора методов)

In [60]:
df.describe()

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,treatment,target
count,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0
mean,5.763734,242.085656,0.551031,0.549719,0.50225,0.667094,0.146781
std,3.507592,256.158608,0.497393,0.497526,0.499999,0.471257,0.35389
min,1.0,29.99,0.0,0.0,0.0,0.0,0.0
25%,2.0,64.66,0.0,0.0,0.0,0.0,0.0
50%,6.0,158.11,1.0,1.0,1.0,1.0,0.0
75%,9.0,325.6575,1.0,1.0,1.0,1.0,0.0
max,12.0,3345.93,1.0,1.0,1.0,1.0,1.0


In [61]:
# Cмотрим значения
for value in ['recency','history','used_discount','used_bogo','zip_code','is_referral','channel','treatment','target']:
    print(df[value].value_counts(), '\n--------')

1     8952
10    7565
2     7537
9     6441
3     5904
4     5077
6     4605
5     4510
7     4078
11    3504
8     3495
12    2332
Name: recency, dtype: int64 
--------
29.99     7947
81.20        9
53.79        9
142.94       8
35.40        8
          ... 
701.66       1
246.45       1
798.83       1
125.19       1
104.00       1
Name: history, Length: 34833, dtype: int64 
--------
1    35266
0    28734
Name: used_discount, dtype: int64 
--------
1    35182
0    28818
Name: used_bogo, dtype: int64 
--------
Surburban    28776
Urban        25661
Rural         9563
Name: zip_code, dtype: int64 
--------
1    32144
0    31856
Name: is_referral, dtype: int64 
--------
Web             28217
Phone           28021
Multichannel     7762
Name: channel, dtype: int64 
--------
1    42694
0    21306
Name: treatment, dtype: int64 
--------
0    54606
1     9394
Name: target, dtype: int64 
--------


In [62]:
# Переводим все категориальные признаки в dummies
df = pd.concat([df.drop(['zip_code', 'channel'], axis=1), 
                          pd.get_dummies(df['zip_code'], prefix='zip_code'),
                          pd.get_dummies(df['channel'], prefix='channel')
                         ], 1)

df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,treatment,target,zip_code_Rural,zip_code_Surburban,zip_code_Urban,channel_Multichannel,channel_Phone,channel_Web
0,10,142.44,1,0,0,1,0,0,1,0,0,1,0
1,6,329.08,1,1,1,0,0,1,0,0,0,0,1
2,7,180.65,0,1,1,1,0,0,1,0,0,0,1


## Разбиение набора данных

Сделать разбиение набора данных на тренировочную и тестовую выборки

In [64]:
X_train, X_test = train_test_split(df.drop(['target'], axis=1), test_size=0.3, shuffle=True, random_state=42)
y_train, y_test = train_test_split(df['target'], test_size=0.3, shuffle=True, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((44800, 12), (44800,), (19200, 12), (19200,))

## Uplift-моделирование

Провести uplift-моделирование 3 способами: одна модель с признаком коммуникации (S learner), модель с трансформацией таргета (трансформация классов п. 2. 1) и вариант с двумя независимыми моделями

In [65]:
df.columns

Index(['recency', 'history', 'used_discount', 'used_bogo', 'is_referral',
       'treatment', 'target', 'zip_code_Rural', 'zip_code_Surburban',
       'zip_code_Urban', 'channel_Multichannel', 'channel_Phone',
       'channel_Web'],
      dtype='object')

In [55]:
models_results = pd.DataFrame(columns=['uplift@10%', 'uplift@20%'])
treat_train = X_train['treatment']
treat_test = X_test['treatment']

In [67]:
%%time 

sm = SoloModel(CatBoostClassifier(iterations=50, 
                                  thread_count=8, 
                                  random_state=42, 
                                  silent=True))

sm = sm.fit(X_train, y_train, treat_train)
uplift_sm = sm.predict(X_test)

sm_score_01 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.1)
sm_score_02 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.2)

models_results.loc[f'SoloModel', 'uplift@10%'] = sm_score_01
models_results.loc[f'SoloModel', 'uplift@20%'] = sm_score_02

CPU times: user 1.35 s, sys: 264 ms, total: 1.61 s
Wall time: 928 ms


In [68]:
%%time 

ct = ClassTransformation(CatBoostClassifier(iterations=50, 
                                            thread_count=8, 
                                            random_state=42, 
                                            silent=True))

ct = ct.fit(X_train, y_train, treat_train)
uplift_ct = ct.predict(X_test)

ct_score_01 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.1)
ct_score_02 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.2)

models_results.loc[f'ClassTransformation', 'uplift@10%'] = ct_score_01
models_results.loc[f'ClassTransformation', 'uplift@20%'] = ct_score_02

CPU times: user 1.34 s, sys: 260 ms, total: 1.6 s
Wall time: 912 ms


In [69]:
# ?TwoModels

In [70]:
%%time 

tm = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=50, 
                                       thread_count=8, 
                                       random_state=42, 
                                       silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=50, 
                                      thread_count=8, 
                                      random_state=42, 
                                      silent=True), 
    method='vanilla'
)

tm = tm.fit(X_train, y_train, treat_train)
uplift_tm = tm.predict(X_test)

tm_score_01 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.1)
tm_score_02 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.2)

models_results.loc[f'TwoModels', 'uplift@10%'] = tm_score_01
models_results.loc[f'TwoModels', 'uplift@20%'] = tm_score_02

CPU times: user 1.8 s, sys: 400 ms, total: 2.2 s
Wall time: 1.3 s


## Таблица сравнения метрик

Вывести единую таблицу сравнения метрик uplift@10%, uplift@20% этих 3 моделей

In [71]:
models_results

Unnamed: 0,uplift@10%,uplift@20%
SoloModel,0.0704374,0.0766759
ClassTransformation,0.247566,0.20387
TwoModels,0.0892742,0.0822209


## Модель UpliftTreeClassifier

Построить модель UpliftTreeClassifier и попытаться описать словами полученное дерево

In [72]:
X_train_tree = X_train.copy()
features = [col for col in X_train_tree]
X_train_tree.head()                      

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,treatment,zip_code_Rural,zip_code_Surburban,zip_code_Urban,channel_Multichannel,channel_Phone,channel_Web
9656,7,434.35,1,0,1,0,0,0,1,0,0,1
63037,1,376.59,1,0,0,1,0,1,0,1,0,0
31405,3,140.34,0,1,1,1,0,0,1,0,1,0
58088,3,150.76,0,1,0,0,0,1,0,0,0,1
44344,2,67.97,1,0,1,1,0,1,0,0,1,0


In [100]:
%%time

uplift_model = UpliftTreeClassifier(max_depth=7, min_samples_leaf=200, min_samples_treatment=50,
                                    n_reg=100, evaluationFunction='KL', control_name='control')

uplift_model.fit(X_train_tree.values,
                 treatment=treat_train.map({1: 'treatment1', 0: 'control'}).values,
                 y=y_train)

graph = uplift_tree_plot(uplift_model.fitted_uplift_tree, features)
Image(graph.create_png())

CPU times: user 5.69 s, sys: 46.7 ms, total: 5.74 s
Wall time: 6.9 s


Не отображается картинка на Colab, попробовал разные библиотеки типа graphviz, pydotplus, тоже не пошло.