# UPLIFT

Обычно продвижение продуктов происходит за счет коммуникации с клиентом через различные каналы: смс, push, сообщения чат-бота в социальных сетях и многие другие.

Uplift модель оценивает чистый эффект от коммуникации, пытаясь выбрать только тех клиентов, которые совершат целевое действие только при взаимодействии. Модель оценивает разницу в поведении клиента при наличии воздействия и при его отсутствии.

Для решения данной задачи воспользуемся методом трансформации классов, при котором будем прогнозировать измененный таргет:

$$
Z_{i} = \left\{
    \begin{array}{ll}
        1, if \space W_i=1\space and \space Y_i=1\\
        1, if \space W_i=0\space and \space Y_i=0\\
        0, else
    \end{array}
\right.
$$

где:
* $W_i$ - флаг взаимодействия i-го клиента
* $Y_i$ - флаг покупки i-го клиента
* $Z_i$ - новая целевая переменная

### Общие данные

* clients2.csv - информация о клиентах: id, даты открытия и первого использования карты лояльности(?), возраст, пол

* products.csv - информация о товарах: id, брэнд, продавец, является ли товар продукцией собственного производстава и т.д.

* train.csv - набор о клиентах для обучения. treatment_flg - была ли совершена коммуникация, purchased - была ли совершена покупка

* train_purch.csv - история покупок train клиентов

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

## Data preparation

In [2]:
class ProcessData:

    
    def __init__(self, data_path='data/x5-uplift-valid/'):
        '''
        Загружаем данные, приводим столбцы с датами к формату datetime,
        создаем новую целевую переменную для train
        '''  
        clients_df = pd.read_csv(f'{data_path}/data/clients2.csv')
        products_df = pd.read_csv(f'{data_path}/data/products.csv')
        train_df = pd.read_csv(f'{data_path}/data/train.csv')
        train_purch_df = pd.read_csv(f'{data_path}/train_purch/train_purch.csv')
        
        del clients_df['client_id.1']
        
        clients_df['first_issue_time'] = (pd.to_datetime(clients_df.first_issue_date) - 
                                  pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
        clients_df['first_redeem_time'] = (pd.to_datetime(clients_df.first_redeem_date) - 
                                  pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
        
        train_purch_df['transaction_datetime'] = pd.to_datetime(train_purch_df.transaction_datetime)
        train_purch_df['transaction_month'] = train_purch_df.transaction_datetime.dt.to_period('M')
        
        train_df['target'] = (train_df['treatment_flg'] == train_df['purchased']).astype(int)
        train_df = train_df.drop(columns=['treatment_flg', 'purchased'])
        
        self.clients_df = clients_df
        self.products_df = products_df
        self.train_df = train_df
        self.train_purch_df = train_purch_df
        
    
    def load_test_data(self, data_path='data/x5-uplift-valid/'):
        
        test_df = pd.read_csv(f'{data_path}/data/test.csv')
        test_purch_df = pd.read_csv(f'{data_path}/test_purch/test_purch.csv')
        
        test_purch_df['transaction_datetime'] = pd.to_datetime(test_purch_df.transaction_datetime)
        test_purch_df['transaction_month'] = test_purch_df.transaction_datetime.dt.to_period('M')
        
        self.test_df = test_df
        self.test_purch_df = test_purch_df
    
    """Далее идут функции для выделения признаков из данных о совершенных транзакциях"""
    
    def _get_sum_features(self, cols, purch_df):
        """
        Вычисление суммарных значений
        """
        total_df = purch_df.groupby('client_id', as_index=False)[cols].sum()
        total_df.columns = ['client_id'] + [f'total_{f}' for f in cols]
        
        return total_df
    
    
    def _get_mean_features(self, cols, purch_df):
        """
        Вычисление средних значений
        """
        mean_df = purch_df.groupby('client_id', as_index=False)[cols].mean()
        mean_df.columns = ['client_id'] + [f'mean_{f}' for f in cols]
        
        return mean_df
    
    
    def _get_last_month_features(self, cols, purch_df):
        """
        Вычисление суммарных значений за последний месяц
        """
        last_df = purch_df.groupby(['client_id', 'transaction_month'], as_index=False)[cols].sum()
        last_df.columns = ['client_id', 'transaction_month'] + [f'last_month_{f}' for f in cols]
        last_df = last_df.drop_duplicates(subset=['client_id'], keep='last')
        last_df = last_df[last_df.transaction_month == '2019-03']
        last_df = last_df.drop(columns=['transaction_month'])
        
        return last_df
    
    
    def _get_past_redeem_features(self, cols, purch_df):
        """
        Вычисление суммарных значений после использования карты
        """
        redeem_df = purch_df.merge(self.clients_df[['client_id', 'first_redeem_date']], how='left')
        cond1 = (redeem_df.transaction_datetime > redeem_df.first_redeem_date)
        cond2 = (redeem_df.first_redeem_date.isna() == False)
        
        redeem_df['redeem_purch'] = (cond1 & cond2).astype(int)
        redeem_df = redeem_df[redeem_df.redeem_purch == 1]
        redeem_df = redeem_df.groupby('client_id', as_index=False)[cols].sum()
        redeem_df.columns = ['client_id'] + [f'past_redeem_total_{f}' for f in cols]
        
        return redeem_df
    
    
    def make_train_features(self, cols=None):
        """
        Создание признаков для train
        """
        if not cols:
            cols = self.train_purch_df \
                    .select_dtypes(float) \
                    .columns.to_list()[:-3]
            
        self.cols = cols
        
        total_df = self._get_sum_features(cols, self.train_purch_df)
        mean_df = self._get_mean_features(cols, self.train_purch_df)
        last_df = self._get_last_month_features(cols, self.train_purch_df)
        redeem_df = self._get_past_redeem_features(cols, self.train_purch_df)
        
        all_train_df = self.train_df.merge(total_df, how='left')
        for df in (mean_df, last_df, redeem_df):
            all_train_df = all_train_df.merge(df, how='left')
        
        to_join = ['client_id', 'first_redeem_time']
        all_train_df = all_train_df.merge(self.clients_df[to_join], how='left')
        all_train_df = all_train_df.fillna(0)
        
        return all_train_df
    
    
    def make_test_features(self, features=None):
        """
        Создание признаков для test
        """
        cols = self.cols
          
        total_df = self._get_sum_features(cols, self.test_purch_df)
        mean_df = self._get_mean_features(cols, self.test_purch_df)
        last_df = self._get_last_month_features(cols, self.test_purch_df)
        redeem_df = self._get_past_redeem_features(cols, self.test_purch_df)
        
        all_test_df = self.test_df.merge(total_df, how='left')
        for df in (mean_df, last_df, redeem_df):
            all_test_df = all_test_df.merge(df, how='left')
        
        to_join = ['client_id', 'first_redeem_time']
        all_test_df = all_test_df.merge(self.clients_df[to_join], how='left')
        all_test_df = all_test_df.fillna(0)
        
        if not features:
            return all_test_df
        
        return all_test_df[['client_id'] + features]
        


In [16]:
uplift_data = ProcessData()
all_train_df = uplift_data.make_train_features()
all_train_df.head()

Unnamed: 0,client_id,target,total_regular_points_received,total_express_points_received,total_regular_points_spent,total_express_points_spent,total_purchase_sum,mean_regular_points_received,mean_express_points_received,mean_regular_points_spent,...,last_month_express_points_received,last_month_regular_points_spent,last_month_express_points_spent,last_month_purchase_sum,past_redeem_total_regular_points_received,past_redeem_total_express_points_received,past_redeem_total_regular_points_spent,past_redeem_total_express_points_spent,past_redeem_total_purchase_sum,first_redeem_time
0,ad6561e2d8,1,729.0,0.0,-576.0,0.0,78849.4,3.538835,0.0,-2.796117,...,0.0,0.0,0.0,17331.68,729.0,0.0,-576.0,0.0,78849.4,1527102000.0
1,7c1ccbf93f,1,96.2,0.0,0.0,0.0,7833.0,6.871429,0.0,0.0,...,0.0,0.0,0.0,7260.0,96.2,0.0,0.0,0.0,7833.0,1519326000.0
2,b58fadcab6,1,757.5,0.0,-2781.0,0.0,114288.48,3.054435,0.0,-11.21371,...,0.0,0.0,0.0,18070.06,757.5,0.0,-2781.0,0.0,114288.48,1537045000.0
3,e99e6fabb9,1,51.6,0.0,0.0,0.0,10895.0,1.097872,0.0,0.0,...,0.0,0.0,0.0,2410.0,51.6,0.0,0.0,0.0,10895.0,1527713000.0
4,27fb6f8520,1,416.0,60.0,-20.0,-250.0,48520.36,4.16,0.6,-0.2,...,0.0,0.0,0.0,14344.44,416.0,60.0,-20.0,-250.0,48520.36,1513332000.0


## Feature selection

Далее проводим отбор признаков. 
Для начала отсеем признаки с помощью F-теста. Оценим степень линейной завесимости между признаками и целевой переменной и оставим только те признаки, чьи значения >= 2

In [4]:
def select_features(all_train_df, pass_score=2):
    
    cols = ['client_id', 'target']
    X = all_train_df.drop(columns=cols)
    y = all_train_df['target']
    
    selector = GenericUnivariateSelect(
        score_func=f_classif,
        mode='k_best',
        param=5
    )
    
    selector.fit(X, y)

    res = pd.DataFrame({
        'feature': X.columns,
        'score': selector.scores_
    })

    res = res.sort_values(by='score', ascending=False) \
            .reset_index(drop=True)
    
    select_feat = res[res.score > pass_score].feature.to_list()
    
    return all_train_df[cols + select_feat]

In [5]:
all_train_df = select_features(all_train_df)
all_train_df.columns

Index(['client_id', 'target', 'mean_express_points_spent',
       'total_express_points_spent', 'last_month_purchase_sum',
       'mean_purchase_sum', 'past_redeem_total_express_points_spent',
       'last_month_express_points_spent', 'first_redeem_time',
       'total_purchase_sum', 'last_month_regular_points_received',
       'past_redeem_total_purchase_sum'],
      dtype='object')

Теперь построим селектор на основе значений взаимной информации(MI)

In [6]:
class MutualInfoSelector:
    
    
    def __init__(self, random_state=42, n_neighbors=3, param=6):
        
        self.random_state = random_state
        self.n_neighbors = n_neighbors
        self.param = param
      
    
    def set_params(self, random_state=42, n_neighbors=3, param=6):
        
        self.random_state = random_state
        self.n_neighbors = n_neighbors
        self.param = param
    
    
    def fit(self, X, y):
        
        scores = mutual_info_classif(
            X, y, 
            random_state=self.random_state,
            n_neighbors=self.n_neighbors,
        )
        
        res = pd.DataFrame({
            'feature': X.columns,
            'score': scores
        })

        res = res.sort_values(by='score', ascending=False) \
                .reset_index(drop=True)


        self.cols = res.feature.to_list()[:self.param]
        return self
    
    
    def transform(self, X):
        return X[self.cols]



In [20]:
xgb = XGBClassifier(
    random_state=42,
    learning_rate=0.1,
    max_depth=2
)

selector = MutualInfoSelector(
    random_state=42,
    param=6
)

skf = StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=42
)

pl = Pipeline([
    ('selector', selector),
    ('model', xgb)
])

params = {
    'selector__n_neighbors': (3, 7, 9, 15),
#     'selector__param': (5, 6, 7),
#     'model__learning_rate': np.arange(0.1, 0.35, 0.05),
#     'model__max_depth': np.arange(2, 5)
}

grid = GridSearchCV(
    estimator=pl,
    param_grid=params,
    cv=skf, n_jobs=-1,
    scoring='roc_auc'
)

X = all_train_df.drop(columns=['client_id', 'target'])
y = all_train_df['target']

grid.fit(X, y)

In [21]:
print(f'best score: {grid.best_score_}')
print(f'params: {grid.best_params_}')

best score: 0.5226444011150939
params: {'selector__n_neighbors': 15}


In [22]:
selector = MutualInfoSelector(
    random_state=42,
    n_neighbors=15,
    param=6
)

selector.fit(X, y)
X = selector.transform(X)
X.columns

Index(['first_redeem_time', 'last_month_regular_points_received',
       'past_redeem_total_purchase_sum', 'total_express_points_spent',
       'total_purchase_sum', 'last_month_regular_points_spent'],
      dtype='object')

In [23]:
xgb = XGBClassifier(
    random_state=42,
    learning_rate=0.1,
    max_depth=2
)

xgb.fit(X, y)

In [24]:
uplift_data.load_test_data()

In [25]:
features = X.columns.to_list()
all_test_df = uplift_data.make_test_features(features)
all_test_df.head(5)

Unnamed: 0,client_id,first_redeem_time,last_month_regular_points_received,past_redeem_total_purchase_sum,total_express_points_spent,total_purchase_sum,last_month_regular_points_spent
0,a9a604ed6e,0.0,102.0,0.0,0.0,33702.7,0.0
1,ebd7360016,1504283000.0,11.1,24998.71,0.0,24998.71,0.0
2,908cd9b8e8,1531502000.0,4.2,37147.95,0.0,37147.95,0.0
3,dceb8ce861,1534013000.0,241.3,195473.42,0.0,195473.42,0.0
4,f4f0ac6b06,1550262000.0,527.8,51698.0,0.0,147386.0,-516.0


In [26]:
X_test = all_test_df.drop(columns=['client_id'])
y_pred = xgb.predict_proba(X_test)
y_pred = [y[1] for y in y_pred]

In [27]:
df_res = pd.DataFrame()
df_res['client_id'] = all_test_df['client_id']
df_res['pred'] = y_pred
df_res.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,0.509895
1,ebd7360016,0.510204
2,908cd9b8e8,0.514425
3,dceb8ce861,0.509925
4,f4f0ac6b06,0.5033


In [28]:
df_res.to_csv('predictions.csv', index=False)