In [1]:
import pandas as pd
import numpy as np
from sklift.models import ClassTransformation
from sklearn.model_selection import train_test_split
from sklift.metrics import uplift_at_k
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklift.metrics import uplift_at_k
import seaborn as sns

In [42]:
pd.set_option('display.max_columns', 300)
clients = pd.read_csv(r'C:\datasets\retailhero-uplift\data\final_clients2.csv', index_col='client_id')
clients_other = pd.read_csv(r'C:\datasets\retailhero-uplift\data\final_clients.csv', index_col='client_id')
uplift_train = pd.read_csv(r'C:\datasets\retailhero-uplift\data\uplift_train.csv', index_col='client_id')
uplift_test = pd.read_csv(r'C:\datasets\retailhero-uplift\data\uplift_test.csv', index_col='client_id')
uplift_sample_submission = pd.read_csv(r'C:\datasets\retailhero-uplift\data\uplift_sample_submission.csv', index_col='client_id')
pd.options.display.max_rows=100
client_index = clients.index

In [43]:
clients = clients.drop(['Unnamed: 0', 'store_count', 'gender', 'first_issue_date', 'first_redeem_date', 
                        'timedelta', 'transaction_count', 'purchase_store_sum', 'store_count', 'product_id', 
                        'age'], axis=1)
clients_other = clients_other.drop(['Unnamed: 0'], axis=1)
clients = clients.merge(clients_other, left_index=True, right_index=True)
clients = clients.drop(['timedelta', 'express_points_received', 'express_points_spent', 
                        'express_points_received_mean', 'purchase_trans_mean', 'points_received', 
                        'purchase_trans_sum', 'product_id', 'age', 'gender', 'mean_prod_trans', 
                        'regular_points_spent_mean', 'percent_points', 'regular_points_received', 
                        'std', 'points_spent'], axis=1)

In [44]:
#clients.loc[clients['points_spent']>=800, 'points_spent'] = clients['points_spent'].median()
clients.loc[clients['purchase_store_mean']>=290000, 'purchase_store_mean'] = clients['purchase_store_mean'].median()

In [45]:
#clients['points_spent'] = np.log(clients['points_spent']+1)
clients['purchase_store_mean'] = np.log(clients['purchase_store_mean']+1)
clients['mean_prod_store'] = np.log(clients['mean_prod_store']+1)
clients['first_issue_date'] = np.log(clients['first_issue_date']+1)
clients['first_redeem_date'] = np.log(clients['first_redeem_date']+1)
clients['regular_points_spent'] = np.log(clients['regular_points_spent']+1)
clients['purchase_sum'] = np.log(clients['purchase_sum']+1)
clients['regular_points_received_mean'] = np.log(clients['regular_points_received_mean']+1)
clients['purchase_sum_mean'] = np.log(clients['purchase_sum_mean']+1)
clients['trn_sum_from_iss_mean'] = np.log(clients['trn_sum_from_iss_mean']+1)
clients['sum_prod_store'] = np.log(clients['sum_prod_store']+1)
clients['transaction_id'] = np.log(clients['transaction_id']+1)
clients['store_id'] = np.log(clients['store_id']+1)
clients['trn_sum_from_iss'] = np.log(clients['trn_sum_from_iss']+1)
clients['is_own_trademark'] = np.log(clients['is_own_trademark']+1)
clients['express_points_spent_mean'] = abs(clients['express_points_spent_mean'])

In [46]:
clients['purchase_store_mean'] = clients['purchase_store_mean'] / clients['purchase_store_mean'].max()
clients['sum_prod_store'] = clients['sum_prod_store'] / clients['sum_prod_store'].max()
clients['mean_prod_store'] = clients['mean_prod_store'] / clients['mean_prod_store'].max()
clients['first_issue_date'] = clients['first_issue_date'] / clients['first_issue_date'].max()
clients['first_redeem_date'] = clients['first_redeem_date'] / clients['first_redeem_date'].max()
clients['transaction_id'] = clients['transaction_id'] / clients['transaction_id'].max()
clients['store_id'] = clients['store_id'] / clients['store_id'].max()
clients['regular_points_spent'] = clients['regular_points_spent'] / clients['regular_points_spent'].max()
clients['purchase_sum'] = clients['purchase_sum'] / clients['purchase_sum'].max()
clients['trn_sum_from_iss'] = clients['trn_sum_from_iss'] / clients['trn_sum_from_iss'].max()
clients['regular_points_received_mean'] = clients['regular_points_received_mean'] / clients['regular_points_received_mean'].max()
clients['express_points_spent_mean'] = clients['express_points_spent_mean'] / clients['express_points_spent_mean'].max()
clients['purchase_sum_mean'] = clients['purchase_sum_mean'] / clients['purchase_sum_mean'].max()
clients['trn_sum_from_iss_mean'] = clients['trn_sum_from_iss_mean'] / clients['trn_sum_from_iss_mean'].max()
clients['is_own_trademark'] = clients['is_own_trademark'] / clients['is_own_trademark'].max()
clients['is_alcohol'] = clients['is_alcohol'] / clients['is_alcohol'].max()

In [47]:
uplift_train = clients.merge(uplift_train, right_index=True, left_index=True)
uplift_test = clients.merge(uplift_test, right_index=True, left_index=True)
df_features = clients.copy()
indices_train = uplift_train.index
indices_test = uplift_test.index
indices_learn, indices_valid = train_test_split(uplift_train.index, test_size=0.3, random_state=123)
treat_train = uplift_train.loc[indices_learn, 'treatment_flg']

In [48]:
uplift_train['classes'] = 'NaN'

In [49]:
uplift_train.loc[(uplift_train['treatment_flg']==0)&(uplift_train['target']==0), 'classes'] = 0
uplift_train.loc[(uplift_train['treatment_flg']==0)&(uplift_train['target']==1), 'classes'] = 1
uplift_train.loc[(uplift_train['treatment_flg']==1)&(uplift_train['target']==0), 'classes'] = 2
uplift_train.loc[(uplift_train['treatment_flg']==1)&(uplift_train['target']==1), 'classes'] = 3

In [50]:
X_train = df_features.loc[indices_learn, :]
y_train = uplift_train.loc[indices_learn, 'classes']
treat_train = uplift_train.loc[indices_learn, 'treatment_flg']

X_val = df_features.loc[indices_valid, :]
y_val = uplift_train.loc[indices_valid, 'classes']
treat_val =  uplift_train.loc[indices_valid, 'treatment_flg']

n_classes_ = len(np.unique(y_train))

In [51]:
xgb_model = xgb.XGBClassifier(objective='multi:softprob')
xgb_params = [{'num_class': n_classes_}]
xgb_model.fit(X_train, y_train)
predict = xgb_model.predict_proba(X_val)

In [58]:
weight = .21
#weight = 0.4

In [59]:
predict_class = xgb_model.predict(X_val)
uplift = (predict[:, 3]/weight + predict[:, 0]/(1 - weight)) - (predict[:, 1]/weight + predict[:, 2]/(1 - weight))
uplift.mean()

0.09519945

In [60]:
count = 0
for i, j in zip(y_val.values, predict_class):
    if i==j:
        count += 1
print(count / predict_class.shape[0])

0.37020929147503834


In [61]:
predict = xgb_model.predict_proba(uplift_test)
uplift = (predict[:, 3]/weight + predict[:, 0]/(1 - weight)) - (predict[:, 1]/weight + predict[:, 2]/(1 - weight))
uplift = pd.DataFrame(uplift)
uplift_result = pd.DataFrame(uplift_test.index)
uplift_result['uplift'] = uplift

In [62]:
uplift_result.to_csv(r'C:\datasets\retailhero-uplift\data\uplift_submission.csv')

In [63]:
uplift

Unnamed: 0,0
0,-0.095288
1,-0.001066
2,0.060186
3,0.132601
4,0.082224
...,...
200118,0.125415
200119,0.166394
200120,0.010515
200121,0.067093
