In [25]:
import pandas
import datetime
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# Чтение данных
df_clients = pandas.read_csv('retail_hero_data/clients.csv', index_col='client_id')
df_purchases = pandas.read_csv('retail_hero_data/purchases.csv', index_col='client_id')
df_train = pandas.read_csv('retail_hero_data/uplift_train.csv', index_col='client_id')
# df_test = pandas.read_csv('retail_hero_data/uplift_test.csv', index_col='client_id')


In [5]:
df_clients = df_clients[df_clients.index.isin(df_train.index)]
df_purchases = df_purchases[df_purchases.index.isin(df_train.index)]

In [7]:
latest_purchases = df_purchases["transaction_datetime"].groupby("client_id").agg(["max","size"])
latest_purchases.head()

Unnamed: 0_level_0,max,size
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000012768d,2019-03-14 15:01:47,52
000036f903,2019-03-17 10:29:37,162
00010925a5,2019-03-08 07:09:50,78
0001f552b0,2019-03-16 10:54:02,86
00020e7b18,2019-03-15 12:04:23,272


In [None]:
# Извлечение признаков

df_clients['first_issue_unixtime'] = pandas.to_datetime(df_clients['first_issue_date']).astype(int)/10**9
df_clients['first_redeem_unixtime'] = pandas.to_datetime(df_clients['first_redeem_date']).astype(int)/10**9
df_clients['latest_purchase_unixtime'] = pandas.to_datetime(latest_purchases["max"]).astype(int)/10**9
df_features = pandas.DataFrame({
    'gender_M'+"_baseline": (df_clients['gender'] == 'M').astype(int),
    'gender_F'+"_baseline": (df_clients['gender'] == 'F').astype(int),
    # 'gender_U'+"_baseline": (df_clients['gender'] == 'U').astype(int),
    'age'+"_baseline": df_clients['age'],
    'latest_purchase_time'+"_baseline": df_clients['latest_purchase_unixtime'],
    'first_issue_time'+"_baseline": df_clients['first_issue_unixtime'],
    # 'first_redeem_time'+"_baseline": df_clients['first_redeem_unixtime'],
    'issue_redeem_delay'+"_baseline": df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime'],
    'purchase_frequency'+"_baseline": latest_purchases["size"],
})#.fillna(0)



In [9]:
df_features['first_issue_time'+"_baseline"] = df_features['first_issue_time'+"_baseline"] - df_features['first_issue_time'+"_baseline"].max() 

In [10]:
df_features['latest_purchase_time'+"_baseline"] = df_features['latest_purchase_time'+"_baseline"] - df_features['latest_purchase_time'+"_baseline"].max() 

In [11]:
df_features['first_issue_time'+"_baseline"] = -np.log1p(-df_features['first_issue_time'+"_baseline"])
df_features['latest_purchase_time'+"_baseline"] = -np.log1p(-df_features['latest_purchase_time'+"_baseline"])

In [12]:
df_features.head()

Unnamed: 0_level_0,gender_M_baseline,gender_F_baseline,age_baseline,latest_purchase_time_baseline,first_issue_time_baseline,issue_redeem_delay_baseline,purchase_frequency_baseline
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000012768d,0,0,45,-12.835913,-17.742198,13146559.0,52
000036f903,0,1,72,-11.795032,-17.923985,1118613.0,162
00010925a5,0,0,83,-13.734495,-16.823021,4492280.0,78
0001f552b0,0,1,33,-12.290078,-17.80145,36610747.0,86
00020e7b18,0,0,73,-12.610557,-17.526723,3823700.0,272


In [13]:
for feature in df_features.columns:
    df_features[f"{feature[:-len('baseline')]}uplift"] = df_train["treatment_flg"] * df_features[feature]

df_features.head()

Unnamed: 0_level_0,gender_M_baseline,gender_F_baseline,age_baseline,latest_purchase_time_baseline,first_issue_time_baseline,issue_redeem_delay_baseline,purchase_frequency_baseline,gender_M_uplift,gender_F_uplift,age_uplift,latest_purchase_time_uplift,first_issue_time_uplift,issue_redeem_delay_uplift,purchase_frequency_uplift
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
000012768d,0,0,45,-12.835913,-17.742198,13146559.0,52,0,0,0,-0.0,-0.0,0.0,0
000036f903,0,1,72,-11.795032,-17.923985,1118613.0,162,0,1,72,-11.795032,-17.923985,1118613.0,162
00010925a5,0,0,83,-13.734495,-16.823021,4492280.0,78,0,0,83,-13.734495,-16.823021,4492280.0,78
0001f552b0,0,1,33,-12.290078,-17.80145,36610747.0,86,0,1,33,-12.290078,-17.80145,36610747.0,86
00020e7b18,0,0,73,-12.610557,-17.526723,3823700.0,272,0,0,73,-12.610557,-17.526723,3823700.0,272


In [14]:
continuous_feats = [
    "age_baseline","latest_purchase_time_baseline","first_issue_time_baseline"
    ,"issue_redeem_delay_baseline","purchase_frequency_baseline"
    ,"age_uplift","latest_purchase_time_uplift","first_issue_time_uplift"
    ,"issue_redeem_delay_uplift","purchase_frequency_uplift"
]

In [15]:
means, stds = df_features[continuous_feats].mean(), df_features[continuous_feats].std()
df_features.loc[:,continuous_feats] = (df_features - means) / stds
df_features.head()

Unnamed: 0_level_0,gender_M_baseline,gender_F_baseline,age_baseline,latest_purchase_time_baseline,first_issue_time_baseline,issue_redeem_delay_baseline,purchase_frequency_baseline,gender_M_uplift,gender_F_uplift,age_uplift,latest_purchase_time_uplift,first_issue_time_uplift,issue_redeem_delay_uplift,purchase_frequency_uplift
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
000012768d,0,0,-0.028614,-0.163604,-0.623882,0.309265,-0.602857,0,0,-0.636112,0.990095,0.997107,0.20641,-0.617349
000036f903,0,1,0.516483,0.676077,-0.832927,0.305316,0.460022,0,1,1.333235,-0.859643,-1.081276,0.206927,1.126581
00010925a5,0,0,0.738559,-0.888493,0.433117,0.306424,-0.351631,0,0,1.634107,-1.163797,-0.953613,0.208487,0.222321
0001f552b0,0,1,-0.270879,0.276722,-0.692018,0.31697,-0.274331,0,1,0.266505,-0.937278,-1.067067,0.223341,0.308441
00020e7b18,0,0,0.536672,0.018191,-0.376098,0.306204,1.522902,0,0,1.360587,-0.987537,-1.035211,0.208178,2.310731


In [16]:
df_features["intercept_baseline"] = 1
df_features["intercept_uplift"] = df_train["treatment_flg"]

### Complete Case Analysis

In [17]:
df_features = df_features.dropna()
df_train = df_train[df_train.index.isin(df_features.index)]

### Parameter Estimation

In [18]:
model = sm.Logit(df_train["target"],df_features)#.astype(float))

In [19]:
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.574386
         Iterations 6


0,1,2,3
Dep. Variable:,target,No. Observations:,200039.0
Model:,Logit,Df Residuals:,200023.0
Method:,MLE,Df Model:,15.0
Date:,"Wed, 25 May 2022",Pseudo R-squ.:,0.1351
Time:,00:18:31,Log-Likelihood:,-114900.0
converged:,True,LL-Null:,-132850.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender_M_baseline,0.0431,0.020,2.130,0.033,0.003,0.083
gender_F_baseline,0.0474,0.016,3.021,0.003,0.017,0.078
age_baseline,0.0562,0.013,4.242,0.000,0.030,0.082
latest_purchase_time_baseline,0.5282,0.008,63.349,0.000,0.512,0.545
first_issue_time_baseline,0.0569,0.007,7.882,0.000,0.043,0.071
issue_redeem_delay_baseline,0.1483,0.007,20.941,0.000,0.134,0.162
purchase_frequency_baseline,0.7635,0.011,66.631,0.000,0.741,0.786
gender_M_uplift,0.0203,0.029,0.706,0.480,-0.036,0.077
gender_F_uplift,0.0695,0.022,3.109,0.002,0.026,0.113
