In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
def x5_coding(x):
    return int(str(x['treatment_flg'])+str(x['target']), 2)

def x5_preprocessing(df_clients, df_train):
    df_clients['first_issue_unixtime'] = pd.to_datetime(df_clients['first_issue_date']).astype(int)/10**9
    df_clients['first_redeem_unixtime'] = pd.to_datetime(df_clients['first_redeem_date']).astype(int)/10**9
    df_features = pd.DataFrame({
    'gender_M': (df_clients['gender'] == 'M').astype(int),
    'gender_F': (df_clients['gender'] == 'F').astype(int),
    'gender_U': (df_clients['gender'] == 'U').astype(int),
    'age': df_clients['age'],
    'first_issue_time': df_clients['first_issue_unixtime'],
    'first_redeem_time': df_clients['first_redeem_unixtime'],
    'issue_redeem_delay': df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime'],
    }).fillna(0)
    df_train['coding'] = df_train.apply(x5_coding, axis=1)
    w = df_train['treatment_flg']
    y = df_train['target']
    c = df_train['coding']
    return df_features, w, y, c

In [5]:
df_clients = pd.read_csv('retailhero-uplift/data/clients.csv', index_col='client_id')
df_train = pd.read_csv('retailhero-uplift/data/uplift_train.csv', index_col='client_id')
df_features, w, y, c = x5_preprocessing(df_clients, df_train)

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [7]:
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=1)

In [8]:
X_train, w_train, y_train, c_train = df_features.loc[indices_learn], w.loc[indices_learn], y.loc[indices_learn], c.loc[indices_learn]
X_valid, w_valid, y_valid, c_valid = df_features.loc[indices_valid], w.loc[indices_valid], y.loc[indices_valid], c.loc[indices_valid]

In [9]:
from tree import UpliftTree, RandomForestUplift

In [16]:
dt = UpliftTree(max_depth=6, scoring='Chi', min_samples_leaf=100, min_samples_treatment=10, n_rand_features=5)
dt.fit(X_train.copy(), y_train, w_train)
class_, prob = dt.predict(X_valid)

In [17]:
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)
# Area Under Qini Curve
tm_qini_auc = qini_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Area Under Uplift Curve
tm_uplift_auc = uplift_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Weighted average uplift
tm_wau = weighted_average_uplift(y_true=y_valid, uplift=class_,  treatment=w_valid)

print("Tree: ", tm_qini_auc, tm_uplift_auc, tm_wau)

Tree:  0.0020343181789145067 0.002890334215699216 0.038157984223109315


In [None]:
rf = RandomForestUplift(n_estimators=50, max_depth=7, scoring='Chi', min_samples_leaf=100, min_samples_treatment=10, n_rand_features=5)
rf.fit(X_train.copy(), y_train, w_train)
class_, prob = rf.predict(X_valid)

In [19]:
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)
# Area Under Qini Curve
tm_qini_auc = qini_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Area Under Uplift Curve
tm_uplift_auc = uplift_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Weighted average uplift
tm_wau = weighted_average_uplift(y_true=y_valid, uplift=class_,  treatment=w_valid)

print("Random Forest: ", tm_qini_auc, tm_uplift_auc, tm_wau)

Random Forest:  -2.9874850489606508e-05 -4.39421786274905e-05 0.03798943157875096
