In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from econml.dml import ForestDMLCateEstimator
from sklearn.ensemble import RandomForestClassifier
from pylift.eval import UpliftEval
from econml.metalearners import TLearner, SLearner, XLearner



In [3]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from platform import python_version
print(python_version())

3.7.3


# Hillstrom

In [None]:
data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
data["segment"] = data["segment"].astype("category")
data["history_segment"] = data["history_segment"].astype("category")
data["zip_code"] = data["zip_code"].astype("category")
data["channel"] = data["channel"].astype("category")
data_size = data.shape[0]

In [None]:
def parse_data_hillstrom(data):
    context = data[["recency", "history_segment", "history", "mens", "womens", "zip_code", "newbie", "channel"]].copy()
    treatment = data["segment"]
    outcome = data["visit"]

    one_hot_hs = pd.get_dummies(context["history_segment"], prefix="hs")
    one_hot_zc = pd.get_dummies(context["zip_code"], prefix="zc")
    one_hot_c = pd.get_dummies(context["channel"], prefix="c")

    context = pd.concat([context[["recency", "history", "mens", "womens", "newbie"]], one_hot_hs, one_hot_zc, one_hot_c], axis=1)

    return (context.values, treatment.values, outcome.values)

In [None]:
class Train:
    def __init__(self, data, data_parser, train_ratio=.5,validation_ratio = .25,test_ratio = .25):
        self.data = shuffle(data)
        self.train_ratio = train_ratio
        self.validation_ratio = validation_ratio
        self.test_ratio = test_ratio
        self.data_parser = data_parser
        
        self.data['strat'] = self.data["segment"].astype(str) + self.data["visit"].astype(str)
        self.data_w = self.data.loc[self.data['segment'].isin(["Womens E-Mail",'No E-Mail'])]
        self.data_m = self.data.loc[self.data['segment'].isin(["Mens E-Mail",'No E-Mail'])]
        
        self.train_data, self.test_data = train_test_split(self.data, test_size=(1-train_ratio), random_state=0, stratify=self.data[['strat']])
        self.validation_data, self.test_data = train_test_split(self.test_data, test_size = self.test_ratio/(self.test_ratio+self.validation_ratio), random_state=0, stratify=self.test_data[['strat']])

        self.train_data_w, self.test_data_w = train_test_split(self.data_w, test_size=(1-train_ratio), random_state=0, stratify=self.data_w[['strat']])
        self.validation_data_w, self.test_data_w = train_test_split(self.test_data_w, test_size = self.test_ratio/(self.test_ratio+self.validation_ratio), random_state=0, stratify=self.test_data_w[['strat']])
        
        self.c_tr, self.t_tr, self.o_tr = self.data_parser(self.train_data_w)
        self.t_tr = self.t_tr.codes - 1
        self.c_va, self.t_va, self.o_va = self.data_parser(self.validation_data_w)
        self.t_va = self.t_va.codes - 1
        self.c_te, self.t_te, self.o_te = self.data_parser(self.test_data_w)
        self.t_te = self.t_te.codes - 1

        self.td_tr = np.concatenate((self.c_tr, np.array([self.t_tr]).T, np.array([self.o_tr]).T), axis=1)
        self.td_va = np.concatenate((self.c_va, np.array([self.t_va]).T, np.array([self.o_va]).T), axis=1)
        self.td_te = np.concatenate((self.c_te, np.array([self.t_te]).T, np.array([self.o_te]).T), axis=1)
        
        self.cols = [f'x{i}' for i in range(self.c_tr.shape[1])]
        self.non_c = ['t', 'y']
        self.cols.extend(self.non_c)
        self.df_tr = pd.DataFrame(data=self.td_tr, columns=self.cols)
        self.df_va = pd.DataFrame(data=self.td_va, columns=self.cols)
        self.df_te = pd.DataFrame(data=self.td_te, columns=self.cols)
        self.x = [variable for variable in list(self.df_tr.columns) if variable not in self.non_c] 
        
        self.param_tuning = {
            'max_depth': [3, 5, 7, 10],
            'n_estimators' : [100,200,300],
            'random_state': [0]
        }
        
    def CF(self):
        est = ForestDMLCateEstimator(model_y = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1),
                                     model_t = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1),
                                     discrete_treatment=True,
                                     n_estimators= 100,
                                     n_jobs = -1,
                                     random_state = 0)
        
        est.fit(self.df_tr['y'].to_numpy(), self.df_tr['t'].to_numpy(),self.df_tr[self.x].to_numpy(), inference = 'blb')
        
        pre = est.effect(self.df_va[self.x].to_numpy())
        
        upev = UpliftEval(self.df_va['t'], self.df_va['y'], pre)

        return upev
    
    def meta_learners(self, learner):

        if learner == 'T_learner':
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = TLearner(model)

        elif learner == 'S_learner':
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = SLearner(model)
            
        elif learner == 'X_learner':            
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            propensity_model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = XLearner(models = model, propensity_model=propensity_model)

        est.fit(self.df_tr['y'].to_numpy(), self.df_tr['t'].to_numpy(), self.df_tr[self.x].to_numpy())
   
        pre = est.effect(self.df_va[self.x].to_numpy())

        upev = UpliftEval(self.df_va['t'], self.df_va['y'], pre)

        return upev    
        

In [None]:
t = Train(data,parse_data_hillstrom)

In [None]:
estimated_CF = t.CF()
estimated_CF.calc(plot_type='aqini', n_bins=100)
estimated_CF.plot(plot_type='aqini')

In [None]:
estimated_metalearners = t.meta_learners(learner = 'T_learner')
estimated_metalearners.calc(plot_type='aqini', n_bins=100)
estimated_metalearners.plot(plot_type='aqini')

# CS

In [None]:
data = pd.read_csv("CS.csv",sep = ';')

def parse_data_chile(data):
    context = data[["year", "sen_tenure", "jerarquia", "homophily", "indegree", "strengthoftie", "NAT"]].copy()
    treatment = data["CAMP"]
    data["Y"] = np.where(data["Y"]==1,1,0)
    outcome = data["Y"]
    return (context.values, treatment.values, outcome.values)


class Train_chile:
    def __init__(self, data, data_parser, train_ratio=.7):
        self.data = shuffle(data)
        self.train_ratio = train_ratio
        self.data_parser = data_parser
        
        self.data['strat'] = self.data["CAMP"].astype(str) + self.data["Y"].astype(str)
        
        self.train_data, self.test_data = train_test_split(self.data, test_size=(1-train_ratio), random_state=0, stratify=self.data[['strat']])

        self.c_tr, self.t_tr, self.o_tr = self.data_parser(self.train_data)
        self.c_te, self.t_te, self.o_te = self.data_parser(self.test_data)
        
        self.td_tr = np.concatenate((self.c_tr, np.array([self.t_tr]).T, np.array([self.o_tr]).T), axis=1)
        self.td_te = np.concatenate((self.c_te, np.array([self.t_te]).T, np.array([self.o_te]).T), axis=1)
        
        self.cols = [f'x{i}' for i in range(self.c_tr.shape[1])]
        self.non_c = ['t', 'y']
        self.cols.extend(self.non_c)
        self.df_tr = pd.DataFrame(data=self.td_tr, columns=self.cols)
        self.df_te = pd.DataFrame(data=self.td_te, columns=self.cols)
        self.x = [variable for variable in list(self.df_tr.columns) if variable not in self.non_c] 
        
        self.param_tuning = {
            'max_depth': [3, 5, 7, 10],
            'n_estimators' : [100,200,300],
            'random_state': [0]
        }
        
    def CF(self):
        est = ForestDMLCateEstimator(model_y = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1),
                                     model_t = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1),
                                     discrete_treatment=True,
                                     n_estimators= 100,
                                     n_jobs = -1,
                                     random_state = 0)
        
        est.fit(self.df_tr['y'].to_numpy(), self.df_tr['t'].to_numpy(),self.df_tr[self.x].to_numpy(), inference = 'blb')
        
        pre = est.effect(self.df_te[self.x].to_numpy())
        
        upev = UpliftEval(self.df_te['t'], self.df_te['y'], pre)

        return upev
    
    def meta_learners(self, learner):

        if learner == 'T_learner':
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = TLearner(model)

        elif learner == 'S_learner':
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = SLearner(model)
            
        elif learner == 'X_learner':            
            model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            propensity_model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = self.param_tuning,cv = 5,n_jobs = -1,verbose = 1)
            est = XLearner(models = model, propensity_model=propensity_model)

        est.fit(self.df_tr['y'].to_numpy(), self.df_tr['t'].to_numpy(), self.df_tr[self.x].to_numpy())
   
        pre = est.effect(self.df_te[self.x].to_numpy())

        upev = UpliftEval(self.df_te['t'], self.df_te['y'], pre)

        return upev    

t_chile = Train_chile(data,parse_data_chile)        

In [None]:
estimated_CF = t_chile.CF()
estimated_CF.calc(plot_type='aqini', n_bins=100)
estimated_CF.plot(plot_type='aqini')

In [None]:
estimated_metalearners = t_chile.meta_learners(learner = 'T_learner')
estimated_metalearners.calc(plot_type='aqini', n_bins=100)
estimated_metalearners.plot(plot_type='aqini')

In [None]:
estimated_metalearners = t_chile.meta_learners(learner = 'S_learner')
estimated_metalearners.calc(plot_type='aqini', n_bins=100)
estimated_metalearners.plot(plot_type='aqini')

In [None]:
estimated_metalearners = t_chile.meta_learners(learner = 'X_learner')
estimated_metalearners.calc(plot_type='aqini', n_bins=100)
estimated_metalearners.plot(plot_type='aqini')