In [34]:
import numpy as np
import pandas as pd
from scipy import stats as sps
from sklearn import linear_model
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt 
import warnings
from dataclasses import dataclass
import statsmodels.api as sm
tqdm.pandas()
plt.style.use('ggplot')
pd.set_option('use_inf_as_na', True)
plt.rcParams['figure.figsize'] = (15,6)
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_predict
from copy import copy

warnings.simplefilter('ignore')


In [2]:
base_data = pd.read_pickle('../data/expN34.pkl')
data = base_data.query('click < view & view < view.quantile(.99)').drop('cov_device',axis=1)
data['ctr'] = data['click'] / data['view']

In [3]:
data_ab = data.query('part == "AB"').drop(['part'],axis=1)
data_aa = data.query('part != "AB"').drop(['part'],axis=1)

In [4]:
Y = 'ctr'
T = 'ab'
X = data.filter(regex='cov').columns.tolist()

In [23]:
params = {
    'X':X,
    'T':T,
    'Y':Y
}

# OLS as stat test

In [14]:
def bootstrap_func(func,data,params,n_bootstraps=500):
    
    return [func(data=data.sample(frac=1,replace=True),**params) for _ in tqdm(range(n_bootstraps),position=0)]

In [145]:
def ols_ttest(data:pd.DataFrame,
              T:str,
              Y:str,
              X:list=[]):
    lm = sm.formula.ols(f'{Y}~{T}',data=data.astype(float)).fit()
    return lm.summary2().tables[1].loc[T,'Coef.']

In [146]:
def IPV(data,
        X:list,
        T:str,
        Y:str,
        is_raw_ps=False):
    ps = linear_model.LogisticRegression(C=1e-6,penalty='none').fit(data[X], data[T]).predict_proba(data[X])[:, 1]
    weight = (data[T]-ps) / (ps*(1-ps)) # define the weights
    if is_raw_ps:
        return ps
    return (weight * data[Y]).mean() # compute the ATE

In [147]:
def s_learner(data:pd.DataFrame,
        X:list,
        T:str,
        Y:str,
        model=LGBMRegressor()):
    
    x0 = data[X+[T]]
    y0 = data[Y]
    model.fit(x0,y0)

    return model.predict(x0.assign(**{T:1})).mean() - model.predict(x0.assign(**{T:0})).mean()


In [148]:
def t_learner(data:pd.DataFrame,
        X:list,
        T:str,
        Y:str
        ,model=LGBMRegressor()):
    X0 = data.query(f'{T}==0')[X]
    Y0 = data.query(f'{T}==0')[Y]

    X1 = data.query(f'{T}==1')[X]
    Y1 = data.query(f'{T}==1')[Y]

    m0 = copy(model)
    m1 = copy(model)

    m0.fit(X0,Y0)
    m1.fit(X1,Y1)

    return m1.predict(data[X]).mean() - m0.predict(data[X]).mean()


In [149]:
def r_learner(data:pd.DataFrame,
                X:list,
                T:str,
                Y:str,
                model=LGBMRegressor()):

    train_data = data[X]
    target = data[Y]
    threatment = data[T]
    
    debaise = copy(model)
    denoise = copy(model)


    T_res = threatment - cross_val_predict(debaise,train_data,threatment,cv=10)
    Y_res = target - cross_val_predict(denoise,train_data,target,cv=10)

    W = T_res **2
    Y_star = Y_res / T_res

    final_model = copy(model)

    final_model.fit(train_data,Y_star,sample_weight=W)

    return final_model.predict(train_data).mean()


In [150]:
def doubly_robust(data:pd.DataFrame,
                X:list,
                T:str,
                Y:str):
    ps = linear_model.LogisticRegression(C=1e6,penalty='none').fit(data[X], data[T]).predict_proba(data[X])[:, 1]
    mu0 = linear_model.LinearRegression().fit(data.query(f"{T}==0")[X], data.query(f"{T}==0")[Y]).predict(data[X])
    mu1 = linear_model.LinearRegression().fit(data.query(f"{T}==1")[X], data.query(f"{T}==1")[Y]).predict(data[X])

    control_metric = np.mean(
        (data[Y] - mu0)
        *(1-data[T])/(1-ps))

    treatment_metric = np.mean(
        (data[Y] - mu1)
        * data[T]/ps)

    return treatment_metric - control_metric

In [152]:
result = {}
for method in (ols_ttest,IPV,s_learner,t_learner,r_learner,doubly_robust):
    result[method.__name__]= bootstrap_func(method,data_ab,params=params,n_bootstraps=500)

100%|██████████| 500/500 [01:07<00:00,  7.37it/s]
100%|██████████| 500/500 [07:25<00:00,  1.12it/s]
100%|██████████| 500/500 [03:10<00:00,  2.63it/s]
100%|██████████| 500/500 [03:55<00:00,  2.13it/s]
100%|██████████| 500/500 [44:23<00:00,  5.33s/it]
100%|██████████| 500/500 [09:03<00:00,  1.09s/it]


In [153]:
zz = pd.DataFrame(result)

In [173]:
zz.apply(lambda x: x>=0.000).mean()

ols_ttest        0.974
IPV              0.850
s_learner        0.860
t_learner        0.780
r_learner        0.752
doubly_robust    0.858
dtype: float64

In [163]:
def bts_ttest(bts_data:list,threshold=0) -> float:
    bool_data = np.array(bts_data) <= threshold
    agg_data = bool_data.mean()
    result = min(agg_data,1-agg_data) * 2
    return result

In [167]:
zz.apply(bts_ttest,threshold=0)

ols_ttest        0.052
IPV              0.300
s_learner        0.280
t_learner        0.440
r_learner        0.496
doubly_robust    0.284
dtype: float64

In [169]:
zz.apply(bts_ttest,threshold=-0.001)

ols_ttest        0.004
IPV              0.060
s_learner        0.008
t_learner        0.084
r_learner        0.084
doubly_robust    0.032
dtype: float64