In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import yaml

sys.path.append(os.path.dirname(os.path.abspath('')))
#from utils.spark import restart_spark
from stratification.params import SplitBuilderParams
from prepilot_local.params import PrepilotParams
from prepilot_local.prepilot_experiment_builder import PrepilotExperimentBuilder
from prepilot_local.prepilot_split_builder import PrepilotSplitBuilder
from analysis.abtest import ABTest
from analysis.ab_params import ABTestParams

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

In [5]:
df = pd.read_csv('ab_data.csv')

with open("../analysis/configs/config.yaml", "r") as stream:
    try:
        ab_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [6]:
df["moda_city"] = np.random.randint(1, 5, df.shape[0])
df["moda_city"] = df["moda_city"].astype(str)
df["id"] = df.index

In [None]:
from analysis.ab_params import *
metric_params = MetricParams(**ab_config['metric_params'])
data_params = DataParams(**ab_config['data_params'])
simulation_params = SimulationParams(**ab_config['simulation_params'])
hypothesis_params = HypothesisParams(**ab_config['hypothesis_params'])
result_params = ResultParams(**ab_config['result_params'])
splitter_params = SplitterParams(**ab_config['splitter_params'])
#bootstrap_params = BootstrapParams(**ab_config['bootstrap_params'])

ab_params = ABTestParams(metric_params,data_params,simulation_params,hypothesis_params,result_params,splitter_params)

In [7]:
ab_params = ABTestParams()

In [8]:
#DATE_FROM = datetime.date(2021, 4, 1)  # CHANGE
#DATE_TO = datetime.date(2021, 4, 15)  # CHANGE
#synthetic_catalog_ids = []  # List[int]
#CAMPAIGN_ID = "CVM-0-0"  # CHANGE - for naming final output files


split_builder_params = SplitBuilderParams(
    map_group_names_to_sizes={
        'control': None,
        'target': None
    },
    region_col = "moda_city",
    split_metric_col = "height_now",
    customer_col = "id",
    cols = [],
    cat_cols=[
        #'offer_rk_goal',
        #'offer_rk_campaign'
    ],
    pvalue=0.05,
    n_top_cat=100,
    stat_test="ttest_ind"
)

In [9]:
prepilot_params = PrepilotParams(
    #datestart=datestart,
    #datepostperiod=datepostperiod,
    metrics_names=['height_now'],
    injects=[1.000001 ,1.00001, 1.0001, 1.001],#[1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1,1.15,1.2,1.25],
    min_group_size=50000, 
    max_group_size=100000, 
    step=50000,
    variance_reduction = ABTest.cupac,
    use_buckets = True,
    bootstrap_metric = np.mean,#np.median
    iterations_number = 3,
    n_buckets = 10000,
    max_beta_score=0.5,
    min_beta_score=0.02,
)

In [10]:
prepilot = PrepilotExperimentBuilder(df, ab_params,
                                     prepilot_params,
                                     split_builder_params)

In [11]:
%%time
beta,alpha = prepilot.collect()

INFO:stratification.split_builder:Calculate stratas for guest table
INFO:stratification.binning:25284 outliers found
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50001, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333355555703705,         resulting size = 49998, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50001, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333355555703705,         resulting size = 49998, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50001, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333355555703705,         resulting size = 49998, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.5

                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.987e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:08   Log-Likelihood:                     -2.2182e+05
No. Observations:               99999   AIC:                                  4.436e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21612.33it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.976e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:08   Log-Likelihood:                     -2.2190e+05
No. Observations:               99999   AIC:                                  4.438e+05
Df Residuals:                   99998   BIC:                                  4.438e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21356.47it/s]

                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.984e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -2.2184e+05
No. Observations:               99999   AIC:                                  4.437e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------


100%|██████████| 200/200 [00:00<00:00, 21007.76it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.987e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -2.2182e+05
No. Observations:               99999   AIC:                                  4.436e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 22046.28it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.976e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -2.2190e+05
No. Observations:               99999   AIC:                                  4.438e+05
Df Residuals:                   99998   BIC:                                  4.438e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21299.53it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.984e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -2.2184e+05
No. Observations:               99999   AIC:                                  4.437e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21122.01it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 22048.59it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21419.73it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:09   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21470.71it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 19140.27it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 22104.37it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21556.79it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.987e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -2.2182e+05
No. Observations:               99999   AIC:                                  4.436e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21750.74it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.976e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -2.2190e+05
No. Observations:               99999   AIC:                                  4.438e+05
Df Residuals:                   99998   BIC:                                  4.438e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 22614.46it/s]

                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.984e+08
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -2.2184e+05
No. Observations:               99999   AIC:                                  4.437e+05
Df Residuals:                   99998   BIC:                                  4.437e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------


100%|██████████| 200/200 [00:00<00:00, 21216.03it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:10   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21833.39it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:11   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 21459.18it/s]


                                 OLS Regression Results                                
Dep. Variable:            height_prev   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.198e+09
Date:                Mon, 23 May 2022   Prob (F-statistic):                        0.00
Time:                        18:29:11   Log-Likelihood:                     -4.4357e+05
No. Observations:              200000   AIC:                                  8.871e+05
Df Residuals:                  199999   BIC:                                  8.871e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

100%|██████████| 200/200 [00:00<00:00, 22017.34it/s]


CPU times: user 17.5 s, sys: 4.89 s, total: 22.4 s
Wall time: 11.2 s


In [12]:
beta

Unnamed: 0_level_0,split_rate,"(50000, 50000)","(100000, 100000)"
metric,MDE,Unnamed: 2_level_1,Unnamed: 3_level_1
height_now,1.000001,>=0.5,>=0.5
height_now,1.00001,>=0.5,>=0.5
height_now,1.0001,>=0.5,0.3333333333
height_now,1.001,<=0.02,<=0.02


In [13]:
alpha

split_rate,"(50000, 50000)","(100000, 100000)"
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
height_now,0.0,0.0
