In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import yaml

sys.path.append(os.path.dirname(os.path.abspath('')))
#from utils.spark import restart_spark
from stratification.params import SplitBuilderParams
from prepilot.params import PrepilotParams
from prepilot.prepilot_experiment_builder import PrepilotExperimentBuilder
from prepilot.prepilot_split_builder import PrepilotSplitBuilder
from auto_ab.abtest import ABTest
from auto_ab.params import ABTestParams

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('ab_data.csv')

with open("../auto_ab/configs/auto_ab.config.yaml", "r") as stream:
    try:
        ab_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
df["moda_city"] = np.random.randint(1, 5, df.shape[0])
df["moda_city"] = df["moda_city"].astype(str)
df["id"] = df.index

In [5]:
from auto_ab.params import *
data_params = DataParams(**ab_config['data_params'])
simulation_params = SimulationParams(**ab_config['simulation_params'])
hypothesis_params = HypothesisParams(**ab_config['hypothesis_params'])
result_params = ResultParams(**ab_config['result_params'])
splitter_params = SplitterParams(**ab_config['splitter_params'])

ab_params = ABTestParams(data_params,simulation_params,hypothesis_params,result_params,splitter_params)

In [6]:
ab_params = ABTestParams()

In [7]:
ab_params.data_params.treatment = np.array(ab_params.data_params.treatment)

In [8]:
#DATE_FROM = datetime.date(2021, 4, 1)  # CHANGE
#DATE_TO = datetime.date(2021, 4, 15)  # CHANGE
#synthetic_catalog_ids = []  # List[int]
#CAMPAIGN_ID = "CVM-0-0"  # CHANGE - for naming final output files


split_builder_params = SplitBuilderParams(
    map_group_names_to_sizes={
        'control': None,
        'target': None
    },
    region_col = "moda_city",
    split_metric_col = "height_now",
    customer_col = "id",
    cols = [],
    cat_cols=[
        #'offer_rk_goal',
        #'offer_rk_campaign'
    ],
    pvalue=0.05,
    n_top_cat=100,
    stat_test="ttest_ind"
)

In [9]:
prepilot_params = PrepilotParams(
    #datestart=datestart,
    #datepostperiod=datepostperiod,
    metrics_names=['height_now'],
    injects=[1.000001 ,1.00001, 1.0001, 1.001],#[1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1,1.15,1.2,1.25],
    min_group_size=50000, 
    max_group_size=100000, 
    step=50000,
    variance_reduction = ABTest.cuped,
    use_buckets = True,
    stat_test = ABTest.test_hypothesis_boot_confint,
    bootstrap_metric = np.mean,#np.median
    iterations_number = 3,
    n_buckets = 10000,
    max_beta_score=0.5,
    min_beta_score=0.02,
)

In [10]:
prepilot = PrepilotExperimentBuilder(df, ab_params,
                                     prepilot_params,
                                     split_builder_params)

In [11]:
%%time
beta,alpha = prepilot.collect()

INFO:stratification.split_builder:Calculate stratas for guest table
INFO:stratification.binning:23816 outliers found
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333333333333333,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333333333333333,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.25,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:target: Desired size = 0.3333333333333333,         resulting size = 50000, diff = 0.0 %
INFO:stratification.split_builder:Success!
INFO:stratification.split_builder:control: Desired size = 0.5

CPU times: user 6.98 s, sys: 432 ms, total: 7.41 s
Wall time: 10.3 s


In [12]:
beta

Unnamed: 0_level_0,split_rate,"(50000, 50000)","(100000, 100000)"
metric,MDE,Unnamed: 2_level_1,Unnamed: 3_level_1
height_now,1.000001,>=0.5,>=0.5
height_now,1.00001,>=0.5,>=0.5
height_now,1.0001,>=0.5,0.3333333333
height_now,1.001,<=0.02,<=0.02


In [13]:
alpha

split_rate,"(50000, 50000)","(100000, 100000)"
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
height_now,0.0,0.0
