In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import logging
import pandas as pd
import numpy as np
import yaml

sys.path.append(os.path.dirname(os.path.abspath('')))

from auto_ab.abtest import ABTest
from auto_ab.params import ABTestParams

logging.basicConfig(level = logging.INFO)


%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('ab_data.csv')

with open("../auto_ab/configs/auto_ab.config.yaml", "r") as stream:
    try:
        ab_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
from auto_ab.params import *

data_params = DataParams(
    id_col='id', 
    group_col='groups', 
    strata_col='country', 
    target='height_now', 
    target_flg='bought', 
    predictors=['weight_now'], 
    numerator='clicks', 
    denominator='sessions', 
    covariate='height_prev', 
    target_prev='height_prev', 
    predictors_prev=['weight_prev'], 
    cluster_col='kl-divergence', 
    clustering_cols=['col1', 'col2', 'col3'], 
    is_grouped=True
)

simulation_params = SimulationParams(
    n_iter=100, 
    split_rates=[0.1, 0.2, 0.3, 0.4, 0.5], 
    vars=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0], 
    extra_params=[]
)

hypothesis_params = HypothesisParams(
    alpha=0.05, 
    beta=0.2, 
    alternative='two-sided', 
    split_ratios=(0.5, 0.5), 
    strategy='simple_test', 
    strata='country', 
    strata_weights={'US': 0.8, 'UK': 0.2}, 
    metric_type='solid', 
    metric_name='mean', 
    metric=np.mean, 
    n_boot_samples=200, 
    n_buckets=50
)

result_params = ResultParams(
    to_csv=True, 
    csv_path='/app/data/internal/guide/solid_mde.csv'
)

splitter_params = SplitterParams(
    split_rate=0.5, 
    name='default'
)

ab_params = ABTestParams(data_params,simulation_params,hypothesis_params,result_params,splitter_params)

# Примеры с пост анализом

In [4]:
ab_test = ABTest(df, ab_params)

In [5]:
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_hypothesis_mannwhitney()}")
print(f"ttest_test: {ab_test.test_hypothesis_ttest()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 0}
mannwhitney_test: {'stat': 4992976857.0, 'p-value': 0.5890975776571467, 'result': 0}
ttest_test: {'stat': -0.4435517988023507, 'p-value': 0.6573671411499618, 'result': 0}


In [6]:
df[ab_test.params.data_params.target].where(df[ab_test.params.data_params.group_col]=='A', #applied where cond is False
                                        df[ab_test.params.data_params.target]*1.001, 
                                        axis=0,
                                        inplace=True)

ab_test = ABTest(df, ab_params)

In [7]:
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_hypothesis_mannwhitney()}")
print(f"ttest_test: {ab_test.test_hypothesis_ttest()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 1}
mannwhitney_test: {'stat': 4894119324.0, 'p-value': 2.4516264484905984e-16, 'result': 1}
ttest_test: {'stat': -8.275588883468059, 'p-value': 1.2859564365137272e-16, 'result': 1}


# Понижение дисперсии

In [8]:
df = pd.read_csv('ab_data.csv')
df[ab_test.params.data_params.target].where(df[ab_test.params.data_params.group_col]=='A', #applied where cond is False
                                        df[ab_test.params.data_params.target]*1.0001, 
                                        axis=0,
                                        inplace=True)

ab_test = ABTest(df, ab_params)
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_hypothesis_mannwhitney()}")
print(f"ttest_test: {ab_test.test_hypothesis_ttest()}")


ab_test1 = ab_test.cuped()
print(f"bootstrap_test: {ab_test1.test_hypothesis_boot_confint()}")
print(f"mannwhitney_test: {ab_test1.test_hypothesis_mannwhitney()}")
print(f"ttest_test: {ab_test1.test_hypothesis_ttest()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 0}
mannwhitney_test: {'stat': 4983082845.0, 'p-value': 0.19137136312943093, 'result': 0}
ttest_test: {'stat': -1.2271064134922398, 'p-value': 0.21978404218147798, 'result': 0}


  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 1}
mannwhitney_test: {'stat': 4962621356.0, 'p-value': 0.0038343556323012554, 'result': 1}
ttest_test: {'stat': -3.3123079503060633, 'p-value': 0.0009254602801692166, 'result': 1}


# Пост анализ для бинарной переменной

In [11]:
data_params = DataParams(
    id_col='id', 
    group_col='groups', 
    strata_col='country', 
    target='conversion', 
    target_flg='bought', 
    predictors=['weight_now'], 
    numerator='clicks', 
    denominator='sessions', 
    covariate='height_prev', 
    target_prev='height_prev', 
    predictors_prev=['weight_prev'], 
    cluster_col='kl-divergence', 
    clustering_cols=['col1', 'col2', 'col3'], 
    is_grouped=True
)

ab_params = ABTestParams(data_params,simulation_params,hypothesis_params,result_params,splitter_params)

In [12]:
df = pd.read_csv('ab_data.csv')

In [13]:
ab_test = ABTest(df, ab_params)

In [14]:
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"ttest_test: {ab_test.test_hypothesis_ztest_prop()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 0}
ttest_test: {'stat': -53.48422677797526, 'p-value': 0.0, 'result': 1}


In [15]:
df = pd.read_csv('ab_data.csv')
df['conversion'] = np.where(df.noise_now<0,0,1)
df[ab_test.params.data_params.target].where(df[ab_test.params.data_params.group_col]=='A', #applied where cond is False
                                        df[ab_test.params.data_params.target]*1.01, 
                                        axis=0,
                                        inplace=True)

ab_test = ABTest(df, ab_params)
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"ttest_test: {ab_test.test_hypothesis_ztest_prop()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 1}
ttest_test: {'stat': -66.8854397361037, 'p-value': 0.0, 'result': 1}


# Понижение дисперсии для конверсии

In [16]:
df = pd.read_csv('ab_data.csv')
df[ab_test.params.data_params.target].where(df[ab_test.params.data_params.group_col]=='A', #applied where cond is False
                                        df[ab_test.params.data_params.target]*1.001, 
                                        axis=0,
                                        inplace=True)

ab_test = ABTest(df, ab_params)
print(f"bootstrap_test: {ab_test.test_hypothesis_boot_confint()}")
print(f"ttest_test: {ab_test.test_hypothesis_ztest_prop()}")


ab_test1 = ab_test.cuped()
print(f"bootstrap_test: {ab_test1.test_hypothesis_boot_confint()}")
print(f"ttest_test: {ab_test1.test_hypothesis_ztest_prop()}")

  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 0}
ttest_test: {'stat': -54.75650959232436, 'p-value': 0.0, 'result': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

bootstrap_test: {'stat': None, 'p-value': None, 'result': 0}
ttest_test: {'stat': -54.75650959219529, 'p-value': 0.0, 'result': 1}
