In [7]:
# import libriaries 
import pandas as pd
from statsmodels.stats.power import TTestIndPower
from scipy import stats
from CH import Getch       # this function we use to read queries from clickhouse
import numpy as np
from statsmodels.stats.power import TTestIndPower  # for test

In [3]:
# query from 29-12-2021 to 04-01-2022 to compare group 0 and 1
data = Getch('''
SELECT user_id,
       exp_group,
       countIf(action='like') as likes,
       countIf(action='view') as views,
       likes / views as CTR
FROM simulator_20211220.feed_actions
where toDate(time) >= toDate('2021-12-29') and toDate(time) <= toDate('2022-01-04') 
      and exp_group in (0, 1)
group by user_id, exp_group
''').df

In [4]:
exp_group_0 = data[data['exp_group'] == 0]  # data from experimental group 0
exp_group_1 = data[data['exp_group'] == 1]  # data from experimental group 0

In [5]:
# extract mean, std, and number of samples of each group (0, 1)
mean_ctr_0 = exp_group_0['CTR'].mean()
mean_ctr_1 = exp_group_1['CTR'].mean()

std_ctr_0 = exp_group_0['CTR'].std()
std_ctr_1 = exp_group_1['CTR'].std()

n_0 = len(exp_group_0['CTR'])
n_1 = len(exp_group_1['CTR'])

print('CTR')
print('mean_group_0:', mean_ctr_0, 'mean_group_1:', mean_ctr_1)
print('std_group_0:', std_ctr_0, 'std_group_1:', std_ctr_1)
print('n_group_0:', n_0, 'n_group_1:', n_1)

CTR
mean_group_0: 0.22195492956578472 mean_group_1: 0.2364864694809337
std_group_0: 0.08787038294959099 std_group_1: 0.09014644584020069
n_group_0: 12997 n_group_1: 13002


In [8]:
effect = 0.3  # Cohen's D 
alpha = 0.05  
power = 0.8
# perform power analysis
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Expected number of observations in each group : ', round(result))

Expected number of observations in each group :  175


<b>Также рассчитайте, чему оказался равен размер эффекта в нашем проведенном АB-тесте, 
и сделайте вывод, могли бы использовать меньший размер выборки при сохранении мощности на уровне 0.8.</b>

In [9]:
# We can do this calculation by two ways: 
# 1. for unequal sample size
d = (mean_ctr_1 - mean_ctr_0) / np.sqrt(((n_0 - 1) * std_ctr_0**2 + (n_1 - 1) * std_ctr_1**2)/ (n_0 + n_1 -2))
d
# 2. for equal sample size
d_equal = np.sqrt((std_ctr_0**2 + std_ctr_1**2)/2)
# Result will be almost the same

In [10]:
effect = d
alpha = 0.05
power = 0.8
# perform power analysis
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Expected number of observations in each group : ', round(result))

Expected number of observations in each group :  590


### Conclusion
In our experiment in group 0 and 1 by CTR 
Cohen's d = 0.16 

For this d the minimum number of samples should be <b>590</b>, which is larger than numbers of observation in case of d=0.3 (<b>175</b> samples)