# AB test setup calculation

This notebooks estimates the amount of data which is needed to get an appropriate MDE for cold start recs test

Metrics

* Like Rate
* First session length

Test details

* T test
* For Like Rate std is calculated as sqrt(pq / n) using the assumption of normality
* Alpha = 0.05, Beta = 0.2

In [2]:
import polars as pl
from datetime import datetime
from scipy import stats
import numpy as np

In [3]:
dtm_fmt = "%B %-d, %Y, %I:%M %p"

In [4]:
user_meme_df = pl.read_csv('user_meme_reaction_240301_240413.csv')
user_df = pl.read_csv('user_240414.csv')

In [5]:
# Only new users in April
# Not bots

user_proc_df = (
    user_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .filter(pl.col('type') != 'blocked_bot')
    .filter(pl.col('created_at') >= datetime(2024, 4, 1))
    .drop('type', 'blocked_bot_at', 'inviter_id', 'last_active_at')
)

In [40]:
# Remove non-reacts
# Add reaction number
# Leave only first 30 reactions
# Leave only new users

(
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .with_columns(sent_date=pl.col('sent_at').dt.truncate('1d'))
    .filter(pl.col('sent_at') >= datetime(2024, 4, 1))
    .drop('reacted_at')
    .with_columns(pl.col('reaction_id').fill_null(2))
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
    .join(user_proc_df.select('id'), left_on='user_id', right_on='id', how='inner')
    .filter(pl.col('reaction_num') <= 30)
    .with_columns(is_like=pl.when(pl.col('reaction_id') == 1).then(1).otherwise(0))
    .group_by('sent_date')
    .agg(
        pl.len().alias('count'),
        pl.sum('is_like').alias('likes'),
    )
    .with_columns((pl.col('likes') / pl.col('count')).alias('lr'))
    .sort('sent_date')
)

sent_date,count,likes,lr
datetime[μs],u32,i32,f64
2024-04-01 00:00:00,10425,5718,0.548489
2024-04-02 00:00:00,1455,791,0.543643
2024-04-03 00:00:00,3438,1567,0.455788
2024-04-04 00:00:00,2807,1517,0.540435
2024-04-05 00:00:00,2192,1031,0.470347
2024-04-06 00:00:00,1902,944,0.49632
2024-04-07 00:00:00,1780,998,0.560674
2024-04-08 00:00:00,2278,895,0.392888
2024-04-09 00:00:00,1572,636,0.40458
2024-04-10 00:00:00,1555,732,0.47074


* Lets assume 1000 reactions from new users daily
* Averate LR = 0.5
* Alpha = 0.05
* Beta = 0.2

In [40]:
def mde(n_days, n_daily, group_percent, p, alpha, beta):
    return (stats.norm.ppf(1 - alpha / 2) + stats.norm.ppf(1 - beta)) * np.sqrt(p * (1 - p) / n_days / n_daily / group_percent)

In [48]:
n_daily = 1000
p = 0.5
alpha = 0.05
beta = 0.2

group_percent_list = [0.05, 0.1, 0.2, 0.5]
n_days_list = [1, 2, 3, 5, 7, 14]

res = []
for group_percent in group_percent_list:
    for n_days in n_days_list:
        res.append({
            'n_days': n_days,
            'group_percent': group_percent,
            'mde': mde(n_days, n_daily, group_percent, p, alpha, beta)
        })

res = pl.DataFrame(res)


In [None]:
pl.Config.set_tbl_rows(100)

In [50]:
res.sort(['group_percent', 'n_days'])

n_days,group_percent,mde
i64,f64,f64
1,0.05,0.198102
2,0.05,0.140079
3,0.05,0.114374
5,0.05,0.088594
7,0.05,0.074876
14,0.05,0.052945
1,0.1,0.140079
2,0.1,0.099051
3,0.1,0.080875
5,0.1,0.062645


## Similar calculations for the first session length

First session length is actually max(session_length, 30)

In [48]:
# Take a first session from each user
# Calculate its size
# Session is bounded by 1 hour pause in replies


(
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .with_columns(sent_date=pl.col('sent_at').dt.truncate('1d'))
    .filter(pl.col('sent_at') >= datetime(2024, 4, 1))
    .drop('reacted_at')
    .with_columns(pl.col('reaction_id').fill_null(2))
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
    .join(user_proc_df.select('id'), left_on='user_id', right_on='id', how='inner')
    .with_columns(is_like=pl.when(pl.col('reaction_id') == 1).then(1).otherwise(0))
    .with_columns(prev_sent_at=pl.col('sent_at').shift(1).over('user_id'))
    .with_columns(lag=(pl.col('sent_at') - pl.col('prev_sent_at')))
    .with_columns(is_new_session=(pl.col('lag').dt.total_hours() > 1).cast(pl.Int64))
    .with_columns(session_num=pl.col('is_new_session').cum_sum().over('user_id').fill_null(0))
    .filter(pl.col('session_num') == 0)
    .group_by('user_id')
    .len()
    .join(user_proc_df, left_on='user_id', right_on='id')
    .with_columns(create_date=pl.col('created_at').dt.truncate('1d'))
    .group_by('create_date')
    .agg(pl.col('len').mean().alias('mean'), pl.col('len').std().alias('std'), pl.len())
    .sort('create_date')
)

create_date,mean,std,len
datetime[μs],f64,f64,u32
2024-04-01 00:00:00,14.748545,19.79798,859
2024-04-02 00:00:00,9.337017,21.18522,181
2024-04-03 00:00:00,15.88189,23.247329,127
2024-04-04 00:00:00,20.935065,42.402009,154
2024-04-05 00:00:00,19.18797,36.729591,133
2024-04-06 00:00:00,12.630252,20.123713,119
2024-04-07 00:00:00,23.017241,47.188241,116
2024-04-08 00:00:00,17.831325,31.582574,83
2024-04-09 00:00:00,18.988372,42.772584,86
2024-04-10 00:00:00,24.278481,49.977671,79


* Lets assume 100 users daily
* Average std as 40
* Alpha = 0.05
* Beta = 0.2

In [45]:
def mde(n_days, n_daily, group_percent, std, alpha, beta):
    return (stats.norm.ppf(1 - alpha / 2) + stats.norm.ppf(1 - beta)) * std / np.sqrt(n_days * n_daily * group_percent)

In [46]:
n_daily = 100
std = 40
alpha = 0.05
beta = 0.2

group_percent_list = [0.05, 0.1, 0.2, 0.5]
n_days_list = [1, 2, 3, 5, 7, 14]

res = []
for group_percent in group_percent_list:
    for n_days in n_days_list:
        res.append({
            'n_days': n_days,
            'group_percent': group_percent,
            'mde': mde(n_days, n_daily, group_percent, std, alpha, beta)
        })

res = pl.DataFrame(res)


In [47]:
res

n_days,group_percent,mde
i64,f64,f64
1,0.05,50.11628
2,0.05,35.437561
3,0.05,28.934648
5,0.05,22.412682
7,0.05,18.942173
14,0.05,13.394139
1,0.1,35.437561
2,0.1,25.05814
3,0.1,20.459886
5,0.1,15.848159
