# Cold Start - AB 240513 Analysis

Only session length metric

In [1]:
from datetime import datetime

import numpy as np
import polars as pl

In [2]:
user_df = pl.read_parquet('user.pq')
user_language_df = pl.read_parquet('user_language.pq')

In [3]:
users_ru_en = user_language_df.filter(pl.col('language_code').is_in(['ru', 'en'])).select(pl.col('user_id').unique())

In [4]:
users = (
    user_df
    .filter(pl.col('created_at') >= datetime(2024, 5, 15))
    .with_columns(is_test=pl.when(pl.col('id') % 100 < 50)
                  .then(pl.lit('test')).otherwise(pl.lit('control')))
    .join(users_ru_en, left_on='id', right_on='user_id', how='inner')
    .select(pl.col('id').alias('user_id'), 'is_test')
)

In [5]:
user_meme_df = (
    pl.read_parquet('user_meme_reaction.pq')
    .with_columns(pl.col('reaction_id').fill_null(2))
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
    .filter(pl.col('reaction_num') <= 30)
)

In [6]:
users.group_by('is_test').len()

is_test,len
str,u32
"""test""",329
"""control""",393


In [7]:
session_lens = (
    user_meme_df
    .group_by('user_id')
    .agg(pl.max('reaction_num'))
)

In [8]:
users_by_gr = {}
users_by_gr['test'] = users.filter(pl.col('is_test') == 'test').select('user_id')
users_by_gr['control'] = users.filter(pl.col('is_test') == 'control').select('user_id')

In [9]:
session_lens_by_gr = {'test': [], 'control': []}
session_len_uplifts = []

for i in range(1000):
    for gr in ['test', 'control']:
        cur_users = users_by_gr[gr].sample(len(users_by_gr[gr]), with_replacement=True, shuffle=True, seed=i+42)
        session_lens_by_gr[gr].append(session_lens.join(cur_users, on='user_id').select(pl.mean('reaction_num')).item(0, 0))
    session_len_uplifts.append(session_lens_by_gr['test'][i] / session_lens_by_gr['control'][i])

In [10]:
test_mean = np.array(session_lens_by_gr['test']).mean()
control_mean = np.array(session_lens_by_gr['control']).mean()
uplift_mean = np.array(session_len_uplifts).mean()
uplift_std = np.array(session_len_uplifts).std()
print(f'Session length. Test mean = {test_mean:.2f}. Control mean = {control_mean:.2f}. Uplift = {uplift_mean:.2f} +- {1.96 * uplift_std:.2f}')

Session length. Test mean = 11.19. Control mean = 9.12. Uplift = 1.24 +- 0.31
