<a href="https://colab.research.google.com/github/dseitova/portfolio/blob/main/ABtesting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
%matplotlib inline

In [None]:
df = pd.read_csv('drive/MyDrive/AB_testing_project/cookie_cats.csv')

df.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB


In [None]:
session_counts = df['userid'].value_counts(ascending=False)
multi_users = session_counts[session_counts > 1].count()

print(f'There are {multi_users} users that appear multiple times in the dataset')

There are 0 users that appear multiple times in the dataset


In [None]:
df['group'] = np.where(df['version'] == 'gate_30', 'control group','treatment')
df['retention_1'] = np.where(df['retention_1'] == True, 1, 0)
df['retention_7'] = np.where(df['retention_7'] == True, 1, 0)
df.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7,group
0,116,gate_30,3,0,0,control group
1,337,gate_30,38,1,0,control group
2,377,gate_40,165,1,0,treatment
3,483,gate_40,1,0,0,treatment
4,488,gate_40,179,1,1,treatment


In [None]:
day1_conversion_rates = df.groupby('group')['retention_1']
std_p = lambda x: np.std(x, ddof=0)
se_p = lambda x: stats.sem(x, ddof=0)

day1_conversion_rates = day1_conversion_rates.agg([np.mean, std_p, se_p])
day1_conversion_rates.columns = ['day1_conversion_rate', 'std_deviation', 'std_error']


day1_conversion_rates.style.format('{:.3f}')

Unnamed: 0_level_0,day1_conversion_rate,std_deviation,std_error
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control group,0.448,0.497,0.002
treatment,0.442,0.497,0.002


In [None]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
control_results = df[df['group'] == 'control group']['retention_1']
treatment_results = df[df['group'] == 'treatment']['retention_1']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: 1.78
p-value: 0.074
ci 95% for control group: [0.444, 0.453]
ci 95% for treatment group: [0.438, 0.447]


In [None]:
day7_conversion_rates = df.groupby('group')['retention_7']
std_p = lambda x: np.std(x, ddof=0)
se_p = lambda x: stats.sem(x, ddof=0)

day7_conversion_rates = day7_conversion_rates.agg([np.mean, std_p, se_p])
day7_conversion_rates.columns = ['day7_conversion_rate', 'std_deviation', 'std_error']


day7_conversion_rates.style.format('{:.3f}')

Unnamed: 0_level_0,day7_conversion_rate,std_deviation,std_error
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control group,0.19,0.392,0.002
treatment,0.182,0.386,0.002


In [None]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
control_results = df[df['group'] == 'control group']['retention_7']
treatment_results = df[df['group'] == 'treatment']['retention_7']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: 3.16
p-value: 0.002
ci 95% for control group: [0.187, 0.194]
ci 95% for treatment group: [0.178, 0.186]
