# Settings

In [1]:
import pandas as pd
import numpy as np

In [2]:
file = 'dataset/ab_data.csv'
df = pd.read_csv(file)
df.sample(5)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
36262,732930,2017-01-19 00:19:41.827044,control,old_page,0
178073,923992,2017-01-05 23:18:06.606140,treatment,new_page,0
216009,898876,2017-01-08 00:03:25.535308,treatment,new_page,0
275473,806885,2017-01-07 02:45:26.712817,treatment,new_page,1
214709,794633,2017-01-15 19:03:21.664211,treatment,new_page,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [4]:
pd.to_datetime(df['timestamp']).dt.month.unique()

array([1], dtype=int64)

In [5]:
pd.to_datetime(df['timestamp']).dt.day.value_counts(dropna=False)

8     13564
11    13553
6     13528
10    13523
23    13511
21    13475
15    13449
9     13439
22    13423
3     13394
20    13393
7     13381
14    13329
16    13327
12    13322
17    13322
19    13293
18    13285
4     13284
13    13238
5     13124
24     7538
2      5783
Name: timestamp, dtype: int64

In [6]:
df['group'].value_counts(dropna=False)

treatment    147276
control      147202
Name: group, dtype: int64

In [7]:
df['converted'].value_counts(dropna=False)

0    259241
1     35237
Name: converted, dtype: int64

In [8]:
df.groupby(['group','landing_page']).agg({'user_id':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
group,landing_page,Unnamed: 2_level_1
control,new_page,1928
control,old_page,145274
treatment,new_page,145311
treatment,old_page,1965


In [9]:
df['drop'] = ((df['group']=='control')&(df['landing_page']=='new_page'))|((df['group']=='treatment')&(df['landing_page']=='old_page'))

In [10]:
df['drop'].value_counts()

False    290585
True       3893
Name: drop, dtype: int64

In [11]:
df = df[df['drop']==False]

In [12]:
len(df)

290585

In [13]:
N = len(df)

# Hypothesis
- H0: the conversion rate of the control group is equal to the conversion rate of the variant group.
- H1: the conversion rate of the control group does not equal to the conversion rate of the variant group.

# Power Analysis

## Power of the Test

In [14]:
beta = 0.2 # type II error
power = 1 - beta

## Significance Level

In [15]:
alpha = 0.05 # type I error
confidence = 1 - alpha

# Sample Size

In [16]:
from statsmodels.stats.power import NormalIndPower
power_analysis = NormalIndPower()

In [17]:
sample_size = power_analysis.solve_power(effect_size = beta, alpha = alpha, power = power, alternative = 'two-sided')
print("The minimun sample size needed is:",sample_size)

The minimun sample size needed is: 392.4430232577885


In [18]:
df.groupby(['group']).agg({'user_id':'count'})

Unnamed: 0_level_0,user_id
group,Unnamed: 1_level_1
control,145274
treatment,145311


In [19]:
N_control = len(df[df['group']=='control'])
N_treatment = len(df[df['group']=='treatment'])
N_control, N_treatment

(145274, 145311)

In [20]:
conv_control = len(df[(df['group']=='control')&df['converted']==1])
conv_treatment = len(df[(df['group']=='treatment')&df['converted']==1])
conv_control, conv_treatment

(17489, 17264)

# Duration of the Test

In [21]:
days = len(pd.to_datetime(df['timestamp']).dt.day.unique())
days

23

In [22]:
avg_visitor_per_day = len(df)/days
avg_visitor_per_day

12634.130434782608

In [23]:
duration = N/avg_visitor_per_day
duration

23.0

# Running the Test

In [24]:
control_conv_rate = conv_control/N_control
treatment_conv_rate = conv_treatment/N_treatment
control_conv_rate, treatment_conv_rate

(0.1203863045004612, 0.11880724790277405)

In [25]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [26]:
converted = [conv_control, conv_treatment]
nobs = [N_control, N_treatment]

In [27]:
z_stat, p_value = proportions_ztest(converted, nobs=nobs)
(lower_control, lower_treatment), (upper_control, upper_treatment) = proportion_confint(converted, nobs=nobs, alpha=alpha)

In [28]:
print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {p_value:.3f}')
print(f'ci 95% for control group: [{lower_control:.3f}, {upper_control:.3f}]')
print(f'ci 95% for treatment group: [{lower_treatment:.3f}, {upper_treatment:.3f}]')

z statistic: 1.31
p-value: 0.190
ci 95% for control group: [0.119, 0.122]
ci 95% for treatment group: [0.117, 0.120]


# Analyzing the Test

In [29]:
if p_value <= alpha:
    print('p-value is less than or equal to the significance level')
    print('H0 is rejected, and H1 is accepted')
else:
    print('p-value is not less than or equal to the significance level')
    print('H0 cannot be rejected, and H1 cannot be accepted')

p-value is not less than or equal to the significance level
H0 cannot be rejected, and H1 cannot be accepted


Result:
- The new landing page does not positively affect the conversion rate compared to the old landing page. Therefore, we cannot make the recommendation to switch to the new landing page.