# Design and Set up an A/B Test

In [22]:
# Import all of the necessary libraries
import pandas as pd
import math
import numpy as np
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
from statsmodels.stats.power import TTestIndPower

## Conduct Power Analysis

In [10]:
# Set the parameters for the power analysis
alpha = 0.05
power = 0.80
effect = sms.proportion_effectsize(0.13, 0.15)

# perform power analysis by using the solve_power() function
# Specify an instance of TTestIndPower
analysis = TTestIndPower()

# Calculate the sample size and list the parameters
result = analysis.solve_power(effect, power=power, alpha=alpha, nobs1=None, ratio=1.0)

# print the output
print('Sample Size : %.3f' % result)

Sample Size : 4720.435


## Preparing the data

In [11]:
# Read CSV file (bike_shop.csv)
df = pd.read_csv('bike_shop.csv')

# View the df
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [12]:
# View the metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IP Address     184588 non-null  object
 2   LoggedInFlag   184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


## Cleaning the data

In [14]:
# Rename the columns.
df_new = df.rename(columns={'IP Address': 'IPAddress',
                            'LoggedInFlag': 'LoyaltyPage'})

# View the DataFrame.
print(df_new.shape)
print(df_new.head())
df_new.info()

(184588, 5)
   RecordID    IPAddress  LoyaltyPage  ServerID  VisitPageFlag
0         1  39.13.114.2            1         2              0
1         2    13.3.25.8            1         1              0
2         3  247.8.211.8            1         1              0
3         4  124.8.220.3            0         3              0
4         5  60.10.192.7            0         2              0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IPAddress      184588 non-null  object
 2   LoyaltyPage    184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


In [15]:
# Drop duplicate values.
df_new.drop_duplicates(subset ='IPAddress',
                       keep = False,
                       inplace = True)


# Drop duplicate columns.
df_final = df_new.drop(['RecordID', 'VisitPageFlag'],
                       axis=1)


# View the DataFrame.
print(df_final.shape)
print(df_final.head())
df_final.info()

(39608, 3)
       IPAddress  LoyaltyPage  ServerID
7     97.6.126.6            0         3
12   188.13.62.2            0         3
14   234.1.239.1            0         2
15  167.15.157.7            0         2
16  123.12.229.8            0         1
<class 'pandas.core.frame.DataFrame'>
Index: 39608 entries, 7 to 184584
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IPAddress    39608 non-null  object
 1   LoyaltyPage  39608 non-null  int64 
 2   ServerID     39608 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ MB


## Subset the DF

In [16]:
# Split the data set into ID1 as treatment and ID2 & ID3 as control groups.
df_final['Group'] = df_final['ServerID'].map({1:'Treatment',
                                              2:'Control',
                                              3:'Control'})

# View the DataFrame.
print(df_final.shape)
df_final.head()

(39608, 4)


Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
7,97.6.126.6,0,3,Control
12,188.13.62.2,0,3,Control
14,234.1.239.1,0,2,Control
15,167.15.157.7,0,2,Control
16,123.12.229.8,0,1,Treatment


In [17]:
# Count the values
df_final['Group'].value_counts()

Group
Control      26310
Treatment    13298
Name: count, dtype: int64

In [19]:
# Create two dfs
c_sample = df_final[df_final['Group'] == 'Control'].sample(n=1566, random_state=42)

t_sample = df_final[df_final['Group'] == 'Treatment'].sample(n=1566, random_state=42)

# View the dfs
print(c_sample.head())
t_sample.head()

           IPAddress  LoyaltyPage  ServerID    Group
53313    25.16.126.2            1         3  Control
52290    106.13.67.3            1         3  Control
104046  169.11.137.7            0         2  Control
171756    164.9.86.8            1         2  Control
2317     112.12.25.7            0         2  Control


Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
173762,251.0.251.9,1,1,Treatment
150588,16.1.214.6,1,1,Treatment
72805,39.3.26.5,0,1,Treatment
112098,90.14.154.1,1,1,Treatment
32507,18.5.206.8,0,1,Treatment


## Perform A/B Test

In [21]:
# Performing the A/B Test
# Create variable and merge the dfs
ab_test = pd.concat([c_sample, t_sample], axis=0)

ab_test.reset_index(drop=True, inplace=True)

# View the df
ab_test.head()

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
0,25.16.126.2,1,3,Control
1,106.13.67.3,1,3,Control
2,169.11.137.7,0,2,Control
3,164.9.86.8,1,2,Control
4,112.12.25.7,0,2,Control


In [23]:
# Calculating the conversion rates
conversion_rates = ab_test.groupby('Group')['LoyaltyPage']

# Standard deviation of the proportion
std_p = lambda x: np.std(x, ddof=0)
# Standard error of the proportion
se_p = lambda x: st.sem(x, ddof=0)

conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])

conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']

# Convert output into df
cr = pd.DataFrame(conversion_rates)

# View the df
cr

  conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])


Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,0.531928,0.49898,0.012609
Treatment,0.483397,0.499724,0.012628


In [24]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

control_results = ab_test[ab_test['Group'] == 'Control']['LoyaltyPage']
treatment_results = ab_test[ab_test['Group'] == 'Treatment']['LoyaltyPage']

n_con = control_results.count()
n_treat = treatment_results.count()

successes = [control_results.sum(), treatment_results.sum()]

nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes,
                                                                        nobs=nobs,
                                                                        alpha=0.05)

print(f'Z test stat: {z_stat:.2f}')
print(f'P-value: {pval:.3f}')
print(f'Confidence Interval of 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'Confidence Interval of 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

Z test stat: 2.72
P-value: 0.007
Confidence Interval of 95% for control group: [0.507, 0.557]
Confidence Interval of 95% for treatment group: [0.459, 0.508]


## Summary and Findings

The change to the homepage slightly decreased the click through to the login page. 

The `p`-value is smaller than the Alpha value of 0.05, meaning we reject the $H_0$. 