In [1]:
# Import statsmodel for statistical calculations and 
# TTestIndPower class to calculate the parameters.
import statsmodels.stats.api as sms
from statsmodels.stats.power import TTestIndPower

In [2]:
# Specify the three required parameters for the power analysis:
alpha = 0.05 
power = 0.80 
effect = sms.proportion_effectsize(0.50, 0.55) 

# Perform power analysis by using the solve_power() function:
# Specify an instance of TTestIndPower.
analysis = TTestIndPower() 

# Calculate the sample size and list the parameters.
result = analysis.solve_power(effect, power=power, nobs1=None,
                              ratio=1.0, alpha=alpha) 

# Print the output.
print('Sample Size: %.3f' % result)

Sample Size: 1565.490


In [3]:
# Import necessary libraries, packages and classes.
import pandas as pd
import math
import numpy as np
import statsmodels.stats.api as sms
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
# Read the CSV file (bike_shop.csv).
df = pd.read_csv('bike_shop.csv')

# View the DataFrame.
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [5]:
# Check the metadata.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IP Address     184588 non-null  object
 2   LoggedInFlag   184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


In [8]:
# Check the shape of the DF
df.shape

(184588, 5)

In [18]:
# Create a new dataframe and rename columns
df_new = df.rename(columns={'IP Address': 'IPAddress',
                            'LoggedInFlag': 'LoyaltyPage'}).copy()

# View the DataFrame.
df_new.head()

Unnamed: 0,RecordID,IPAddress,LoyaltyPage,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [19]:
#Check the new DataFrames metadata
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IPAddress      184588 non-null  object
 2   LoyaltyPage    184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


In [20]:
# Check the shape of the new DataFrame
df_new.shape

(184588, 5)

In [21]:
# Drop duplicate values
df_new.drop_duplicates(subset ='IPAddress',
                       keep = False,
                       inplace = True)

In [29]:
# Create a new DataFrame and drop the unneeded columns 
df_final = df_new.drop(['RecordID', 'VisitPageFlag'], axis=1)

# View the DataFrame.
print(df_final.head())

#Check the new DataFrames metadata
print(df_final.info())

# Check the shape of the new DataFrame
print(df_final.shape)

       IPAddress  LoyaltyPage  ServerID
7     97.6.126.6            0         3
12   188.13.62.2            0         3
14   234.1.239.1            0         2
15  167.15.157.7            0         2
16  123.12.229.8            0         1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39608 entries, 7 to 184584
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IPAddress    39608 non-null  object
 1   LoyaltyPage  39608 non-null  int64 
 2   ServerID     39608 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ MB
None
(39608, 3)


In [34]:
#Create a new column called group using the map function
df_final['Group'] = df_final.ServerID.map({1:'Treatment', 
                                           2:'Control', 
                                           3: 'Control'})

# View the output
df_final.head()

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
7,97.6.126.6,0,3,Control
12,188.13.62.2,0,3,Control
14,234.1.239.1,0,2,Control
15,167.15.157.7,0,2,Control
16,123.12.229.8,0,1,Treatment


In [35]:
# View the shape
df_final.shape

(39608, 4)

In [41]:
# Determine the sample sizes
print(df_final['Group'].value_counts())

Control      26310
Treatment    13298
Name: Group, dtype: int64


In [45]:
# Obtain a simple random sample for control and treatment groups with n = 1566; 
# set random_stategenerator seed at an arbitrary value of 43.
# Obtain a simple random sample for the control group.
control_sample = df_final[df_final['Group'] == 'Control'].sample(n=1566, 
                                                       random_state=42) 

# Obtain a simple random sample for the treatment group.
treatment_sample = df_final[df_final['Group'] == 'Treatment'].sample(n=1566,
                                                           random_state=42)

In [46]:
# Join the two samples.  
ab_test = pd.concat([control_sample, treatment_sample], axis=0)  

# Reset the A/B index.
ab_test.reset_index(drop=True, inplace=True) 

# Print the sample table.
ab_test  

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
0,25.16.126.2,1,3,Control
1,106.13.67.3,1,3,Control
2,169.11.137.7,0,2,Control
3,164.9.86.8,1,2,Control
4,112.12.25.7,0,2,Control
...,...,...,...,...
3127,187.4.117.9,1,1,Treatment
3128,134.0.112.5,1,1,Treatment
3129,7.3.242.7,0,1,Treatment
3130,118.14.226.4,0,1,Treatment


In [54]:
# Calculate basic statistics.
# Import library.
# SEM stands for standard error mean.
from scipy.stats import sem
import numpy as np



# Group the ab_test data set by group and aggregate by converted.
conversion_rates = ab_test.groupby('Group')['LoyaltyPage']

# Standard deviation of the proportion
STD_p = lambda x: np.std(x, ddof=0)

# Standard error of the proportion
SE_p = lambda x: st.sem(x, ddof=0

# Calculate conversion rates by calculating the means of columns STD_p and SE_p.
conversion_rates = conversion_rates.agg([np.mean, np.std,sem])

# Assign names to the three columns.
conversion_rates.columns = ['conversion_rate',
                            'std_deviation',
                            'std_error']  

# Convert output into a Pandas DataFrame.
cr = pd.DataFrame(conversion_rates)

# View output.
cr

SyntaxError: invalid syntax (Temp/ipykernel_8604/989521234.py, line 19)

In [56]:
import numpy as np
from scipy.stats import sem

# Group the ab_test data set by group and aggregate by converted.
conversion_rates = ab_test.groupby('Group')['LoyaltyPage']

# Standard deviation of the proportion
STD_p = lambda x: np.std(x, ddof=0)

# Standard error of the proportion
SE_p = lambda x: sem(x, ddof=0)

# Calculate conversion rates by calculating the means of columns STD_p and SE_p.
conversion_rates = conversion_rates.agg([np.mean, np.std, SE_p])

# Assign names to the three columns.
conversion_rates.columns = ['conversion_rate',
                            'std_deviation',
                            'std_error']  

# Convert output into a Pandas DataFrame.
cr = pd.DataFrame(conversion_rates)

# View output.
cr

Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,0.531928,0.499139,0.012609
Treatment,0.483397,0.499884,0.012628


In [57]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

control_results = ab_test[ab_test['Group'] == 'Control']['LoyaltyPage']
treatment_results = ab_test[ab_test['Group'] == 'Treatment']['LoyaltyPage']

n_con = control_results.count()
n_treat = treatment_results.count()

successes = [control_results.sum(), treatment_results.sum()]

nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes,
                                                                        nobs=nobs,
                                                                        alpha=0.05)

print(f'Z test stat: {z_stat:.2f}')
print(f'P-value: {pval:.3f}')
print(f'Confidence Interval of 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'Confidence Interval of 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

Z test stat: 2.72
P-value: 0.007
Confidence Interval of 95% for control group: [0.507, 0.557]
Confidence Interval of 95% for treatment group: [0.459, 0.508]


The newq home page achieves the goal