In [1]:
from statsmodels.stats.proportion import proportions_ztest
import numpy as np
import pandas as pd

In [2]:
# to compare campaign A and B we use a z-test of proportions
# we will need the # observations and the # of successes
# the number of successes is just a count of the people who bought something
# we can define the # observations as the count of people who received an email
# alternatively we can define # observations as the # of people who OPENED the email 
# but that will obviously reduce the sample size

In [3]:
# read the files for both marketing campaigns
df_a=pd.read_csv("campaign_a.csv")
df_b=pd.read_csv("campaign_b.csv")


In [4]:
df_a.head()


Unnamed: 0,campaign,id,opened,clicked,purchased,dollars,unsub
0,A,1016,0,0,0,0,0
1,A,1019,0,0,0,0,0
2,A,1037,0,0,0,0,0
3,A,1043,0,0,0,0,1
4,A,1058,1,0,0,0,0


In [24]:
camp_a_observations=df_a.count()["id"]
camp_b_observations=df_b.count()["id"]
# this determines the number of records (sample size) for both campaigns

In [25]:
print (camp_a_observations, camp_b_observations)

10050 10686


In [45]:
camp_a_success=df_a[df_a["purchased"]==1].count()["id"]
camp_b_success=df_b[df_b["purchased"]==1].count()["id"]
# this determines the number of successes for both campaigns
print (camp_a_success, camp_b_success)

65 96


In [29]:
significance = 0.05
# our alpha value is 5% 

In [46]:
#calculate success and sample sizes
sample_success_a, sample_size_a = (camp_a_success, camp_a_observations)   # success = purchase,  numbers of observations (nobs) = sample size
sample_success_b, sample_size_b = (camp_b_success, camp_b_observations)   # success = purchase,  numbers of observations (nobs) = sample size


In [47]:
# put them into np.arrays
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_a, sample_size_b])


In [48]:
# use the z test to determine p value for Ha: A < B
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='smaller') # H0 is greater than OR equal 

In [49]:

print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
if p_value > significance:
   print ("p is not low.  Campaigns A and B were not different ")
else:
   print ("p is low! THE NULL MUST GO! A is different than B")

z_stat: -2.063, p_value: 0.020
p is low! THE NULL MUST GO! A is different than B


In [12]:
# suppose instead we defined our # observations as the # of people who OPENED the email

In [50]:
camp_a_opened=df_a[df_a["opened"]>0].count()["id"]
camp_b_opened=df_b[df_b["opened"]>0].count()["id"]

sample_success_a, sample_size_a = (camp_a_success, camp_a_opened)   # success = purchase,  numbers of observations (nobs) = opened  
sample_success_b, sample_size_b = (camp_b_success, camp_b_opened)   # success = purchase,  numbers of observations (nobs) = opened

successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_a, sample_size_b])

stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='smaller') # H0 is greater than OR equal 

In [51]:
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
if p_value > significance:
   print ("p is not low.  Campaigns A and B were not different ")
else:
   print ("p is low! THE NULL MUST GO! A was less successful than B")

z_stat: -1.688, p_value: 0.046
p is low! THE NULL MUST GO! A was less successful than B


In [None]:
# it's close but still significant!! 