In [14]:
from statsmodels.stats.proportion import proportions_ztest
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, ttest_ind_from_stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
# when comparing AVERAGE money spent (not success rate) we'll use a t-test
# a t-test compares A and B using 3 values:
# 1) average of both groups
# 2) standard deviation of both groups
# 3) the number of samples in each group 

# in this case the number of samples is NOT the # of records
# it is the number of customers who made a purchase as a result of the campaign


In [13]:
# this is how we read data in Pandas. 
df_a=pd.read_csv("campaign_a.csv")
df_b=pd.read_csv("campaign_b.csv")


In [3]:
spec_chars = [ "[", "]"]    #gotta clean out the brackets from the dollars column
for char in spec_chars:
    df_a['dollars'] = df_a['dollars'].str.replace(char, '')
    df_b['dollars'] = df_b['dollars'].str.replace(char, '')

In [4]:
significance = 0.05
# again we set our alpha value to 5% - this is our doubt threshold

In [5]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10686 entries, 0 to 10685
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   campaign   10686 non-null  object
 1   id         10686 non-null  int64 
 2   opened     10686 non-null  int64 
 3   clicked    10686 non-null  int64 
 4   purchased  10686 non-null  int64 
 5   dollars    10686 non-null  object
 6   unsub      10686 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 584.5+ KB


In [6]:
df_a['dollars']=df_a['dollars'].astype(float)   # cast dollars column to float type
df_b['dollars']=df_b['dollars'].astype(float)

In [7]:
nobs1=df_a[df_a["purchased"]>0].count()["id"]   #count the total rows that have at least one purchase
nobs2=df_b[df_b["purchased"]>0].count()["id"]
print(nobs1,nobs2)

65 96


In [8]:
# next calculate average and standard deviation for dollars spent in each trial

In [9]:
camp_a_average=df_a[df_a["purchased"]>0].mean(numeric_only=True, skipna=True)["dollars"]
camp_b_average=df_b[df_b["purchased"]>0].mean(numeric_only=True, skipna=True)["dollars"]
print (camp_a_average, camp_b_average)

51.71307692307684 55.146249999999874


In [10]:
camp_a_std=df_a[df_a["purchased"]>0].std(numeric_only=True, skipna=True)["dollars"]
camp_b_std=df_b[df_b["purchased"]>0].std(numeric_only=True, skipna=True)["dollars"]
print (camp_a_std, camp_b_std)

30.385290279749103 26.416632768808842


In [19]:
t, p =ttest_ind_from_stats(camp_a_average, camp_a_std, nobs1, camp_b_average, camp_b_std, nobs2, equal_var=False, alternative='less')

In [18]:
print('t_stat: %0.3f, p_value: %0.3f' % (t, p))
if p > significance:
   print ("p is not low. A is not significantly less than B ")
else:
   print ("p is low! THE NULL MUST GO!  A is less than B")

t_stat: -0.741, p_value: 0.460
p is not low. A is not significantly less than B 
