# Dependencies and dataset

In [1]:
# ! pip install -r requirements.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import norm
import itertools

In [2]:
mkt_data = pd.read_csv('data/mkt_data.csv')
mkt_data.head(10)

Unnamed: 0,id,send_date,estimated_age,age_range,channel,coupon,clicked,last_step,nb_units,order_value
0,1,2019-04-12,59,46-60,SMS,2,0,received,0,0
1,2,2019-04-08,27,18-30,Email,2,0,received,0,0
2,3,2019-04-05,74,60+,Email,2,0,received,0,0
3,4,2019-04-21,32,31-45,SMS,2,0,received,0,0
4,5,2019-04-28,42,31-45,Email,2,0,received,0,0
5,6,2019-04-02,25,18-30,Email,4,0,received,0,0
6,7,2019-04-06,26,18-30,Email,2,0,received,0,0
7,8,2019-04-03,47,46-60,Email,6,0,received,0,0
8,9,2019-04-05,38,31-45,Email,6,0,received,0,0
9,10,2019-04-14,20,18-30,SMS,6,0,received,0,0


In [10]:
mkt_data['last_step'].unique()

array(['received', 'bounced', 'added to cart', 'purchased', 'saw review',
       'payment page'], dtype=object)

In [3]:
mkt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             300000 non-null  int64 
 1   send_date      300000 non-null  object
 2   estimated_age  300000 non-null  int64 
 3   age_range      300000 non-null  object
 4   channel        300000 non-null  object
 5   coupon         300000 non-null  int64 
 6   clicked        300000 non-null  int64 
 7   last_step      300000 non-null  object
 8   nb_units       300000 non-null  int64 
 9   order_value    300000 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 22.9+ MB


# A/B test among methods in each age group to select desirable method

In [4]:
def get_statistics(df: pd.DataFrame, att: str) -> tuple:
    """
    Return mean, variance, and size (degree of freemdom + 1) for any attributes of one dataframe
    Input: dataframe name, attribute name
    Output: mu, var, n
    """
    mu = df[att].mean() # get mean
    var = df[att].var() # get variance
    n = df[att].size # get size, degree of freedom = size - 1
    return mu, var, n

def check_hypo_by_agerange(email_df: pd.DataFrame, sms_df: pd.DataFrame, att: str, agerange: str):
    """
    Verify Hypothesis that there is difference between email and 
    Input: dataframe for each method and tested attributes
    Output: hypo decision
    """
    email_mu, email_var, email_size = get_statistics(email_df[email_df['age_range'] == agerange], att)
    sms_mu, sms_var, sms_size = get_statistics(sms_df[sms_df['age_range'] == agerange], att)

    # Calculate std, z_score, p_value
    standard_error = math.sqrt((email_var / email_size) + (sms_var / sms_size))
    z_score = (email_mu - sms_mu) / standard_error
    print("Z-score:", z_score)
    p_value = norm.sf(abs(z_score))  # Multiply by 2 for a two-tailed test
    print('p-value:', p_value)
    if p_value < 0.05:
        print('Reject Null Hypothesis \n')
    else:
        print('Cannot reject Null Hypothesis \n')

In [5]:
# Group by send_date and channel for rows where 'last_step' == 'purchased'
purchased_counts = mkt_data[mkt_data['last_step'] == 'purchased'].groupby(['send_date', 'channel']).size().rename('purchased_count')
total_counts = mkt_data.groupby(['age_range','send_date', 'channel']).size().rename('total_count')
combined_counts = purchased_counts.reset_index().merge(total_counts.reset_index(), on=['send_date', 'channel'], how='outer')
print(combined_counts)

combined_counts['purchased_percentage'] = combined_counts['purchased_count']/combined_counts['total_count'] * 100
combined_email_counts = combined_counts[combined_counts['channel'] == 'Email']
combined_sms_counts = combined_counts[combined_counts['channel'] == 'SMS']

      send_date channel  purchased_count age_range  total_count
0    2019-04-01   Email               34     18-30         1470
1    2019-04-01   Email               34     31-45         1853
2    2019-04-01   Email               34     46-60          799
3    2019-04-01   Email               34       60+          336
4    2019-04-01     SMS               21     18-30         1753
..          ...     ...              ...       ...          ...
243  2019-05-01   Email               34       60+          355
244  2019-05-01     SMS               29     18-30         1803
245  2019-05-01     SMS               29     31-45         2162
246  2019-05-01     SMS               29     46-60          896
247  2019-05-01     SMS               29       60+          414

[248 rows x 5 columns]


In [6]:
for agerange in combined_counts['age_range'].unique():
    print(f'Investigating email vs sms for age range: {agerange}')
    check_hypo_by_agerange(email_df=combined_email_counts, 
                           sms_df=combined_sms_counts, 
                           att='purchased_percentage', 
                           agerange=agerange)

Investigating email vs sms for age range: 18-30
Z-score: 5.309229352498702
p-value: 5.504486408713201e-08
Reject Null Hypothesis 

Investigating email vs sms for age range: 31-45
Z-score: 5.844152961564428
p-value: 2.5457614027594263e-09
Reject Null Hypothesis 

Investigating email vs sms for age range: 46-60
Z-score: 5.782828650923549
p-value: 3.672743680476232e-09
Reject Null Hypothesis 

Investigating email vs sms for age range: 60+
Z-score: 5.654877289069287
p-value: 7.79788275716462e-09
Reject Null Hypothesis 



# A/B test among age group for each method

In [7]:
def check_hypo_by_method(age1_df: pd.DataFrame, age2_df: pd.DataFrame, att: str, channel: str):
    """
    Verify Hypothesis that there is difference between email and 
    Input: dataframe for each method and tested attributes
    Output: hypo decision
    """
    mu1, var1, size1 = get_statistics(age1_df[age1_df['channel'] == channel], att)
    mu2, var2, size2 = get_statistics(age2_df[age2_df['channel'] == channel], att)

    # Calculate std, z_score, p_value
    standard_error = math.sqrt((var1 / size1) + (var2 / size2))
    z_score = (mu1 - mu2) / standard_error
    print("Z-score:", z_score)
    p_value = norm.sf(abs(z_score))  # Multiply by 2 for a two-tailed test
    print('p-value:', p_value)
    if p_value < 0.05:
        print('Reject Null Hypothesis \n')
    else:
        print('Cannot reject Null Hypothesis \n')

In [8]:
combined_counts

Unnamed: 0,send_date,channel,purchased_count,age_range,total_count,purchased_percentage
0,2019-04-01,Email,34,18-30,1470,2.312925
1,2019-04-01,Email,34,31-45,1853,1.834862
2,2019-04-01,Email,34,46-60,799,4.255319
3,2019-04-01,Email,34,60+,336,10.119048
4,2019-04-01,SMS,21,18-30,1753,1.197946
...,...,...,...,...,...,...
243,2019-05-01,Email,34,60+,355,9.577465
244,2019-05-01,SMS,29,18-30,1803,1.608430
245,2019-05-01,SMS,29,31-45,2162,1.341351
246,2019-05-01,SMS,29,46-60,896,3.236607


In [9]:
for agerange1, agerange2 in itertools.combinations(combined_counts['age_range'].unique(), 2):
    for channel in combined_counts['channel'].unique():
        print(f'Investigating {agerange1} vs {agerange2} for method: {channel}')
        df1 = combined_counts[combined_counts['age_range'] == agerange1]
        df2 = combined_counts[combined_counts['age_range'] == agerange2]
        check_hypo_by_method(age1_df=df1, 
                            age2_df=df2, 
                            att='purchased_percentage', 
                            channel=channel)

Investigating 18-30 vs 31-45 for method: Email
Z-score: 4.71973638937039
p-value: 1.180752366548354e-06
Reject Null Hypothesis 

Investigating 18-30 vs 31-45 for method: SMS
Z-score: 4.905301302898469
p-value: 4.6642015464309933e-07
Reject Null Hypothesis 

Investigating 18-30 vs 46-60 for method: Email
Z-score: -16.56808662842089
p-value: 5.926899614251251e-62
Reject Null Hypothesis 

Investigating 18-30 vs 46-60 for method: SMS
Z-score: -15.839098718564708
p-value: 8.360005780017339e-57
Reject Null Hypothesis 

Investigating 18-30 vs 60+ for method: Email
Z-score: -27.89479853858131
p-value: 1.5426447746282987e-171
Reject Null Hypothesis 

Investigating 18-30 vs 60+ for method: SMS
Z-score: -26.41584213087034
p-value: 4.506028431595518e-154
Reject Null Hypothesis 

Investigating 31-45 vs 46-60 for method: Email
Z-score: -20.051952297311033
p-value: 9.70388250718964e-90
Reject Null Hypothesis 

Investigating 31-45 vs 46-60 for method: SMS
Z-score: -19.324878981628473
p-value: 1.658804