In [147]:
import pandas as pd
import numpy as np
import scipy.stats as stats


#### Import Data

In [148]:
df = pd.read_csv('data.csv')

In [149]:
df

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,Samsung Internet,0,0


#### Ensure the Trustworthiness Data

##### Check Data Quality

In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8077 entries, 0 to 8076
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   auction_id   8077 non-null   object
 1   experiment   8077 non-null   object
 2   date         8077 non-null   object
 3   hour         8077 non-null   int64 
 4   device_make  8077 non-null   object
 5   platform_os  8077 non-null   int64 
 6   browser      8077 non-null   object
 7   yes          8077 non-null   int64 
 8   no           8077 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 568.0+ KB


In [151]:
# Check missing value
df.isnull().sum()

auction_id     0
experiment     0
date           0
hour           0
device_make    0
platform_os    0
browser        0
yes            0
no             0
dtype: int64

There is no missing value in the data

In [152]:
# Check duplicate value
df.duplicated().sum()

0

There is no duplicated data

In [153]:
# Check yes and no column
df[(df['yes'] == 1) & (df['no'] == 1)].shape[0]

0

The columns labeled "Yes" and "No" are correct in that there is no row where both columns have a value of 1, indicating that no user selected both options.

In [154]:
no_response = df[(df['yes'] == 0) & (df['no'] == 0)].shape[0] / df.shape[0] * 100

print(f"The number of users not responding the advertisement are {no_response:.2f} percent")

The number of users not responding the advertisement are 84.61 percent


Given that the objective of this experiment is to determine whether there is a significant difference in the click-through rate (CTR) for the "Yes" button between control and exposed group, the user with no response for now will be deleted.

In [155]:
# Drop no response user
df.drop(df[(df['yes'] == 0) & (df['no'] == 0)].index, inplace=True)

In [156]:
# Drop no column
df.drop(columns='no', inplace=True)

In [157]:
# Rename yes column
df.rename(columns={'yes':'convert'}, inplace=True)

##### Perform SRM Test

In [158]:
# Sample ratio calculation 
control_user = df['experiment'].value_counts()['control']
exposed_user = df['experiment'].value_counts()['exposed']
total_user = control_user + exposed_user

control_per_total = control_user / total_user * 100

exposed_per_total = exposed_user / total_user * 100

print(f'Control per total user is {control_per_total:.2f}% and exposed per total user is {exposed_per_total:.2f}%')

Control per total user is 47.14% and exposed per total user is 52.86%


In [159]:
control_per_exposed = control_user / exposed_user

print(f'Control per exposed is {control_per_exposed:.2f}')

Control per exposed is 0.89


From this sample ratio calculations, the ratio of two group is close to 50:50. The ratio is also close to 1. Next is do chi-square test to detect SRM.

Define the null and alternative hypothesis (H0 and H1)

H0 : No SRM detected

H1 : SRM detected

Calculate chi-square statistics

In [160]:
expected_value = total_user / 2

#Calculate chi-square statistics
chi2stat = (((control_user - expected_value)**2) / expected_value) + (((exposed_user - expected_value)**2) / expected_value)

print(f'Chi-square statistics is {chi2stat:.2f}')

Chi-square statistics is 4.06


Define decision rules

We reject H0 if chi-square statistics > chi-square critical value

Calculate chi-square critical value

In [161]:
#Calculate chi-square critical value
chi2crit = stats.chi2.ppf(1-0.01, df=1)

print(f'Chi-square critical is {chi2crit:.2f}')

Chi-square critical is 6.63


Chi-square statistics < chi-square critical value so H0 is not rejected and there is no SRM.

#### Hypothesis Testing

In [162]:
# Create table for control and exposed experiment
control = df[df['experiment'] == 'control']
exposed = df[df['experiment'] == 'exposed']

In [163]:
# Number of User Click Yes
yes_control = control['convert'].sum()
yes_exposed = exposed['convert'].sum()

In [164]:
# Number of User Click No
no_control = control_user - yes_control
no_exposed = exposed_user - yes_exposed

In [165]:
result = np.array([[no_control, no_exposed],
                [yes_control, yes_exposed]])

In [166]:
#Pearson's Chi Square Test
chi2, p, dof, expected = stats.chi2_contingency(result)

alpha = 0.05

print(f'p-value : {p}')

p-value : 0.5560768104229136


P-value > alpha. So, the H0 is not rejected. The CTR for the “Yes” is not significantly different between the conrol and exposed group.