In [113]:
import pandas as pd
import numpy as np
import random
from scipy.stats import chi2_contingency
from statsmodels.stats.power import GofChisquarePower
import math

In [2]:
data1 = pd.read_csv("dataset1.csv")
data2 = pd.read_csv("dataset2.csv")

In [3]:
data1.head()

Unnamed: 0,user_id,date,hour,impressions,clicks,video_midpoint,video_complete_views,asset,conversions
0,00007e57-36c3-49aa-9917-4266a59c3bea,2022-11-07,1,1,1,1,1,Asset1,0
1,000449ee-3b62-4e56-be4e-f600a728bbc5,2022-11-08,4,1,0,0,0,Asset1,0
2,00065b29-9ef3-4c60-8605-7f796d00e3d2,2022-11-10,1,1,0,0,0,Asset1,0
3,0009d922-3cd3-4a9c-a70c-b1b2fcaa8bfa,2022-11-11,4,1,0,0,0,Asset1,0
4,000bce0a-3c18-4cf5-9e5b-d2b0fe17ec8e,2022-11-09,3,1,0,0,0,Asset1,0


In [4]:
data2.head()

Unnamed: 0,user_id,gender,age,device_type,os,nrs_grade,date_collected
0,0002d9ed-170a-4e5a-8a14-11c1e676db36,,21.0,tablet,,B,
1,00030986-2813-4f7d-b837-502be86ecbdf,M,,,,C1,2022-07-18
2,00030986-2813-4f7d-b837-502be86ecbdf,M,49.0,,,C1,2022-07-18
3,0003d012-3ebc-4967-8be4-0d486cf7eaf9,,59.0,,,B,2022-11-14
4,0003d012-3ebc-4967-8be4-0d486cf7eaf9,,,,,B,2022-10-28


In [6]:
print("Time min: ", data1['date'].min(), " , Time max: ", data1['date'].max())

Time min:  2022-11-07  , Time max:  2022-11-13


In [17]:
data1.groupby('date')['conversions'].agg([len, np.sum])

Unnamed: 0_level_0,len,sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-11-07,7427,351
2022-11-08,7059,369
2022-11-09,6947,343
2022-11-10,7317,406
2022-11-11,6836,322
2022-11-12,7406,334
2022-11-13,6807,324


# Measure experiment for M-Th and Fri-Sun

#### Null Hypothesis (H0): There is no difference in conversion rates between users exposed to ads on weekdays and users exposed to ads on weekends.
#### Alternative Hypothesis (H1): There is a difference in conversion rates between users exposed to ads on weekdays and users exposed to ads on weekends.

In [18]:
# Split data between weekends and M-Th

wkdaydf = data1[data1['date'] < '2022-11-11']
wkenddf = data1[data1['date'] >= '2022-11-11']

In [29]:
# Shapes

print("Weekday DF shape: ", wkdaydf.shape[0], " | Weekend DF shape: ", wkenddf.shape[0])

Weekday DF shape:  28750  | Weekend DF shape:  21049


In [26]:
# Create index field for weekday

wkdaydf['ind'] = [i for i in range(wkdaydf.shape[0])]

# Randomized list of numbers

# Create a list of numbers to choose from
nums = list(range(wkdaydf.shape[0]))

# Choose a random sample of 5 numbers without repetition
records_to_pick = random.sample(nums, wkenddf.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
len(records_to_pick)

21049

In [37]:
# Filter for the randomization of records

wkdaydffinal = wkdaydf[wkdaydf['ind'].isin(records_to_pick)]

del wkdaydffinal['ind']

wkdaydffinal['Week'] = ["Weekday" for i in range(wkdaydffinal.shape[0])]

wkdaydffinal.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,user_id,date,hour,impressions,clicks,video_midpoint,video_complete_views,asset,conversions,Week
0,00007e57-36c3-49aa-9917-4266a59c3bea,2022-11-07,1,1,1,1,1,Asset1,0,Weekday
1,000449ee-3b62-4e56-be4e-f600a728bbc5,2022-11-08,4,1,0,0,0,Asset1,0,Weekday
11,00106248-d803-4542-9ae4-f870239a3e6a,2022-11-07,11,1,0,0,0,Asset1,0,Weekday
12,00106248-d803-4542-9ae4-f870239a3e6a,2022-11-08,9,1,0,0,0,Asset1,0,Weekday
13,0011a6d7-cfba-4e0f-92fc-4f3928dcf6b3,2022-11-08,8,2,0,0,0,Asset1,0,Weekday


In [38]:
# Create a field for week

wkenddf['Week'] = ['Weekend' for i in range(wkenddf.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
wkenddf.head()

Unnamed: 0,user_id,date,hour,impressions,clicks,video_midpoint,video_complete_views,asset,conversions,Week
3,0009d922-3cd3-4a9c-a70c-b1b2fcaa8bfa,2022-11-11,4,1,0,0,0,Asset1,0,Weekend
5,000c83bd-eb24-4f61-a2db-320d3261c601,2022-11-11,4,1,0,0,0,Asset1,0,Weekend
6,000d3542-ecd9-4886-b670-0d79de61631d,2022-11-12,5,1,0,0,0,Asset1,0,Weekend
7,000de28d-5893-4041-879e-f2dd91f5beed,2022-11-12,5,1,0,0,0,Asset1,0,Weekend
8,000f7ca7-b895-424a-9bb7-557277979a0e,2022-11-12,12,1,0,1,1,Asset1,0,Weekend


In [48]:
# Combine both data sets together

df = pd.concat([wkdaydffinal, wkenddf])

# Create fields for conversion

df["Converted"] = np.where(df['conversions'] == 1, 1, 0)
df["Not Converted"] = np.where(df['conversions'] == 0, 1, 0)

df

Unnamed: 0,user_id,date,hour,impressions,clicks,video_midpoint,video_complete_views,asset,conversions,Week,Converted,Not Converted
0,00007e57-36c3-49aa-9917-4266a59c3bea,2022-11-07,1,1,1,1,1,Asset1,0,Weekday,0,1
1,000449ee-3b62-4e56-be4e-f600a728bbc5,2022-11-08,4,1,0,0,0,Asset1,0,Weekday,0,1
11,00106248-d803-4542-9ae4-f870239a3e6a,2022-11-07,11,1,0,0,0,Asset1,0,Weekday,0,1
12,00106248-d803-4542-9ae4-f870239a3e6a,2022-11-08,9,1,0,0,0,Asset1,0,Weekday,0,1
13,0011a6d7-cfba-4e0f-92fc-4f3928dcf6b3,2022-11-08,8,2,0,0,0,Asset1,0,Weekday,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
49790,ffeead30-0c2b-410e-ba2d-54425fe6c884,2022-11-11,12,1,0,0,0,Asset2,0,Weekend,0,1
49791,fff09e24-4a7c-4c1c-81e9-add0b70648e8,2022-11-11,13,1,1,1,1,Asset2,0,Weekend,0,1
49792,fff24e72-cc60-4776-8ba0-251430c0c68d,2022-11-11,6,1,0,0,0,Asset2,0,Weekend,0,1
49793,fffaf0c3-04bc-4664-9be5-28c565624c1f,2022-11-12,2,1,0,0,0,Asset1,0,Weekend,0,1


In [45]:
# Summarize to conversion rate

df[['Week','conversions']].value_counts(normalize = True)

Week     conversions
Weekend  0              0.476721
Weekday  0              0.473918
         1              0.026082
Weekend  1              0.023279
dtype: float64

In [60]:
# Pivot table

pvt = pd.pivot_table(data = df, 
               values = ['Converted', 'Not Converted'], 
               index = 'Week', 
               aggfunc = np.sum, 
               margins = True, 
               margins_name='Grand Total')

pvt['Total'] = pvt['Converted'] + pvt['Not Converted']


In [65]:
pvt

Unnamed: 0_level_0,Converted,Not Converted,Total
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weekday,1098,19951,21049
Weekend,980,20069,21049
Grand Total,2078,40020,42098


In [82]:
chi2, p, dof, expected = chi2_contingency(pvt.iloc[0:2,0:2].to_numpy())

print("Chi-square statistic:", chi2, "\n P-value: ", p, "\n Degrees of Freedom: ", dof,)

cr_wkday = pvt.iloc[0,0]/pvt.iloc[0,2]
cr_wkend = pvt.iloc[1,0]/pvt.iloc[1,2]

Chi-square statistic: 6.929638188605409 
 P-value:  0.008477883787320093 
 Degrees of Freedom:  1


In [108]:
# Effect size

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * min(k-1, r-1)))

effect_size = cramers_v(pvt.iloc[0:2,0:2].to_numpy())

print("Cramer's V Effect Size:", effect_size)

Cramer's V Effect Size: 0.012829937713758724


In [111]:
# Set parameters
alpha = 0.05  # significance level
sample_size = df.shape[0]  # sample size

# Initialize power analysis
power_analysis = GofChisquarePower()

# Calculate power
power = power_analysis.solve_power(effect_size=effect_size, nobs=sample_size, alpha=alpha)

print("Power:", power)

Power: 0.7493556640946815


In [83]:
print("Weekday conversion rate: " , (cr_wkday) * 100, '%',
      "\n",
      "Weekend conversion rate: ", (cr_wkend) * 100, '%')

Weekday conversion rate:  5.216399828970497 % 
 Weekend conversion rate:  4.655803126039242 %


In [85]:
# Lift percentage

print(((cr_wkend - cr_wkday)/cr_wkday)* 100 , '%')

-10.746812386156648 %


In [112]:
effect_size

0.012829937713758724

#### As a conclusion, we reject the null hypothesis that the conversion rate between weekdays (M - TH) and (Fri - Sun) are not different in favor of the alternate hypothesis that states the opposite.  We received a p-value of 0.8% vs the significance level of 5% along with a lift ratio of -10.75% and a 75% power effect, which is strong.