In [26]:
import pandas as pd
import numpy as np
from statsmodels.stats.power import TTestIndPower
from scipy import stats

In [2]:
df = pd.read_parquet(r'data\clean\cleaned_data.parquet')

In [4]:
print(df.groupby('experiment').size())

experiment
control    4071
exposed    4006
dtype: int64


In [None]:
#---------Means and Standard Deviations Calculation---------
df['responded'] = df[['yes', 'no']].sum(axis=1)
df['responded'] = df['responded'].apply(lambda x: 1 if x >=  1 else 0)
mean_responded = df.groupby('experiment')['responded'].mean()

df['not_responded'] = df['responded'].apply(lambda x: 1 if x == 0 else 0)
mean_notresponded = df.groupby('experiment')['not_responded'].mean()

print(f"Mean Responded:{mean_responded}")
print(f"\nMean Not Responded:{mean_notresponded}")

std_dev_responded = df.groupby('experiment')['responded'].std()
std_dev_notresponded = df.groupby('experiment')['not_responded'].std()

print(f"\nStandard Deviation Responded:{std_dev_responded}")
print(f"\nStandard Deviation Not Responded:{std_dev_notresponded}")

Mean Responded:experiment
control    0.143945
exposed    0.164004
Name: responded, dtype: float64

Mean Not Responded:experiment
control    0.856055
exposed    0.835996
Name: not_responded, dtype: float64

Standard Deviation Responded:experiment
control    0.351077
exposed    0.370325
Name: responded, dtype: float64

Standard Deviation Not Responded:experiment
control    0.351077
exposed    0.370325
Name: not_responded, dtype: float64


In [None]:
#---------Pooled Standard Deviation Calculation---------
n1 = 4071 # control group size
n2 = 4006 # exposed group size
s1 = 0.351077 #standard deviation control group
s2 = 0.370325  #standard deviation exposed group

S_combined_responded = np.sqrt(
    ((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2 -2)
)

print(f"\nResponded Pooled Standard: {S_combined_responded:.4f}")


Responded Standard Deviation Combined: 0.3608


In [11]:
#---------Effect Size Calculation---------
effect_size_responded = (mean_responded['exposed'] - mean_responded['control'])/S_combined_responded
print(f'Cohen\'s d for Responded: {effect_size_responded:.2f}')

Cohen's d for Responded: 0.06


In [25]:
#---------Statistical Power---------
analysis = TTestIndPower()
power = analysis.power(effect_size = effect_size_responded, nobs1=n1, alpha=0.096) #alpha > 0.05 for study purposes (can't increase sample size)
print(f'Statistical Power: {power:.2f}')

Statistical Power: 0.80


In [27]:
#---------Testing Hypothesis---------
control = df.loc[df['experiment'] == 'control', 'responded'].values
exposed = df.loc[df['experiment'] == 'exposed', 'responded'].values

t_stat, p_value = stats.ttest_ind(control, exposed, equal_var=False)
print(f't = {t_stat:.3f}, p-value = {p_value:.3f}')

t = -2.497, p-value = 0.013
