In [3]:
import pandas as pd
from scipy import stats
import numpy as np

#Load the dataset
df = pd.read_csv('../data/ecommerce_data.csv')

#Preview data
df.head()


Unnamed: 0,user_id,group,purchase,revenue,session_duration,device_type
0,1,control,0,0.0,209,mobile
1,2,treatment,0,0.0,890,desktop
2,3,control,0,0.0,90,mobile
3,4,control,1,51.23,176,desktop
4,5,control,0,0.0,593,mobile


# Chi-Square Test for Conversion Rate(Categorical)
The reason for this test is to find out whether the conversion rate is significantly different between control and treatment.
We use chi-square test of independence to see if group assignment is associated with conversion outcome.

In [4]:
#Create a contingency table
conversion_table = pd.crosstab(df['group'], df['purchase'])

#Run Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(conversion_table)

#Show result
print("Chi-Square Statistic:", round(chi2, 3))
print("p-value:", round(p, 4))

Chi-Square Statistic: 2.692
p-value: 0.1009


# Interpretation
The p-value tells us the probability that the observed difference in conversion rates happened by chance.
If p-value < 0.05 the difference is statistically significant and if p-value > 0.05 the difference is not statistically significant.



# T-Test for Revenue (Numerical)
The reason for the test is to find out whether the average revenue per user is significantly different Tbetween control and treatment.
Independent t-test is used because revenue is continuous and the groups are independent.


In [6]:
#Group revenues
control_revenue = df[df['group'] == 'control']['revenue']
treatment_revenue = df[df['group'] == 'treatment']['revenue']

#Perfom independent t-test
t_stat, p_value = stats.ttest_ind(treatment_revenue, control_revenue)

#Show results
print("T-Statistic:", round(t_stat, 3))
print("p-value:", round(p_value, 4))

T-Statistic: 1.492
p-value: 0.1357


T-test checks whether the treatment generated signifivantly higher average revenue than control.Because the p-value > 0.05 this means the revenue difference could be due to a random variation. This helps us assess the financial effectiveness of the new page.

# Confidence Interval for Conversion Rate

In [8]:
def propotion_confint(successes, n, confidence=0.95):
    p_hat = successes / n
    z = stats.norm.ppf(1 - (1 - confidence)/2)
    margin = z * np.sqrt((p_hat * (1 - p_hat)) / n)
    return round(p_hat - margin, 4), round(p_hat + margin, 4)

#Control group
control = df[df['group'] == 'control']
control_conv = control['purchase'].sum()
control_n = control.shape[0]

#Treatment group
treatment = df[df['group'] == 'treatment']
treatment_conv = treatment['purchase'].sum()
treatment_n = treatment.shape[0]

#Compute CIs
control_ci = propotion_confint(control_conv, control_n)
treatment_ci = propotion_confint(treatment_conv, treatment_n)

print("Control group 95% CI:", control_ci)
print("Treatment group 95% CI:", treatment_ci)

Control group 95% CI: (np.float64(0.1094), np.float64(0.135))
Treatment group 95% CI: (np.float64(0.1247), np.float64(0.1518))


# Interpretation
The confidence interval tells us the likely range of the true conversion rate for each group. If the intervals do not overlap it is a strong sign that the groups are different but if the intervals overlap the difference may not be statistically meaningful.
This helps visualize the uncertainity around our estimates.