In [1]:
import pandas as pd
from scipy.stats import ttest_ind

# Load the engineered dataset
df = pd.read_csv('../data/engineered_data.csv')

# Create a report text
report = []

# a. Do older people prefer store purchases?
older = df[df['Age'] > 50]['NumStorePurchases']
younger = df[df['Age'] <= 50]['NumStorePurchases']
t_stat1, p_val1 = ttest_ind(older, younger, nan_policy='omit')
report.append(f"1. Older vs Younger (Store Purchases): p-value = {p_val1:.4f}")

# b. Do people with kids prefer online purchases?
with_kids = df[df['Total_Children'] > 0]['NumWebPurchases']
without_kids = df[df['Total_Children'] == 0]['NumWebPurchases']
t_stat2, p_val2 = ttest_ind(with_kids, without_kids, nan_policy='omit')
report.append(f"2. With vs Without Kids (Web Purchases): p-value = {p_val2:.4f}")

# c. Correlation between store and web purchases (possible cannibalization)
correlation = df[['NumWebPurchases', 'NumStorePurchases']].corr().iloc[0, 1]
report.append(f"3. Correlation (Web vs Store Purchases): r = {correlation:.4f}")

# d. Spending difference between US and rest of the world
if 'Country_US' in df.columns:
    us = df[df['Country_US'] == 1]['Total_Spending']
    non_us = df[df['Country_US'] == 0]['Total_Spending']
    t_stat3, p_val3 = ttest_ind(us, non_us, nan_policy='omit')
    report.append(f"4. US vs Rest (Total Spending): p-value = {p_val3:.4f}")
else:
    report.append("4. Country_US column not found.")


# Save summary statistics
summary = df.describe()
summary.to_csv('../output/reports/summary_statistics.csv')

# Save the hypothesis test results
with open('../output/reports/hypothesis_results.txt', 'w') as f:
    f.write("HYPOTHESIS TEST RESULTS\n")
    f.write("------------------------\n")
    for line in report:
        f.write(line + "\n")

# Print to console as well
for line in report:
    print(line)


1. Older vs Younger (Store Purchases): p-value = 0.0000
2. With vs Without Kids (Web Purchases): p-value = 0.0009
3. Correlation (Web vs Store Purchases): r = 0.5027
4. US vs Rest (Total Spending): p-value = 0.7630
