In [1]:
import pandas as pd
from scipy import stats

# Load the cleaned data
data = pd.read_csv('../data/cleaned_data.csv')

# 1. Hypothesis 1: Risk differences across provinces
# KPI: Total Claims (as a proxy for risk)
province_risk = data.groupby('Province')['TotalClaims'].mean()

# Perform ANOVA test for differences across provinces
anova_provinces = stats.f_oneway(*(data[data['Province'] == province]['TotalClaims'] for province in data['Province'].unique()))
print(f"ANOVA Result for Risk Differences Across Provinces: p-value = {anova_provinces.pvalue}")

# 2. Hypothesis 2: Risk differences between zip codes
# KPI: Total Claims (risk)
# We'll select two zip codes for A/B testing
zip_code_A = 1234  # Replace with a relevant zip code from your dataset
zip_code_B = 5678  # Replace with a different relevant zip code

# Perform t-test for risk differences between zip codes
ttest_zip_risk = stats.ttest_ind(data[data['PostalCode'] == zip_code_A]['TotalClaims'], data[data['PostalCode'] == zip_code_B]['TotalClaims'])
print(f"T-test Result for Risk Differences Between Zip Codes: p-value = {ttest_zip_risk.pvalue}")

# 3. Hypothesis 3: Margin (Profit) differences between zip codes
# KPI: Profit = Total Premium - Total Claims
data['Profit'] = data['TotalPremium'] - data['TotalClaims']

# Perform t-test for margin (profit) differences between zip codes
ttest_zip_margin = stats.ttest_ind(data[data['PostalCode'] == zip_code_A]['Profit'], data[data['PostalCode'] == zip_code_B]['Profit'])
print(f"T-test Result for Margin Differences Between Zip Codes: p-value = {ttest_zip_margin.pvalue}")

# 4. Hypothesis 4: Risk differences between women and men
# KPI: Total Claims (risk)
# Group by gender
group_A = data[data['Gender'] == 'Male']['TotalClaims']
group_B = data[data['Gender'] == 'Female']['TotalClaims']

# Perform t-test for risk differences between genders
ttest_gender_risk = stats.ttest_ind(group_A, group_B)
print(f"T-test Result for Risk Differences Between Genders: p-value = {ttest_gender_risk.pvalue}")

# Reporting Results
if anova_provinces.pvalue < 0.05:
    print("There is a significant risk difference across provinces (reject the null hypothesis).")
else:
    print("There is no significant risk difference across provinces (fail to reject the null hypothesis).")

if ttest_zip_risk.pvalue < 0.05:
    print("There is a significant risk difference between zip codes (reject the null hypothesis).")
else:
    print("There is no significant risk difference between zip codes (fail to reject the null hypothesis).")

if ttest_zip_margin.pvalue < 0.05:
    print("There is a significant margin (profit) difference between zip codes (reject the null hypothesis).")
else:
    print("There is no significant margin (profit) difference between zip codes (fail to reject the null hypothesis).")

if ttest_gender_risk.pvalue < 0.05:
    print("There is a significant risk difference between men and women (reject the null hypothesis).")
else:
    print("There is no significant risk difference between men and women (fail to reject the null hypothesis).")


  data = pd.read_csv('../data/cleaned_data.csv')


ANOVA Result for Risk Differences Across Provinces: p-value = 1.6782057588675906e-07
T-test Result for Risk Differences Between Zip Codes: p-value = nan
T-test Result for Margin Differences Between Zip Codes: p-value = nan
T-test Result for Risk Differences Between Genders: p-value = 0.8041073961270343
There is a significant risk difference across provinces (reject the null hypothesis).
There is no significant risk difference between zip codes (fail to reject the null hypothesis).
There is no significant margin (profit) difference between zip codes (fail to reject the null hypothesis).
There is no significant risk difference between men and women (fail to reject the null hypothesis).
