In [None]:
import pandas as pd
import numpy as np
%pip install scipy
from scipy import stats

# --- Compute KPIs ---
df = pd.read_csv('MachineLearningRating_v3_cleaned.csv', sep='|', on_bad_lines='skip')
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
df['ClaimSeverity'] = df['TotalClaims'] / df['HasClaim'].replace(0, np.nan)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# --- Set alpha level for significance ---
alpha = 0.05

def test_frequency(group_col):
    contingency = pd.crosstab(df[group_col], df['HasClaim'])
    chi2, p, _, _ = stats.chi2_contingency(contingency)
    return p

def test_severity(group_col):
    groups = [group['ClaimSeverity'].dropna() for _, group in df[df['HasClaim'] == 1].groupby(group_col)]
    groups = [g for g in groups if len(g) > 0]  # Only keep non-empty groups
    if len(groups) < 2:
        print(f"Not enough groups with data for {group_col} (severity).")
        return np.nan
    f_stat, p = stats.f_oneway(*groups)
    return p

def test_margin(group_col):
    groups = [group['Margin'].dropna() for _, group in df.groupby(group_col)]
    groups = [g for g in groups if len(g) > 0]  # Only keep non-empty groups
    if len(groups) < 2:
        print(f"Not enough groups with data for {group_col} (margin).")
        return np.nan
    f_stat, p = stats.f_oneway(*groups)
    return p

def result(name, p):
    if p < alpha:
        print(f"Reject H₀ for {name} (p = {p:.4f}) → Statistically significant difference.")
    else:
        print(f" Fail to reject H₀ for {name} (p = {p:.4f}) → No significant difference.")

# --- Hypothesis Testing ---

# H₀: No risk difference across Provinces
result("Claim Frequency by Province", test_frequency('Province'))
result("Claim Severity by Province", test_severity('Province'))
result("Claim Frequency by Province", test_frequency('Province'))
result("Claim Severity by Province", test_severity('Province'))
# H₀: No risk difference between Zip Codes (PostalCode)
result("Claim Frequency by Zip Code", test_frequency('PostalCode'))
result("Claim Severity by Zip Code", test_severity('PostalCode'))

# H₀: No Margin difference between Zip Codes
result("Margin by Zip Code", test_margin('PostalCode'))

# H₀: No risk difference between Women and Men
result("Claim Frequency by Gender", test_frequency('Gender'))
result("Claim Severity by Gender", test_severity('Gender'))
def run_all_tests():
    result("Claim Frequency by Province", test_frequency('Province'))
    result("Claim Severity by Province", test_severity('Province'))
    result("Claim Frequency by Zip Code", test_frequency('PostalCode'))
    result("Claim Severity by Zip Code", test_severity('PostalCode'))
    result("Margin by Zip Code", test_margin('PostalCode'))
    result("Claim Frequency by Gender", test_frequency('Gender'))
    result("Claim Severity by Gender", test_severity('Gender'))

# Call it like this:
run_all_tests()


Note: you may need to restart the kernel to use updated packages.


  df = pd.read_csv('MachineLearningRating_v3_cleaned.csv', sep='|', on_bad_lines='skip')


 Fail to reject H₀ for Claim Frequency by Province (p = 1.0000) → No significant difference.
Not enough groups with data for Province (severity).
 Fail to reject H₀ for Claim Severity by Province (p = nan) → No significant difference.
 Fail to reject H₀ for Claim Frequency by Province (p = 1.0000) → No significant difference.
Not enough groups with data for Province (severity).
 Fail to reject H₀ for Claim Severity by Province (p = nan) → No significant difference.
 Fail to reject H₀ for Claim Frequency by Zip Code (p = 1.0000) → No significant difference.
Not enough groups with data for PostalCode (severity).
 Fail to reject H₀ for Claim Severity by Zip Code (p = nan) → No significant difference.
Reject H₀ for Margin by Zip Code (p = 0.0000) → Statistically significant difference.
 Fail to reject H₀ for Claim Frequency by Gender (p = 1.0000) → No significant difference.
Not enough groups with data for Gender (severity).
 Fail to reject H₀ for Claim Severity by Gender (p = nan) → No si

TypeError: result() missing 2 required positional arguments: 'name' and 'p'