In [8]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the data
# Using DVC to get the data path ensures we're using the versioned data
# For now, we know the path, but this is a good practice to learn for later.
file_path = '../data/raw/MachineLearningRating_v3.txt'
df = pd.read_csv(file_path, delimiter='|')

# --- Best Practice: Standardize Column Names ---
# Let's do this now to avoid KeyErrors.
df.columns = df.columns.str.lower().str.replace(' ', '_')

# --- Define Our Metrics ---
# 1. Claim Frequency Metric: Create a binary 'has_claim' column
df['has_claim'] = (df['totalclaims'] > 0).astype(int)

# 2. Margin (Profit) Metric
df['margin'] = df['totalpremium'] - df['totalclaims']

# 3. Create a DataFrame for severity analysis (only policies with claims)
claims_df = df[df['has_claim'] == 1].copy()

print("Data prepared for hypothesis testing. Shape of claims_df:", claims_df.shape)
df[['province', 'gender', 'totalclaims', 'totalpremium', 'has_claim', 'margin']].head()

  df = pd.read_csv(file_path, delimiter='|')


Data prepared for hypothesis testing. Shape of claims_df: (2788, 54)


Unnamed: 0,province,gender,totalclaims,totalpremium,has_claim,margin
0,Gauteng,Not specified,0.0,21.929825,0,21.929825
1,Gauteng,Not specified,0.0,21.929825,0,21.929825
2,Gauteng,Not specified,0.0,0.0,0,0.0
3,Gauteng,Not specified,0.0,512.84807,0,512.84807
4,Gauteng,Not specified,0.0,0.0,0,0.0


In [10]:
# Prepare data for the test: a list of claim amounts for each province
province_groups = claims_df.groupby('province')['totalclaims'].apply(list)

# Perform the Kruskal-Wallis test
h_statistic, p_value = stats.kruskal(*province_groups)

print(f"\nKruskal-Wallis Test for Claim Severity by Province:")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Result: Reject the null hypothesis. There is a significant difference in claim severity across provinces.")
else:
    print("Result: Fail to reject the null hypothesis. There is no significant difference in claim severity across provinces.")


Kruskal-Wallis Test for Claim Severity by Province:
P-value: 2.4146571363417815e-19
Result: Reject the null hypothesis. There is a significant difference in claim severity across provinces.


In [11]:
# Test 2.1: Claim Frequency by Zip Code (Chi-squared)
zip_claim_freq = pd.crosstab(df['postalcode'], df['has_claim'])
chi2, p_val_zip_freq, dof, expected = stats.chi2_contingency(zip_claim_freq)
print(f"\nChi-squared Test for Claim Frequency by Zip Code P-value: {p_val_zip_freq}")


# Test 2.2: Claim Severity by Zip Code (Kruskal-Wallis)
zip_severity_groups = claims_df.groupby('postalcode')['totalclaims'].apply(list)
# Filter out groups with only one claim, as they don't have variance
zip_severity_groups_filtered = [group for group in zip_severity_groups if len(group) > 1]
h_stat, p_val_zip_sev = stats.kruskal(*zip_severity_groups_filtered)
print(f"Kruskal-Wallis Test for Claim Severity by Zip Code P-value: {p_val_zip_sev}")


# Test 3: Margin by Zip Code (Kruskal-Wallis)
zip_margin_groups = df.groupby('postalcode')['margin'].apply(list)
zip_margin_groups_filtered = [group for group in zip_margin_groups if len(group) > 1]
h_stat, p_val_zip_margin = stats.kruskal(*zip_margin_groups_filtered)
print(f"Kruskal-Wallis Test for Margin by Zip Code P-value: {p_val_zip_margin}")


Chi-squared Test for Claim Frequency by Zip Code P-value: 3.152172246339057e-30
Kruskal-Wallis Test for Claim Severity by Zip Code P-value: 1.3787406038888729e-08
Kruskal-Wallis Test for Margin by Zip Code P-value: 0.0


In [12]:
# Filter for only Male and Female for a clear comparison
gender_df = df[df['gender'].isin(['Male', 'Female'])]
gender_claims_df = claims_df[claims_df['gender'].isin(['Male', 'Female'])]

# Test 4.1: Claim Frequency by Gender (Chi-squared)
gender_claim_freq = pd.crosstab(gender_df['gender'], gender_df['has_claim'])
chi2, p_value, dof, expected = stats.chi2_contingency(gender_claim_freq)
print(f"\nChi-squared Test for Claim Frequency by Gender:")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Result: Reject the null hypothesis. There is a significant difference in claim frequency between Men and Women.")
else:
    print("Result: Fail to reject the null hypothesis.")


# Test 4.2: Claim Severity by Gender
# Question: Is the average claim amount different between Men and Women?
# Test: Mann-Whitney U test (non-parametric alternative to the t-test).
male_claims = gender_claims_df[gender_claims_df['gender'] == 'Male']['totalclaims']
female_claims = gender_claims_df[gender_claims_df['gender'] == 'Female']['totalclaims']

u_statistic, p_value = stats.mannwhitneyu(male_claims, female_claims)
print(f"\nMann-Whitney U Test for Claim Severity by Gender:")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Result: Reject the null hypothesis. There is a significant difference in claim severity between Men and Women.")
else:
    print("Result: Fail to reject the null hypothesis.")


Chi-squared Test for Claim Frequency by Gender:
P-value: 0.9514644755420456
Result: Fail to reject the null hypothesis.

Mann-Whitney U Test for Claim Severity by Gender:
P-value: 0.22351273500106295
Result: Fail to reject the null hypothesis.
