Load Independencies

In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, kruskal, ttest_ind
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

Load CSV Dataset

In [4]:
# Load the data
df = pd.read_csv('../data/MachineLearningRating_v3.csv')

Data preproccessing

In [5]:
# Preprocessing
df['ClaimOccurred'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Clean and convert PostalCode to string
df['PostalCode'] = df['PostalCode'].astype(str).str.split('.').str[0].str.strip()

# Filter valid genders for H4
gender_df = df[df['Gender'].isin(['Male', 'Female'])]

# Initialize results storage
results = []


Hypothesis 1: Risk differences across provinces

In [6]:
print("Hypothesis 1: Risk differences across provinces")
# Claim Frequency
freq_table = pd.crosstab(df['Province'], df['ClaimOccurred'])
chi2, p_freq_prov, dof, _ = chi2_contingency(freq_table)
results.append(('H1: Claim Frequency (Provinces)', p_freq_prov))
print(f"Claim Frequency (Provinces): p-value = {p_freq_prov:.6f}")

# Claim Severity
severity_groups = [group['TotalClaims'].values 
                  for name, group in df[df['ClaimOccurred']].groupby('Province') 
                  if len(group) >= 3]  # Require min 3 claims
if len(severity_groups) >= 3:  # Require at least 3 groups for Kruskal-Wallis
    h_stat, p_sev_prov = kruskal(*severity_groups)
    results.append(('H1: Claim Severity (Provinces)', p_sev_prov))
    print(f"Claim Severity (Provinces): p-value = {p_sev_prov:.6f}")
else:
    print("Insufficient data for Claim Severity test across provinces")

# Margin differences
margin_groups = [group['Margin'].values 
                for name, group in df.groupby('Province') 
                if len(group) >= 3]  # Require min 3 policies
if len(margin_groups) >= 3:
    h_stat, p_margin_prov = kruskal(*margin_groups)
    results.append(('H1: Margin (Provinces)', p_margin_prov))
    print(f"Margin (Provinces): p-value = {p_margin_prov:.6f}")
else:
    print("Insufficient data for Margin test across provinces")

print("\n" + "="*80 + "\n")

Hypothesis 1: Risk differences across provinces
Claim Frequency (Provinces): p-value = 0.000000
Claim Severity (Provinces): p-value = 0.000000
Margin (Provinces): p-value = 0.000000




Hypothesis 2: Risk differences between zip codes

In [7]:
# Hypothesis 2: Risk differences between zip codes
print("Hypothesis 2: Risk differences between zip codes")
# Filter zip codes with sufficient data (at least 5 policies each)
valid_zips = df['PostalCode'].value_counts()
valid_zips = valid_zips[valid_zips >= 5].index.tolist()
zip_filtered = df[df['PostalCode'].isin(valid_zips)]

if len(valid_zips) >= 2:
    # Claim Frequency
    freq_table_zip = pd.crosstab(zip_filtered['PostalCode'], zip_filtered['ClaimOccurred'])
    if freq_table_zip.size > 0:
        chi2, p_freq_zip, dof, _ = chi2_contingency(freq_table_zip)
        results.append(('H2: Claim Frequency (Zip Codes)', p_freq_zip))
        print(f"Claim Frequency (Zip Codes): p-value = {p_freq_zip:.6f}")
    
    # Claim Severity
    severity_zip = [group['TotalClaims'].values 
                   for name, group in zip_filtered[zip_filtered['ClaimOccurred']].groupby('PostalCode')
                   if len(group) >= 3]  # Require min 3 claims
    if len(severity_zip) >= 2:
        h_stat, p_sev_zip = kruskal(*severity_zip)
        results.append(('H2: Claim Severity (Zip Codes)', p_sev_zip))
        print(f"Claim Severity (Zip Codes): p-value = {p_sev_zip:.6f}")
    else:
        print("Insufficient data for Claim Severity test across zip codes")
else:
    print("Insufficient data for Zip Code tests")

print("\n" + "="*80 + "\n")

Hypothesis 2: Risk differences between zip codes
Claim Frequency (Zip Codes): p-value = 0.000000
Claim Severity (Zip Codes): p-value = 0.000000




Hypothesis 3: Margin differences between zip codes

In [8]:
# Hypothesis 3: Margin differences between zip codes
print("Hypothesis 3: Margin differences between zip codes")
if len(valid_zips) >= 2:
    margin_zip = [group['Margin'].values 
                 for name, group in zip_filtered.groupby('PostalCode') 
                 if len(group) >= 3]  # Require min 3 policies
    if len(margin_zip) >= 2:
        h_stat, p_margin_zip = kruskal(*margin_zip)
        results.append(('H3: Margin (Zip Codes)', p_margin_zip))
        print(f"Margin (Zip Codes): p-value = {p_margin_zip:.6f}")
    else:
        print("Insufficient data for Margin test across zip codes")
else:
    print("Insufficient data for Margin test across zip codes")

print("\n" + "="*80 + "\n")

Hypothesis 3: Margin differences between zip codes
Margin (Zip Codes): p-value = nan




Hypothesis 4: Risk differences between genders

In [9]:
print("Hypothesis 4: Risk differences between genders")
# Claim Frequency
freq_table_gender = pd.crosstab(gender_df['Gender'], gender_df['ClaimOccurred'])
if freq_table_gender.size > 0:
    chi2, p_freq_gender, dof, _ = chi2_contingency(freq_table_gender)
    results.append(('H4: Claim Frequency (Gender)', p_freq_gender))
    print(f"Claim Frequency (Gender): p-value = {p_freq_gender:.6f}")

# Claim Severity
male_claims = gender_df[(gender_df['Gender'] == 'Male') & gender_df['ClaimOccurred']]['TotalClaims']
female_claims = gender_df[(gender_df['Gender'] == 'Female') & gender_df['ClaimOccurred']]['TotalClaims']

if len(male_claims) >= 3 and len(female_claims) >= 3:
    # Check variance equality for t-test
    _, p_var = stats.levene(male_claims, female_claims)
    if p_var > 0.05:  # Equal variances
        t_stat, p_sev_gender = ttest_ind(male_claims, female_claims)
    else:  # Unequal variances
        t_stat, p_sev_gender = ttest_ind(male_claims, female_claims, equal_var=False)
    results.append(('H4: Claim Severity (Gender)', p_sev_gender))
    print(f"Claim Severity (Gender): p-value = {p_sev_gender:.6f}")
else:
    print("Insufficient data for Claim Severity test by gender")

print("\n" + "="*80 + "\n")

Hypothesis 4: Risk differences between genders
Claim Frequency (Gender): p-value = 0.951464
Claim Severity (Gender): p-value = 0.676016




In [10]:
# Results interpretation
print("Hypothesis Test Results:")
print("=" * 80)
for test_name, p_value in results:
    if p_value < 0.05:
        result = "REJECT NULL HYPOTHESIS"
        evidence = "Strong evidence of significant difference"
    else:
        result = "FAIL TO REJECT NULL HYPOTHESIS"
        evidence = "Insufficient evidence of significant difference"
    
    print(f"{test_name}:")
    print(f"- p-value = {p_value:.6f}")
    print(f"- Conclusion: {result}")
    print(f"- Interpretation: {evidence}")
    print("-" * 80)

# Business Implications Summary
print("\nBusiness Implications Summary:")
print("=" * 80)
print("1. Province Differences:")
print("   - Risk varies significantly across provinces" if any("H1" in name and p < 0.05 for name, p in results) 
      else "   - No significant risk differences across provinces")
print("   - Recommendation: Consider province-based pricing or risk segmentation")

print("\n2. ZIP Code Differences:")
print("   - Significant risk and margin variations exist between ZIP codes" if any(("H2" in name or "H3" in name) and p < 0.05 for name, p in results) 
      else "   - ZIP codes show no significant differences in risk or margin")
print("   - Recommendation: Implement granular geographic pricing strategies")

print("\n3. Gender Differences:")
print("   - Significant risk differences between genders exist" if any("H4" in name and p < 0.05 for name, p in results) 
      else "   - No significant risk differences between genders")
print("   - Recommendation: Gender should not be used as primary risk factor in pricing")
print("=" * 80)

Hypothesis Test Results:
H1: Claim Frequency (Provinces):
- p-value = 0.000000
- Conclusion: REJECT NULL HYPOTHESIS
- Interpretation: Strong evidence of significant difference
--------------------------------------------------------------------------------
H1: Claim Severity (Provinces):
- p-value = 0.000000
- Conclusion: REJECT NULL HYPOTHESIS
- Interpretation: Strong evidence of significant difference
--------------------------------------------------------------------------------
H1: Margin (Provinces):
- p-value = 0.000000
- Conclusion: REJECT NULL HYPOTHESIS
- Interpretation: Strong evidence of significant difference
--------------------------------------------------------------------------------
H2: Claim Frequency (Zip Codes):
- p-value = 0.000000
- Conclusion: REJECT NULL HYPOTHESIS
- Interpretation: Strong evidence of significant difference
--------------------------------------------------------------------------------
H2: Claim Severity (Zip Codes):
- p-value = 0.000000
- Co

Key Features of the Script:
Comprehensive Hypothesis Testing:

Tests all 4 hypotheses with appropriate statistical methods

Uses Chi-square for claim frequency comparisons

Uses Kruskal-Wallis for severity/margin comparisons across groups

Uses t-tests with Levene's test for variance checking

Robust Data Handling:

Cleans and preprocesses PostalCode data

Filters out small groups (min 3-5 observations) for reliable tests

Handles both categorical (frequency) and continuous (severity) data

Detailed Reporting:

Provides p-values for each test

Clearly states statistical conclusions (reject/fail to reject null)

Includes business implications and recommendations

Formats output for easy readability

Statistical Best Practices:

Uses non-parametric tests when normality assumptions may be violated

Checks variance equality before t-tests

Handles edge cases with insufficient data gracefully

How to Interpret the Results:
p-value < 0.05: Strong evidence against null hypothesis (significant difference exists)

p-value ≥ 0.05: Insufficient evidence against null hypothesis (no significant difference found)

The script outputs both statistical conclusions and business recommendations based on the findings, providing a complete analysis for segmentation strategy development