In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, ttest_ind
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv('../data/cleaned_data.csv', low_memory=False)

# Data Preparation
# Ensure 'Claimed' column is binary (0 or 1), using 'TotalClaims' for binary transformation
data['Claimed'] = data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

# Check for missing values and drop or fill as needed
print("Missing values in columns:\n", data.isnull().sum())

# Drop rows with missing values in critical columns for testing
data_cleaned = data.dropna(subset=['PostalCode', 'TotalPremium', 'Gender'])

# Check cleaned data
print(data_cleaned.head())
print(data_cleaned.info())

# Hypothesis 1: Risk Differences Across Provinces

def test_risk_across_provinces(data):
    # Create a contingency table
    contingency_table = pd.crosstab(data['Province'], data['Claimed'])
    # Perform Chi-square test
    chi2_stat, p_val, _, _ = chi2_contingency(contingency_table)
    return chi2_stat, p_val

chi2_stat_prov, p_val_prov = test_risk_across_provinces(data_cleaned)
print(f"Chi-squared statistic (Provinces): {chi2_stat_prov}, P-value: {p_val_prov}")

# Hypothesis 2: Risk Differences Between Zip Codes

def test_risk_between_zipcodes(data, zip1, zip2):
    group1 = data[data['PostalCode'] == zip1]['Claimed']
    group2 = data[data['PostalCode'] == zip2]['Claimed']
    
    # Check if both groups have data
    if group1.empty or group2.empty:
        return np.nan, np.nan
    
    # Perform T-test
    t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
    return t_stat, p_val

# Example zip codes for testing
zip1 = 1000
zip2 = 2000

t_stat_zip, p_val_zip = test_risk_between_zipcodes(data_cleaned, zip1, zip2)
print(f"T-statistic (Zip Codes): {t_stat_zip}, P-value: {p_val_zip}")

# Hypothesis 3: Margin (Profit) Differences Between Zip Codes

def test_margin_between_zipcodes(data, zip1, zip2):
    group1 = data[data['PostalCode'] == zip1]['TotalPremium']
    group2 = data[data['PostalCode'] == zip2]['TotalPremium']
    
    # Check if both groups have data
    if group1.empty or group2.empty:
        return np.nan, np.nan
    
    # Perform T-test
    t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
    return t_stat, p_val

# Example zip codes for testing
t_stat_margin_zip, p_val_margin_zip = test_margin_between_zipcodes(data_cleaned, zip1, zip2)
print(f"T-statistic (Margin between Zip Codes): {t_stat_margin_zip}, P-value: {p_val_margin_zip}")

# Hypothesis 4: Risk Differences Between Women and Men

def test_risk_gender(data):
    group1 = data[data['Gender'] == 'Male']['Claimed']
    group2 = data[data['Gender'] == 'Female']['Claimed']
    
    # Check if both groups have data
    if group1.empty or group2.empty:
        return np.nan, np.nan
    
    # Perform T-test
    t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
    return t_stat, p_val

t_stat_gender, p_val_gender = test_risk_gender(data_cleaned)
print(f"T-statistic (Gender): {t_stat_gender}, P-value: {p_val_gender}")

# Interpretation
def interpret_results(p_val, alpha=0.05):
    if pd.notna(p_val) and p_val < alpha:
        return "Reject the null hypothesis: Significant difference detected."
    else:
        return "Fail to reject the null hypothesis: No significant difference detected."

# Print Interpretations
print("Interpretation for Risk Differences Across Provinces:", interpret_results(p_val_prov))
print("Interpretation for Risk Differences Between Zip Codes:", interpret_results(p_val_zip))
print("Interpretation for Margin Differences Between Zip Codes:", interpret_results(p_val_margin_zip))
print("Interpretation for Risk Differences Between Genders:", interpret_results(p_val_gender))

# Visualization of distributions (optional)
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
data_cleaned['Claimed'].value_counts().plot(kind='bar')
plt.title('Distribution of Claims')
plt.xlabel('Claimed')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
data_cleaned['TotalPremium'].plot(kind='hist', bins=30, edgecolor='black')
plt.title('Distribution of Total Premium')
plt.xlabel('Total Premium')

plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_data.csv'