# B5W3: End-to-End Insurance Risk Analytics & Predictive Modeling
## Task 3
    - A/B Hypothesis Testing

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from scipy import stats

sns.set_style('whitegrid')

In [2]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [6]:
df  = pd.read_csv('../data/MachineLearningRating_v3.csv', low_memory=False )

In [7]:
from scripts.data_converter import DataConverter

In [8]:
input_file_path = os.path.join('B5W3_End_to_End_Insurance_Risk_Analytics_Predictive_Modeling', 'data', 'MachineLearningRating_v3.txt')
output_file_path = os.path.join('B5W3_End_to_End_Insurance_Risk_Analytics_Predictive_Modeling', 'data', 'MachineLearningRating_v3.csv')

# Create an instance (an object) of the DataConverter class
# This calls the __init__ method of the class, initializing the object
file_converter = DataConverter(input_file_path, output_file_path)

In [9]:
df = file_converter.clean_data_types(df)


--- Starting Data Type Cleaning ---
Converting date columns...
  - 'TransactionMonth' converted to datetime.
  - 'VehicleIntroDate' converted to datetime.

Converting numerical columns from object to float...


  df[col] = pd.to_datetime(df[col], errors='coerce')


  - 'CapitalOutstanding' converted to float.
  - 'ExcessSelected' converted to float.

Converting count/binary columns...
  - 'Cylinders' converted to nullable integer (Int64).
  - 'NumberOfDoors' converted to nullable integer (Int64).
  - 'mmcode' converted to nullable integer (Int64).
  - 'RegistrationYear' converted to nullable integer (Int64).
  - 'PostalCode' converted to nullable integer (Int64).
  - 'AlarmImmobiliser' converted to binary integer (Int64).
  - 'TrackingDevice' converted to binary integer (Int64).
  - 'NewVehicle' converted to binary integer (Int64).
  - 'WrittenOff' converted to binary integer (Int64).
  - 'Rebuilt' converted to binary integer (Int64).
  - 'Converted' converted to binary integer (Int64).
  - 'CrossBorder' converted to binary integer (Int64).
  - 'IsVATRegistered' converted to binary integer (Int64).

Converting object columns to category...
  - 'Citizenship' converted to category.
  - 'LegalType' converted to category.
  - 'Title' converted to cat

## A/B Hypothesis Testing

### Select and create matrics

In [20]:
# 1. Claim Occurred (Binary) for Claim Frequency
# Create a new column 'ClaimOccurred' (1 if a claim was made, 0 otherwise)
df['ClaimOccurred'] = np.where(df['TotalClaims'] > 0, 1, 0)

# 2. Claim Severity (Requires filtering for claims)
# Create a DataFrame containing only policies with claims
claims_df = df[df['ClaimOccurred'] == 1].copy()

# 3. Margin (Profit)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

print("Metrics 'ClaimOccurred' and 'Margin' have been created in the DataFrame.")
print(f"Number of policies with claims: {claims_df.shape[0]}")

Metrics 'ClaimOccurred' and 'Margin' have been created in the DataFrame.
Number of policies with claims: 2788


#### Statistical Testing & Analysis

#### Hypothesis 1: H0 There are no risk differences across provinces

A. Claim Frequency by Province (Categorical Data)

- Test: Chi-squared Test of Independence. This is used to compare proportions across multiple groups.

- Analysis: We'll check if the proportion of policies with claims is independent of the province.

In [21]:
# Create a contingency table of ClaimOccurred vs. Province
contingency_table = pd.crosstab(df['Province'], df['ClaimOccurred'])

# Conduct the Chi-squared test
chi2, p_value_freq_province, dof, expected = stats.chi2_contingency(contingency_table)

print(f"Claim Frequency by Province:")
print(f"Chi-squared statistic: {chi2:.2f}")
print(f"P-value: {p_value_freq_province:.4f}")

# Analyze the p-value
if p_value_freq_province < 0.05:
    print("Conclusion: Reject the Null Hypothesis. There is a statistically significant difference in Claim Frequency across provinces.")
else:
    print("Conclusion: Fail to reject the Null Hypothesis. Claim Frequency does not significantly differ by province.")

Claim Frequency by Province:
Chi-squared statistic: 104.19
P-value: 0.0000
Conclusion: Reject the Null Hypothesis. There is a statistically significant difference in Claim Frequency across provinces.


#### B. Claim Severity by Province (Numerical Data)
    - Test: ANOVA (Analysis of Variance). This is used to compare the means of a numerical variable across three or more groups.
    - Analysis: We'll check if the average claim amount is the same across all provinces.

In [22]:
# Create a list of claim amounts for each province
claim_severities_by_province = [group['TotalClaims'] for name, group in claims_df.groupby('Province')]

# Conduct the ANOVA test
f_stat_severity_province, p_value_severity_province = stats.f_oneway(*claim_severities_by_province)

print(f"\nClaim Severity by Province:")
print(f"F-statistic: {f_stat_severity_province:.2f}")
print(f"P-value: {p_value_severity_province:.4f}")

# Analyze the p-value
if p_value_severity_province < 0.05:
    print("Conclusion: Reject the Null Hypothesis. There is a statistically significant difference in Claim Severity across provinces.")
else:
    print("Conclusion: Fail to reject the Null Hypothesis. Claim Severity does not significantly differ by province.")


Claim Severity by Province:
F-statistic: 4.83
P-value: 0.0000
Conclusion: Reject the Null Hypothesis. There is a statistically significant difference in Claim Severity across provinces.


  claim_severities_by_province = [group['TotalClaims'] for name, group in claims_df.groupby('Province')]


#### Hypothesis 2: H0: There are no risk differences between zip codes

#### A . Claim Frequency by Zip Code (Categorical Data)
    - Test: Chi-squared Test of Independence.
    - Analysis: Compare the proportion of claims between the two chosen zip codes.



In [None]:
# Select the two zip codes for A/B testing
zip_A = df['PostalCode'][10]
zip_B = df['PostalCode'][120]

# Create a contingency table for the two zip codes
zip_subset = df[df['PostalCode'].isin([zip_A, zip_B])]
contingency_table_zip = pd.crosstab(zip_subset['PostalCode'], zip_subset['ClaimOccurred'])
# Conduct the Chi-squared test
chi2_zip_freq, p_value_zip_freq, dof_zip, expected_zip = stats.chi2_contingency(contingency_table_zip)

print(f"\nClaim Frequency between Zip Codes {zip_A} and {zip_B}:")
print(f"Chi-squared statistic: {chi2_zip_freq:.2f}")
print(f"P-value: {p_value_zip_freq:.4f}")

if p_value_zip_freq < 0.05:
    print(f"Conclusion: Reject the Null Hypothesis. There is a significant difference in Claim Frequency between {zip_A} and {zip_B}.")
else:
    print(f"Conclusion: Fail to reject the Null Hypothesis. Claim Frequency does not significantly differ between {zip_A} and {zip_B}.")

ClaimOccurred    0
PostalCode        
1459           622
1513            81

Claim Frequency between Zip Codes 1459 and 1513:
Chi-squared statistic: 0.00
P-value: 1.0000
Conclusion: Fail to reject the Null Hypothesis. Claim Frequency does not significantly differ between 1459 and 1513.


#### B. Claim Severity by Zip Code (Numerical Data)
    - Test: Independent Samples t-test.
    - Analysis: Compare the average claim amount between the two zip codes.

In [28]:
# Get claim amounts for the two zip codes, only for policies with claims
claims_zip_A = claims_df[claims_df['PostalCode'] == zip_A]['TotalClaims']
claims_zip_B = claims_df[claims_df['PostalCode'] == zip_B]['TotalClaims']

# Conduct the t-test
t_stat_zip_severity, p_value_zip_severity = stats.ttest_ind(claims_zip_A, claims_zip_B, equal_var=False, nan_policy='omit')

print(f"\nClaim Severity between Zip Codes {zip_A} and {zip_B}:")
print(f"T-statistic: {t_stat_zip_severity:.2f}")
print(f"P-value: {p_value_zip_severity:.4f}")

if p_value_zip_severity < 0.05:
    print(f"Conclusion: Reject the Null Hypothesis. There is a significant difference in Claim Severity between {zip_A} and {zip_B}.")
else:
    print(f"Conclusion: Fail to reject the Null Hypothesis. Claim Severity does not significantly differ between {zip_A} and {zip_B}.")


Claim Severity between Zip Codes 1459 and 1513:
T-statistic: nan
P-value: nan
Conclusion: Fail to reject the Null Hypothesis. Claim Severity does not significantly differ between 1459 and 1513.


  return f(*args, **kwargs)


#### Hypothesis 3: H0: There are no significant margin (profit) difference between zip codes
- Test: Independent Samples t-test.
- Analysis: Compare the average margin between the same two zip codes.

In [25]:
# Get margin data for the two zip codes
margin_zip_A = df[df['PostalCode'] == zip_A]['Margin']
margin_zip_B = df[df['PostalCode'] == zip_B]['Margin']

# Conduct the t-test
t_stat_zip_margin, p_value_zip_margin = stats.ttest_ind(margin_zip_A, margin_zip_B, equal_var=False, nan_policy='omit')

print(f"\nMargin between Zip Codes {zip_A} and {zip_B}:")
print(f"T-statistic: {t_stat_zip_margin:.2f}")
print(f"P-value: {p_value_zip_margin:.4f}")

if p_value_zip_margin < 0.05:
    print(f"Conclusion: Reject the Null Hypothesis. There is a significant difference in Margin between {zip_A} and {zip_B}.")
else:
    print(f"Conclusion: Fail to reject the Null Hypothesis. Margin does not significantly differ between {zip_A} and {zip_B}.")


Margin between Zip Codes 1459 and 1513:
T-statistic: -0.44
P-value: 0.6630
Conclusion: Fail to reject the Null Hypothesis. Margin does not significantly differ between 1459 and 1513.


#### Hypothesis 4: H0: There are not significant risk difference between Women and Men
- Note: We will only compare Male and Female categories and exclude 'Not specified' to ensure a clean A/B test.
- A. Claim Frequency between Genders (Categorical Data)
    - Test: Chi-squared Test of Independence.
    - Analysis: Compare the proportion of claims between male and female policyholders.

In [26]:
# Filter data for Male and Female only
gender_subset = df[df['Gender'].isin(['Male', 'Female'])]

# Create a contingency table
contingency_table_gender = pd.crosstab(gender_subset['Gender'], gender_subset['ClaimOccurred'])

# Conduct the Chi-squared test
chi2_gender_freq, p_value_gender_freq, dof_gender, expected_gender = stats.chi2_contingency(contingency_table_gender)

print(f"\nClaim Frequency between Men and Women:")
print(f"Chi-squared statistic: {chi2_gender_freq:.2f}")
print(f"P-value: {p_value_gender_freq:.4f}")

if p_value_gender_freq < 0.05:
    print("Conclusion: Reject the Null Hypothesis. There is a significant difference in Claim Frequency between Men and Women.")
else:
    print("Conclusion: Fail to reject the Null Hypothesis. Claim Frequency does not significantly differ between Men and Women.")


Claim Frequency between Men and Women:
Chi-squared statistic: 0.00
P-value: 0.9515
Conclusion: Fail to reject the Null Hypothesis. Claim Frequency does not significantly differ between Men and Women.


#### B. Claim Severity between Genders (Numerical Data)
    - Test: Independent Samples t-test.
    - Analysis: Compare the average claim amount between male and female policyholders.

In [27]:
# Get claim amounts for Male and Female, only for policies with claims
claims_gender_subset = claims_df[claims_df['Gender'].isin(['Male', 'Female'])]
claims_male = claims_gender_subset[claims_gender_subset['Gender'] == 'Male']['TotalClaims']
claims_female = claims_gender_subset[claims_gender_subset['Gender'] == 'Female']['TotalClaims']

# Conduct the t-test
t_stat_gender_severity, p_value_gender_severity = stats.ttest_ind(claims_male, claims_female, equal_var=False, nan_policy='omit')

print(f"\nClaim Severity between Men and Women:")
print(f"T-statistic: {t_stat_gender_severity:.2f}")
print(f"P-value: {p_value_gender_severity:.4f}")

if p_value_gender_severity < 0.05:
    print("Conclusion: Reject the Null Hypothesis. There is a significant difference in Claim Severity between Men and Women.")
else:
    print("Conclusion: Fail to reject the Null Hypothesis. Claim Severity does not significantly differ between Men and Women.")


Claim Severity between Men and Women:
T-statistic: -0.58
P-value: 0.5680
Conclusion: Fail to reject the Null Hypothesis. Claim Severity does not significantly differ between Men and Women.
