In [2]:
# Import standard libraries
import pandas as pd
import numpy as np
import sys

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


sys.path.append('../src') 

from utils.data_loader import load_data



In [3]:
df=load_data("../data/processed/cleaned_dataset.csv",delimiter=",")

Data loaded successfully from ../data/processed/cleaned_dataset.csv


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999472 entries, 0 to 999471
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   UnderwrittenCoverID       999472 non-null  int64  
 1   PolicyID                  999472 non-null  int64  
 2   TransactionMonth          999472 non-null  object 
 3   IsVATRegistered           999472 non-null  bool   
 4   Citizenship               999472 non-null  object 
 5   LegalType                 999472 non-null  object 
 6   Title                     999472 non-null  object 
 7   Language                  999472 non-null  object 
 8   Bank                      999472 non-null  object 
 9   AccountType               999472 non-null  object 
 10  MaritalStatus             999472 non-null  object 
 11  Gender                    999472 non-null  object 
 12  Country                   999472 non-null  object 
 13  Province                  999472 non-null  o

 Claim Frequency
Proportion of policies that had at least one claim.

In [5]:
# # Create binary indicator for having at least one claim
df['HadClaim'] = df['TotalClaims'] > 0

# Example: Claim Frequency by Province
claim_frequency = df.groupby('Province')['HadClaim'].mean().sort_values(ascending=False)

print(claim_frequency)


Province
Gauteng          0.003359
KwaZulu-Natal    0.002845
Limpopo          0.002698
North West       0.002436
Mpumalanga       0.002432
Western Cape     0.002094
Eastern Cape     0.001648
Free State       0.001358
Northern Cape    0.001254
Name: HadClaim, dtype: float64


 Claim Severity
Average claim amount given that a claim occurred.

In [6]:
claim_severity = df[df['TotalClaims'] > 0].groupby('Province')['TotalClaims'].mean()
claim_severity

Province
Eastern Cape     27128.533277
Free State       32265.661085
Gauteng          22243.878396
KwaZulu-Natal    29609.487473
Limpopo          15171.294187
Mpumalanga       15979.553421
North West       16963.467035
Northern Cape    11186.313596
Western Cape     27559.603588
Name: TotalClaims, dtype: float64

Margin
Profit per policy = TotalPremium - TotalClaims

In [7]:
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['Margin']

0          21.929825
1          21.929825
2           0.000000
3         512.848070
4           0.000000
             ...    
999467    347.235175
999468    347.235175
999469    347.235175
999470      2.315000
999471      2.315000
Name: Margin, Length: 999472, dtype: float64

Data Segmentation and Statistical  Testing

1.	H₀:There are no risk differences across provinces 

In [8]:
from scipy.stats import f_oneway

# Filter only rows with claims
claimed_df = df[df['HadClaim']]

# Run ANOVA (since there are multiple provinces)
groups = [group['TotalClaims'] for name, group in claimed_df.groupby('Province')]
f_stat, p = f_oneway(*groups)

print(f"ANOVA P-Value: {p}")
if p < 0.05:
    print("Reject H₀: Provinces differ significantly in claim severity.")
else:
    print("Fail to reject H₀.")


ANOVA P-Value: 1.0022718272355673e-05
Reject H₀: Provinces differ significantly in claim severity.


2.	H₀:There are no risk differences between zip codes 

In [9]:
from scipy.stats import f_oneway

# Filter only rows with claims
claimed_df = df[df['HadClaim']]

# Run ANOVA (since there are multiple provinces)
groups = [group['TotalClaims'] for name, group in claimed_df.groupby('PostalCode')]
f_stat, p = f_oneway(*groups)

print(f"ANOVA P-Value: {p}")
if p < 0.05:
    print("Reject H₀: PostalCodes differ significantly in claim severity.")
else:
    print("Fail to reject H₀.")


ANOVA P-Value: 0.029871120348189457
Reject H₀: PostalCodes differ significantly in claim severity.


3.	H₀:There are no significant margin (profit) difference between zip codes 

In [14]:
# Find the two most frequent postal codes
top_postal_codes = df['PostalCode'].value_counts().nlargest(2).index.tolist()

# Extract margins for the two postal codes
group_a = df[df['PostalCode'] == top_postal_codes[0]]['Margin']
group_b = df[df['PostalCode'] == top_postal_codes[1]]['Margin']

# Perform t-test
from scipy.stats import ttest_ind

t_stat, p = ttest_ind(group_a, group_b, equal_var=False)

print(f"T-test P-Value: {p}")
if p < 0.05:
    print(f"Reject H₀: Margin differs between postal codes {top_postal_codes[0]} and {top_postal_codes[1]}.")
else:
    print(f"Fail to reject null hypothesis for postal codes {top_postal_codes[0]} and {top_postal_codes[1]}.")


T-test P-Value: 0.24623805807451107
Fail to reject null hypothesis for postal codes 2000 and 122.


In [15]:
mean_a = group_a.mean()
mean_b = group_b.mean()

print(f"Mean Margin for Postal Code {top_postal_codes[0]}: {mean_a}")
print(f"Mean Margin for Postal Code {top_postal_codes[1]}: {mean_b}")

if mean_a > mean_b:
    print(f"Postal Code {top_postal_codes[0]} has a higher average margin.")
elif mean_b > mean_a:
    print(f"Postal Code {top_postal_codes[1]} has a higher average margin.")
else:
    print("Both postal codes have the same average margin.")


Mean Margin for Postal Code 2000: -8.162335926182381
Mean Margin for Postal Code 122: -22.859806159734156
Postal Code 2000 has a higher average margin.


● Interpretation & Business Recommendation:
We fail to reject the null hypothesis for margin differences between postal codes 2000 and 122 (p = 0.25). Although postal code 2000 shows a higher average margin (-8.16) compared to postal code 122 (-22.86), this difference is not statistically significant. Therefore, no postal code–based margin adjustment is currently justified. However, the trend suggests further analysis with more data could be beneficial for future regional pricing strategies.


4.	H₀:There are not significant risk difference between Women and Men

In [12]:
from scipy.stats import chi2_contingency

# Gender-based claim frequency table
contingency = pd.crosstab(df['Gender'], df['HadClaim'])

# Chi-square test
chi2_stat, p_gender, dof, expected = chi2_contingency(contingency)

print(f"Chi-square Test (Gender) P-Value: {p_gender}")
if p_gender < 0.05:
    print("Reject H₀: Claim frequency differs by Gender.")
else:
    print("Fail to reject H₀: No significant gender-based risk difference.")

# Show proportions
gender_claim_freq = df.groupby('Gender')['HadClaim'].mean()
print("\nClaim Frequency by Gender:")
print(gender_claim_freq)


Chi-square Test (Gender) P-Value: 0.0339731936612422
Reject H₀: Claim frequency differs by Gender.

Claim Frequency by Gender:
Gender
Female           0.002073
Male             0.002195
Not specified    0.002808
Name: HadClaim, dtype: float64


Interpretation & Business Recommendation: We reject the null hypothesis for gender (p = 0.034). Claim frequency differs slightly, with males and unspecified genders showing higher claim rates than females. This suggests gender-based risk differentiation should be considered when setting premiums to improve pricing accuracy.