In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
import sys

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


sys.path.append('../src') 

from utils.data_loader import load_data



In [6]:
df=load_data("../data/processed/cleaned_dataset.csv",delimiter=",")

Data loaded successfully from ../data/processed/cleaned_dataset.csv


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999472 entries, 0 to 999471
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   UnderwrittenCoverID       999472 non-null  int64  
 1   PolicyID                  999472 non-null  int64  
 2   TransactionMonth          999472 non-null  object 
 3   IsVATRegistered           999472 non-null  bool   
 4   Citizenship               999472 non-null  object 
 5   LegalType                 999472 non-null  object 
 6   Title                     999472 non-null  object 
 7   Language                  999472 non-null  object 
 8   Bank                      999472 non-null  object 
 9   AccountType               999472 non-null  object 
 10  MaritalStatus             999472 non-null  object 
 11  Gender                    999472 non-null  object 
 12  Country                   999472 non-null  object 
 13  Province                  999472 non-null  o

 Claim Frequency
Proportion of policies that had at least one claim.

In [8]:
# # Create binary indicator for having at least one claim
df['HadClaim'] = df['TotalClaims'] > 0

# Example: Claim Frequency by Province
claim_frequency = df.groupby('Province')['HadClaim'].mean().sort_values(ascending=False)

print(claim_frequency)


Province
Gauteng          0.003359
KwaZulu-Natal    0.002845
Limpopo          0.002698
North West       0.002436
Mpumalanga       0.002432
Western Cape     0.002094
Eastern Cape     0.001648
Free State       0.001358
Northern Cape    0.001254
Name: HadClaim, dtype: float64


 Claim Severity
Average claim amount given that a claim occurred.

In [10]:
claim_severity = df[df['TotalClaims'] > 0].groupby('Province')['TotalClaims'].mean()
claim_severity

Province
Eastern Cape     27128.533277
Free State       32265.661085
Gauteng          22243.878396
KwaZulu-Natal    29609.487473
Limpopo          15171.294187
Mpumalanga       15979.553421
North West       16963.467035
Northern Cape    11186.313596
Western Cape     27559.603588
Name: TotalClaims, dtype: float64

Margin
Profit per policy = TotalPremium - TotalClaims

In [11]:
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['Margin']

0          21.929825
1          21.929825
2           0.000000
3         512.848070
4           0.000000
             ...    
999467    347.235175
999468    347.235175
999469    347.235175
999470      2.315000
999471      2.315000
Name: Margin, Length: 999472, dtype: float64

Data Segmentation and Statistical  Testing

1.	H₀:There are no risk differences across provinces 

In [18]:
from scipy.stats import f_oneway

# Filter only rows with claims
claimed_df = df[df['HadClaim']]

# Run ANOVA (since there are multiple provinces)
groups = [group['TotalClaims'] for name, group in claimed_df.groupby('Province')]
f_stat, p = f_oneway(*groups)

print(f"ANOVA P-Value: {p}")
if p < 0.05:
    print("Reject H₀: Provinces differ significantly in claim severity.")
else:
    print("Fail to reject H₀.")


ANOVA P-Value: 1.0022718272355673e-05
Reject H₀: Provinces differ significantly in claim severity.


2.	H₀:There are no risk differences between zip codes 

In [19]:
from scipy.stats import f_oneway

# Filter only rows with claims
claimed_df = df[df['HadClaim']]

# Run ANOVA (since there are multiple provinces)
groups = [group['TotalClaims'] for name, group in claimed_df.groupby('PostalCode')]
f_stat, p = f_oneway(*groups)

print(f"ANOVA P-Value: {p}")
if p < 0.05:
    print("Reject H₀: PostalCodes differ significantly in claim severity.")
else:
    print("Fail to reject H₀.")


ANOVA P-Value: 0.029871120348189457
Reject H₀: PostalCodes differ significantly in claim severity.


3.	H₀:There are no significant margin (profit) difference between zip codes 

In [22]:
from scipy.stats import ttest_ind

df['Margin'] = df['TotalPremium'] - df['TotalClaims']

group_a = df[df['PostalCode'] == 1459]['Margin']
group_b = df[df['PostalCode'] == 1513]['Margin']

t_stat, p = ttest_ind(group_a, group_b, equal_var=False)

print(f"T-test P-Value: {p}")
if p < 0.05:
    print("Reject H₀: Margin differs between postal codes.")
else:
    print("Fail to reject null hypothesis.")


T-test P-Value: 0.6630316429729602
Fail to reject null hypothesis.


4.	H₀:There are not significant risk difference between Women and Men

In [20]:
from scipy.stats import chi2_contingency

# Contingency table: number of people with and without claims per gender
contingency = pd.crosstab(df['Gender'], df['HadClaim'])

# Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print(f"Chi-square Test P-Value: {p}")
if p < 0.05:
    print("Reject the null hypothesis: Claim frequency differs by Gender.")
else:
    print("Fail to reject the null hypothesis: No significant difference.")


Chi-square Test P-Value: 0.0339731936612422
Reject the null hypothesis: Claim frequency differs by Gender.
