Import Required Libraries

In [35]:
import pandas as pd
import numpy as np
from scipy import stats


In [36]:
# Load data
# data = pd.read_csv('../data/cleaned_data.csv')
data = pd.read_csv('../data/cleaned_data.csv', header=None,low_memory=False)


In [37]:
# Optional: Check for any missing values
# data.isnull().sum()

In [38]:
# Define the correct column names
column_names = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 
                'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 
                'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 
                'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 
                'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 
                'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 
                'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 
                'TotalPremium', 'TotalClaims']

In [39]:
data.columns = column_names

In [44]:
# Optional: Check for any missing values
data.isnull().sum()

UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                              0
AccountType                       0
MaritalStatus                     0
Gender                            0
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                            0
VehicleType                       0
RegistrationYear                  0
make                              0
Model                             0
Cylinders                         0
cubiccapacity                     0
kilowatts                         0
bodytype                          0
NumberOfDoors               

Task 3

In [45]:
# Convert TotalPremium and TotalClaims to numeric
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')


Define Metrics and Perform Data Segmentation

Provinces

In [63]:
risk_metric = 'TotalClaims'
margin_metric = 'TotalPremium'  # You can calculate margin as TotalPremium - TotalClaims

# Group A: Provinces with specific feature
group_A_province = data[data['Province'] == 'Gauteng']  # Change to actual province names
group_B_province = data[data['Province'] == 'Western Cape']


In [64]:
# Perform t-test on TotalClaims between provinces
stat_province, p_value_province = stats.ttest_ind(group_A_province[risk_metric], group_B_province[risk_metric], equal_var=False)

print("Hypothesis 1: Test risk differences across provinces")
print("-----------------------------------------------------")
print(f"T-statistic: {stat_province}")
print(f"P-value: {p_value_province}")

if p_value_province < 0.05:
    print("Reject Null Hypothesis: Significant risk difference across provinces")
else:
    print("Fail to Reject Null Hypothesis: No significant risk difference across provinces")


Hypothesis 1: Test risk differences across provinces
-----------------------------------------------------
T-statistic: 1.8652143496485993
P-value: 0.06215231452280004
Fail to Reject Null Hypothesis: No significant risk difference across provinces


Zip Codes

In [65]:

# Group A: Zip Codes with specific feature
group_A_zip = data[data['PostalCode'] == '1459']
group_B_zip = data[data['PostalCode'] == '7784']


In [66]:
# Perform t-test on TotalClaims between zip codes
stat_zip, p_value_zip = stats.ttest_ind(group_A_zip[risk_metric], group_B_zip[risk_metric], equal_var=False)

print("Hypothesis 1: Test risk differences across zip codes")
print("-----------------------------------------------------")
print(f"T-statistic: {stat_zip}")
print(f"P-value: {p_value_zip}")

if p_value_zip < 0.05:
    print("Reject Null Hypothesis: Significant risk difference between zip codes")
else:
    print("Fail to Reject Null Hypothesis: No significant risk difference between zip codes")


Hypothesis 1: Test risk differences across zip codes
-----------------------------------------------------
T-statistic: -3.7403598910754248
P-value: 0.00018411572295022876
Reject Null Hypothesis: Significant risk difference between zip codes


margin difference between zip code

In [69]:
# Calculate margin (TotalPremium - TotalClaims)
data['Margin'] = data[margin_metric] - data[risk_metric]

# Optional: Check if the 'Margin' column is successfully added
print(data[['TotalPremium', 'TotalClaims', 'Margin']].head())


   TotalPremium  TotalClaims      Margin
0           NaN          NaN         NaN
1     21.929825          0.0   21.929825
2     21.929825          0.0   21.929825
3      0.000000          0.0    0.000000
4    512.848070          0.0  512.848070


In [72]:
# Calculate margin (TotalPremium - TotalClaims)
Margin = data[margin_metric] - data[risk_metric]
stat_margin_zip, p_value_margin_zip = stats.ttest_ind(group_A_zip['Margin'], group_B_zip['Margin'], equal_var=False)


print(f"T-statistic:{stat_margin_zip}")
print(f"P-value: {p_value_margin_zip}")

if p_value_margin_zip < 0.05:
    print("Reject Null Hypothesis: Significant margin difference between zip codes")
else:
    print("Fail to Reject Null Hypothesis: No significant margin difference between zip codes")


T-statistic:4.919455962425793
P-value: 8.76223176989912e-07
Reject Null Hypothesis: Significant margin difference between zip codes


Gender comparison

In [None]:
# Group A: Gender comparison (Men)
group_A_gender = data[data['Gender'] == 'Male']
group_B_gender = data[data['Gender'] == 'Female']


In [None]:
# Perform t-test on TotalClaims between genders
stat_gender, p_value_gender = stats.ttest_ind(group_A_gender[risk_metric], group_B_gender[risk_metric], equal_var=False)

print("Hypothesis 1: Test risk differences across genders")
print("-----------------------------------------------------")
print(f"T-statistic: {stat_gender}")
print(f"P-value: {p_value_gender}")


if p_value_gender < 0.05:
    print("Reject Null Hypothesis: Significant risk difference between Women and Men")
else:
    print("Fail to Reject Null Hypothesis: No significant risk difference between Women and Men")


Hypothesis 1: Test risk differences across genders
-----------------------------------------------------
T-statistic: -0.296353891400699
P-value: 0.7669656471629474
Fail to Reject Null Hypothesis: No significant risk difference between Women and Men
