# B5W3: End-to-End Insurance Risk Analytics & Predictive Modeling

In [21]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Used for checking for division by zero
sns.set_style('whitegrid')

In [16]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [17]:
df  = pd.read_csv('../data/MachineLearningRating_v3.csv', low_memory=False )

In [4]:
df.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [5]:
df.columns.tolist()

['UnderwrittenCoverID',
 'PolicyID',
 'TransactionMonth',
 'IsVATRegistered',
 'Citizenship',
 'LegalType',
 'Title',
 'Language',
 'Bank',
 'AccountType',
 'MaritalStatus',
 'Gender',
 'Country',
 'Province',
 'PostalCode',
 'MainCrestaZone',
 'SubCrestaZone',
 'ItemType',
 'mmcode',
 'VehicleType',
 'RegistrationYear',
 'make',
 'Model',
 'Cylinders',
 'cubiccapacity',
 'kilowatts',
 'bodytype',
 'NumberOfDoors',
 'VehicleIntroDate',
 'CustomValueEstimate',
 'AlarmImmobiliser',
 'TrackingDevice',
 'CapitalOutstanding',
 'NewVehicle',
 'WrittenOff',
 'Rebuilt',
 'Converted',
 'CrossBorder',
 'NumberOfVehiclesInFleet',
 'SumInsured',
 'TermFrequency',
 'CalculatedPremiumPerTerm',
 'ExcessSelected',
 'CoverCategory',
 'CoverType',
 'CoverGroup',
 'Section',
 'Product',
 'StatutoryClass',
 'StatutoryRiskType',
 'TotalPremium',
 'TotalClaims']

In [6]:
df.dtypes

UnderwrittenCoverID           int64
PolicyID                      int64
TransactionMonth             object
IsVATRegistered                bool
Citizenship                  object
LegalType                    object
Title                        object
Language                     object
Bank                         object
AccountType                  object
MaritalStatus                object
Gender                       object
Country                      object
Province                     object
PostalCode                    int64
MainCrestaZone               object
SubCrestaZone                object
ItemType                     object
mmcode                      float64
VehicleType                  object
RegistrationYear              int64
make                         object
Model                        object
Cylinders                   float64
cubiccapacity               float64
kilowatts                   float64
bodytype                     object
NumberOfDoors               

In [7]:
df.shape

(1000098, 52)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      854137 non-null   object 
 9   AccountType               959866 non-null   object 
 10  MaritalStatus             991839 non-null   object 
 11  Gender                    990562 non-null   object 
 12  Country                   1000098 non-null  object 
 13  Province                  1

In [9]:
df.isnull().sum()

UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
NumberOfDoors               

In [10]:
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")


Number of duplicate rows: 0


In [11]:
print(df.describe().T) # .T transposes the output for better readability

                              count          mean           std           min  \
UnderwrittenCoverID       1000098.0  1.048175e+05  6.329371e+04  1.000000e+00   
PolicyID                  1000098.0  7.956682e+03  5.290039e+03  1.400000e+01   
PostalCode                1000098.0  3.020601e+03  2.649854e+03  1.000000e+00   
mmcode                     999546.0  5.487770e+07  1.360381e+07  4.041200e+06   
RegistrationYear          1000098.0  2.010225e+03  3.261391e+00  1.987000e+03   
Cylinders                  999546.0  4.046642e+00  2.940201e-01  0.000000e+00   
cubiccapacity              999546.0  2.466743e+03  4.428006e+02  0.000000e+00   
kilowatts                  999546.0  9.720792e+01  1.939326e+01  0.000000e+00   
NumberOfDoors              999546.0  4.019250e+00  4.683144e-01  0.000000e+00   
CustomValueEstimate        220456.0  2.255311e+05  5.645157e+05  2.000000e+04   
NumberOfVehiclesInFleet         0.0           NaN           NaN           NaN   
SumInsured                10

In [12]:
print(df[['SumInsured', 'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims']].describe().T)

                              count           mean           std  \
SumInsured                1000098.0  604172.732589  1.508332e+06   
CalculatedPremiumPerTerm  1000098.0     117.875691  3.997017e+02   
TotalPremium              1000098.0      61.905496  2.302845e+02   
TotalClaims               1000098.0      64.861190  2.384075e+03   

                                   min        25%          50%            75%  \
SumInsured                    0.010000  5000.0000  7500.000000  250000.000000   
CalculatedPremiumPerTerm      0.000000     3.2248     8.436900      90.000000   
TotalPremium               -782.576754     0.0000     2.178333      21.929825   
TotalClaims              -12002.412281     0.0000     0.000000       0.000000   

                                   max  
SumInsured                1.263620e+07  
CalculatedPremiumPerTerm  7.442217e+04  
TotalPremium              6.528260e+04  
TotalClaims               3.930921e+05  


### Summarize Categorical Data

In [13]:
print("\nDescriptive Statistics for Categorical Columns:")
print(df.describe(include='object').T)


Descriptive Statistics for Categorical Columns:
                      count unique                                 top  \
TransactionMonth    1000098     23                 2015-08-01 00:00:00   
Citizenship         1000098      4                                       
LegalType           1000098      6                          Individual   
Title               1000098      5                                  Mr   
Language            1000098      1                             English   
Bank                 854137     11                 First National Bank   
AccountType          959866      3                     Current account   
MaritalStatus        991839      3                       Not specified   
Gender               990562      3                       Not specified   
Country             1000098      1                        South Africa   
Province            1000098      9                             Gauteng   
MainCrestaZone      1000098     16     Transvaal (all except Pr

In [14]:
# Check the unique values and their frequencies for key categorical columns
print("\nValue counts for 'make':")
print(df['make'].value_counts().head(10)) # Top 10 most frequent makes

print("\nValue counts for 'Country':")
print(df['Country'].value_counts())

print("\nUnique values in 'VehicleType':")
print(df['VehicleType'].unique())


Value counts for 'make':
make
TOYOTA                                 813280
MERCEDES-BENZ                           41940
CMC                                     21624
VOLKSWAGEN                              20929
C.A.M                                   16171
GOLDEN JOURNEY                          14462
NISSAN/DATSUN                           10997
JINBEI                                  10374
IVECO                                    8430
AUDI                                     7407
Name: count, dtype: int64

Value counts for 'Country':
Country
South Africa    1000098
Name: count, dtype: int64

Unique values in 'VehicleType':
['Passenger Vehicle' 'Medium Commercial' 'Heavy Commercial'
 'Light Commercial' 'Bus' nan]


#### Calculate the Overall Loss Ratio for the Portfolio

In [18]:
# Calculate the sum of all claims and all premiums
total_claims = df['TotalClaims'].sum()
total_premium = df['TotalPremium'].sum()

# Calculate the overall Loss Ratio
# Add a check to avoid division by zero if total premium is 0
if total_premium > 0:
    overall_loss_ratio = total_claims / total_premium
    print(f"Overall Loss Ratio for the portfolio: {overall_loss_ratio:.2%}")
else:
    print("Cannot calculate overall Loss Ratio: Total Premium is zero.")

Overall Loss Ratio for the portfolio: 104.77%


- A loss ratio less than 100% (1.00) means the company is making a profit from underwriting.
- A loss ratio greater than 100% (1.00) means the company is paying out more in claims than it is collecting in premiums.
- So, 104.77% mean company is paying out more in claims than it is collecting in premiums.

#### Loss Ratio by Province

In [None]:
# Group by Province and sum the TotalClaims and TotalPremium
loss_ratio_by_province = df.groupby('Province')[['TotalClaims', 'TotalPremium']].sum()

# Calculate the Loss Ratio for each Province
loss_ratio_by_province['LossRatio'] = np.where(
    loss_ratio_by_province['TotalPremium'] > 0, 
    loss_ratio_by_province['TotalClaims'] / loss_ratio_by_province['TotalPremium'], 
)

# Sort the results to see the highest loss ratios first
loss_ratio_by_province = loss_ratio_by_province.sort_values('LossRatio', ascending=False)

print("\nLoss Ratio by Province (sorted by highest Loss Ratio):")
print(loss_ratio_by_province[['TotalPremium', 'TotalClaims', 'LossRatio']])


Loss Ratio by Province (sorted by highest Loss Ratio):
               TotalPremium   TotalClaims  LossRatio
Province                                            
Gauteng        2.405377e+07  2.939415e+07   1.222018
KwaZulu-Natal  1.320908e+07  1.430138e+07   1.082693
Western Cape   9.806559e+06  1.038977e+07   1.059472
North West     7.490508e+06  5.920250e+06   0.790367
Mpumalanga     2.836292e+06  2.044675e+06   0.720897
Free State     5.213632e+05  3.549223e+05   0.680758
Limpopo        1.537324e+06  1.016477e+06   0.661199
Eastern Cape   2.140104e+06  1.356427e+06   0.633813
Northern Cape  3.165581e+05  8.949051e+04   0.282699


- Gauteng, KwaZulu-Natal, and Western Cape are Unprofitable
- Northern Cape is the Most Profitable

####  Loss Ratio by VehicleType

In [23]:
# Group by VehicleType and sum the claims and premiums
loss_ratio_by_vehicle_type = df.groupby('VehicleType')[['TotalClaims', 'TotalPremium']].sum()

# Calculate the Loss Ratio for each VehicleType
loss_ratio_by_vehicle_type['LossRatio'] = np.where(
    loss_ratio_by_vehicle_type['TotalPremium'] > 0,
    loss_ratio_by_vehicle_type['TotalClaims'] / loss_ratio_by_vehicle_type['TotalPremium'],
    0
)

# Sort the results and filter out types with very low premium volumes if needed
loss_ratio_by_vehicle_type = loss_ratio_by_vehicle_type.sort_values('TotalPremium', ascending=False)

print("\nLoss Ratio by VehicleType (sorted by highest Premium Volume):")
print(loss_ratio_by_vehicle_type[['TotalPremium', 'TotalClaims', 'LossRatio']])


Loss Ratio by VehicleType (sorted by highest Premium Volume):
                   TotalPremium   TotalClaims  LossRatio
VehicleType                                             
Passenger Vehicle  5.664202e+07  5.937207e+07   1.048198
Medium Commercial  3.922746e+06  4.119867e+06   1.050251
Heavy Commercial   4.609479e+05  7.504746e+05   1.628112
Light Commercial   2.604975e+05  6.045250e+04   0.232066
Bus                5.824474e+04  7.996535e+03   0.137292


- Heavy Commercial highly unprofitable
- Medium Commercial slightly unprofitable
- Light Commercial and Bus  are highly profitable

#### Loss Ratio by Gender

In [24]:
# Group by Gender and sum the claims and premiums
loss_ratio_by_gender = df.groupby('Gender')[['TotalClaims', 'TotalPremium']].sum()

# Calculate the Loss Ratio for each Gender
loss_ratio_by_gender['LossRatio'] = np.where(
    loss_ratio_by_gender['TotalPremium'] > 0,
    loss_ratio_by_gender['TotalClaims'] / loss_ratio_by_gender['TotalPremium'],
    0
)

print("\nLoss Ratio by Gender:")
print(loss_ratio_by_gender[['TotalPremium', 'TotalClaims', 'LossRatio']])


Loss Ratio by Gender:
               TotalPremium   TotalClaims  LossRatio
Gender                                              
Female         3.044806e+05  2.502461e+05   0.821879
Male           1.580143e+06  1.396704e+06   0.883910
Not specified  5.920275e+07  6.271410e+07   1.059311


- Female and Male : Both of these categories are profitable, with premiums collected comfortably exceeding claims paid out. The Female segment appears slightly more profitable than the Male segment.
- Not specified (1.05): This category is unprofitable, operating at a loss. For every R1 of premium collected from this segment, R1.05 is paid out in claims.