In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
import sys

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


sys.path.append('../src') 

from utils.data_loader import load_data



In [2]:
df=load_data("../data/processed/cleaned_dataset.csv",delimiter=",")

Data loaded successfully from ../data/processed/cleaned_dataset.csv


1. Filter data to only rows where TotalClaims > 0

In [3]:

# Subset for only records with claims
claimed_df = df[df['TotalClaims'] > 0].copy()

In [4]:
claimed_df

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,LossRatio
203,46222,4044,2014-10-01,False,,Close Corporation,Mr,English,First National Bank,Current account,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.000000,2294.096491,inf
284,82062,7174,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,2040.473684,93.045600
1560,119591,8672,2015-04-01,False,,Close Corporation,Mr,English,First National Bank,Current account,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,1213.889211,46492.211754,38.300210
1779,50193,283,2014-10-01,False,,Close Corporation,Mr,English,Standard Bank,Current account,...,Own Damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,645.017456,26516.859649,41.110298
1943,119582,8672,2015-04-01,False,,Close Corporation,Mr,English,First National Bank,Current account,...,Income Protector,Income Protector,Income Protector,Optional Extended Covers,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,74.561404,6140.350877,82.352941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997886,263942,962,2015-07-01,False,ZA,Private company,Mr,English,First National Bank,Current account,...,Windscreen (2015),Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,0.000000,7996.535088,inf
998222,937,138,2014-08-01,False,ZA,Private company,Mr,English,RMB Private Bank,Current account,...,Own Damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,613.519737,40011.859649,65.216907
998592,3255,394,2014-05-01,False,ZA,Private company,Mr,English,RMB Private Bank,Current account,...,Own Damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,836.004912,101812.517544,121.784592
999075,238713,20611,2015-07-01,False,ZA,Private company,Mr,English,RMB Private Bank,Current account,...,Vehicle over R100 000,Own Damage,Motor Comprehensive,Motor Comprehensive,Bridge Taxi Finance: Monthly,Commercial,IFRS Constant,650.879211,155317.578947,238.627347


 2. Drop irrelevant or ID-like columns

In [5]:
# Drop ID columns and the target itself (TotalClaims) from features
X = claimed_df.drop(columns=['TotalClaims', 'PolicyID', 'CustomerID'], errors='ignore')
y = claimed_df['TotalClaims']


3. Encode categorical variables
Use one-hot encoding for all object or category-type columns.


In [6]:

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

 4. Train-Test Split


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Replace inf/-inf with nan, then fill nan with column mean for both train and test
X_train_clean = X_train.replace([np.inf, -np.inf], np.nan).fillna(X_train.mean())
X_test_clean = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_train.mean())

# Initialize the model
lr_model = LinearRegression()

# Fit the model
lr_model.fit(X_train_clean, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test_clean)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression RMSE: {rmse:.2f}")
print(f"Linear Regression R²: {r2:.3f}")


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest R²: {r2_rf:.3f}")


ValueError: Input X contains infinity or a value too large for dtype('float32').

In [None]:
%pip install xgboost

import xgboost as xgb

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict on test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb:.2f}")
print(f"XGBoost R²: {r2_xgb:.3f}")


In [None]:
import pandas as pd

# Gather metrics into a dictionary
results = {
    "Model": ["Linear Regression", "Random Forest", "XGBoost"],
    "RMSE": [rmse_lr, rmse_rf, rmse_xgb],
    "R-squared": [r2_lr, r2_rf, r2_xgb]
}

# Create a DataFrame for easy visualization
results_df = pd.DataFrame(results)

print(results_df)
