ðŸ“˜ TASK 4 â€” Statistical Modeling & Risk-Based Premium Prediction

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.metrics import (
    mean_squared_error, r2_score,
    accuracy_score, f1_score, roc_auc_score
)

import shap
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


## 2. Load Dataset

In [6]:
df = pd.read_csv("../data/insurance_data.csv", low_memory=False)
df.head()


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827.0,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827.0,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827.0,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827.0,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827.0,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


## 3. Create Required Features

In [7]:
# Claim Frequency
df["HasClaim"] = (df["TotalClaims"] > 0).astype(int)

# Margin
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]

# Loss Ratio
df["LossRatio"] = np.where(
    df["TotalPremium"] > 0,
    df["TotalClaims"] / df["TotalPremium"],
    np.nan
)

# Example Feature: Vehicle Age
if "RegistrationYear" in df.columns:
    df["VehicleAge"] = pd.to_numeric(df["RegistrationYear"], errors="coerce")


## 4. Select Modeling Features

In [8]:
features = [
    "Cubiccapacity", "Kilowatts", "SumInsured",
    "VehicleType", "Make", "Model", "Gender",
    "Province", "NumberOfDoors", "VehicleAge"
]

features = [f for f in features if f in df.columns]

print("Using features:", features)


Using features: ['SumInsured', 'VehicleType', 'Model', 'Gender', 'Province', 'NumberOfDoors', 'VehicleAge']


ðŸŸ¦ PART A â€” CLASSIFICATION MODEL
Predicting Claim Probability (HasClaim)

## 5. Prepare Data for Classification

In [9]:
X = df[features].copy()
y = df["HasClaim"]

# Train/test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## 6. Preprocessing Pipeline (Shared for All Models)

In [10]:
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), numeric_cols),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical_cols)
])


## 7. Train Classification Models
### ðŸ”¹ Logistic Regression

In [11]:
clf_lr = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression(class_weight="balanced"))
])

clf_lr.fit(X_train_c, y_train_c)
preds_lr = clf_lr.predict(X_test_c)
proba_lr = clf_lr.predict_proba(X_test_c)[:,1]

print("Logistic Regression AUC:", roc_auc_score(y_test_c, proba_lr))


Logistic Regression AUC: 0.6236228835732172


### ðŸ”¹ Random Forest Classifier

In [15]:
clf_rf = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ))
])

clf_rf.fit(X_train_c, y_train_c)
preds_rf = clf_rf.predict(X_test_c)
proba_rf = clf_rf.predict_proba(X_test_c)[:,1]

print("Random Forest AUC:", roc_auc_score(y_test_c, proba_rf))


Random Forest AUC: 0.71080813932724


## 8. Classification Metrics

In [16]:
results_class = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "AUC": [
        roc_auc_score(y_test_c, proba_lr),
        roc_auc_score(y_test_c, proba_rf)
    ],
    "Accuracy": [
        accuracy_score(y_test_c, preds_lr),
        accuracy_score(y_test_c, preds_rf)
    ],
    "F1-score": [
        f1_score(y_test_c, preds_lr),
        f1_score(y_test_c, preds_rf)
    ]
})

results_class


Unnamed: 0,Model,AUC,Accuracy,F1-score
0,Logistic Regression,0.623623,0.45157,0.006898
1,Random Forest,0.710808,0.866108,0.021699


--------------------------------------------------
ðŸŸ© PART B â€” SEVERITY MODEL
Predicting TotalClaims for policies that had a claim

## 9. Prepare Data for Severity Modeling

In [17]:
claims_df = df[df["HasClaim"] == 1].copy()

X_sev = claims_df[features]
y_sev = claims_df["TotalClaims"]

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_sev, y_sev, test_size=0.2, random_state=42
)


## 10. Train Regression Models
### ðŸ”¹ Linear Regression

In [20]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test_s, preds_lr_s))
print("Linear Regression RMSE:", rmse)


Linear Regression RMSE: 36315.88901561408


### ðŸ”¹ Random Forest Regressor

In [22]:

reg_rf = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=120,      # faster than 300, similar performance
        random_state=42,
        n_jobs=-1,             # use all CPU cores
        max_depth=None         # optional: tune if needed
    ))
])

reg_rf.fit(X_train_s, y_train_s)
preds_rf_s = reg_rf.predict(X_test_s)

# RMSE (version-safe)
rmse = np.sqrt(mean_squared_error(y_test_s, preds_rf_s))
print("Random Forest RMSE:", rmse)


Random Forest RMSE: 34793.80339792203


## 11. Regression Metrics

In [23]:

results_reg = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test_s, preds_lr_s)),
        np.sqrt(mean_squared_error(y_test_s, preds_rf_s))
    ],
    "R2": [
        r2_score(y_test_s, preds_lr_s),
        r2_score(y_test_s, preds_rf_s)
    ]
})

results_reg


Unnamed: 0,Model,RMSE,R2
0,Linear Regression,36315.889016,0.022248
1,Random Forest,34793.803398,0.10249


ðŸŸ§ PART C â€” Risk-Based Premium Model
Premium = P(claim) Ã— Severity + Expense Loading + Profit Margin

## 12. Compute New Premiums

In [44]:
needed_cols = preprocessor.feature_names_in_
print(needed_cols)        # this shows the columns you should select from df



['SumInsured' 'VehicleType' 'Model' 'Gender' 'Province' 'NumberOfDoors'
 'VehicleAge']


In [49]:
raw_cols = clf_rf.named_steps["prep"].feature_names_in_

all(col in df.columns for col in raw_cols)  # should be True


True

In [51]:
def calculate_premium(df, clf_pipeline, reg_pipeline, expense_loading=0.10, profit_margin=0.15):
    """
    Calculate predicted insurance premium for a given dataframe using trained pipelines.
    
    Parameters:
        df (pd.DataFrame): New data for prediction
        clf_pipeline (Pipeline): Trained classifier pipeline (predicts claim probability)
        reg_pipeline (Pipeline): Trained regressor pipeline (predicts claim severity)
        expense_loading (float): Expense loading factor (default 0.10)
        profit_margin (float): Profit margin factor (default 0.15)
    
    Returns:
        pd.Series: Predicted premium for each row in df
    """
    # Get the raw columns expected by the classifier and regressor
    clf_raw_cols = clf_pipeline.named_steps["prep"].feature_names_in_
    reg_raw_cols = reg_pipeline.named_steps["prep"].feature_names_in_
    
    # Ensure dataframe contains all required columns
    missing_clf_cols = [c for c in clf_raw_cols if c not in df.columns]
    missing_reg_cols = [c for c in reg_raw_cols if c not in df.columns]
    
    if missing_clf_cols:
        raise ValueError(f"Missing columns for classifier: {missing_clf_cols}")
    if missing_reg_cols:
        raise ValueError(f"Missing columns for regressor: {missing_reg_cols}")
    
    # Select only needed columns
    df_clf = df[clf_raw_cols]
    df_reg = df[reg_raw_cols]
    
    # Predict probability of claim
    p_claim = clf_pipeline.predict_proba(df_clf)[:, 1]
    
    # Predict severity
    sev_pred = reg_pipeline.predict(df_reg)
    
    # Compute premium
    premium = p_claim * sev_pred + (expense_loading * sev_pred) + (profit_margin * sev_pred)
    
    return pd.Series(premium, index=df.index, name="PredictedPremium")




ðŸŸª PART D â€” Model Explainability (SHAP)

## 13. SHAP Interpretation (Random Forest)

-------------------------------------------------
ðŸŸ¦ PART E â€” Final Insights Summary

## 14. Summaries

In [54]:
print("=== Classification Results ===")
display(results_class)

print("\n=== Regression Results ===")
display(results_reg)


=== Classification Results ===


Unnamed: 0,Model,AUC,Accuracy,F1-score
0,Logistic Regression,0.623623,0.45157,0.006898
1,Random Forest,0.710808,0.866108,0.021699



=== Regression Results ===


Unnamed: 0,Model,RMSE,R2
0,Linear Regression,36315.889016,0.022248
1,Random Forest,34793.803398,0.10249
