# B5W3: End-to-End Insurance Risk Analytics & Predictive Modeling
## Task 4
    - Build and evaluate predictive models

In [7]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, roc_auc_score, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt
import shap 
from sklearn.impute import SimpleImputer

In [8]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [9]:
df  = pd.read_csv('../data/MachineLearningRating_v3.csv', low_memory=False )

#### Missing values

In [5]:
df.isnull().sum()

UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
NumberOfDoors               

In [10]:
# --- Strategy Implementation ---

# 1. Identify and drop columns with very high missingness (> 60%)
columns_to_drop = [
    'NumberOfVehiclesInFleet',
    'CrossBorder',
    'CustomValueEstimate',
    'WrittenOff',
    'Rebuilt',
    'Converted'
]
df_cleaned = df.drop(columns=columns_to_drop)
print(f"Dropped {len(columns_to_drop)} columns with high missing data.")

# 2. Drop rows with low missingness (< 5%)
# This is a safe and clean approach for columns like Gender, MaritalStatus, and vehicle details
# We'll drop rows where any of these columns have a missing value.
rows_to_drop_na = [
    'Gender',
    'MaritalStatus',
    'CapitalOutstanding',
    'mmcode', 'VehicleType', 'make', 'Model', 'Cylinders', 'cubiccapacity',
    'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate'
]

# Create a copy to avoid SettingWithCopyWarning
df_cleaned = df_cleaned.dropna(subset=rows_to_drop_na).copy()
print(f"Dropped rows with missing values in key low-missingness columns.")
print(f"Remaining rows after dropping: {df_cleaned.shape[0]}")

# 3. Impute columns with moderate missingness (5-30%)
# These columns are `Bank`, `AccountType`, and `NewVehicle`
# Let's check their data types to choose the right imputation strategy
# `Bank` and `AccountType` are likely categorical. `NewVehicle` is likely binary.
# We will use the `most_frequent` (mode) strategy.

imputation_features = ['Bank', 'AccountType', 'NewVehicle']
imputer = SimpleImputer(strategy='most_frequent')

# The imputer needs to be fitted on the data
df_cleaned[imputation_features] = imputer.fit_transform(df_cleaned[imputation_features])
print("Imputed missing values in 'Bank', 'AccountType', and 'NewVehicle' with the mode.")

# Final check of missing values after cleaning
print("\nFinal Missing Value Summary:")
print(df_cleaned.isnull().sum().sort_values(ascending=False))

# Update your `modeling_features` list to reflect the changes
modeling_features = [
    'Province', 'Gender', 'make', 'body_type', 'Cylinders', 'VehicleAge',
    'CapitalOutstanding', 'SumInsured', 'CalculatedPremiumPerTerm',
    'Bank', 'AccountType', 'NewVehicle', 'MaritalStatus' # Add imputed features
]
# Note: 'Model' is a high-cardinality feature. We will handle it with one-hot encoding later.
# You might need to add `cubiccapacity`, `kilowatts`, and `NumberOfDoors` if you want to use them.

Dropped 6 columns with high missing data.
Dropped rows with missing values in key low-missingness columns.
Remaining rows after dropping: 989557
Imputed missing values in 'Bank', 'AccountType', and 'NewVehicle' with the mode.

Final Missing Value Summary:
UnderwrittenCoverID         0
PolicyID                    0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
LegalType                   0
Title                       0
Language                    0
Bank                        0
AccountType                 0
MaritalStatus               0
Gender                      0
Country                     0
Province                    0
PostalCode                  0
MainCrestaZone              0
SubCrestaZone               0
ItemType                    0
mmcode                      0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity           

#### Feature Engineering

In [11]:
# Create a copy to perform feature engineering without modifying the original DataFrame
df_fe = df.copy()

# 1. Vehicle Age
# This is a classic risk feature, as older vehicles may have different risk profiles.
# We'll calculate it relative to the latest registration year in the data.
max_reg_year = df_fe['RegistrationYear'].max()
df_fe['VehicleAge'] = max_reg_year - df_fe['RegistrationYear']
print(f"Created 'VehicleAge' based on RegistrationYear. Max age is {df_fe['VehicleAge'].max()} years.")

# 2. Premium Rate (a proxy for risk density)
# This ratio captures the premium paid relative to the sum insured.
# A higher rate might indicate a policy the insurer already considered higher risk.
# Handle division by zero/missing SumInsured to avoid errors
df_fe['PremiumRate'] = df_fe['CalculatedPremiumPerTerm'] / df_fe['SumInsured'].replace(0, np.nan)
# Replace infinite values with NaN and then a sensible number like the median
df_fe['PremiumRate'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_fe['PremiumRate'].fillna(df_fe['PremiumRate'].median(), inplace=True)
print("Created 'PremiumRate' (CalculatedPremiumPerTerm / SumInsured).")

# 3. Under/Over-Insurance Ratio
# This compares the policy's sum insured to the vehicle's estimated value,
# which could be a proxy for customer behavior or risk.
# Handle division by zero/missing CustomValueEstimate
df_fe['SumInsuredRatio'] = df_fe['SumInsured'] / df_fe['CustomValueEstimate'].replace(0, np.nan)
df_fe['SumInsuredRatio'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_fe['SumInsuredRatio'].fillna(df_fe['SumInsuredRatio'].median(), inplace=True)
print("Created 'SumInsuredRatio' (SumInsured / CustomValueEstimate).")

# 4. Binning a numerical feature (e.g., VehicleAge)
# This converts a continuous variable into categorical bins, which can help models
# capture non-linear relationships without needing a complex model.
df_fe['VehicleAgeGroup'] = pd.cut(
    df_fe['VehicleAge'],
    bins=[0, 5, 10, 15, 20, np.inf],
    labels=['0-5 Years', '6-10 Years', '11-15 Years', '16-20 Years', '20+ Years'],
    right=False
)
print("Created 'VehicleAgeGroup' by binning VehicleAge.")

# 5. Interaction Feature (example)
# This can capture that a combination of factors might be a better predictor than
# each factor alone (e.g., older vehicles of a specific make).
df_fe['Make_VehicleAge'] = df_fe['make'].astype(str) + '_' + df_fe['VehicleAgeGroup'].astype(str)
print("Created 'Make_VehicleAge' interaction feature.")

print("\nFeature Engineering is complete.")

Created 'VehicleAge' based on RegistrationYear. Max age is 28 years.
Created 'PremiumRate' (CalculatedPremiumPerTerm / SumInsured).
Created 'SumInsuredRatio' (SumInsured / CustomValueEstimate).
Created 'VehicleAgeGroup' by binning VehicleAge.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fe['PremiumRate'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fe['PremiumRate'].fillna(df_fe['PremiumRate'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the i

Created 'Make_VehicleAge' interaction feature.

Feature Engineering is complete.


#### Update Modeling Pipeline

In [12]:
# Update the list of features for your modeling pipeline
modeling_features_updated = [
    # Original Features
    'Province', 'Gender', 'make', 'Model', 'body_type', 'Cylinders', 'RegistrationYear',
    'CustomValueEstimate', 'CapitalOutstanding', 'SumInsured', 'CalculatedPremiumPerTerm',
    'NumberOfDoors', 'AlarmImmobiliser', 'TrackingDevice', 'CoverType',
    
    # Engineered Features
    'VehicleAge',           # Numerical
    'PremiumRate',          # Numerical
    'SumInsuredRatio',      # Numerical
    'VehicleAgeGroup',      # Categorical
    'Make_VehicleAge'       # Categorical (Interaction)
]

# Note: You will need to re-run the missing data handling and encoding steps on `df_fe`
# with this updated list of features.