<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/Risk_Category_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# Load your dataset
data = pd.read_csv('telematics_syn.csv')



# Step 1: Inspect the Dataset


In [4]:
# Check the first few rows
print(data.head())

# Check data types and missing values
print(data.info())

# Summary statistics
print(data.describe())

   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  Urban             6213.71              65  ...                    2.0   

   Left.turn.intensity11  Left.turn.intensity12  Right.t

# Step 2: Handle Missing Values


In [8]:
# Drop the single row with missing values
data.dropna(inplace=True)

## Step 2.1. Fix Invalid Car.age


In [9]:
# Replace negative values with 0 (assuming data entry error)
data['Car.age'] = data['Car.age'].clip(lower=0)

# 3. Encode Categorical Variables


In [10]:
# One-hot encode categorical features
cat_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region']
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

# 4. Separate Features & Target


In [11]:
X = data.drop(['NB_Claim', 'AMT_Claim'], axis=1)  # Features
y = data['NB_Claim']  # Binary classification target (Claim Yes/No)

# 5. Split Data


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Address Class Imbalance with SMOTE


In [14]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_resampled))

Resampled class distribution: Counter({0.0: 61263, 1.0: 61263, 2.0: 61263, 3.0: 61263})


# 7. Feature Scaling (Critical for Linear Models)


In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

In [17]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import (roc_auc_score, classification_report,
                             confusion_matrix, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt

# Load and preprocess data
data = pd.read_csv('telematics_syn.csv')
data.dropna(inplace=True)
data['Car.age'] = data['Car.age'].clip(lower=0)

# Encode categorical variables
cat_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region']
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

# Split data
X = data.drop(['NB_Claim', 'AMT_Claim'], axis=1)
y = data['NB_Claim'].astype(int)  # Convert to integer for XGBoost
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# XGBoost Classifier with imbalance handling
model = XGBClassifier(
    scale_pos_weight=len(y_res[y_res==0])/len(y_res[y_res==1]),  # Auto-balance
    objective='binary:logistic',
    eval_metric='aucpr',  # Optimized for imbalanced data
    n_estimators=1000,
    early_stopping_rounds=50,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train with early stopping
model.fit(
    X_res, y_res,
    eval_set=[(X_test, y_test)],
    verbose=20
)

# Evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print(f"\nAUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")
print(classification_report(y_test, y_pred))

# Confusion Matrix
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax, cmap='Blues')
plt.show()

# Feature Importance
plt.figure(figsize=(12,8))
sorted_idx = model.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx][-20:], model.feature_importances_[sorted_idx][-20:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

Parameters: { "scale_pos_weight" } are not used.



[0]	validation_0-aucpr:0.40917
[20]	validation_0-aucpr:0.45391
[40]	validation_0-aucpr:0.50514
[60]	validation_0-aucpr:0.53104
[80]	validation_0-aucpr:0.55157
[100]	validation_0-aucpr:0.56602
[120]	validation_0-aucpr:0.58315
[140]	validation_0-aucpr:0.58947
[160]	validation_0-aucpr:0.59809
[180]	validation_0-aucpr:0.60449
[200]	validation_0-aucpr:0.61012
[220]	validation_0-aucpr:0.61413
[240]	validation_0-aucpr:0.61622
[260]	validation_0-aucpr:0.62062
[280]	validation_0-aucpr:0.62336
[300]	validation_0-aucpr:0.62524
[320]	validation_0-aucpr:0.62736
[340]	validation_0-aucpr:0.62884
[360]	validation_0-aucpr:0.62892
[380]	validation_0-aucpr:0.63021
[395]	validation_0-aucpr:0.63091


ValueError: multi_class must be in ('ovo', 'ovr')