<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/Risk_Category_TabNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

# Specify file path

file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
df = pd.read_csv(file_path)
print(df.shape)  # Should print (100000, 52)
print(df.head()) # To check the first few rows

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(100000, 52)
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                  

In [10]:
!pip install pytorch-tabnet matplotlib scikit-learn



In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Create a binary Risk Category label based on specified conditions
df['Risk_Category'] = np.where((df['NB_Claim'] >= 1) & (df['AMT_Claim'] > 1000), 1, 0)

# Drop NB_Claim and AMT_Claim as they are labels and should not be part of features
df = df.drop(columns=['NB_Claim', 'AMT_Claim'])

# Check the distribution of the new target variable
print(df['Risk_Category'].value_counts())

# Identify categorical columns and apply label encoding
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separate features and target variable
X = df.drop('Risk_Category', axis=1)
y = df['Risk_Category']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data to a DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Risk_Category'])], axis=1)

# Identify categorical and numerical columns
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in balanced_data.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(balanced_data.drop('Risk_Category', axis=1))

# Split the data into training, validation, and testing sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, balanced_data['Risk_Category'], test_size=0.4, random_state=42, stratify=balanced_data['Risk_Category'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the TabNet model with reduced epochs and learning rate to prevent overfitting
tabnet_model = TabNetClassifier(
    n_d=32, n_a=32, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=[], cat_dims=[], cat_emb_dim=[],
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Train the model with early stopping
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=10,  # Reduced to prevent overfitting
    patience=5,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Evaluate the model on the test set
y_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = preprocessor.get_feature_names_out()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)
print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
saved_filepath = tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saved_filepath}")


Risk_Category
0    97302
1     2698
Name: count, dtype: int64




epoch 0  | loss: 0.89424 | train_accuracy: 0.62311 | valid_accuracy: 0.62599 |  0:00:28s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_accuracy = 0.75846




Test Accuracy: 0.7544
Test F1 Score: 0.7525
Top 10 Important Features:
                    feature  importance
6    num__Annual.pct.driven    0.199205
51           cat__Car.use_2    0.128139
3         num__Credit.score    0.121263
0             num__Duration    0.024281
102       cat__Territory_58    0.021519
77        cat__Territory_33    0.018302
135       cat__Territory_91    0.015445
125       cat__Territory_81    0.014457
55        cat__Territory_11    0.014075
68        cat__Territory_24    0.012182
Successfully saved model at tabnet_model.zip
Model saved to: tabnet_model.zip
