<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/Insurance_TabNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    matthews_corrcoef,  # Added for Matthews Correlation
    roc_auc_score,      # Added for AUC score
    accuracy_score      # Added for test accuracy
)

In [3]:
# Step 1: Load Data and Explore
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
data = pd.read_csv(file_path)

print(data.head())
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Display basic statistics
print(data.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  U

In [8]:

# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Impute missing values (if any) - using median for numerical, mode for categorical
for col in data.columns:
    if data[col].dtype == 'object':
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:
        data[col].fillna(data[col].median(), inplace=True)

# Label encoding for categorical features
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region']
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Scale and normalize numerical data
scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# Exclude the target 'AMT_Claim' from scaling
numerical_cols.remove('AMT_Claim')

# Apply both Standard and MinMax scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
data[numerical_cols] = minmax_scaler.fit_transform(data[numerical_cols])

# Display first few rows to confirm
print("Preprocessed Data:\n", data.head())


Missing Values:
 Duration                  0
Insured.age               0
Insured.sex               0
Car.age                   0
Marital                   0
Car.use                   0
Credit.score              0
Region                    0
Annual.miles.drive        0
Years.noclaims            0
Territory                 0
Annual.pct.driven         0
Total.miles.driven        0
Pct.drive.mon             0
Pct.drive.tue             0
Pct.drive.wed             0
Pct.drive.thr             0
Pct.drive.fri             0
Pct.drive.sat             0
Pct.drive.sun             0
Pct.drive.2hrs            0
Pct.drive.3hrs            0
Pct.drive.4hrs            0
Pct.drive.wkday           0
Pct.drive.wkend           0
Pct.drive.rush am         0
Pct.drive.rush pm         0
Avgdays.week              0
Accel.06miles             0
Accel.08miles             0
Accel.09miles             0
Accel.11miles             0
Accel.12miles             0
Accel.14miles             0
Brake.06miles             0
Bra

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


Preprocessed Data:
    Duration  Insured.age  Insured.sex   Car.age  Marital   Car.use  \
0  1.000000     0.333333          1.0  0.045455      0.0  0.333333   
1  0.457227     0.321839          0.0  0.227273      0.0  0.333333   
2  0.463127     0.367816          0.0  0.363636      0.0  0.333333   
3  0.460177     0.632184          1.0  0.363636      0.0  1.000000   
4  0.460177     0.781609          1.0  0.545455      0.0  1.000000   

   Credit.score  Region  Annual.miles.drive  Years.noclaims  ...  \
0      0.391213     1.0            0.109529        0.316456  ...   
1      0.320084     1.0            0.219058        0.253165  ...   
2      0.889121     1.0            0.219058        0.177215  ...   
3      0.878661     1.0            0.109529        0.544304  ...   
4      0.907950     1.0            0.109529        0.822785  ...   

   Left.turn.intensity10  Left.turn.intensity11  Left.turn.intensity12  \
0               0.000001                0.00000               0.000000   
1 

In [9]:
# Feature engineering - Aggregating harsh driving events
data['HarshDrivingScore'] = data.filter(regex='Accel|Brake|Left.turn|Right.turn').sum(axis=1)

# Drop original harsh driving event columns
data.drop(data.filter(regex='Accel|Brake|Left.turn|Right.turn').columns, axis=1, inplace=True)

# Display to confirm feature engineering
print("Feature Engineered Data:\n", data[['HarshDrivingScore', 'AMT_Claim']].head())


Feature Engineered Data:
    HarshDrivingScore    AMT_Claim
0           0.220631  5100.171753
1           0.283649   883.554840
2           0.508857     0.000000
3           0.177134     0.000000
4           0.171370     0.000000


In [10]:
# Define target and features
X = data.drop(columns='AMT_Claim')
y = data['AMT_Claim']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of train/test splits
print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")


Train Shape: (80000, 30), Test Shape: (20000, 30)
