<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/Insurance_TabNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    matthews_corrcoef,  # Added for Matthews Correlation
    roc_auc_score,      # Added for AUC score
    accuracy_score      # Added for test accuracy
)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score
from xgboost import XGBRegressor
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from pytorch_tabnet.tab_model import TabNetRegressor

In [28]:
# Step 1: Load df and Explore
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
df = pd.read_csv(file_path)

print(df.head())
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Display basic statistics
print(df.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  U

In [34]:
# Remove samples with AMT_Claim = 0
df = df[df['AMT_Claim'] > 0]

# Data Preprocessing
categorical_columns = ['Insured.sex', 'Marital', 'Car.use', 'Region']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['AMT_Claim']]

# Feature Engineering
def aggregate_harsh_driving(df):
    df['total_accel'] = df[[col for col in df.columns if col.startswith('Accel')]].sum(axis=1)
    df['total_brake'] = df[[col for col in df.columns if col.startswith('Brake')]].sum(axis=1)
    df['total_left_turn'] = df[[col for col in df.columns if col.startswith('Left.turn')]].sum(axis=1)
    df['total_right_turn'] = df[[col for col in df.columns if col.startswith('Right.turn')]].sum(axis=1)
    return df

df = aggregate_harsh_driving(df)

# Split the data
X = df.drop('AMT_Claim', axis=1)
y = df['AMT_Claim']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Fit the preprocessor and transform the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get the number of features after preprocessing
n_features = X_train_processed.shape[1]


In [37]:
 # Replace with your actual file path
df = df[df['AMT_Claim'] > 0]  # Consider only non-zero claims
print(f"Number of samples after removing zero claims: {len(df)}")

# Step 2: Define features and target
print("\nStep 2: Defining features and target")
categorical_columns = ['Insured.sex', 'Marital', 'Car.use', 'Region']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['AMT_Claim']]
X = df.drop('AMT_Claim', axis=1)
y = df['AMT_Claim']

# Step 3: Split the data
print("\nStep 3: Splitting the data")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

# Step 4: Create preprocessing pipeline
print("\nStep 4: Creating preprocessing pipeline")
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Step 5: Fit preprocessor and transform data
print("\nStep 5: Fitting preprocessor and transforming data")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"Processed training set shape: {X_train_processed.shape}")


Number of samples after removing zero claims: 3864

Step 2: Defining features and target

Step 3: Splitting the data
Training set shape: (3091, 55), Test set shape: (773, 55)

Step 4: Creating preprocessing pipeline

Step 5: Fitting preprocessor and transforming data
Processed training set shape: (3091, 61)
