<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/Insurance_TabNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    matthews_corrcoef,  # Added for Matthews Correlation
    roc_auc_score,      # Added for AUC score
    accuracy_score      # Added for test accuracy
)

In [28]:
# Step 1: Load df and Explore
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
df = pd.read_csv(file_path)

print(df.head())
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Display basic statistics
print(df.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  U

In [29]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from pytorch_tabnet.tab_model import TabNetRegressor


# Data Preprocessing
# 1. Identify categorical and numerical columns
categorical_columns = ['Insured.sex', 'Marital', 'Car.use', 'Region']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['AMT_Claim']]

# 2. Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# 3. Feature Engineering
def aggregate_harsh_driving(df):
    df['total_accel'] = df[[col for col in df.columns if col.startswith('Accel')]].sum(axis=1)
    df['total_brake'] = df[[col for col in df.columns if col.startswith('Brake')]].sum(axis=1)
    df['total_left_turn'] = df[[col for col in df.columns if col.startswith('Left.turn')]].sum(axis=1)
    df['total_right_turn'] = df[[col for col in df.columns if col.startswith('Right.turn')]].sum(axis=1)
    return df

df = aggregate_harsh_driving(df)

# 4. Split the data
X = df.drop('AMT_Claim', axis=1)
y = df['AMT_Claim']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Define the FJ model (DNN + TabNet + XGBoost)
class FJModel:
    def __init__(self, input_dim):
        self.dnn = self._build_dnn(input_dim)
        self.tabnet = TabNetRegressor()
        self.xgb = XGBRegressor(random_state=42)

    def _build_dnn(self, input_dim):
        inputs = Input(shape=(input_dim,))
        x = Dense(64, activation='relu')(inputs)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        x = Dense(32, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        outputs = Dense(1)(x)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=Adam(), loss='mse')
        return model

    def fit(self, X, y):
        self.dnn.fit(X, y, epochs=50, batch_size=32, verbose=0)
        self.tabnet.fit(X.to_numpy(), y.to_numpy())
        self.xgb.fit(X, y)

    def predict(self, X):
        dnn_pred = self.dnn.predict(X).flatten()
        tabnet_pred = self.tabnet.predict(X.to_numpy()).flatten()
        xgb_pred = self.xgb.predict(X)
        return (dnn_pred + tabnet_pred + xgb_pred) / 3

# 6. Train and evaluate the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', FJModel(len(X_train.columns)))
])

pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# 7. Feature Importance (using XGBoost as a proxy)
feature_importance = pipeline.named_steps['regressor'].xgb.feature_importances_
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(10))



ValueError: Input 0 of layer "functional_5" is incompatible with the layer: expected shape=(None, 55), found shape=(32, 57)