<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/Insurance_TabNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    matthews_corrcoef,  # Added for Matthews Correlation
    roc_auc_score,      # Added for AUC score
    accuracy_score      # Added for test accuracy
)

In [2]:
# Step 1: Load the dataset
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
data = pd.read_csv(file_path)

# Step 2: Explore the data
print(data.head())
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Display basic statistics
print(data.describe())

Mounted at /content/drive
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  Urban             6213.71              65  ...                    2.0   

   Left.turn.intensity11  Left

In [None]:
# Step 1: Load Data and Explore
import pandas as pd

# Load your data (replace with actual file path)
data = pd.read_csv('/mnt/data/your_dataset.csv')

# Display the first few rows
print(data.head())

# Step 2: Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Identify categorical columns (assuming 'Region' is categorical, adapt for your data)
categorical_columns = ['Region', 'Territory']

# One-Hot Encoding for categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Feature scaling (StandardScaler and MinMaxScaler)
scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

# Example features to scale (replace with relevant columns)
columns_to_scale = ['Annual.miles.drive', 'Duration', 'Credit.score']
data_encoded[columns_to_scale] = scaler.fit_transform(data_encoded[columns_to_scale])

# Show processed data
print(data_encoded.head())

# Step 3: Feature Engineering
# Example: Combining harsh driving events
data_encoded['Harsh_Driving_Score'] = data_encoded[['Accel.12miles', 'Brake.14miles',
                                                   'Left.turn.intensity12', 'Right.turn.intensity12']].mean(axis=1)

# Drop the individual columns after aggregation if needed
data_encoded.drop(['Accel.12miles', 'Brake.14miles', 'Left.turn.intensity12', 'Right.turn.intensity12'], axis=1, inplace=True)

# Show processed data with new features
print(data_encoded[['Harsh_Driving_Score']].head())

# Step 4: Data Splitting (Train/Test Split)
X = data_encoded.drop(columns=['AMT_Claim'])  # Drop target
y = data_encoded['AMT_Claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Data Balancing (if necessary)
# If 'AMT_Claim' is highly imbalanced, you can apply SMOTE or other techniques

# Step 6: Model Building (Combining DNN and TabNet)
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from pytorch_tabnet.tab_model import TabNetRegressor

# Define a simple DNN model for regression
def create_dnn_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)  # Regression output
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])
    return model

# TabNet model
tabnet = TabNetRegressor()

# Tune hyperparameters for DNN using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Example hyperparameters to tune (adjust as needed)
param_dist = {
    'max_epochs': [50, 100],
    'patience': [5, 10],
    'learning_rate': [0.001, 0.01],
}

# Use RandomizedSearchCV for TabNet
search = RandomizedSearchCV(tabnet, param_distributions=param_dist, n_iter=5, cv=3)
search.fit(X_train, y_train)

# Train DNN
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)

# Step 7: Model Evaluation
# Evaluate both models (TabNet and DNN) using regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions
y_pred_tabnet = search.predict(X_test)
y_pred_dnn = dnn_model.predict(X_test)

# Calculate evaluation metrics
print("TabNet Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_tabnet)}")
print(f"RMSE: {mean_squared_error(y_test, y_pred_tabnet, squared=False)}")
print(f"R²: {r2_score(y_test, y_pred_tabnet)}")

print("DNN Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_dnn)}")
print(f"RMSE: {mean_squared_error(y_test, y_pred_dnn, squared=False)}")
print(f"R²: {r2_score(y_test, y_pred_dnn)}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('/path_to_data.csv')  # Modify to your file path

# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Impute missing values (if any) - using median for numerical, mode for categorical
for col in data.columns:
    if data[col].dtype == 'object':
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:
        data[col].fillna(data[col].median(), inplace=True)

# Label encoding for categorical features
categorical_cols = ['Region', 'Territory', 'Car.usage', 'Marital.status']
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Scale and normalize numerical data
scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# Exclude the target 'AMT_Claim' from scaling
numerical_cols.remove('AMT_Claim')

# Apply both Standard and MinMax scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
data[numerical_cols] = minmax_scaler.fit_transform(data[numerical_cols])

# Display first few rows to confirm
print("Preprocessed Data:\n", data.head())


In [None]:
# Feature engineering - Aggregating harsh driving events
data['HarshDrivingScore'] = data.filter(regex='Accel|Brake|Left.turn|Right.turn').sum(axis=1)

# Drop original harsh driving event columns
data.drop(data.filter(regex='Accel|Brake|Left.turn|Right.turn').columns, axis=1, inplace=True)

# Display to confirm feature engineering
print("Feature Engineered Data:\n", data[['HarshDrivingScore', 'AMT_Claim']].head())
