<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/telematics_syn_V11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from tor

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import torch

# Load the dataset from Google Drive
file_path = '/content/drive/My Drive/telematics_syn.csv'
df = pd.read_csv(file_path)

In [None]:
# Create a binary 'Risk_Category' label
df['Risk_Category'] = np.where(df['NB_Claim'] == 0, 'Low Risk', 'High Risk')

# Save the updated dataset
df.to_csv('telematics_syn_updated.csv', index=False)

# Load the updated dataset
data = pd.read_csv('telematics_syn_updated.csv')

# Separate features and target variable
X = data.drop('Risk_Category', axis=1)
y = data['Risk_Category']

# Identify columns with non-numerical data
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply label encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store the encoders for later use if needed

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data to a DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                           pd.DataFrame(y_resampled, columns=['Risk_Category'])], axis=1)

# Check the distribution of the target variable
print(Counter(balanced_data['Risk_Category']))

# Identify categorical and numerical columns for preprocessing
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in balanced_data.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Separate features and labels
X = balanced_data.drop('Risk_Category', axis=1)
y = balanced_data['Risk_Category']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the preprocessed data into training, validation, and testing sets
X_train_processed, X_temp_processed, y_train, y_temp = train_test_split(
    X_preprocessed, y, test_size=0.4, random_state=42, stratify=y
)
X_val_processed, X_test_processed, y_val, y_test = train_test_split(
    X_temp_processed, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
# Check the shape of the splits to ensure they are correct
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Validation set shape: {X_val.shape}, {y_val.shape}')
print(f'Testing set shape: {X_test.shape}, {y_test.shape}')

# Prepare categorical indices and dimensions for TabNet
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_cols]
categorical_dimensions = [len(X[col].unique()) for col in categorical_cols]

# Initialize TabNet model
tabnet_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=categorical_columns_indices,
    cat_dims=categorical_dimensions,
    cat_emb_dim=8,   # or any other appropriate embedding dimension
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Train the model
tabnet_model.fit(
    X_train=X_train_processed, y_train=y_train,  # Use the preprocessed training data
    eval_set=[(X_train_processed, y_train), (X_val_processed, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Make predictions on the test set
y_pred = tabnet_model.predict(X_test_processed)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = balanced_data.drop('Risk_Category', axis=1).columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)

print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saving_path_name}.zip")


Counter({'High Risk': 95728, 'Low Risk': 95728})
Training set shape: (114873, 138), (114873,)
Validation set shape: (38291, 138), (38291,)
Testing set shape: (38292, 138), (38292,)




IndexError: index out of range in self