<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/telematics_syn_V10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

# Specify file path

file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
df = pd.read_csv(file_path)
print(df.shape)  # Should print (100000, 52)
print(df.head()) # To check the first few rows

Mounted at /content/drive
(100000, 52)
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  Urban             6213.71              65  ...                    2.0   

   Left.turn.inte

In [2]:
!pip install pytorch-tabnet matplotlib scikit-learn

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from tor

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch


# Create a binary Risk Category label based on specified conditions
df['Risk_Category'] = np.where((df['NB_Claim'] > 1) & (df['AMT_Claim'] > 1000), 1, 0)

# Drop NB_Claim and AMT_Claim as they are labels and should not be part of features
df = df.drop(columns=['NB_Claim', 'AMT_Claim'])

# Check the distribution of the new target variable
print(df['Risk_Category'].value_counts())

# Identify categorical columns and apply label encoding
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separate features and target variable
X = df.drop('Risk_Category', axis=1)
y = df['Risk_Category']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data to a DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Risk_Category'])], axis=1)

# Identify categorical and numerical columns
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in balanced_data.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Separate features and labels again after resampling
X = balanced_data.drop('Risk_Category', axis=1)
y = balanced_data['Risk_Category']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Split the data into training, validation, and testing sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the TabNet model
tabnet_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=[], cat_dims=[], cat_emb_dim=[],
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Train the model
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Evaluate the model on the test set
y_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)
print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
saved_filepath = tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saved_filepath}")

Risk_Category
0    99822
1      178
Name: count, dtype: int64




epoch 0  | loss: 0.43714 | train_accuracy: 0.90165 | valid_accuracy: 0.90285 |  0:01:04s
epoch 10 | loss: 0.01485 | train_accuracy: 0.99776 | valid_accuracy: 0.99669 |  0:11:59s
epoch 20 | loss: 0.00307 | train_accuracy: 0.99967 | valid_accuracy: 0.99837 |  0:22:55s

Early stopping occurred at epoch 29 with best_epoch = 19 and best_valid_accuracy = 0.99882




Test Accuracy: 0.9987
Test F1 Score: 0.9987


ValueError: All arrays must be of the same length

In [None]:
import pandas as pd
import numpy as np

# Create a binary Risk Category label
df['Risk_Category'] = np.where(df['NB_Claim'] == 0, 'Low Risk', 'High Risk')

# Save the updated dataset
df.to_csv('telematics_syn_updated.csv', index=False)

# Display the first few rows to verify the new column
print(df.head())

   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity11  \
0  Urban             6213.71              25  ...                    0.0   
1  Urban            12427.42              20  ...                   24.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  Urban             6213.71              65  ...                    0.0   

   Left.turn.intensity12  Right.turn.intensity08  Right.

In [None]:
# Count the occurrences of 1's and 0's in Risk_Category
claim_yn_counts = df['Risk_Category'].value_counts()

print(claim_yn_counts)

Risk_Category
Low Risk     95728
High Risk     4272
Name: count, dtype: int64


In [3]:
!pip install imbalanced-learn



In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# Load your dataset
data = pd.read_csv('telematics_syn_updated.csv')

# Separate features and target variable
X = data.drop('Risk_Category', axis=1)
y = data['Risk_Category']

# Identify columns with non-numerical data
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply label encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store the encoders for later use if needed

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data to a DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Risk_Category'])], axis=1)

# Check the distribution of the target variable
print(Counter(balanced_data['Risk_Category']))

# Save the balanced dataset
balanced_data.to_csv('balanced_dataset.csv', index=False)

Counter({'High Risk': 95728, 'Low Risk': 95728})


In [None]:
df = pd.read_csv('balanced_dataset.csv')
df.head()

Unnamed: 0,Duration,Insured.age,Insured.sex,Car.age,Marital,Car.use,Credit.score,Region,Annual.miles.drive,Years.noclaims,...,Left.turn.intensity11,Left.turn.intensity12,Right.turn.intensity08,Right.turn.intensity09,Right.turn.intensity10,Right.turn.intensity11,Right.turn.intensity12,NB_Claim,AMT_Claim,Risk_Category
0,366,45,1,-1,0,1,609.0,1,6213.71,25,...,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1,5100.171753,High Risk
1,182,44,0,3,0,1,575.0,1,12427.42,20,...,24.0,11.0,1099.0,615.0,219.0,101.0,40.0,1,883.55484,High Risk
2,184,48,0,6,0,1,847.0,1,12427.42,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,Low Risk
3,183,71,1,6,0,3,842.0,1,6213.71,43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,Low Risk
4,183,84,1,10,0,3,856.0,1,6213.71,65,...,0.0,0.0,325.0,111.0,18.0,4.0,2.0,0,0.0,Low Risk


In [None]:
# Count the occurrences of 1's and 0's in Risk_Category
claim_yn_counts = df['Risk_Category'].value_counts()

print(claim_yn_counts)


Risk_Category
High Risk    95728
Low Risk     95728
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the balanced dataset
balanced_data = pd.read_csv('balanced_dataset.csv')

# Identify categorical and numerical columns
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in balanced_data.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Separate features and labels
X = balanced_data.drop('Risk_Category', axis=1)
y = balanced_data['Risk_Category']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Split the data into training, validation, and testing sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Check the shape of the splits to ensure they are correct
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Validation set shape: {X_val.shape}, {y_val.shape}')
print(f'Testing set shape: {X_test.shape}, {y_test.shape}')

# Save the preprocessed data to CSV files if needed
train_data = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
train_data['Risk_Category'] = y_train.values
train_data.to_csv('train_data.csv', index=False)

val_data = pd.DataFrame(X_val, columns=preprocessor.get_feature_names_out())
val_data['Risk_Category'] = y_val.values
val_data.to_csv('val_data.csv', index=False)

test_data = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())
test_data['Risk_Category'] = y_test.values
test_data.to_csv('test_data.csv', index=False)


Training set shape: (114873, 138), (114873,)
Validation set shape: (38291, 138), (38291,)
Testing set shape: (38292, 138), (38292,)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Load the preprocessed data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Separate features and labels
X_train = train_data.drop('Risk_Category', axis=1).values
y_train = train_data['Risk_Category'].values
X_val = val_data.drop('Risk_Category', axis=1).values
y_val = val_data['Risk_Category'].values
X_test = test_data.drop('Risk_Category', axis=1).values
y_test = test_data['Risk_Category'].values

# Identify categorical columns indices and dimensions (adjust based on your actual data)
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']  # Replace with your actual categorical columns
categorical_columns_indices = [train_data.columns.get_loc(col) for col in categorical_cols if col in train_data.columns]
categorical_dimensions = [len(train_data[col].unique()) for col in categorical_cols if col in train_data.columns]

# Ensure categorical features are integers after one-hot encoding
X_train[:, categorical_columns_indices] = X_train[:, categorical_columns_indices].astype(int)
X_val[:, categorical_columns_indices] = X_val[:, categorical_columns_indices].astype(int)
X_test[:, categorical_columns_indices] = X_test[:, categorical_columns_indices].astype(int)

tabnet_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=categorical_columns_indices,
    cat_dims=categorical_dimensions,
    cat_emb_dim=1,  # or any other appropriate embedding dimension
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)


# Train the model
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Make predictions on the test set
y_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = train_data.drop('Risk_Category', axis=1).columns
importance_df = pd.DataFrame({'feature': feature_names, })

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import torch

# Load the dataset from Google Drive
file_path = '/content/drive/My Drive/telematics_syn.csv'
df = pd.read_csv(file_path)

# Create a binary 'Risk_Category' label
df['Risk_Category'] = np.where(df['NB_Claim'] == 0, 'Low Risk', 'High Risk')

# Save the updated dataset
df.to_csv('telematics_syn_updated.csv', index=False)

# Load the updated dataset
data = pd.read_csv('telematics_syn_updated.csv')

# Separate features and target variable
X = data.drop('Risk_Category', axis=1)
y = data['Risk_Category']

# Identify columns with non-numerical data
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply label encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store the encoders for later use if needed

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data to a DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                           pd.DataFrame(y_resampled, columns=['Risk_Category'])], axis=1)

# Check the distribution of the target variable
print(Counter(balanced_data['Risk_Category']))

# Identify categorical and numerical columns for preprocessing
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in balanced_data.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Separate features and labels
X = balanced_data.drop('Risk_Category', axis=1)
y = balanced_data['Risk_Category']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training, validation, and testing sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Check the shape of the splits to ensure they are correct
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Validation set shape: {X_val.shape}, {y_val.shape}')
print(f'Testing set shape: {X_test.shape}, {y_test.shape}')

# Prepare categorical indices and dimensions for TabNet
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_cols]
categorical_dimensions = [len(X[col].unique()) for col in categorical_cols]

# Initialize TabNet model
tabnet_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=categorical_columns_indices,
    cat_dims=categorical_dimensions,
    cat_emb_dim=1,  # or any other appropriate embedding dimension
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Train the model
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Make predictions on the test set
y_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = balanced_data.drop('Risk_Category', axis=1).columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)

print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saving_path_name}.zip")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/telematics_syn.csv')

# Create Risk_Category based on the specified conditions
# 1: Indicates high risk (NB_Claim > 1 and AMT_Claim > 1000)
# 0: Indicates low risk (otherwise)
df['Risk_Category'] = np.where((df['NB_Claim'] > 1) & (df['AMT_Claim'] > 1000), 1, 0)

# Identify categorical and numerical columns
categorical_cols = ['Insured.sex', 'Marital', 'Car.use', 'Region', 'Territory']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['Risk_Category']]

# Preprocess the data: one-hot encode categorical variables and standardize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Separate features and labels
X = df.drop('Risk_Category', axis=1)
y = df['Risk_Category']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Split the data into training, validation, and testing sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize TabNet model
tabnet_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Train the model
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Evaluate the model on the validation set
y_val_pred = tabnet_model.predict(X_val)

# Calculate accuracy and F1 score on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")

# Evaluate the model on the test set
y_test_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
feature_names = preprocessor.get_feature_names_out()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)
print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
saved_filepath = tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saved_filepath}")
