<a href="https://colab.research.google.com/github/fjadidi2001/Artificial_Intelligence_Learning/blob/master/Copy_of_Copy_of_Copy_of_telematics_syn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('telematics_syn.csv')

# 1. Handling Missing Values
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Impute missing values (using median for numerical and most frequent for categorical)
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Imputation transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply the preprocessing steps
df_preprocessed = preprocessor.fit_transform(df)

# Convert the result back to a DataFrame
df_preprocessed = pd.DataFrame(df_preprocessed, columns=numerical_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)))

# Check the shape of the preprocessed data
print("Shape of preprocessed data:", df_preprocessed.shape)

# Save the preprocessed DataFrame to a new CSV file
df_preprocessed.to_csv('telematics_syn_preprocessed.csv', index=False)

print("Preprocessing completed and saved to 'telematics_syn_preprocessed.csv'")


Missing values in each column:
 Duration                  0
Insured.age               0
Insured.sex               0
Car.age                   0
Marital                   0
Car.use                   0
Credit.score              0
Region                    0
Annual.miles.drive        0
Years.noclaims            0
Territory                 0
Annual.pct.driven         0
Total.miles.driven        0
Pct.drive.mon             0
Pct.drive.tue             0
Pct.drive.wed             0
Pct.drive.thr             0
Pct.drive.fri             0
Pct.drive.sat             0
Pct.drive.sun             0
Pct.drive.2hrs            0
Pct.drive.3hrs            0
Pct.drive.4hrs            0
Pct.drive.wkday           0
Pct.drive.wkend           0
Pct.drive.rush am         0
Pct.drive.rush pm         0
Avgdays.week              0
Accel.06miles             0
Accel.08miles             0
Accel.09miles             0
Accel.11miles             0
Accel.12miles             0
Accel.14miles             0
Brake.06miles   

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
df_preprocessed = pd.read_csv('telematics_syn_preprocessed.csv')

# Split the dataset into train (70%), validation (15%), and test (15%) sets
train_size = 0.7
val_size = 0.15
test_size = 0.15

# Split the data into train and temp sets (train + validation + test)
train_df, temp_df = train_test_split(df_preprocessed, train_size=train_size, random_state=42)

# Split the temp set into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=test_size/(val_size + test_size), random_state=42)

# Print the sizes of each set
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# Save the splits to CSV files (optional)
train_df.to_csv('telematics_train.csv', index=False)
val_df.to_csv('telematics_val.csv', index=False)
test_df.to_csv('telematics_test.csv', index=False)

print("Dataset splits saved to 'telematics_train.csv', 'telematics_val.csv', and 'telematics_test.csv'")


Training set size: 70000
Validation set size: 15000
Test set size: 15000
Dataset splits saved to 'telematics_train.csv', 'telematics_val.csv', and 'telematics_test.csv'


In [3]:
!pip install pytorch-tabnet


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cubl

In [6]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the preprocessed dataset splits
train_df = pd.read_csv('telematics_train.csv')
val_df = pd.read_csv('telematics_val.csv')
test_df = pd.read_csv('telematics_test.csv')

# Separate features and target
X_train = train_df.drop(columns=['NB_Claim'])  # Replace 'NB_Claim' with your target column name
y_train = train_df['NB_Claim'].astype(int)  # Convert target to integer labels

X_val = val_df.drop(columns=['NB_Claim'])
y_val = val_df['NB_Claim'].astype(int)  # Convert target to integer labels

X_test = test_df.drop(columns=['NB_Claim'])
y_test = test_df['NB_Claim'].astype(int)  # Convert target to integer labels

# Initialize the TabNet model
tabnet_model = TabNetClassifier()

# Train the TabNet model
tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=1000, patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

# Evaluate the model on the test set
y_pred = tabnet_model.predict(X_test.values)
test_accuracy = accuracy_score(y_test.values, y_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test.values, y_pred))
print("Classification Report:")
print(classification_report(y_test.values, y_pred))

# Save the model
tabnet_model.save_model('tabnet_model')



epoch 0  | loss: 0.94992 | val_accuracy: 0.5086  |  0:00:06s
epoch 1  | loss: 0.46054 | val_accuracy: 0.03853 |  0:00:11s
epoch 2  | loss: 0.34196 | val_accuracy: 0.035   |  0:00:16s
epoch 3  | loss: 0.28742 | val_accuracy: 0.20013 |  0:00:22s
epoch 4  | loss: 0.2458  | val_accuracy: 0.15913 |  0:00:27s
epoch 5  | loss: 0.21692 | val_accuracy: 0.96667 |  0:00:32s
epoch 6  | loss: 0.17786 | val_accuracy: 0.05    |  0:00:38s
epoch 7  | loss: 0.1902  | val_accuracy: 0.04827 |  0:00:43s
epoch 8  | loss: 0.16078 | val_accuracy: 0.98247 |  0:00:48s
epoch 9  | loss: 0.16651 | val_accuracy: 0.9848  |  0:00:54s
epoch 10 | loss: 0.13779 | val_accuracy: 0.09567 |  0:00:58s
epoch 11 | loss: 0.12122 | val_accuracy: 0.9842  |  0:01:04s
epoch 12 | loss: 0.1205  | val_accuracy: 0.04673 |  0:01:10s
epoch 13 | loss: 0.11444 | val_accuracy: 0.06353 |  0:01:14s
epoch 14 | loss: 0.09627 | val_accuracy: 0.6186  |  0:01:21s
epoch 15 | loss: 0.11322 | val_accuracy: 0.05473 |  0:01:25s
epoch 16 | loss: 0.09425



Test Accuracy: 0.9828666666666667
Confusion Matrix:
[[14200   109     3     3]
 [   82   518    52     0]
 [    2     6    22     0]
 [    0     0     0     3]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14315
           4       0.82      0.79      0.81       652
           8       0.29      0.73      0.41        30
          13       0.50      1.00      0.67         3

    accuracy                           0.98     15000
   macro avg       0.65      0.88      0.72     15000
weighted avg       0.98      0.98      0.98     15000

Successfully saved model at tabnet_model.zip


'tabnet_model.zip'

In [8]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Load the preprocessed dataset splits
train_df = pd.read_csv('telematics_train.csv')
val_df = pd.read_csv('telematics_val.csv')
test_df = pd.read_csv('telematics_test.csv')

# Separate features and target
X_train = train_df.drop(columns=['NB_Claim'])  # Replace 'NB_Claim' with your target column name
y_train = train_df['NB_Claim']

X_val = val_df.drop(columns=['NB_Claim'])
y_val = val_df['NB_Claim']

X_test = test_df.drop(columns=['NB_Claim'])
y_test = test_df['NB_Claim']

# Convert target variables to integer labels (assuming it's a classification problem)
# Adjust the threshold and logic based on your problem definition
threshold = 0  # Example threshold for binary classification
y_train = (y_train > threshold).astype(int)
y_val = (y_val > threshold).astype(int)
y_test = (y_test > threshold).astype(int)

# Initialize the TabNet model
tabnet_model = TabNetClassifier()

# Train the TabNet model
history = tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_train.values, y_train.values), (X_val.values, y_val.values)],
    eval_name=['train', 'val'],
    eval_metric=['accuracy'],
    max_epochs=1000, patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

# Evaluate the model on the test set
y_pred = tabnet_model.predict(X_test.values)
test_accuracy = accuracy_score(y_test.values, y_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test.values, y_pred))
print("Classification Report:")
print(classification_report(y_test.values, y_pred))

# Save the model
tabnet_model.save_model('tabnet_model')

# Extract history for plotting
train_losses = history.history['loss']
train_accuracies = history.history['train_accuracy']
val_accuracies = history.history['val_accuracy']

# Plot accuracy over epochs
plt.figure(figsize=(12, 6))

# Plot training and validation accuracy
plt.subplot(1, 2, 1)
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

# Plot training loss over epochs
plt.subplot(1, 2, 2)
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.tight_layout()
plt.show()



epoch 0  | loss: 0.4831  | train_accuracy: 0.11404 | val_accuracy: 0.10907 |  0:00:08s
epoch 1  | loss: 0.21842 | train_accuracy: 0.04853 | val_accuracy: 0.0464  |  0:00:16s
epoch 2  | loss: 0.18378 | train_accuracy: 0.0462  | val_accuracy: 0.04453 |  0:00:24s
epoch 3  | loss: 0.16759 | train_accuracy: 0.04841 | val_accuracy: 0.04733 |  0:00:32s
epoch 4  | loss: 0.16409 | train_accuracy: 0.04726 | val_accuracy: 0.04613 |  0:00:40s
epoch 5  | loss: 0.15097 | train_accuracy: 0.04386 | val_accuracy: 0.04267 |  0:00:49s
epoch 6  | loss: 0.15326 | train_accuracy: 0.0938  | val_accuracy: 0.09213 |  0:00:56s
epoch 7  | loss: 0.15468 | train_accuracy: 0.04481 | val_accuracy: 0.04307 |  0:01:05s
epoch 8  | loss: 0.15368 | train_accuracy: 0.04316 | val_accuracy: 0.0412  |  0:01:12s
epoch 9  | loss: 0.1421  | train_accuracy: 0.04313 | val_accuracy: 0.04133 |  0:01:20s
epoch 10 | loss: 0.13718 | train_accuracy: 0.04287 | val_accuracy: 0.04093 |  0:01:27s
epoch 11 | loss: 0.13756 | train_accuracy: 



Test Accuracy: 0.112
Confusion Matrix:
[[ 1016 13299]
 [   21   664]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.07      0.13     14315
           1       0.05      0.97      0.09       685

    accuracy                           0.11     15000
   macro avg       0.51      0.52      0.11     15000
weighted avg       0.94      0.11      0.13     15000

Successfully saved model at tabnet_model.zip


AttributeError: 'NoneType' object has no attribute 'history'