<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/telematics_syn_V11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
# Step 1: Load the dataset
dataset = pd.read_csv('/content/drive/My Drive/telematics_syn.csv')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

# Create the adjusted ClaimYN label
dataset['ClaimYN'] = ((dataset['NB_Claim'] >= 1) & (dataset['AMT_Claim'] > 1000)).astype(int)

# Preprocess the dataset
# 1. Handle missing values
dataset.fillna(method='ffill', inplace=True)  # Simple forward fill for missing values

# 2. Encode categorical variables (if any exist)
categorical_columns = dataset.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

# 3. Standardize numerical columns
numerical_columns = dataset.drop(columns=['ClaimYN', 'NB_Claim', 'AMT_Claim']).columns
scaler = StandardScaler()
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])

# Handle the imbalance in the dataset using SMOTE
X = dataset.drop(columns=['ClaimYN'])
y = dataset['ClaimYN']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Merging resampled data back into a single DataFrame
dataset_resampled = pd.concat([X_resampled, y_resampled], axis=1)

# Drop NB_Claim and AMT_Claim columns
dataset_resampled = dataset_resampled.drop(columns=['NB_Claim', 'AMT_Claim'])

# Split the dataset into train, test, and validation sets
from sklearn.model_selection import train_test_split

X = dataset_resampled.drop(columns=['ClaimYN'])
y = dataset_resampled['ClaimYN']

# Split the data (70% train, 15% test, 15% validation)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"Validation set size: {len(X_val)}")

Training set size: 136222
Testing set size: 29191
Validation set size: 29191


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=200)
log_reg.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75     14593
           1       0.74      0.80      0.77     14598

    accuracy                           0.76     29191
   macro avg       0.76      0.76      0.76     29191
weighted avg       0.76      0.76      0.76     29191



In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     14593
           1       1.00      0.98      0.99     14598

    accuracy                           0.99     29191
   macro avg       0.99      0.99      0.99     29191
weighted avg       0.99      0.99      0.99     29191



In [7]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = xgb.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     14593
           1       1.00      0.98      0.99     14598

    accuracy                           0.99     29191
   macro avg       0.99      0.99      0.99     29191
weighted avg       0.99      0.99      0.99     29191



In [8]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.3->pytorch_tabnet)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from tor

In [9]:
from pytorch_tabnet.tab_model import TabNetClassifier
import numpy as np

# Convert data to numpy arrays for TabNet
X_train_np, y_train_np = X_train.values, y_train.values
X_test_np, y_test_np = X_test.values, y_test.values

# Initialize and train the TabNet model
tabnet = TabNetClassifier(seed=42)
tabnet.fit(X_train_np, y_train_np, eval_set=[(X_test_np, y_test_np)], patience=10)

# Evaluate the model on the test set
y_pred = np.argmax(tabnet.predict_proba(X_test_np), axis=1)
print("TabNet Classification Report:")
print(classification_report(y_test_np, y_pred))

epoch 0  | loss: 0.52451 | val_0_auc: 0.87515 |  0:00:15s
epoch 1  | loss: 0.43317 | val_0_auc: 0.89764 |  0:00:20s
epoch 2  | loss: 0.39164 | val_0_auc: 0.92828 |  0:00:28s
epoch 3  | loss: 0.34    | val_0_auc: 0.94508 |  0:00:33s
epoch 4  | loss: 0.31046 | val_0_auc: 0.95073 |  0:00:41s
epoch 5  | loss: 0.28787 | val_0_auc: 0.96103 |  0:00:47s
epoch 6  | loss: 0.27042 | val_0_auc: 0.962   |  0:00:54s
epoch 7  | loss: 0.26094 | val_0_auc: 0.96252 |  0:01:00s
epoch 8  | loss: 0.25302 | val_0_auc: 0.96613 |  0:01:06s
epoch 9  | loss: 0.24544 | val_0_auc: 0.96825 |  0:01:12s
epoch 10 | loss: 0.23994 | val_0_auc: 0.97131 |  0:01:19s
epoch 11 | loss: 0.23557 | val_0_auc: 0.96242 |  0:01:25s
epoch 12 | loss: 0.22619 | val_0_auc: 0.96915 |  0:01:32s
epoch 13 | loss: 0.22559 | val_0_auc: 0.96316 |  0:01:38s
epoch 14 | loss: 0.21921 | val_0_auc: 0.97498 |  0:01:47s
epoch 15 | loss: 0.21526 | val_0_auc: 0.9745  |  0:01:53s
epoch 16 | loss: 0.21137 | val_0_auc: 0.97374 |  0:02:00s
epoch 17 | los

In [11]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.optimizers import Adam

# Reshape data for RNN (3D input: samples, timesteps, features)
X_train_rnn = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val_rnn = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test_rnn = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

# Initialize the RNN model
rnn_model = Sequential()
rnn_model.add(SimpleRNN(32, input_shape=(X_train_rnn.shape[1], 1), activation='relu'))
rnn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
rnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
rnn_history = rnn_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Evaluate the model on the test set
test_loss, test_accuracy = rnn_model.evaluate(X_test_rnn, y_test)
print(f"RNN Test Accuracy: {test_accuracy:.4f}")

Epoch 1/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 10ms/step - accuracy: 0.6756 - loss: 0.6066 - val_accuracy: 0.7629 - val_loss: 0.4695
Epoch 2/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 9ms/step - accuracy: 0.8018 - loss: 0.4254 - val_accuracy: 0.8215 - val_loss: 0.4088
Epoch 3/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.8370 - loss: 0.3563 - val_accuracy: 0.8264 - val_loss: 0.4639
Epoch 4/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.8657 - loss: 0.3049 - val_accuracy: 0.8742 - val_loss: 0.2848
Epoch 5/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 9ms/step - accuracy: 0.8730 - loss: 0.2857 - val_accuracy: 0.8830 - val_loss: 0.2597
Epoch 6/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 9ms/step - accuracy: 0.8797 - loss: 0.2714 - val_accuracy: 0.8627 - val_loss: 0.3100
Epoch 7/5

In [12]:
from keras.layers import Conv1D, MaxPooling1D, Flatten

# Initialize the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_rnn.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
cnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
cnn_history = cnn_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Evaluate the model on the test set
test_loss, test_accuracy = cnn_model.evaluate(X_test_rnn, y_test)
print(f"CNN Test Accuracy: {test_accuracy:.4f}")

Epoch 1/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.8103 - loss: 0.4161 - val_accuracy: 0.8547 - val_loss: 0.3262
Epoch 2/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.8563 - loss: 0.3241 - val_accuracy: 0.8621 - val_loss: 0.3132
Epoch 3/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8665 - loss: 0.3015 - val_accuracy: 0.8680 - val_loss: 0.2962
Epoch 4/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.8747 - loss: 0.2873 - val_accuracy: 0.8744 - val_loss: 0.2837
Epoch 5/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8783 - loss: 0.2776 - val_accuracy: 0.8819 - val_loss: 0.2757
Epoch 6/50
[1m4257/4257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8830 - loss: 0.2683 - val_accuracy: 0.8849 - val_loss: 0.2622
Epoch 7/50


In [None]:
from keras.layers import LSTM

# Initialize the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(32, input_shape=(X_train_rnn.shape[1], 1), activation='relu'))
lstm_model.add(Dense(1, activation='sigmoid'))

# Compile the model
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
lstm_history = lstm_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                              validation_data=(X_val_rnn, y_val), verbose=1)

# Evaluate the model on the test set
test_loss, test_accuracy = lstm_model.evaluate(X_test_rnn, y_test)
print(f"LSTM Test Accuracy: {test_accuracy:.4f}")

In [None]:
from keras.layers import GRU

# Initialize the GRU model
gru_model = Sequential()
gru_model.add(GRU(32, input_shape=(X_train_rnn.shape[1], 1), activation='relu'))
gru_model.add(Dense(1, activation='sigmoid'))

# Compile the model
gru_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
gru_history = gru_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Evaluate the model on the test set
test_loss, test_accuracy = gru_model.evaluate(X_test_rnn, y_test)
print(f"GRU Test Accuracy: {test_accuracy:.4f}")

## Import Necessary Libraries for Evaluation

In [None]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure you have seaborn for nicer plots
sns.set(style="whitegrid")

In [None]:
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    """Evaluate the model using MCC, AUC, and plot the confusion matrix."""

    # Calculate MCC
    mcc = matthews_corrcoef(y_true, y_pred)

    # Calculate AUC
    auc = roc_auc_score(y_true, y_pred_proba)

    # Print results
    print(f"{model_name} Evaluation:")
    print(f"  MCC: {mcc:.4f}")
    print(f"  AUC: {auc:.4f}")

    # Plot confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()

def plot_training_history(history, model_name):
    """Plot training & validation accuracy and loss from the model's history."""
    # Accuracy
    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{model_name} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()

In [None]:
# Predict probabilities for AUC calculation
y_pred_proba_log_reg = log_reg.predict_proba(X_test)[:, 1]
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the Logistic Regression model
evaluate_model(y_test, y_pred_log_reg, y_pred_proba_log_reg, "Logistic Regression")

In [None]:
# Predict probabilities for AUC calculation
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]
y_pred_rf = rf.predict(X_test)

# Evaluate the Random Forest model
evaluate_model(y_test, y_pred_rf, y_pred_proba_rf, "Random Forest")

In [None]:
# Predict probabilities for AUC calculation
y_pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]
y_pred_xgb = xgb.predict(X_test)

# Evaluate the XGBoost model
evaluate_model(y_test, y_pred_xgb, y_pred_proba_xgb, "XGBoost")

In [None]:
# Predict probabilities for AUC calculation
y_pred_proba_tabnet = tabnet.predict_proba(X_test_np)[:, 1]
y_pred_tabnet = np.argmax(tabnet.predict_proba(X_test_np), axis=1)

# Evaluate the TabNet model
evaluate_model(y_test_np, y_pred_tabnet, y_pred_proba_tabnet, "TabNet")

In [None]:
# Evaluate the RNN model
y_pred_proba_rnn = rnn_model.predict(X_test_rnn).flatten()
y_pred_rnn = (y_pred_proba_rnn > 0.5).astype(int)

evaluate_model(y_test, y_pred_rnn, y_pred_proba_rnn, "RNN")
plot_training_history(rnn_history, "RNN")

In [None]:
# Train the RNN model
rnn_history = rnn_model.fit(X_train_rnn, y_train, epochs=100, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Predict on test set
y_pred_proba_rnn = rnn_model.predict(X_test_rnn).flatten()
y_pred_rnn = (y_pred_proba_rnn > 0.5).astype(int)

# Evaluate RNN model
evaluate_model(y_test, y_pred_rnn, y_pred_proba_rnn, "RNN")

# Plot training and validation history
plot_training_history(rnn_history, "RNN")

In [None]:
# Train the CNN model
cnn_history = cnn_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Predict on test set
y_pred_proba_cnn = cnn_model.predict(X_test_rnn).flatten()
y_pred_cnn = (y_pred_proba_cnn > 0.5).astype(int)

# Evaluate CNN model
evaluate_model(y_test, y_pred_cnn, y_pred_proba_cnn, "CNN")

# Plot training and validation history
plot_training_history(cnn_history, "CNN")

In [None]:
# Train the LSTM model
lstm_history = lstm_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                              validation_data=(X_val_rnn, y_val), verbose=1)

# Predict on test set
y_pred_proba_lstm = lstm_model.predict(X_test_rnn).flatten()
y_pred_lstm = (y_pred_proba_lstm > 0.5).astype(int)

# Evaluate LSTM model
evaluate_model(y_test, y_pred_lstm, y_pred_proba_lstm, "LSTM")

# Plot training and validation history
plot_training_history(lstm_history, "LSTM")

In [None]:
# Train the GRU model
gru_history = gru_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32,
                            validation_data=(X_val_rnn, y_val), verbose=1)

# Predict on test set
y_pred_proba_gru = gru_model.predict(X_test_rnn).flatten()
y_pred_gru = (y_pred_proba_gru > 0.5).astype(int)

# Evaluate GRU model
evaluate_model(y_test, y_pred_gru, y_pred_proba_gru, "GRU")

# Plot training and validation history
plot_training_history(gru_history, "GRU")
