<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/Nov21Insur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, log_loss, confusion_matrix, roc_curve, auc
)
from sklearn.base import clone

# Ignore warnings
warnings.filterwarnings('ignore')

# Load dfset
file_path = '/content/drive/My Drive/telematics_syn.csv'
df = pd.read_csv(file_path)

# Split df into features and target
X = df.drop('NB_Claim', axis=1)
y = df['NB_Claim']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for numerical and categorical df
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training df
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Balance the training dfset using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define models
models = {
    'Logistic Regression': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob'),
    'TabNet': TabNetClassifier(
        verbose=0,  # Set to 1 if you want to see training progress
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":10, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax'
    )
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
    else:
        y_prob = np.eye(len(np.unique(y_test)))[y_pred]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)
    loss = log_loss(y_test, y_prob)

    conf_matrix = confusion_matrix(y_test, y_pred)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'mcc': mcc,
        'loss': loss,
        'conf_matrix': conf_matrix
    }

# Function to plot confusion matrix
def plot_confusion_matrix(conf_matrix, title):
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {title}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Function to plot ROC curve
def plot_roc_curve(y_test, y_prob, title):
    n_classes = y_prob.shape[1]
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test == i, y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(10, 7))
    colors = ['blue', 'red', 'green', 'orange']
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {title}')
    plt.legend(loc="lower right")
    plt.show()

# Training and evaluation loop
n_epochs = 10
results = {name: {'metrics': [], 'final_model': None} for name in models.keys()}

for name, model in models.items():
    print(f"\nTraining {name} for {n_epochs} epochs...")

    for epoch in range(n_epochs):
        print(f"Epoch {epoch + 1}/{n_epochs}")

        if name == 'TabNet':
            model.fit(
                X_train_res, y_train_res,
                eval_set=[(X_test, y_test)],
                max_epochs=1,  # Train for 1 epoch at a time
                patience=n_epochs  # Prevent early stopping
            )
        else:
            # For non-TabNet models, we'll retrain from scratch each "epoch" to simulate epoch-based training
            model_clone = clone(model)
            model_clone.fit(X_train_res, y_train_res)
            model = model_clone

        # Evaluate the model after each epoch
        metrics = evaluate_model(model, X_train_res, X_test, y_train_res, y_test)
        results[name]['metrics'].append(metrics)

        print(f"Accuracy: {metrics['accuracy']:.4f}, F1 Score: {metrics['f1']:.4f}")

    # Store the final model
    results[name]['final_model'] = model

# Plot final results
for name, result in results.items():
    final_metrics = result['metrics'][-1]  # Get metrics from the last epoch
    final_model = result['final_model']

    print(f"\nFinal results for {name}:")
    for metric, value in final_metrics.items():
        if metric != 'conf_matrix':
            print(f"  {metric}: {value:.4f}")

    plot_confusion_matrix(final_metrics['conf_matrix'], name)
    if hasattr(final_model, "predict_proba"):
        plot_roc_curve(y_test, final_model.predict_proba(X_test), name)

# Plot learning curves
plt.figure(figsize=(12, 8))
for name, result in results.items():
    accuracies = [m['accuracy'] for m in result['metrics']]
    plt.plot(range(1, n_epochs + 1), accuracies, label=name)

plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend()
plt.show()

# Combine all results into a final dfFrame for comparison
final_results = pd.dfFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[name]['metrics'][-1]['accuracy'] for name in results],
    'Precision': [results[name]['metrics'][-1]['precision'] for name in results],
    'Recall': [results[name]['metrics'][-1]['recall'] for name in results],
    'F1 Score': [results[name]['metrics'][-1]['f1'] for name in results],
    'ROC AUC': [results[name]['metrics'][-1]['roc_auc'] for name in results],
    'MCC': [results[name]['metrics'][-1]['mcc'] for name in results],
    'Loss': [results[name]['metrics'][-1]['loss'] for name in results]
})

print("\nFinal Comparative Analysis Results:")
print(final_results)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/telematics_syn.csv'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset

# 1. Handle missing values
# For numerical columns, we'll use median imputation
# For categorical columns, we'll use mode imputation
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# 2. Feature engineering
df['Age_Group'] = pd.cut(df['Insured.age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
df['Car_Age_Group'] = pd.cut(df['Car.age'], bins=[-1, 0, 3, 5, 10, 20, 100], labels=['New', '1-3', '4-5', '6-10', '11-20', '20+'])

# 3. Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 4. Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Fit and transform the data
X = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)  # Features
y_nb = df['NB_Claim']  # Target for classification
y_amt = df['AMT_Claim']  # Target for regression

X_preprocessed = preprocessor.fit_transform(X)

# 6. Convert to DataFrame for better interpretability
feature_names = (numeric_features.tolist() +
                 preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names(categorical_features).tolist())
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

# 7. Print info about the preprocessed data
print("Preprocessed data shape:", X_preprocessed_df.shape)
print("\nFirst few rows of preprocessed data:")
print(X_preprocessed_df.head())

# 8. Handle target variables
# For NB_Claim (classification)
y_nb = (y_nb > 0).astype(int)  # Convert to binary (0 for no claim, 1 for claim)

# For AMT_Claim (regression)
# We'll keep it as is, but you might want to consider log transformation if the distribution is skewed

print("\nUnique values in NB_Claim (after binarization):", y_nb.unique())
print("AMT_Claim statistics:")
print(y_amt.describe())

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load the preprocessed data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Combine all data for analysis
all_data = pd.concat([train_data, val_data, test_data], axis=0)

# Function to plot feature distributions
def plot_feature_distributions(data, feature, target):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=target, y=feature, data=data)
    plt.title(f'Distribution of {feature} by {target}')
    plt.show()

# Analyze top features
top_features = ['num__NB_Claim', 'num__AMT_Claim', 'cat__Region_0']
for feature in top_features:
    plot_feature_distributions(all_data, feature, 'Risk_Category')

# Calculate correlation with target
correlation = all_data.corr()['Risk_Category'].sort_values(ascending=False)
print("Top 10 correlations with Risk_Category:")
print(correlation[:10])

# Remove highly correlated features
threshold = 0.8
high_corr_features = correlation[abs(correlation) > threshold].index.tolist()
high_corr_features.remove('Risk_Category')  # Keep the target variable
print(f"Removing highly correlated features: {high_corr_features}")

X = all_data.drop(high_corr_features + ['Risk_Category'], axis=1)
y = all_data['Risk_Category']

# Split the data
X_train_val = X.iloc[:len(train_data) + len(val_data)]
y_train_val = y.iloc[:len(train_data) + len(val_data)]
X_test = X.iloc[len(train_data) + len(val_data):]
y_test = y.iloc[len(train_data) + len(val_data):]

# Initialize TabNet model
tabnet_model = TabNetClassifier(
    n_d=16, n_a=16, n_steps=3,
    gamma=1.5, n_independent=2, n_shared=2,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax",
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=42,
    verbose=10
)

# Perform stratified k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val), 1):
    print(f"Fold {fold}")

    X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
    y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

    tabnet_model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_name=['train', 'valid'],
        eval_metric=['accuracy'],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False
    )

    y_pred = tabnet_model.predict(X_val)
    fold_accuracy = accuracy_score(y_val, y_pred)
    cv_scores.append(fold_accuracy)
    print(f"Fold {fold} Validation Accuracy: {fold_accuracy:.4f}")

print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

# Retrain on full training set
tabnet_model.fit(
    X_train=X_train_val, y_train=y_train_val,
    eval_set=[(X_train_val, y_train_val)],
    eval_name=['train'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Make predictions on the test set
y_pred = tabnet_model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Feature importance
feature_importances = tabnet_model.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).head(10)
print("Top 10 Important Features:")
print(importance_df)

# Save the model
saving_path_name = "tabnet_model"
saved_filepath = tabnet_model.save_model(saving_path_name)
print(f"Model saved to: {saved_filepath}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_rate=0.5):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        x = self.dropout(h_n[-1])
        x = self.fc(x)
        return x

# Hyperparameters
input_size = 10
hidden_size = 256  # Increased from 128
num_layers = 3  # Increased from 2
num_classes = 2
dropout_rate = 0.5
learning_rate = 0.001
batch_size = 32  # Decreased from 64
epochs = 100  # Increased from 20

lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes, dropout_rate)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate, weight_decay=1e-5)  # Added weight decay

# Assume X_train, y_train, X_val, y_val, X_test, y_test are already defined
# For demonstration, we'll use random data. Replace with your actual data.
X_train_tensor = torch.randn(1000, 10, input_size)
y_train_tensor = torch.randint(0, 2, (1000,))
X_val_tensor = torch.randn(200, 10, input_size)
y_val_tensor = torch.randint(0, 2, (200,))
X_test_tensor = torch.randn(100, 10, input_size)
y_test_tensor = torch.randint(0, 2, (100,))

train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Training loop with early stopping
train_acc, val_acc = [], []
train_loss, val_loss = [], []
best_val_acc = 0
patience = 10
no_improve = 0

for epoch in range(epochs):
    lstm_model.train()
    running_loss = 0.0
    correct, total = 0, 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = lstm_model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    train_acc.append(100 * correct / total)
    train_loss.append(running_loss / len(train_loader))

    # Validation
    lstm_model.eval()
    val_loss_total, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = lstm_model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss_total += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    val_accuracy = 100 * correct / total
    val_acc.append(val_accuracy)
    val_loss.append(val_loss_total / len(val_loader))

    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss[-1]:.4f}, Train Acc: {train_acc[-1]:.2f}%, Val Loss: {val_loss[-1]:.4f}, Val Acc: {val_acc[-1]:.2f}%')

    # Early stopping
    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        torch.save(lstm_model.state_dict(), 'best_model.pth')
        no_improve = 0
    else:
        no_improve += 1
        if no_improve == patience:
            print("Early stopping triggered")
            break

# Load best model for testing
lstm_model.load_state_dict(torch.load('best_model.pth'))

# Test Accuracy
lstm_model.eval()
with torch.no_grad():
    lstm_test_outputs = lstm_model(X_test_tensor)
    _, lstm_test_preds = torch.max(lstm_test_outputs.data, 1)
    test_acc_lstm = accuracy_score(y_test_tensor.numpy(), lstm_test_preds.numpy())

print(f"Test Accuracy - LSTM: {test_acc_lstm:.2f}")

# Plot training and validation accuracy/loss
def plot_metrics(metrics, title):
    plt.figure(figsize=(12, 6))
    for metric_name, values in metrics.items():
        plt.plot(values, label=metric_name)
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

plot_metrics({'train_acc': train_acc, 'val_acc': val_acc, 'train_loss': train_loss, 'val_loss': val_loss}, title="LSTM Training & Validation Metrics")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Assume dataset_resampled is already loaded
# If not, you should load your dataset here
# dataset_resampled = pd.read_csv('your_dataset.csv')

# Prepare the data
X = dataset_resampled.drop(columns=['ClaimYN']).values
y = dataset_resampled['ClaimYN'].values

# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Function to plot metrics
def plot_metrics(history, title="Model Metrics"):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(history['train_acc'], label='Training Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title(f'{title} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title(f'{title} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
train_acc_log = log_reg.score(X_train, y_train)
val_acc_log = log_reg.score(X_val, y_val)
print(f"Logistic Regression - Training Accuracy: {train_acc_log:.4f}, Validation Accuracy: {val_acc_log:.4f}")

# Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
train_acc_rf = random_forest.score(X_train, y_train)
val_acc_rf = random_forest.score(X_val, y_val)
print(f"Random Forest - Training Accuracy: {train_acc_rf:.4f}, Validation Accuracy: {val_acc_rf:.4f}")

# Neural Network Model
class NNModel(nn.Module):
    def __init__(self, input_size):
        super(NNModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to train and evaluate NN model
def train_nn_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    train_acc, val_acc = [], []
    train_loss, val_loss = [], []

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        train_acc.append(100 * correct / total)
        train_loss.append(running_loss / len(train_loader))

        model.eval()
        val_running_loss = 0.0
        correct, total = 0, 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        val_acc.append(100 * correct / total)
        val_loss.append(val_running_loss / len(val_loader))

        print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss[-1]:.4f}, Train Acc: {train_acc[-1]:.2f}%, '
              f'Val Loss: {val_loss[-1]:.4f}, Val Acc: {val_acc[-1]:.2f}%')

    return {'train_acc': train_acc, 'val_acc': val_acc, 'train_loss': train_loss, 'val_loss': val_loss}

# Prepare data for PyTorch models
train_data = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
val_data = TensorDataset(torch.FloatTensor(X_val), torch.LongTensor(y_val))
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# Train NN model
nn_model = NNModel(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)
nn_history = train_nn_model(nn_model, train_loader, val_loader, criterion, optimizer, epochs=20)

plot_metrics(nn_history, title="Neural Network")

# Test Accuracy for each model
test_acc_log = accuracy_score(y_test, log_reg.predict(X_test))
test_acc_rf = accuracy_score(y_test, random_forest.predict(X_test))

nn_model.eval()
with torch.no_grad():
    nn_test_outputs = nn_model(torch.FloatTensor(X_test))
    _, nn_test_preds = torch.max(nn_test_outputs.data, 1)
test_acc_nn = accuracy_score(y_test, nn_test_preds.numpy())

# Display test accuracies
print(f"Test Accuracy - Logistic Regression: {test_acc_log:.4f}")
print(f"Test Accuracy - Random Forest: {test_acc_rf:.4f}")
print(f"Test Accuracy - Neural Network: {test_acc_nn:.4f}")

In [6]:
# Step 3: Visualize the data (remains the same as before)

# Step 4: Feature engineering
# First, let's print the column names to see what we're working with
print(data.columns)

# Now, let's modify our feature engineering based on the available columns
# We'll assume similar columns exist with slightly different names

# Create a new feature: total turns (if applicable)
if 'left_turns_count' in data.columns and 'right_turns_count' in data.columns:
    data['total_turns'] = data['left_turns_count'] + data['right_turns_count']

    # Create a new feature: turn ratio
    data['turn_ratio'] = data['left_turns_count'] / (data['right_turns_count'] + 1)  # Adding 1 to avoid division by zero
else:
    print("Turn count columns not found. Skipping total_turns and turn_ratio features.")

# Create a new feature: risk score (example - you may want to adjust this based on domain knowledge and available columns)
risk_factors = ['Insured.age', 'Car.age', 'Annual.miles.drive']
acceleration_column = next((col for col in data.columns if 'accel' in col.lower()), None)
braking_column = next((col for col in data.columns if 'brake' in col.lower()), None)

if acceleration_column:
    risk_factors.append(acceleration_column)
if braking_column:
    risk_factors.append(braking_column)

data['risk_score'] = sum(data[col] * weight for col, weight in zip(risk_factors, [-0.01, 0.02, 0.001, 0.1, 0.1]))

# Step 5: Prepare data for modeling
# Select features and target
features = ['Duration', 'Insured.age', 'Car.age', 'Credit.score', 'Annual.miles.drive', 'risk_score']

# Add turn-related features if they exist
if 'total_turns' in data.columns:
    features.extend(['total_turns', 'turn_ratio'])
if acceleration_column:
    features.append(acceleration_column)
if braking_column:
    features.append(braking_column)

target = 'NB_Claim'

X = data[features]
y = data[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check class imbalance
print("Class distribution before SMOTE:")
print(Counter(y_train))

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("Class distribution after SMOTE:")
print(Counter(y_train_smote))

# The data is now ready for model training and evaluation

NameError: name 'data' is not defined

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame
X = data.drop(['AMT_Claim', 'NB_Claim'], axis=1)
y = data['AMT_Claim']

# Create binary target for classification
y_binary = (y > 0).astype(int)

# Split the data
X_train, X_test, y_train, y_test, y_binary_train, y_binary_test = train_test_split(
    X, y, y_binary, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train classification model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_binary_train)

# Predict probabilities of non-zero claims
y_binary_pred_proba = clf.predict_proba(X_test_scaled)[:, 1]

# Evaluate classification model
auc_score = roc_auc_score(y_binary_test, y_binary_pred_proba)
print(f"Classification AUC: {auc_score:.4f}")

# Train regression model on non-zero claims
mask_train = y_train > 0
mask_test = y_test > 0

reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_scaled[mask_train], np.log1p(y_train[mask_train]))

# Predict on non-zero claims
y_pred_log = reg.predict(X_test_scaled[mask_test])
y_pred = np.expm1(y_pred_log)

# Evaluate regression model
rmse = np.sqrt(mean_squared_error(y_test[mask_test], y_pred))
print(f"Regression RMSE on non-zero claims: {rmse:.4f}")

# Feature importance for classification
feature_importance_clf = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance_clf.head(10).plot(kind='bar')
plt.title('Top 10 Features for Predicting Non-Zero Claims')
plt.tight_layout()
plt.show()

# Feature importance for regression
feature_importance_reg = pd.Series(reg.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance_reg.head(10).plot(kind='bar')
plt.title('Top 10 Features for Predicting Claim Amount')
plt.tight_layout()
plt.show()

NameError: name 'data' is not defined

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Load the data
data = pd.read_csv('telematics_syn.csv')

# Select top 20 features based on the provided ranking
top_features = [
    'Total.miles.driven', 'Annual.pct.driven', 'Duration', 'Credit.score',
    'Pct.drive.2hrs', 'Avgdays.week', 'Pct.drive.thr', 'Pct.drive.rush am',
    'Pct.drive.rush pm', 'Pct.drive.wed', 'Pct.drive.mon', 'Pct.drive.tue',
    'Pct.drive.fri', 'Brake.06miles', 'Pct.drive.sat', 'Pct.drive.sun',
    'Car.age', 'Accel.06miles', 'Years.noclaims', 'Left.turn.intensity09'
]

# Prepare the data
X = data[top_features]
y = (data['NB_Claim'] > 0).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train TabNet
tabnet_model = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3,
    gamma=1.3, n_independent=2, n_shared=2,
    cat_idxs=[], cat_dims=[],
    cat_emb_dim=1, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    lambda_sparse=1e-3, momentum=0.3, clip_value=2.,
    max_epochs=100, patience=10,
    device_name='auto'
)

tabnet_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Extract features using the explain method
train_features = tabnet_model.explain(X_train_scaled)
test_features = tabnet_model.explain(X_test_scaled)

# Train SVM on the extracted features
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_model.fit(train_features, y_train)

# Make predictions
y_pred = svm_model.predict(test_features)
y_pred_proba = svm_model.predict_proba(test_features)[:, 1]

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.3f}")

# Feature importance from TabNet
feat_importances = tabnet_model.feature_importances_
feat_importance_dict = dict(zip(top_features, feat_importances))
sorted_feat_importances = sorted(feat_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("\nTabNet Feature Importances:")
for feat, importance in sorted_feat_importances:
    print(f"{feat}: {importance:.6f}")