<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/ImproveNov4Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load and preprocess data (as per your code)
df = pd.read_csv('/content/drive/My Drive/Insurance/telematics_syn.csv')
df['ClaimYN'] = ((df['NB_Claim'] >= 1) & (df['AMT_Claim'] > 1000)).astype(int)
df = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)
df = pd.get_dummies(df, drop_first=True)
X = df.drop('ClaimYN', axis=1)
y = df['ClaimYN']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train models
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
nn_model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000, random_state=42)
gb_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

# Get validation predictions and weights
gb_val_proba = gb_model.predict_proba(X_val)[:, 1]
nn_val_proba = nn_model.predict_proba(X_val)[:, 1]
gb_val_acc = accuracy_score(y_val, (gb_val_proba > 0.5).astype(int))
nn_val_acc = accuracy_score(y_val, (nn_val_proba > 0.5).astype(int))
weight_gb = gb_val_acc / (gb_val_acc + nn_val_acc)
weight_nn = nn_val_acc / (gb_val_acc + nn_val_acc)

# Weighted ensemble on test set
X_test, _, y_test, _ = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
gb_test_proba = gb_model.predict_proba(X_test)[:, 1]
nn_test_proba = nn_model.predict_proba(X_test)[:, 1]
ensemble_proba = (weight_gb * gb_test_proba + weight_nn * nn_test_proba)
ensemble_pred = (ensemble_proba > 0.5).astype(int)

# Evaluate
print("\nWeighted Ensemble Performance:")
print(f"Accuracy: {accuracy_score(y_test, ensemble_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, ensemble_proba):.4f}")


Weighted Ensemble Performance:
Accuracy: 0.9966
AUC-ROC: 0.9998


In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load and initial preprocess (as above)
df = pd.read_csv('/content/drive/My Drive/Insurance/telematics_syn.csv')
df['ClaimYN'] = ((df['NB_Claim'] >= 1) & (df['AMT_Claim'] > 1000)).astype(int)
df = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)
df = pd.get_dummies(df, drop_first=True)
X = df.drop('ClaimYN', axis=1)
y = df['ClaimYN']

# Anomaly handling for Car.age
X['Car.age'] = np.where(X['Car.age'] < 0, np.nan, X['Car.age'])  # Treat negative as missing
X['Car.age'] = X['Car.age'].fillna(X['Car.age'].median())  # Impute with median

# Log transformation for skewed intensity metrics
intensity_cols = [col for col in X.columns if 'intensity' in col or 'Accel' in col or 'Brake' in col]
for col in intensity_cols:
    X[col] = np.log1p(X[col])  # log(1+x) to handle zeros

# Scale and resample
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Preprocessed dataset shape:", X_resampled.shape)

Preprocessed dataset shape: (194604, 52)


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class NoiseFilterLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, noise_factor=0.1):
        super(NoiseFilterLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.noise_factor = noise_factor
        self.i2h = nn.Linear(input_size + hidden_size, 5 * hidden_size)
        self.h2h = nn.Linear(hidden_size, 5 * hidden_size)

    def forward(self, x, hidden):
        h, c = hidden
        noise = torch.randn_like(x) * self.noise_factor
        x_noisy = x + noise
        combined = torch.cat((x_noisy, h), dim=1)
        gates = self.i2h(combined) + self.h2h(h)
        i, f, g, o, g_global = gates.chunk(5, 1)
        i, f, o, g_global = [torch.sigmoid(x) for x in [i, f, o, g_global]]
        c = f * c + i * torch.tanh(g)
        c_filtered = g_global * torch.tanh(c) + (1 - g_global) * c
        h = o * torch.tanh(c_filtered)
        return h, c_filtered

class xLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(xLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cells = nn.ModuleList([NoiseFilterLSTMCell(input_size if i == 0 else hidden_size, hidden_size) for i in range(num_layers)])
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        h = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        c = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        for t in range(x.size(1)):
            for l in range(self.num_layers):
                if l == 0:
                    h[l], c[l] = self.cells[l](x[:, t, :], (h[l], c[l]))
                else:
                    h[l], c[l] = self.cells[l](h[l-1], (h[l], c[l]))
        return self.fc(h[-1])

# Prepare data (assuming X_resampled, y_resampled from above)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_train_tensor = torch.FloatTensor(X_train).unsqueeze(1)  # Add time dimension
y_train_tensor = torch.LongTensor(y_train)
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)

# Train xLSTM
xlstm_model = xLSTM(input_size=X_train.shape[1], hidden_size=64, num_layers=2, output_size=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(xlstm_model.parameters(), lr=0.001)
xlstm_model.train()
for epoch in range(20):
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = xlstm_model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
print("xLSTM training completed.")

xLSTM training completed.


In [6]:
from sklearn.metrics import precision_recall_curve, f1_score, precision_score, recall_score

# Get predictions
gb_test_proba = gb_model.predict_proba(X_test)[:, 1]
nn_test_proba = nn_model.predict_proba(X_test)[:, 1]
ensemble_proba = (weight_gb * gb_test_proba + weight_nn * nn_test_proba)

# Threshold optimization
precisions, recalls, thresholds = precision_recall_curve(y_test, ensemble_proba)
f1_scores = [f1_score(y_test, (ensemble_proba >= t).astype(int)) for t in thresholds]
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_pred = (ensemble_proba >= optimal_threshold).astype(int)

# Evaluate with optimal threshold
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print("Ensemble Performance with Optimal Threshold:")
print(f"Accuracy: {accuracy_score(y_test, optimal_pred):.4f}")
print(f"Precision: {precision_score(y_test, optimal_pred):.4f}")
print(f"Recall: {recall_score(y_test, optimal_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, optimal_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, ensemble_proba):.4f}")


Optimal Threshold: 0.1144
Ensemble Performance with Optimal Threshold:
Accuracy: 0.8342
Precision: 0.7866
Recall: 0.9197
F1 Score: 0.8480
AUC-ROC: 0.8988


In [8]:
import time

# Batch processing function
def process_in_batches(model, X, batch_size=1000):
    predictions = []
    start_time = time.time()
    for i in range(0, len(X), batch_size):
        batch = X[i:i + batch_size]
        batch_pred = model.predict_proba(batch)[:, 1]
        predictions.append(batch_pred)
    end_time = time.time()
    return np.concatenate(predictions), end_time - start_time

# Test scalability
gb_predictions, gb_time = process_in_batches(gb_model, X_test)
nn_predictions, nn_time = process_in_batches(nn_model, X_test)
start_time = time.time()
ensemble_pred = (weight_gb * gb_predictions + weight_nn * nn_predictions)
ensemble_time = time.time() - start_time + gb_time + nn_time

print(f"Gradient Boosting Processing Time: {gb_time:.2f} seconds")
print(f"Neural Network Processing Time: {nn_time:.2f} seconds")
print(f"Ensemble Processing Time: {ensemble_time:.2f} seconds")

Gradient Boosting Processing Time: 0.13 seconds
Neural Network Processing Time: 0.06 seconds
Ensemble Processing Time: 0.20 seconds
