<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/Insurance_ICCKe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Insurance Claim Prediction - Complete ML Pipeline
# Comprehensive implementation with xLSTM, Neural Networks, and Gradient Boosting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# ML Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve,
                           average_precision_score, matthews_corrcoef)
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from imblearn.over_sampling import SMOTE

# Deep Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
# =============================================================================
# 1. DATA LOADING AND INITIAL EXPLORATION
# =============================================================================

def load_and_explore_data(file_path):
    """Load dataset and perform initial exploration"""
    print("="*60)
    print("1. DATA LOADING AND INITIAL EXPLORATION")
    print("="*60)

    # Load dataset
    df = pd.read_csv(file_path)
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")

    # Display basic info
    print("\nDataset Info:")
    print(df.info())

    print("\nFirst 5 rows:")
    print(df.head())

    print("\nBasic Statistics:")
    print(df.describe())

    return df


In [1]:



# =============================================================================
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

def perform_eda(df):
    """Comprehensive Exploratory Data Analysis"""
    print("\n" + "="*60)
    print("2. EXPLORATORY DATA ANALYSIS (EDA)")
    print("="*60)

    # Create target variable
    df['ClaimYN'] = ((df['NB_Claim'] >= 1) & (df['AMT_Claim'] > 1000)).astype(int)

    # Class distribution
    print("\nOriginal Class Distribution:")
    class_dist = df['ClaimYN'].value_counts()
    print(f"Not Risky (0): {class_dist[0]} instances ({class_dist[0]/len(df)*100:.2f}%)")
    print(f"Risky (1): {class_dist[1]} instances ({class_dist[1]/len(df)*100:.2f}%)")

    # Visualization 1: Class Distribution
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    df['ClaimYN'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
    plt.title('Original Class Distribution')
    plt.xlabel('ClaimYN')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Not Risky', 'Risky'], rotation=0)

    plt.subplot(1, 2, 2)
    plt.pie(class_dist.values, labels=['Not Risky', 'Risky'], autopct='%1.1f%%',
            colors=['skyblue', 'salmon'])
    plt.title('Class Distribution Percentage')

    plt.tight_layout()
    plt.show()

    # Visualization 2: Correlation Matrix
    plt.figure(figsize=(15, 12))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.show()

    # Visualization 3: Feature distributions by class
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    important_features = ['Duration', 'Insured.age', 'Car.age', 'Credit.score',
                         'Annual.miles.drive', 'Years.noclaims']

    for i, feature in enumerate(important_features):
        row = i // 3
        col = i % 3

        df.boxplot(column=feature, by='ClaimYN', ax=axes[row, col])
        axes[row, col].set_title(f'{feature} by ClaimYN')
        axes[row, col].set_xlabel('ClaimYN')

    plt.tight_layout()
    plt.show()

    return df

# =============================================================================
# 3. DATA PREPROCESSING
# =============================================================================

def preprocess_data(df):
    """Comprehensive data preprocessing pipeline"""
    print("\n" + "="*60)
    print("3. DATA PREPROCESSING")
    print("="*60)

    # Drop original claim columns
    df = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)

    # Handle missing values
    print("Handling missing values...")
    df = df.dropna()

    # Handle anomalies in Car.age (negative values)
    print("Handling Car.age anomalies...")
    df['Car.age'] = np.where(df['Car.age'] < 0, np.nan, df['Car.age'])
    df['Car.age'] = df['Car.age'].fillna(df['Car.age'].median())

    # Log transformation for skewed features
    print("Applying log transformation to intensity features...")
    intensity_cols = [col for col in df.columns if 'intensity' in col or 'Accel' in col or 'Brake' in col]
    for col in intensity_cols:
        df[col] = np.log1p(df[col])  # log(1+x) to handle zeros

    # Encode categorical variables
    print("Encoding categorical variables...")
    df = pd.get_dummies(df, drop_first=True)

    # Separate features and target
    X = df.drop('ClaimYN', axis=1)
    y = df['ClaimYN']

    # Feature scaling
    print("Scaling features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Handle class imbalance with SMOTE
    print("Applying SMOTE for class balance...")
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    print(f"After SMOTE - Class distribution:")
    unique, counts = np.unique(y_resampled, return_counts=True)
    for u, c in zip(unique, counts):
        print(f"Class {u}: {c} instances")

    # Visualization: Before and After SMOTE
    plt.figure(figsize=(12, 5))

    # Before SMOTE
    plt.subplot(1, 2, 1)
    original_counts = y.value_counts().values
    plt.bar(['Not Risky', 'Risky'], original_counts, color=['skyblue', 'salmon'])
    plt.title('Before SMOTE')
    plt.ylabel('Count')
    for i, v in enumerate(original_counts):
        plt.text(i, v + 1000, str(v), ha='center', va='bottom')

    # After SMOTE
    plt.subplot(1, 2, 2)
    resampled_counts = [counts[0], counts[1]]
    plt.bar(['Not Risky', 'Risky'], resampled_counts, color=['lightblue', 'coral'])
    plt.title('After SMOTE')
    plt.ylabel('Count')
    for i, v in enumerate(resampled_counts):
        plt.text(i, v + 1000, str(v), ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

    return X_resampled, y_resampled, X.columns, scaler

# =============================================================================
# 4. MODEL ARCHITECTURES WITH INNOVATIONS
# =============================================================================

# 4.1 xLSTM Implementation (Novel Architecture)
class xLSTMCell(nn.Module):
    """Extended LSTM Cell with noise filtering and global gating"""
    def __init__(self, input_size, hidden_size, noise_factor=0.1):
        super(xLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.noise_factor = noise_factor

        # Extended gates: input, forget, cell, output, global
        self.i2h = nn.Linear(input_size + hidden_size, 5 * hidden_size)
        self.h2h = nn.Linear(hidden_size, 5 * hidden_size)

    def forward(self, x, hidden):
        h, c = hidden

        # Add noise for regularization
        noise = torch.randn_like(x) * self.noise_factor
        x_noisy = x + noise

        # Combine input and hidden state
        combined = torch.cat((x_noisy, h), dim=1)
        gates = self.i2h(combined) + self.h2h(h)

        # Split into 5 gates
        i, f, g, o, g_global = gates.chunk(5, 1)

        # Apply activations
        i = torch.sigmoid(i)  # Input gate
        f = torch.sigmoid(f)  # Forget gate
        o = torch.sigmoid(o)  # Output gate
        g_global = torch.sigmoid(g_global)  # Global gate (innovation)

        # Update cell state with global filtering
        c_new = f * c + i * torch.tanh(g)
        c_filtered = g_global * torch.tanh(c_new) + (1 - g_global) * c

        # Update hidden state
        h_new = o * torch.tanh(c_filtered)

        return h_new, c_filtered

class xLSTM(nn.Module):
    """Extended LSTM with multiple layers and advanced features"""
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(xLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Stack of xLSTM cells
        self.cells = nn.ModuleList([
            xLSTMCell(input_size if i == 0 else hidden_size, hidden_size)
            for i in range(num_layers)
        ])

        # Dropout and final layers
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)

        # Initialize hidden states
        h = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        c = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]

        # Process through time steps
        for t in range(x.size(1)):
            for l in range(self.num_layers):
                if l == 0:
                    h[l], c[l] = self.cells[l](x[:, t, :], (h[l], c[l]))
                else:
                    h[l], c[l] = self.cells[l](self.dropout(h[l-1]), (h[l], c[l]))

        # Final prediction
        return self.fc(self.dropout(h[-1]))

# 4.2 Advanced Neural Network
class AdvancedNN(nn.Module):
    """Advanced Neural Network with residual connections and batch normalization"""
    def __init__(self, input_size, hidden_sizes=[128, 64, 32], output_size=2, dropout=0.3):
        super(AdvancedNN, self).__init__()

        layers = []
        prev_size = input_size

        for i, hidden_size in enumerate(hidden_sizes):
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, output_size))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# =============================================================================
# 5. MODEL TRAINING AND EVALUATION
# =============================================================================

class ModelTrainer:
    """Comprehensive model training and evaluation class"""

    def __init__(self, X, y, feature_names):
        self.X = X
        self.y = y
        self.feature_names = feature_names
        self.models = {}
        self.results = {}

    def split_data(self, test_size=0.2, val_size=0.1):
        """Split data into train, validation, and test sets"""
        # First split: separate test set
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=42, stratify=self.y
        )

        # Second split: separate train and validation
        val_size_adjusted = val_size / (1 - test_size)
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp, test_size=val_size_adjusted, random_state=42, stratify=y_temp
        )

        print(f"Train set: {self.X_train.shape[0]} samples")
        print(f"Validation set: {self.X_val.shape[0]} samples")
        print(f"Test set: {self.X_test.shape[0]} samples")

    def train_traditional_models(self):
        """Train traditional ML models"""
        print("\n" + "="*60)
        print("5. TRAINING TRADITIONAL MODELS")
        print("="*60)

        # Gradient Boosting with hyperparameter tuning
        print("Training Gradient Boosting...")
        gb_param_grid = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }

        gb_model = GradientBoostingClassifier(random_state=42)
        gb_search = RandomizedSearchCV(
            gb_model, gb_param_grid, n_iter=20, cv=3,
            scoring='roc_auc', random_state=42, n_jobs=-1
        )
        gb_search.fit(self.X_train, self.y_train)
        self.models['Gradient Boosting'] = gb_search.best_estimator_

        print(f"Best GB parameters: {gb_search.best_params_}")

        # Neural Network (sklearn)
        print("Training Neural Network...")
        nn_param_grid = {
            'hidden_layer_sizes': [(100, 50), (128, 64), (100, 100)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive']
        }

        nn_model = MLPClassifier(max_iter=1000, random_state=42)
        nn_search = RandomizedSearchCV(
            nn_model, nn_param_grid, n_iter=15, cv=3,
            scoring='roc_auc', random_state=42, n_jobs=-1
        )
        nn_search.fit(self.X_train, self.y_train)
        self.models['Neural Network'] = nn_search.best_estimator_

        print(f"Best NN parameters: {nn_search.best_params_}")

        # Random Forest for comparison
        print("Training Random Forest...")
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf_model.fit(self.X_train, self.y_train)
        self.models['Random Forest'] = rf_model

    def train_deep_models(self, epochs=50):
        """Train deep learning models"""
        print("\n" + "="*60)
        print("6. TRAINING DEEP LEARNING MODELS")
        print("="*60)

        # Prepare data for PyTorch
        X_train_tensor = torch.FloatTensor(self.X_train).unsqueeze(1)  # Add sequence dimension
        y_train_tensor = torch.LongTensor(self.y_train)
        X_val_tensor = torch.FloatTensor(self.X_val).unsqueeze(1)
        y_val_tensor = torch.LongTensor(self.y_val)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64)

        # Train xLSTM
        print("Training xLSTM...")
        xlstm_model = xLSTM(
            input_size=self.X_train.shape[1],
            hidden_size=64,
            num_layers=2,
            output_size=2
        )

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(xlstm_model.parameters(), lr=0.001)

        xlstm_history = self._train_pytorch_model(
            xlstm_model, train_loader, val_loader, criterion, optimizer, epochs, "xLSTM"
        )

        self.models['xLSTM'] = xlstm_model

        # Train Advanced NN
        print("Training Advanced NN...")
        # For regular NN, we don't need sequence dimension
        X_train_flat = torch.FloatTensor(self.X_train)
        X_val_flat = torch.FloatTensor(self.X_val)

        train_dataset_flat = TensorDataset(X_train_flat, y_train_tensor)
        val_dataset_flat = TensorDataset(X_val_flat, y_val_tensor)

        train_loader_flat = DataLoader(train_dataset_flat, batch_size=64, shuffle=True)
        val_loader_flat = DataLoader(val_dataset_flat, batch_size=64)

        advanced_nn = AdvancedNN(input_size=self.X_train.shape[1])
        optimizer_nn = optim.Adam(advanced_nn.parameters(), lr=0.001)

        nn_history = self._train_pytorch_model(
            advanced_nn, train_loader_flat, val_loader_flat, criterion, optimizer_nn, epochs, "Advanced NN"
        )

        self.models['Advanced NN'] = advanced_nn

        return xlstm_history, nn_history

    def _train_pytorch_model(self, model, train_loader, val_loader, criterion, optimizer, epochs, model_name):
        """Helper function to train PyTorch models"""
        history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

        for epoch in range(epochs):
            # Training phase
            model.train()
            train_loss = 0.0
            train_correct = 0
            train_total = 0

            for batch_x, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                train_total += batch_y.size(0)
                train_correct += (predicted == batch_y).sum().item()

            # Validation phase
            model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)

                    val_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += batch_y.size(0)
                    val_correct += (predicted == batch_y).sum().item()

            # Record metrics
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total

            history['train_loss'].append(train_loss / len(train_loader))
            history['val_loss'].append(val_loss / len(val_loader))
            history['train_acc'].append(train_acc)
            history['val_acc'].append(val_acc)

            if (epoch + 1) % 10 == 0:
                print(f'{model_name} Epoch [{epoch+1}/{epochs}] - '
                      f'Train Acc: {train_acc:.2f}% - Val Acc: {val_acc:.2f}%')

        return history

    def evaluate_all_models(self):
        """Comprehensive evaluation of all models"""
        print("\n" + "="*60)
        print("7. MODEL EVALUATION")
        print("="*60)

        for model_name, model in self.models.items():
            print(f"\nEvaluating {model_name}...")

            # Get predictions
            if isinstance(model, nn.Module):
                # PyTorch model
                model.eval()
                with torch.no_grad():
                    if 'xLSTM' in model_name:
                        test_tensor = torch.FloatTensor(self.X_test).unsqueeze(1)
                    else:
                        test_tensor = torch.FloatTensor(self.X_test)

                    outputs = model(test_tensor)
                    probabilities = torch.softmax(outputs, dim=1)[:, 1].numpy()
                    predictions = (probabilities > 0.5).astype(int)
            else:
                # Sklearn model
                predictions = model.predict(self.X_test)
                probabilities = model.predict_proba(self.X_test)[:, 1]

            # Calculate metrics
            accuracy = accuracy_score(self.y_test, predictions)
            precision = precision_score(self.y_test, predictions)
            recall = recall_score(self.y_test, predictions)
            f1 = f1_score(self.y_test, predictions)
            auc = roc_auc_score(self.y_test, probabilities)
            mcc = matthews_corrcoef(self.y_test, predictions)

            self.results[model_name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'mcc': mcc,
                'predictions': predictions,
                'probabilities': probabilities
            }

            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print(f"AUC-ROC: {auc:.4f}")
            print(f"MCC: {mcc:.4f}")

# =============================================================================
# 8. POST-PROCESSING AND VISUALIZATION
# =============================================================================

def create_comprehensive_visualizations(trainer):
    """Create comprehensive visualization suite"""
    print("\n" + "="*60)
    print("8. COMPREHENSIVE VISUALIZATIONS")
    print("="*60)

    # 1. Model Performance Comparison
    plt.figure(figsize=(15, 10))

    metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc', 'mcc']
    model_names = list(trainer.results.keys())

    for i, metric in enumerate(metrics):
        plt.subplot(2, 3, i+1)
        values = [trainer.results[model][metric] for model in model_names]
        bars = plt.bar(model_names, values, color=['skyblue', 'lightcoral', 'lightgreen', 'gold', 'mediumpurple'])
        plt.title(f'{metric.upper()} Comparison')
        plt.ylabel(metric.upper())
        plt.xticks(rotation=45)

        # Add value labels on bars
        for bar, value in zip(bars, values):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{value:.3f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

    # 2. ROC Curves Comparison
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green', 'orange', 'purple']

    for i, (model_name, results) in enumerate(trainer.results.items()):
        fpr, tpr, _ = roc_curve(trainer.y_test, results['probabilities'])
        plt.plot(fpr, tpr, color=colors[i], label=f'{model_name} (AUC = {results["auc"]:.3f})')

    plt.plot([0, 1], [0, 1], 'k--', alpha=0.6)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # 3. Confusion Matrices
    n_models = len(trainer.results)
    fig, axes = plt.subplots(1, n_models, figsize=(4*n_models, 4))
    if n_models == 1:
        axes = [axes]

    for i, (model_name, results) in enumerate(trainer.results.items()):
        cm = confusion_matrix(trainer.y_test, results['predictions'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f'{model_name}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')

    plt.tight_layout()
    plt.show()

    # 4. Feature Importance (for tree-based models)
    if 'Gradient Boosting' in trainer.models:
        gb_model = trainer.models['Gradient Boosting']
        feature_importance = gb_model.feature_importances_

        # Get top 15 features
        indices = np.argsort(feature_importance)[::-1][:15]

        plt.figure(figsize=(12, 8))
        plt.title('Top 15 Feature Importances (Gradient Boosting)')
        plt.bar(range(15), feature_importance[indices])
        plt.xticks(range(15), [trainer.feature_names[i] for i in indices], rotation=45, ha='right')
        plt.ylabel('Importance')
        plt.tight_layout()
        plt.show()

def create_ensemble_model(trainer):
    """Create and evaluate ensemble model"""
    print("\n" + "="*60)
    print("9. ENSEMBLE MODEL CREATION")
    print("="*60)

    # Get probabilities from all models
    probabilities = []
    weights = []

    for model_name, results in trainer.results.items():
        probabilities.append(results['probabilities'])
        weights.append(results['auc'])  # Weight by AUC score

    probabilities = np.array(probabilities)
    weights = np.array(weights)
    weights = weights / weights.sum()  # Normalize weights

    # Create weighted ensemble
    ensemble_probs = np.average(probabilities, axis=0, weights=weights)
    ensemble_preds = (ensemble_probs > 0.5).astype(int)

    # Evaluate ensemble
    ensemble_metrics = {
        'accuracy': accuracy_score(trainer.y_test, ensemble_preds),
        'precision': precision_score(trainer.y_test, ensemble_preds),
        'recall': recall_score(trainer.y_test, ensemble_preds),
        'f1': f1_score(trainer.y_test, ensemble_preds),
        'auc': roc_auc_score(trainer.y_test, ensemble_probs),
        'mcc': matthews_corrcoef(trainer.y_test, ensemble_preds)
    }

    print("Ensemble Model Performance:")
    for metric, value in ensemble_metrics.items():
        print(f"{metric.upper()}: {value:.4f}")

    # Add ensemble to results
    trainer.results['Ensemble'] = {
        **ensemble_metrics,
        'predictions': ensemble_preds,
        'probabilities': ensemble_probs
    }

    return ensemble_metrics

# =============================================================================
# 10. MAIN EXECUTION PIPELINE
# =============================================================================

# def main_pipeline(file_path):
#     """Execute the complete ML pipeline"""
#     print("INSURANCE CLAIM PREDICTION - COMPLETE ML PIPELINE")
#     print("="*70)

#     # Step 1: Load and explore data
#     df = load_and_explore_data(file_path)

#     # Step 2: EDA
#     df = perform_eda(df)

#     # Step 3: Preprocessing
#     X_resampled, y_resampled, feature_names, scaler = preprocess_data(df)

#     # Step 4: Initialize trainer
#     trainer = ModelTrainer(X_resampled, y_resampled, feature_names)
#     trainer.split_data()

#     # Step 5: Train traditional models
#     trainer.train_traditional_models()

#     # Step 6: Train deep learning models
#     xlstm_history, nn_history =

In [2]:
def main_pipeline(file_path):
    """Execute the complete ML pipeline"""
    print("INSURANCE CLAIM PREDICTION - COMPLETE ML PIPELINE")
    print("="*70)

    try:
        # Step 1: Load and explore data
        df = load_and_explore_data(file_path)

        # Step 2: EDA
        df = perform_eda(df)

        # Step 3: Preprocessing
        X_resampled, y_resampled, feature_names, scaler = preprocess_data(df)

        # Step 4: Initialize trainer
        trainer = ModelTrainer(X_resampled, y_resampled, feature_names)
        trainer.split_data()

        # Step 5: Train traditional models
        trainer.train_traditional_models()

        # Step 6: Train deep learning models
        xlstm_history, nn_history = trainer.train_deep_models(epochs=50)

        # Step 7: Evaluate all models
        trainer.evaluate_all_models()

        # Step 8: Create comprehensive visualizations
        create_comprehensive_visualizations(trainer)

        # Step 9: Create ensemble model
        ensemble_metrics = create_ensemble_model(trainer)

        # Step 10: Final summary and results
        print_final_summary(trainer, ensemble_metrics)

        # Step 11: Save results and models
        save_results_and_models(trainer, scaler, file_path)

        return trainer, ensemble_metrics

    except Exception as e:
        print(f"Error in pipeline execution: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

def print_final_summary(trainer, ensemble_metrics):
    """Print comprehensive final summary"""
    print("\n" + "="*60)
    print("10. FINAL RESULTS SUMMARY")
    print("="*60)

    # Create results DataFrame
    results_df = pd.DataFrame(trainer.results).T
    results_df = results_df[['accuracy', 'precision', 'recall', 'f1', 'auc', 'mcc']]

    print("\nComplete Performance Summary:")
    print(results_df.round(4))

    # Find best model for each metric
    print("\nBest Model by Metric:")
    for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc', 'mcc']:
        best_model = results_df[metric].idxmax()
        best_score = results_df[metric].max()
        print(f"{metric.upper()}: {best_model} ({best_score:.4f})")

    # Overall best model (weighted by AUC and F1)
    results_df['combined_score'] = 0.6 * results_df['auc'] + 0.4 * results_df['f1']
    best_overall = results_df['combined_score'].idxmax()
    print(f"\nBest Overall Model: {best_overall} (Combined Score: {results_df.loc[best_overall, 'combined_score']:.4f})")

    # Business insights
    print("\n" + "="*60)
    print("BUSINESS INSIGHTS & RECOMMENDATIONS")
    print("="*60)

    print("\n1. Model Performance Analysis:")
    if ensemble_metrics['auc'] > 0.85:
        print("   ✓ Excellent predictive performance achieved (AUC > 0.85)")
    elif ensemble_metrics['auc'] > 0.75:
        print("   ✓ Good predictive performance achieved (AUC > 0.75)")
    else:
        print("   ⚠ Moderate predictive performance (AUC < 0.75) - consider feature engineering")

    print(f"\n2. Risk Assessment Capability:")
    print(f"   • Ensemble model can correctly identify {ensemble_metrics['recall']:.1%} of high-risk claims")
    print(f"   • {ensemble_metrics['precision']:.1%} of flagged claims are actually high-risk")
    print(f"   • Overall accuracy: {ensemble_metrics['accuracy']:.1%}")

    print(f"\n3. Business Impact:")
    print(f"   • Potential reduction in claim processing costs")
    print(f"   • Improved risk-based pricing strategies")
    print(f"   • Enhanced customer segmentation capabilities")

def save_results_and_models(trainer, scaler, original_file_path):
    """Save models, results, and create deployment artifacts"""
    print("\n" + "="*60)
    print("11. SAVING MODELS AND RESULTS")
    print("="*60)

    import pickle
    import json
    from datetime import datetime
    import os

    # Create results directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = f"insurance_claim_results_{timestamp}"
    os.makedirs(results_dir, exist_ok=True)

    # Save traditional models
    for model_name, model in trainer.models.items():
        if not isinstance(model, nn.Module):  # Skip PyTorch models for pickle
            model_file = os.path.join(results_dir, f"{model_name.replace(' ', '_').lower()}_model.pkl")
            with open(model_file, 'wb') as f:
                pickle.dump(model, f)
            print(f"Saved {model_name} to {model_file}")

    # Save PyTorch models
    for model_name, model in trainer.models.items():
        if isinstance(model, nn.Module):
            model_file = os.path.join(results_dir, f"{model_name.replace(' ', '_').lower()}_model.pth")
            torch.save(model.state_dict(), model_file)
            print(f"Saved {model_name} to {model_file}")

    # Save scaler
    scaler_file = os.path.join(results_dir, "scaler.pkl")
    with open(scaler_file, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"Saved scaler to {scaler_file}")

    # Save results
    results_file = os.path.join(results_dir, "model_results.json")
    # Convert numpy arrays to lists for JSON serialization
    json_results = {}
    for model_name, results in trainer.results.items():
        json_results[model_name] = {
            k: v.tolist() if isinstance(v, np.ndarray) else float(v) if isinstance(v, np.floating) else v
            for k, v in results.items() if k not in ['predictions', 'probabilities']
        }

    with open(results_file, 'w') as f:
        json.dump(json_results, f, indent=2)
    print(f"Saved results to {results_file}")

    # Save feature names
    features_file = os.path.join(results_dir, "feature_names.json")
    with open(features_file, 'w') as f:
        json.dump(trainer.feature_names.tolist(), f, indent=2)
    print(f"Saved feature names to {features_file}")

    # Create deployment guide
    deployment_guide = f"""
# Insurance Claim Prediction Model - Deployment Guide

## Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

## Model Performance Summary
- Best Overall Model: {max(trainer.results.keys(), key=lambda x: trainer.results[x]['auc'])}
- Best AUC Score: {max(trainer.results[model]['auc'] for model in trainer.results):.4f}
- Ensemble AUC Score: {trainer.results.get('Ensemble', {}).get('auc', 'N/A')}

## Files in this directory:
- *.pkl files: Trained scikit-learn models
- *.pth files: Trained PyTorch models
- scaler.pkl: Feature scaler for preprocessing
- model_results.json: Detailed performance metrics
- feature_names.json: List of feature names
- deployment_guide.md: This file

## Quick Start for Predictions:
```python
import pickle
import torch
import numpy as np

# Load best traditional model (example with Gradient Boosting)
with open('gradient_boosting_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Load scaler
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Make prediction on new data
def predict_claim_risk(new_data):
    # new_data should be a pandas DataFrame with same features as training
    scaled_data = scaler.transform(new_data)
    prediction = model.predict(scaled_data)
    probability = model.predict_proba(scaled_data)[:, 1]
    return prediction, probability
```

## Model Interpretation:
- Prediction = 1: High risk of significant insurance claim
- Prediction = 0: Low risk of significant insurance claim
- Probability threshold: 0.5 (adjustable based on business needs)

## Recommended Usage:
1. Use ensemble model for highest accuracy
2. Use Gradient Boosting for interpretability
3. Use xLSTM for complex pattern detection
4. Monitor model performance regularly and retrain as needed
"""

    guide_file = os.path.join(results_dir, "deployment_guide.md")
    with open(guide_file, 'w') as f:
        f.write(deployment_guide)
    print(f"Saved deployment guide to {guide_file}")

    print(f"\nAll results saved to directory: {results_dir}")
    return results_dir

# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    # Configuration
    DATA_FILE_PATH = "insurance_data.csv"  # Update this path to your dataset

    print("Starting Insurance Claim Prediction Pipeline...")
    print("Please ensure your dataset is available at:", DATA_FILE_PATH)

    # Check if file exists
    import os
    if not os.path.exists(DATA_FILE_PATH):
        print(f"Error: Dataset file '{DATA_FILE_PATH}' not found!")
        print("Please update the DATA_FILE_PATH variable with the correct path to your dataset.")
        print("\nAlternatively, you can run the pipeline with a custom path:")
        print("trainer, ensemble_metrics = main_pipeline('path/to/your/dataset.csv')")
    else:
        # Execute the complete pipeline
        trainer, ensemble_metrics = main_pipeline(DATA_FILE_PATH)

        if trainer is not None:
            print("\n" + "="*70)
            print("PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
            print("="*70)
            print(f"Total models trained: {len(trainer.models)}")
            print(f"Best ensemble AUC: {ensemble_metrics['auc']:.4f}")
            print("Check the results directory for saved models and detailed reports.")
        else:
            print("Pipeline execution failed. Please check the error messages above.")

# Alternative execution for Jupyter notebooks or custom usage:
"""
# For Jupyter notebook or custom usage:
DATA_PATH = "your_insurance_dataset.csv"
trainer, ensemble_results = main_pipeline(DATA_PATH)

# Access specific model results:
print("Gradient Boosting AUC:", trainer.results['Gradient Boosting']['auc'])
print("xLSTM AUC:", trainer.results['xLSTM']['auc'])
print("Ensemble AUC:", ensemble_results['auc'])

# Make predictions on new data:
# new_predictions = trainer.models['Gradient Boosting'].predict(new_scaled_data)
"""

Starting Insurance Claim Prediction Pipeline...
Please ensure your dataset is available at: insurance_data.csv
Error: Dataset file 'insurance_data.csv' not found!
Please update the DATA_FILE_PATH variable with the correct path to your dataset.

Alternatively, you can run the pipeline with a custom path:
trainer, ensemble_metrics = main_pipeline('path/to/your/dataset.csv')


'\n# For Jupyter notebook or custom usage:\nDATA_PATH = "your_insurance_dataset.csv"\ntrainer, ensemble_results = main_pipeline(DATA_PATH)\n\n# Access specific model results:\nprint("Gradient Boosting AUC:", trainer.results[\'Gradient Boosting\'][\'auc\'])\nprint("xLSTM AUC:", trainer.results[\'xLSTM\'][\'auc\'])\nprint("Ensemble AUC:", ensemble_results[\'auc\'])\n\n# Make predictions on new data:\n# new_predictions = trainer.models[\'Gradient Boosting\'].predict(new_scaled_data)\n'