In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from ucimlrepo import fetch_ucirepo  # Ensure this package is installed

# Load Cleveland Heart Disease dataset
heart_disease = fetch_ucirepo(id=45)  # UCI ML Repository ID for Cleveland Dataset

# Combine features and target
df = pd.concat([heart_disease.data.features, heart_disease.data.targets], axis=1)

# Assign column names
df.columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'
]

# Simplify 'num' column to binary classification
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df.drop('num', axis=1, inplace=True)

# Handle missing values
df.replace(['?', ''], np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df.fillna(df.median(), inplace=True)

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1).values)
y = df['target'].values

# Define PyTorch Dataset
class HeartDiseaseDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Define Logistic Regression-inspired Neural Network
class LRInspiredNN(nn.Module):
    def __init__(self, input_size):
        super(LRInspiredNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

# Cross-validation setup
k_folds = 5
batch_size = 16
num_epochs = 50
patience = 10
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Collect fold results and structure them for averaging
all_folds_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Fold {fold + 1}/{k_folds}")

    # Reset best model state at the start of each fold
    best_model_state = None

    # Split data for this fold
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Create DataLoaders
    train_dataset = HeartDiseaseDataset(X_train, y_train)
    val_dataset = HeartDiseaseDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model, loss function, and optimizer
    model = LRInspiredNN(input_size=X.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Early stopping variables
    best_val_loss = np.inf
    epochs_no_improve = 0
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            y_batch = y_batch.float()

            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * X_batch.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                y_batch = y_batch.float()
                outputs = model(X_batch).squeeze()
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()  # Save best model state for current fold
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping triggered")
                break

    # Load the best model state for this fold, if available
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Evaluate model on validation set
    model.eval()
    with torch.no_grad():
        y_pred_prob = model(torch.tensor(X_val, dtype=torch.float32)).squeeze().numpy()
        y_pred = (y_pred_prob >= 0.5).astype(int)

    # Generate classification report for this fold and append to results
    report = classification_report(y_val, y_pred, output_dict=True)
    all_folds_results.append(report)

    # Plot training and validation loss for each fold
    plt.plot(train_losses, label="Train Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"Training and Validation Loss - Fold {fold + 1}")
    plt.legend()
    plt.show()

# Convert all_folds_results into a DataFrame
fold_results_df = pd.json_normalize(all_folds_results)

# Calculate the average metrics across all folds and display rounded results
average_report = fold_results_df.mean().round(2)
print("\nAverage Cross-Validation Results:")
print(average_report)

Fold 1/5


NameError: name 'Data' is not defined