In [1]:
# !pip3 install -U ucimlrepo

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'ucimlrepo'

In [None]:
# Fetch data
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# Data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# Preprocess data
means = X.mean()
X = X.fillna(means)
X = pd.get_dummies(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.float32)).view(-1, 1)
X_val_tensor = torch.tensor(X_val.astype(np.float32))
y_val_tensor = torch.tensor(y_val.values.astype(np.float32)).view(-1, 1)
X_test_tensor = torch.tensor(X_test.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.float32)).view(-1, 1)

# Create DataLoaders
batch_size = 64
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [None]:
# Define model
class DiabetesPredictor(nn.Module):
    def __init__(self, input_features):
        super(DiabetesPredictor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_features, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

input_features = X_train.shape[1]
model = DiabetesPredictor(input_features)

In [None]:
# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 40
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    train_loss_accum = []  # List to store losses of each batch

    for data, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss_accum.append(loss.item())  # Append the loss of the current batch

    # Compute average training loss for the epoch and store it
    epoch_train_loss = np.mean(train_loss_accum)
    train_losses.append(epoch_train_loss)

    # Compute validation loss for the epoch and store it
    model.eval()
    with torch.no_grad():
        val_loss_accum = []  # List to store validation losses
        for data, labels in val_loader:
            outputs = model(data)
            val_loss = criterion(outputs, labels)
            val_loss_accum.append(val_loss.item())

        epoch_val_loss = np.mean(val_loss_accum)
        val_losses.append(epoch_val_loss)

    print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss:.4f}, Validation Loss: {epoch_val_loss:.4f}')

In [None]:
# Test loop
test_losses = []
model.eval()
with torch.no_grad():
    for batch_index, (batch_x, batch_y) in enumerate(test_loader):
        if batch_index % 20 == 0:  # Check if the current batch index is a multiple of 20
            preds = model(batch_x)  # Get predictions
            loss = criterion(preds, batch_y)  # Compute loss
            test_losses.append(loss.item())

In [None]:
# Plotting training and validation loss vs epochs
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.title('Training and Validation Loss vs Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plotting test loss for every 20th batch
plt.figure(figsize=(10, 5))
plt.plot(range(0, len(test_losses) * 20, 20), test_losses, label='Test Loss per 20th Batch')
plt.title('Test Loss for Every 20th Batch')
plt.xlabel('Batch Number')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

# Compute ROC curve and ROC area
with torch.no_grad():
    y_pred_probs = model(X_test_tensor).detach().numpy()
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)

# Compute Precision-Recall curve and area
precision, recall, _ = precision_recall_curve(y_test, y_pred_probs)
pr_auc = average_precision_score(y_test, y_pred_probs)

# Plotting the ROC Curve and Precision-Recall Curve
plt.figure(figsize=(12, 5))

# ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

# Precision-Recall Curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, color='green', lw=2, label='Precision-Recall curve (area = %0.2f)' % pr_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")

plt.show()

# Plotting histogram of predicted probabilities
plt.figure(figsize=(6, 4))
plt.hist(y_pred_probs, bins=50, color='blue', alpha=0.7, label='Predicted probabilities')
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability')
plt.ylabel('Frequency')
plt.legend()
plt.show()