## Preprocessing

#### 90/10 initial split

In [13]:
randomState = 42

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
raw_dataset = pd.read_csv("./data/processed_data.csv") #data has X and Y
X = raw_dataset.drop(columns=["DR"])
Y = pd.DataFrame(raw_dataset["DR"])

#* 90/10 split for training and final test
X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=randomState, stratify=Y)

#### Preprocessing & folds generation

In [14]:
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)

In [15]:
def FOLDS_GENERATOR(X, Y, normalisation_method=MinMaxScaler(), n_splits=5, randomState=None, oversample=False):
    
    """
    Generates stratified folds with specified normalization.
    
    For list of scalers, see:
    https://scikit-learn.org/stable/api/sklearn.preprocessing.html
    
    For more details on scaling and normalization effects, see:
    https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#
    
    normalisation_method should be an instance of a scaler, e.g.,
    - MinMaxScaler()
    - MaxAbsScaler()
    - Quantile_Transform(output_distribution='uniform')
    
    Returns a list of tuples, each containing:
    (X_train_scaled, X_test_scaled, Y_train, Y_test), representing data for each fold
    """
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=randomState)
    kFolds_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
        
        # Fit the scaler on the training data and transform both train and test sets
        X_train_scaled = normalisation_method.fit_transform(X_train)
        X_test_scaled = normalisation_method.transform(X_test)
        
        if oversample:
            # Oversample the training data if needed (e.g., using SMOTE or similar techniques)
            # This is a placeholder; actual oversampling code should be implemented here
            # X_train_scaled....
            pass
        
        # Convert back to DataFrame to maintain column names
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)
        
        # Ensure 'gender' is still binary (0 or 1)
        if X_train_scaled['Gender'].isin([0, 1]).all():
            kFolds_list.append((X_train_scaled, X_test_scaled, Y_train, Y_test))
        else:
            print("Warning: 'gender' column contains unexpected values after scaling.") 
               
        print(f"Fold: {fold+1}, Train: {kFolds_list[fold][0].shape}, Test: {kFolds_list[fold][1].shape}")   
    return kFolds_list

In [16]:
kFolds = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS, normalisation_method=MinMaxScaler(), n_splits=5, randomState=randomState)

Fold: 1, Train: (4593, 28), Test: (1149, 28)
Fold: 2, Train: (4593, 28), Test: (1149, 28)
Fold: 3, Train: (4594, 28), Test: (1148, 28)
Fold: 4, Train: (4594, 28), Test: (1148, 28)
Fold: 5, Train: (4594, 28), Test: (1148, 28)


## Training & Model definition

#### Helper functions for model training

In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from modularModels1 import BlockMaker, modularNN, BasicModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using", device)

def init_weights(model): #tested already
    if isinstance(model, nn.Linear):  # Apply only to linear layers
        nn.init.xavier_uniform_(model.weight)
        if model.bias is not None:
            nn.init.zeros_(model.bias)
            
def fold_to_dataloader_tensor(train_x, test_x, train_y, test_y, batchSize=64, device=device):
    train_dataset = TensorDataset(
        torch.tensor(train_x.values,dtype=torch.float32).to(device), 
        torch.tensor(train_y.values,dtype=torch.float32).to(device))
    val_dataset = TensorDataset(
        torch.tensor(test_x.values,dtype=torch.float32).to(device), 
        torch.tensor(test_y.values,dtype=torch.float32).to(device))

    train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
    return train_loader, val_loader 

Using cuda


#### Other Loss Functions

In [18]:
import torch
import torch.nn as nn
class FocalLoss(nn.Module):
    """
    The alpha parameter adjusts the weight for the minority class.
    The gamma parameter adjusts how much to focus on hard examples (higher values will focus more on difficult-to-classify samples).
    
    criterion = FocalLoss(alpha=0.25, gamma=2.0).to(device) 
    """
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)  # pt is the probability for each class
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()
    
class DiceLoss(nn.Module):
    """
    Dice loss is a metric commonly used for imbalanced datasets, especially in segmentation tasks. It measures the overlap between the predicted and true classes. While it’s more often used in segmentation, it can be adapted for binary classification tasks.
    
    criterion = DiceLoss().to(device) 
    """
    def __init__(self, smooth=1e-6):
        super(DiceLoss, self).__init__()
        self.smooth = smooth

    def forward(self, inputs, targets):
        intersection = torch.sum(inputs * targets)
        union = torch.sum(inputs) + torch.sum(targets)
        dice = (2. * intersection + self.smooth) / (union + self.smooth)
        return 1 - dice

#! default_criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.0])).to(device)

#### __Model + loss + optimiser__ definition & initialisation

In [19]:
model_1 = BasicModel(28,[512,256,32],1,
                   [nn.Tanh(),nn.LeakyReLU()])
# print(model_1)
model = model_1.to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.0])).to(device) #! IMPORTANT .to(device) for GPU
optimiser = optim.Adagrad(model.parameters(),lr=0.005)

model.apply(init_weights)

BasicModel(
  (block): Sequential(
    (0): Linear(in_features=28, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=256, out_features=32, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

#### Training Loop

In [None]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import recall_score, accuracy_score,f1_score, precision_score, confusion_matrix, roc_auc_score, classification_report, roc_curve

epochs = 10
batchsize = 64

#* Fold Training Loop, repeat for each fold
for fold, (train_x, test_x, train_y, test_y) in enumerate(kFolds, start=1):
    # print(f"Fold: {fold}")
    #* Convert the fold to PyTorch tensors and create DataLoader objects
    train_loader, val_loader = fold_to_dataloader_tensor(train_x, test_x, train_y, test_y, batchsize, device)

    #* Set model to training mode: essential for dropout and batch norm layers
    model.train()
    
    #* Epoch Training loop for this fold
    for epoch in range(1,epochs+1):
        running_loss = 0.0 #! for future loss tracking
        #* Mini-batch training loop through the whole fold
        for batch, (inputs, labels) in enumerate(train_loader,start=1):
            optimiser.zero_grad() #? Zero the gradients
            
            outputs = model(inputs) #? Forward pass through the model
            loss = criterion(outputs, labels) #? Calculate loss
            loss.backward() #? Backpropagation
            running_loss += loss.item()
            optimiser.step() #? Update weights
    
    #* Now we evaluate the model on the validation set        
    model.eval() #? Set model to evaluation mode
    with torch.no_grad(): #? No need to track gradients during evaluation       
        for batch, (inputs, labels) in enumerate(val_loader,start=1):#! one pass because val_loader batch size is all   
            outputs = model(inputs)  
            predictions = (torch.sigmoid(outputs) > 0.5).float().cpu()#? Convert logits to binary predictions
            labels = labels.cpu() #? Move labels to CPU for compatibility with sklearn metrics
            
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, pos_label=1)
    recall = recall_score(labels, predictions, pos_label=1)
    f1 = f1_score(labels, predictions, pos_label=1)
    auc = roc_auc_score(labels, predictions)

    print(f"Fold: {fold}".ljust(12),
            f"AccuracyScore: {(accuracy):.4f}".ljust(22),
            f"RecallScore: {(precision):.2f}".ljust(20),
            f"f1Score: {(recall):.2f}".ljust(20),   
            f"f1Score: {(f1):.2f}".ljust(20),
                )
        
    

Fold: 1      AccuracyScore: 0.9391  RecallScore: 0.73    f1Score: 0.64        f1Score: 0.68       
Fold: 2      AccuracyScore: 0.9426  RecallScore: 0.77    f1Score: 0.62        f1Score: 0.69       
Fold: 3      AccuracyScore: 0.9521  RecallScore: 0.84    f1Score: 0.66        f1Score: 0.73       
Fold: 4      AccuracyScore: 0.9451  RecallScore: 0.73    f1Score: 0.72        f1Score: 0.73       
Fold: 5      AccuracyScore: 0.9408  RecallScore: 0.72    f1Score: 0.67        f1Score: 0.70       
