In [96]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from glob import glob
import os
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

# Define a custom dataloader

In [99]:
# Custom dataloader for MIL
class PatientPatchDataset(Dataset):
    def __init__(self, csv_file, root_dir):
        """
        Args:
            csv_file (string): Path to the csv file with patient labels.
            root_dir (string): Directory with all the patch files.
        """
        self.labels_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.patient_ids = self.labels_frame['patient_id'].unique()

    def __len__(self):
        return len(self.patient_ids)

    def __getitem__(self, idx):
        patient_id = self.patient_ids[idx]
        patient_label = self.labels_frame[self.labels_frame['patient_id'] == patient_id]['label'].values[0]
        patch_files = glob(os.path.join(self.root_dir, f'{patient_id}*.npy'))
        # print('patient_id:', patient_id, ', label:', patient_label, ', num_patch files:', len(patch_files))
        
        # Initialize a list to hold all patches for the patient
        all_patches = []
        
        # Iterate through each file and append its patches to all_patches
        for file in patch_files:
            patches_in_file = np.load(file)  # patches_in_file.shape is (x, 7, 7, 2048)
            # flatten the patches_in_file to (x, 7*7*2048)
            patches_in_file = patches_in_file.reshape(patches_in_file.shape[0], -1)
            all_patches.append(patches_in_file)
        
        # Concatenate all patches along the first dimension
        if all_patches:
            all_patches = np.concatenate(all_patches, axis=0)
        else:
            # Handle case with no patches
            all_patches = np.array([]).reshape(0, 100352)
        
        sample = {'patches': all_patches, 'label': patient_label}
        return sample


# Test the dataloader

In [88]:
import torch
from torch.utils.data import DataLoader

# Path to the CSV file containing patient labels
csv_file = '/home/80024223/data/cytology-some-data/unique-labels.csv'

# Directory containing all the .npy patch files
root_dir = '/mnt/Dept_MachineLearning/Faculty/Rasool, Ghulam/Shared Resources/HNC-Histopath-Embeddings/matched_patients'

# Initialize the dataset
dataset = PatientPatchDataset(csv_file=csv_file, root_dir=root_dir)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

# Function to test the DataLoader
def test_dataloader(dataloader):
    for i, data in enumerate(dataloader, 0):
        patches, label = data['patches'], data['label']
        print(f"Batch {i+1}")
        print(f"Shape of patches: {patches.shape}")  # Expected shape: (batch_size, total_patches, 7, 7, 2048)
        print(f"Label: {label}")
        print(f"Number of patches in this batch: {patches.shape[1]}")  # total_patches
        print("=====================================")
        
        # To keep the test concise, let's only look at the first batch
        # if i == 4:  # Look at the first batch only for testing
        #     break

# Call the test function
test_dataloader(dataloader)


patient_id: MCC001 , label: 0 , num_patch files: 1
Batch 1
Shape of patches: torch.Size([1, 8, 100352])
Label: tensor([0])
Number of patches in this batch: 8
patient_id: MCC002 , label: 1 , num_patch files: 1
Batch 2
Shape of patches: torch.Size([1, 11, 100352])
Label: tensor([1])
Number of patches in this batch: 11
patient_id: MCC003 , label: 0 , num_patch files: 1
Batch 3
Shape of patches: torch.Size([1, 8, 100352])
Label: tensor([0])
Number of patches in this batch: 8
patient_id: MCC007 , label: 1 , num_patch files: 1
Batch 4
Shape of patches: torch.Size([1, 6, 100352])
Label: tensor([1])
Number of patches in this batch: 6
patient_id: MCC008 , label: 1 , num_patch files: 3
Batch 5
Shape of patches: torch.Size([1, 42, 100352])
Label: tensor([1])
Number of patches in this batch: 42
patient_id: MCC009 , label: 1 , num_patch files: 1
Batch 6
Shape of patches: torch.Size([1, 8, 100352])
Label: tensor([1])
Number of patches in this batch: 8
patient_id: MCC011 , label: 1 , num_patch files:

# MIL Model

In [89]:
class SimpleMIL(nn.Module):
    def __init__(self):
        super(SimpleMIL, self).__init__()
        # Update the input size to match the flattened patch size
        self.fc1 = nn.Linear(100352, 500)  # Adjusted input size
        self.fc2 = nn.Linear(500, 1)

    def forward(self, x):
        # x shape is [batch_size, num_patches, num_features]
        # No need to flatten x as it is already in the desired shape for processing
        # We process each patch through the network, so reshape to (-1, num_features)
        x = x.view(-1, 100352)  # Flatten the patches for each patient
        
        # Process each patch through the network
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        
        # Reshape back to [batch_size, num_patches] to aggregate patch-level predictions
        x = x.view(-1, x.size(0))  # Adjust the view based on how many patches were processed
        
        # Aggregate patch-level predictions to bag-level prediction
        # Using max pooling as an example aggregation function. You might consider mean or custom pooling based on your task.
        x, _ = torch.max(x, dim=1)  # dim=1 aggregates across patches
        
        return x


# Training Function

In [102]:
def train_model_gpu(dataset, model, epochs=10, batch_size=5, learning_rate=0.001):
    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCELoss()

    model.to(device)  # Move model to the appropriate device
    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(dataloader):
            inputs, labels = data['patches'].float().to(device), data['label'].float().view(-1, 1).to(device)  # Move data to the appropriate device

            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.view(-1)  # Ensure outputs are 1D to match labels' squeeze
            labels = labels.view(-1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if (i + 1) % 10 == 0:  # Print every 10 batches
                print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 10:.4f}')
                running_loss = 0.0

    print('Finished Training')


In [105]:
def train_model_cpu(dataset, model, epochs=10, batch_size=5, learning_rate=0.001):
    print(f"Training on CPU")
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCELoss()

    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(dataloader):
            inputs, labels = data['patches'].float(), data['label'].float().view(-1, 1)  # Ensure labels are correctly shaped
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.view(-1)  # Ensure outputs are 1D to match labels' squeeze
            labels = labels.view(-1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if (i + 1) % 10 == 0:  # Print every 10 batches
                print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 10:.4f}')
                running_loss = 0.0                

    print('Finished Training')

In [109]:
dataset = PatientPatchDataset(csv_file='/mnt/Dept_MachineLearning/Faculty/Rasool, Ghulam/Shared Resources/HNC-Histopath-Embeddings/matched_patients/unique-labels.csv', root_dir='/mnt/Dept_MachineLearning/Faculty/Rasool, Ghulam/Shared Resources/HNC-Histopath-Embeddings/matched_patients/')
model = SimpleMIL()
# train_model_cpu(dataset, model, batch_size=1)
train_model_gpu(dataset, model, batch_size=1)

Training on cuda:7
Epoch 1, Batch 10, Loss: 80.8438
Epoch 1, Batch 20, Loss: 90.0000
Epoch 1, Batch 30, Loss: 90.0000
Epoch 1, Batch 40, Loss: 80.0000
Epoch 2, Batch 10, Loss: 80.0000
Epoch 2, Batch 20, Loss: 90.0000
Epoch 2, Batch 30, Loss: 90.0000
Epoch 2, Batch 40, Loss: 80.0000
Epoch 3, Batch 10, Loss: 80.0000
Epoch 3, Batch 20, Loss: 90.0000
Epoch 3, Batch 30, Loss: 90.0000
Epoch 3, Batch 40, Loss: 80.0000
Epoch 4, Batch 10, Loss: 80.0000
Epoch 4, Batch 20, Loss: 90.0000
Epoch 4, Batch 30, Loss: 90.0000
Epoch 4, Batch 40, Loss: 80.0000
Epoch 5, Batch 10, Loss: 80.0000
Epoch 5, Batch 20, Loss: 90.0000
Epoch 5, Batch 30, Loss: 90.0000
Epoch 5, Batch 40, Loss: 80.0000
Epoch 6, Batch 10, Loss: 80.0000
Epoch 6, Batch 20, Loss: 90.0000
Epoch 6, Batch 30, Loss: 90.0000
Epoch 6, Batch 40, Loss: 80.0000
Epoch 7, Batch 10, Loss: 80.0000
Epoch 7, Batch 20, Loss: 90.0000
Epoch 7, Batch 30, Loss: 90.0000
Epoch 7, Batch 40, Loss: 80.0000
Epoch 8, Batch 10, Loss: 80.0000
Epoch 8, Batch 20, Loss: