# Download Data

In [1]:
import kagglehub

# Download NIH dataset
nih_path = kagglehub.dataset_download("nih-chest-xrays/data")

print("Path to dataset files:", nih_path)

Path to dataset files: /kaggle/input/data


In [2]:
# Download Vin dataset
vin_path = kagglehub.dataset_download("awsaf49/vinbigdata-original-image-dataset")

print("Path to dataset files:", vin_path)

Path to dataset files: /kaggle/input/vinbigdata-original-image-dataset


In [3]:
import os

# List downloaded files
print("Files in NIH dataset directory:", os.listdir(nih_path))
print("Files in VinBigData directory:", os.listdir(vin_path))

Files in NIH dataset directory: ['images_003', 'images_012', 'LOG_CHESTXRAY.pdf', 'README_CHESTXRAY.pdf', 'BBox_List_2017.csv', 'images_009', 'images_008', 'images_007', 'test_list.txt', 'images_010', 'ARXIV_V5_CHESTXRAY.pdf', 'images_002', 'images_011', 'Data_Entry_2017.csv', 'images_001', 'train_val_list.txt', 'images_005', 'FAQ_CHESTXRAY.pdf', 'images_004', 'images_006']
Files in VinBigData directory: ['vinbigdata']


In [None]:
# Download Guangzhou dataset
guangzhou_path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")

print("Path to dataset files:", guangzhou_path)
print("Files in Guangzhou directory:", os.listdir(guangzhou_path))

# Exploratory Data Analysis

In [4]:
# Look through NIH csv file
import pandas as pd

nih_csv_file = os.path.join(nih_path, "Data_Entry_2017.csv")
nih_df = pd.read_csv(nih_csv_file)

print(nih_df.head())

        Image Index          Finding Labels  Follow-up #  Patient ID  \
0  00000001_000.png            Cardiomegaly            0           1   
1  00000001_001.png  Cardiomegaly|Emphysema            1           1   
2  00000001_002.png   Cardiomegaly|Effusion            2           1   
3  00000002_000.png              No Finding            0           2   
4  00000003_000.png                  Hernia            0           3   

   Patient Age Patient Gender View Position  OriginalImage[Width  Height]  \
0           58              M            PA                 2682     2749   
1           58              M            PA                 2894     2729   
2           58              M            PA                 2500     2048   
3           81              M            PA                 2500     2048   
4           81              F            PA                 2582     2991   

   OriginalImagePixelSpacing[x     y]  Unnamed: 11  
0                        0.143  0.143          NaN 

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [5]:
# Separate out the train csv for Vin as a dataframe
vin_path2 = os.path.join(vin_path, "vinbigdata")
vin_train_csv = os.path.join(vin_path2, "train.csv")
vin_root_dir = os.path.join(vin_path2, "train")
vin_df = pd.read_csv(vin_train_csv)


In [6]:
# Drop duplicates b/c there were multiple annotations per image - use the first label
vin_df_unique = vin_df.drop_duplicates(subset="image_id")
print("Unique images in VinDr:", len(vin_df_unique))

Unique images in VinDr: 15000


In [7]:
# Sample 4000 images from vin_df
vin_df_sample = vin_df_unique.sample(n=4000, random_state=42)
print(vin_df_sample.head())

                               image_id          class_name  class_id rad_id  \
22667  730ee58d327ab8bcdf8167683c71f565          No finding        14     R3   
8987   d8284119d2a86d1f3db93bb6c32272fc          No finding        14     R6   
31151  5c9c0490f1629ab3659c7785ae22224d          No finding        14     R3   
904    7db70125d7739e6cd0c442e7b7592d4c  Pulmonary fibrosis        13     R9   
8054   77694248815e59abca6fa327b40f4343          No finding        14     R2   

       x_min  y_min   x_max   y_max  width  height  
22667    NaN    NaN     NaN     NaN   2805    3408  
8987     NaN    NaN     NaN     NaN   2642    2850  
31151    NaN    NaN     NaN     NaN   1994    2430  
904    841.0  937.0  1149.0  1435.0   2961    3107  
8054     NaN    NaN     NaN     NaN   3072    3072  


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


# Setup datasets and dataloaders

In [8]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader, Dataset
from PIL import Image

# Save dataset paths for easy retrieval
data_dir = "/kaggle/input/data"
data_entry_csv = os.path.join(data_dir, "Data_Entry_2017.csv")
train_list_txt = os.path.join(data_dir, "train_val_list.txt")
test_list_txt = os.path.join(data_dir, "test_list.txt")

# Load image CSV
df_metadata = pd.read_csv(data_entry_csv)

# Load train/test file lists
def load_txt(file_path):
    with open(file_path, "r") as f:
        return set(f.read().splitlines())

train_images = load_txt(train_list_txt)
test_images = load_txt(test_list_txt)

# Split NIH dataset
df_train = df_metadata[df_metadata["Image Index"].isin(train_images)].sample(n=20000, random_state=42) 
df_test = df_metadata[df_metadata["Image Index"].isin(test_images)].sample(n=1000, random_state=42) 

In [None]:
# Define transformation function for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

In [9]:
# Define function for creating new Alexnet trained on Imagenet
def create_binary_alexnet():
    model = models.alexnet(weights=models.AlexNet_Weights.IMAGENET1K_V1)
    model.classifier[6] = nn.Linear(4096, 1)  # Binary output
    return model

In [10]:
# Custom Dataset class for NIH
class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx, 0]
        img_path = self.find_image_path(img_name)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = 0 if "No Finding" in self.dataframe.iloc[idx, 1] else 1
        return image, label

    def find_image_path(self, img_name):
        for folder in [f"images_{str(i).zfill(3)}/images" for i in range(1, 13)]:
            img_path = os.path.join(self.root_dir, folder, img_name)
            if os.path.exists(img_path):
                return img_path
        raise FileNotFoundError(f"{img_name} not found in any subfolder.")

# Define datasets and dataloaders for NIH
nih_train_dataset = ChestXrayDataset(df_train, data_dir, transform)
nih_test_dataset = ChestXrayDataset(df_test, data_dir, transform)

nih_train_loader = DataLoader(nih_train_dataset, batch_size=32, shuffle=True, num_workers = 4, pin_memory = True)
nih_test_loader = DataLoader(nih_test_dataset, batch_size=32, shuffle=False, num_workers = 4, pin_memory = True)

# Load AlexNet model - define cuda for GPU use
device = torch.device("cuda")
print(f"Using device: {device}")
alexnet_naive = create_binary_alexnet().to(device)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(alexnet_naive.parameters(), lr=0.0001, momentum=0.9)

Using device: cuda


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:01<00:00, 208MB/s]  


In [11]:
# Create dataset class for Vin dataset
class VinDrDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx]['image_id'] + ".jpg"  
        img_path = os.path.join(self.root_dir, img_name)
        
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"{img_name} not found in {self.root_dir}")

        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        label_str = self.dataframe.iloc[idx]['class_name']  
        label = 0 if "No finding" in label_str else 1
        return image, label

In [12]:
# Split Vin data into train and test splits
from sklearn.model_selection import train_test_split

vin_train_df, vin_test_df = train_test_split(
    vin_df_sample,
    test_size=0.25,  # 3000 train, 1000 test
    random_state=42
)

In [13]:
# Define Vin Datasets and Dataloaders
vin_train_dataset = VinDrDataset(vin_train_df, vin_root_dir, transform)
vin_test_dataset = VinDrDataset(vin_test_df, vin_root_dir, transform)

vin_train_loader = DataLoader(vin_train_dataset, batch_size=32, shuffle=True, num_workers = 4, pin_memory = True)
vin_test_loader = DataLoader(vin_test_dataset, batch_size=32, shuffle=False, num_workers = 4, pin_memory = True)

In [None]:
# Create dataloaders for GZ test and train
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

train_dir = os.path.join(guangzhou_path, 'chest_xray/train')
test_dir = os.path.join(guangzhou_path, 'chest_xray/test')

gz_train_dataset = ImageFolder(train_dir, transform=transform)
gz_test_dataset = ImageFolder(test_dir, transform=transform)

In [None]:
from sklearn.utils import shuffle

# Convert Guangzhou dataset into a DataFrame with file paths and labels
gz_train_files = [sample[0] for sample in gz_train_dataset.samples]
gz_train_labels = [sample[1] for sample in gz_train_dataset.samples]
gz_train_df = pd.DataFrame({
    'image_path': gz_train_files,
    'label': gz_train_labels
})
gz_test_files = [sample[0] for sample in gz_test_dataset.samples]
gz_test_labels = [sample[1] for sample in gz_test_dataset.samples]
gz_test_df = pd.DataFrame({
    'image_path': gz_test_files,
    'label': gz_test_labels
})

In [None]:
# Create dataloaders for GZ
gz_test_loader = DataLoader(gz_test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
gz_train_loader = DataLoader(gz_train_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

In [75]:
# Explore gold standard label counts in Vin test dataset
print(vin_test_df['class_name'].apply(lambda x: "No Finding" if "No finding" in x else "Abnormal").value_counts())


class_name
No Finding    712
Abnormal      288
Name: count, dtype: int64


In [76]:
# Explore gold standard label counts in NIH test dataset
print(df_test['Finding Labels'].apply(lambda x: "No Finding" if "No Finding" in x else "Abnormal").value_counts())


Finding Labels
Abnormal      617
No Finding    383
Name: count, dtype: int64


# Define training and evaluate functions

In [14]:
# Define evaluate_model function to store performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            probs = torch.sigmoid(outputs).squeeze(1)
            preds = (probs > 0.5).long()

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return {
        "Accuracy": round(accuracy_score(all_labels, all_preds), 4),
        "Precision": round(precision_score(all_labels, all_preds, zero_division=0), 4),
        "Recall": round(recall_score(all_labels, all_preds, zero_division=0), 4),
        "F1 Score": round(f1_score(all_labels, all_preds, zero_division=0), 4),
        "AUC-ROC": round(roc_auc_score(all_labels, all_probs), 4),
    }

In [24]:
# Define a standard model training function
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        print(f"Starting Epoch {epoch+1}")  # Debugging line
        for batch_idx, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")
    print("Training Finished!")  # Ensure we know when training ends

# Experimentation

In [None]:
# Train naive model on NIH source dataset
train_model(alexnet_naive, nih_train_loader, criterion, optimizer, epochs=10)

In [80]:
results = {}
results["Naive_vin"] = evaluate_model(alexnet_naive, vin_test_loader)
results["Naive_nih"] = evaluate_model(alexnet_naive, nih_test_loader)

In [86]:
# Comparing naive model on gz_test
results["Naive_gz"] = evaluate_model(alexnet_naive, gz_test_loader)

In [88]:
# Fine-Tuning Experiments with different LRs and subset sizes
import copy

learning_rates = [0.001, 0.0001, 0.0005]
subset_sizes = [3000, 1000, 500]

for subset_size in subset_sizes:
    vin_subset = vin_train_df.sample(n=subset_size, random_state=subset_size)

    vin_dataset = VinDrDataset(vin_subset, vin_root_dir, transform)
    vin_loader = DataLoader(vin_dataset, batch_size=32, shuffle=True)

    for lr in learning_rates:
        print(f"\n🔁 Fine-tuning on {subset_size} VinDr samples @ LR={lr}")

        # Deepcopy the pretrained model
        model_ft = copy.deepcopy(alexnet_naive)

        # New optimizer for fine-tuning
        optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)

        # Fine-tune for 10 epochs
        train_model(model_ft, vin_loader, criterion, optimizer_ft, epochs=10)

        # Evaluate and store results
        key_vin = f"VinDr{subset_size}_LR{lr}_Vin_Test"
        key_nih = f"VinDr{subset_size}_LR{lr}_Nih_Test"
        results[key_vin] = evaluate_model(model_ft, vin_test_loader)
        results[key_nih] = evaluate_model(model_ft, nih_test_loader) # Also evaluated on NIH test set


🔁 Fine-tuning on 3000 VinDr samples @ LR=0.001
Starting Epoch 1
Epoch 1, Loss: 0.4167614016444125
Starting Epoch 2
Epoch 2, Loss: 0.292131983853401
Starting Epoch 3
Epoch 3, Loss: 0.23455219795095159
Starting Epoch 4
Epoch 4, Loss: 0.22101311678899097
Starting Epoch 5
Epoch 5, Loss: 0.19061739532712926
Starting Epoch 6
Epoch 6, Loss: 0.1751107146686062
Starting Epoch 7
Epoch 7, Loss: 0.19193430118104246
Starting Epoch 8
Epoch 8, Loss: 0.16123782347967017
Starting Epoch 9
Epoch 9, Loss: 0.15092638832457522
Starting Epoch 10
Epoch 10, Loss: 0.11328724627085823
Training Finished!

🔁 Fine-tuning on 3000 VinDr samples @ LR=0.0001
Starting Epoch 1
Epoch 1, Loss: 0.47044996092928215
Starting Epoch 2
Epoch 2, Loss: 0.39408877840701567
Starting Epoch 3
Epoch 3, Loss: 0.34525497241857206
Starting Epoch 4
Epoch 4, Loss: 0.33155941313251536
Starting Epoch 5
Epoch 5, Loss: 0.3135951240646078
Starting Epoch 6
Epoch 6, Loss: 0.2915453088093311
Starting Epoch 7
Epoch 7, Loss: 0.27667907768107475
Star

In [90]:
# Experimenting with different fine-tuning parameters on gz_test
learning_rates = [0.001, 0.0001, 0.0005]
subset_sizes = [3000, 1000, 500]

for subset_size in subset_sizes:
    # Define subset from gz train
    guangzhou_subset = gz_train_df.sample(n=subset_size, random_state=subset_size)
    gz_dataset = GuangzhouDataset(guangzhou_subset, transform=transform)
    gz_loader = DataLoader(gz_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

    for lr in learning_rates:
        print(f"\n🔁 Fine-tuning on {subset_size} Guangzhou samples @ LR={lr}")

        model_ft = copy.deepcopy(alexnet_naive)
        model_ft.to(device)

        optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)

        train_model(model_ft, gz_loader, criterion, optimizer_ft, epochs=10)

        key_gz = f"GZ{subset_size}_LR{lr}_GZ_Test"
        key_nih = f"GZ{subset_size}_LR{lr}_NIH_Test"

        # Store performance metrics in results
        results[key_gz] = evaluate_model(model_ft, gz_test_loader) 
        results[key_nih] = evaluate_model(model_ft, nih_test_loader)


🔁 Fine-tuning on 3000 Guangzhou samples @ LR=0.001
Starting Epoch 1
Epoch 1, Loss: 0.1842102636186842
Starting Epoch 2
Epoch 2, Loss: 0.10183154181280035
Starting Epoch 3
Epoch 3, Loss: 0.08206537653236314
Starting Epoch 4
Epoch 4, Loss: 0.05591031092224009
Starting Epoch 5
Epoch 5, Loss: 0.0435319431497656
Starting Epoch 6
Epoch 6, Loss: 0.04385568058747362
Starting Epoch 7
Epoch 7, Loss: 0.03716037949383695
Starting Epoch 8
Epoch 8, Loss: 0.024634413481219374
Starting Epoch 9
Epoch 9, Loss: 0.025898481001507767
Starting Epoch 10
Epoch 10, Loss: 0.013855892619950341
Training Finished!

🔁 Fine-tuning on 3000 Guangzhou samples @ LR=0.0001
Starting Epoch 1
Epoch 1, Loss: 0.2792285527796187
Starting Epoch 2
Epoch 2, Loss: 0.12744594515955193
Starting Epoch 3
Epoch 3, Loss: 0.10145871577031434
Starting Epoch 4
Epoch 4, Loss: 0.09187070405150348
Starting Epoch 5
Epoch 5, Loss: 0.0775602522028729
Starting Epoch 6
Epoch 6, Loss: 0.0750656328164041
Starting Epoch 7
Epoch 7, Loss: 0.0717046745

In [91]:
results_df = pd.DataFrame(results).T 
print(results_df)

                             Accuracy  Precision  Recall  F1 Score  AUC-ROC
Naive_vin                      0.7120     0.8354  0.7416    0.7857   0.7605
Naive_nih                      0.6880     0.6320  0.4439    0.5215   0.6845
Naive_gz                       0.7436     0.6076  0.8932    0.7232   0.8698
VinDr3000_LR0.001_Vin_Test     0.9180     0.9145  0.9761    0.9443   0.9727
VinDr3000_LR0.001_Nih_Test     0.6480     0.5471  0.4700    0.5056   0.6299
VinDr3000_LR0.0001_Vin_Test    0.9110     0.9495  0.9242    0.9367   0.9621
VinDr3000_LR0.0001_Nih_Test    0.6650     0.5952  0.3916    0.4724   0.6502
VinDr3000_LR0.0005_Vin_Test    0.8860     0.9776  0.8596    0.9148   0.9674
VinDr3000_LR0.0005_Nih_Test    0.6580     0.5919  0.3446    0.4356   0.6506
VinDr1000_LR0.001_Vin_Test     0.9150     0.9312  0.9508    0.9409   0.9614
VinDr1000_LR0.001_Nih_Test     0.6210     0.5065  0.4073    0.4515   0.6310
VinDr1000_LR0.0001_Vin_Test    0.8870     0.9020  0.9438    0.9224   0.9386
VinDr1000_LR

In [92]:
# Combine NIH and Vin datasets for Oracle
from sklearn.model_selection import train_test_split

# NIH
nih_train = df_train[['Image Index', 'Finding Labels']].copy()
nih_train.columns = ['image_id', 'class_name']
nih_train['source'] = 'NIH'

nih_test = df_test[['Image Index', 'Finding Labels']].copy()
nih_test.columns = ['image_id', 'class_name']
nih_test['source'] = 'NIH'

# Vin
df_vindr = vin_df_unique.sample(n=4000, random_state = 42)[['image_id', 'class_name']].copy()
df_vindr['source'] = 'VinDr'

vindr_train, vindr_test = train_test_split(
    df_vindr, test_size=0.25, random_state=42
)

# Combine datasets
train_df = pd.concat([nih_train, vindr_train], ignore_index=True)
test_df = pd.concat([nih_test, vindr_test], ignore_index=True)


In [93]:
# Create new Dataset class for defining combined dataset
class OracleDataset(Dataset):
    def __init__(self, dataframe, nih_root, vindr_root, transform=None):
        self.dataframe = dataframe
        self.nih_root = nih_root
        self.vindr_root = vindr_root
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_name = row['image_id']

        if row['source'] == 'NIH':
            img_path = self.find_nih_path(img_name)
        else:
            img_path = os.path.join(self.vindr_root, img_name + ".jpg")  # or .png if needed

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        label = 1 if "no finding" in row['class_name'].lower() else 0
        return image, label

    def find_nih_path(self, img_name):
        for folder in [f"images_{str(i).zfill(3)}/images" for i in range(1, 13)]:
            img_path = os.path.join(self.nih_root, folder, img_name)
            if os.path.exists(img_path):
                return img_path
        raise FileNotFoundError(f"{img_name} not found in NIH subfolders.")


In [94]:
# Define NIH + Vin datasets / dataloaders
oracle_train_dataset = OracleDataset(train_df, data_dir, vin_root_dir, transform)
oracle_test_dataset = OracleDataset(test_df, data_dir, vin_root_dir, transform)

oracle_train_loader = DataLoader(oracle_train_dataset, batch_size=32, shuffle=True)
oracle_test_loader = DataLoader(oracle_test_dataset, batch_size=32, shuffle=False)


In [95]:
# Train model on Oracle approach
alexnet_oracle = create_binary_alexnet().to(device)
optimizer = torch.optim.SGD(alexnet_oracle.parameters(), lr=0.0001, momentum=0.9)
train_model(alexnet_oracle, oracle_train_loader, criterion, optimizer, epochs=10)

Starting Epoch 1
Epoch 1, Loss: 0.6259746785704383
Starting Epoch 2
Epoch 2, Loss: 0.6019877066035264
Starting Epoch 3
Epoch 3, Loss: 0.5933138241108005
Starting Epoch 4
Epoch 4, Loss: 0.5866437921189135
Starting Epoch 5
Epoch 5, Loss: 0.5793740123468911
Starting Epoch 6
Epoch 6, Loss: 0.5770268055593519
Starting Epoch 7
Epoch 7, Loss: 0.5707517981695035
Starting Epoch 8
Epoch 8, Loss: 0.5671259911733476
Starting Epoch 9
Epoch 9, Loss: 0.5654075912126082
Starting Epoch 10
Epoch 10, Loss: 0.5605852416145287
Training Finished!


In [96]:
# Store performance metrics in results
results["Oracle_vin"] = evaluate_model(alexnet_oracle, vin_test_loader)
results["Oracle_nih"] = evaluate_model(alexnet_oracle, nih_test_loader)

In [99]:
# Combine GZ and NIH using ConcatDataset for Oracle

from torch.utils.data import ConcatDataset

oracle_dataset = ConcatDataset([gz_train_dataset, nih_train_dataset])
oracle_loader = DataLoader(oracle_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

In [100]:
# Define new oracle model for NIH + GZ
alexnet_oracle2 = create_binary_alexnet().to(device)
optimizer = torch.optim.SGD(alexnet_oracle2.parameters(), lr=0.0001, momentum=0.9)

In [101]:
# Train on Oracle dataloader for NIH + GZ
train_model(alexnet_oracle2, oracle_loader, criterion, optimizer, epochs=10)

Starting Epoch 1
Epoch 1, Loss: 0.5770221113175305
Starting Epoch 2
Epoch 2, Loss: 0.5299298852530833
Starting Epoch 3
Epoch 3, Loss: 0.5173381685756789
Starting Epoch 4
Epoch 4, Loss: 0.5078922847503333
Starting Epoch 5
Epoch 5, Loss: 0.5029188700133774
Starting Epoch 6
Epoch 6, Loss: 0.49871510872380986
Starting Epoch 7
Epoch 7, Loss: 0.4966923375495799
Starting Epoch 8
Epoch 8, Loss: 0.492355085795906
Starting Epoch 9
Epoch 9, Loss: 0.4904581658579976
Starting Epoch 10
Epoch 10, Loss: 0.48991082631391925
Training Finished!


In [102]:
# Store evaluation metrics
results["oracle_model2_gz_test"] = evaluate_model(alexnet_oracle2, gz_test_loader) 
results["oracle_model2_nih_test"] = evaluate_model(alexnet_oracle2, nih_test_loader)

In [103]:
results_df = pd.DataFrame(results).T 
print(results_df)

                             Accuracy  Precision  Recall  F1 Score  AUC-ROC
Naive_vin                      0.7120     0.8354  0.7416    0.7857   0.7605
Naive_nih                      0.6880     0.6320  0.4439    0.5215   0.6845
Naive_gz                       0.7436     0.6076  0.8932    0.7232   0.8698
VinDr3000_LR0.001_Vin_Test     0.9180     0.9145  0.9761    0.9443   0.9727
VinDr3000_LR0.001_Nih_Test     0.6480     0.5471  0.4700    0.5056   0.6299
VinDr3000_LR0.0001_Vin_Test    0.9110     0.9495  0.9242    0.9367   0.9621
VinDr3000_LR0.0001_Nih_Test    0.6650     0.5952  0.3916    0.4724   0.6502
VinDr3000_LR0.0005_Vin_Test    0.8860     0.9776  0.8596    0.9148   0.9674
VinDr3000_LR0.0005_Nih_Test    0.6580     0.5919  0.3446    0.4356   0.6506
VinDr1000_LR0.001_Vin_Test     0.9150     0.9312  0.9508    0.9409   0.9614
VinDr1000_LR0.001_Nih_Test     0.6210     0.5065  0.4073    0.4515   0.6310
VinDr1000_LR0.0001_Vin_Test    0.8870     0.9020  0.9438    0.9224   0.9386
VinDr1000_LR

In [22]:
vin_train_dataset = VinDrDataset(vin_train_df, vin_root_dir, transform)

In [25]:
# Experiment with up-sampling
import random
import torch
import torch.nn as nn
from torch.utils.data import Subset, ConcatDataset, DataLoader
from torchvision import models

# Function for creating upsampled datasets
def create_upsampled_dataset(dataset, total_samples):
    indices = random.choices(range(len(dataset)), k=total_samples)
    return Subset(dataset, indices)

# Loop over three upsample sizes
upsample_sizes = [5000,10000,20000]

for size in upsample_sizes:
    print(f"\n🔁 Creating Oracle datasets with {size} upsampled samples")

    # Create upsampled Vin and GZ datasets
    vin_upsampled = create_upsampled_dataset(vin_train_dataset, size)
    gz_upsampled = create_upsampled_dataset(gz_train_dataset, size) 

    # Combine target with full NIH dataset
    vin_oracle = ConcatDataset([vin_upsampled, nih_train_dataset])
    gz_oracle = ConcatDataset([gz_upsampled, nih_train_dataset])

    # Create DataLoaders
    vin_loader = DataLoader(vin_oracle, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
    gz_loader = DataLoader(gz_oracle, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

    # Train on VinDr + NIH 
    print(f"🚀 Training model on VinDr (upsampled {size}) + NIH...")
    model_vin = create_binary_alexnet().to(device)
    optimizer_vin = torch.optim.SGD(model_vin.parameters(), lr=1e-3, momentum=0.9)
    train_model(model_vin, vin_loader, criterion, optimizer_vin, epochs=10)

    # Store performance metrics
    results[f"VinOracle_{size}_VinTest"] = evaluate_model(model_vin, vin_test_loader)
    results[f"VinOracle_{size}_NihTest"] = evaluate_model(model_vin, nih_test_loader)

    # Train on GZ + NIH
    print(f"🚀 Training model on Guangzhou (upsampled {size}) + NIH...")
    model_gz = create_binary_alexnet().to(device)
    optimizer_gz = torch.optim.SGD(model_gz.parameters(), lr=0.0001, momentum=0.9)
    train_model(model_gz, gz_loader, criterion, optimizer_gz, epochs=10)

    # Store performance metrics
    results[f"GZOracle_{size}_GzTest"] = evaluate_model(model_gz, gz_test_loader)
    results[f"GZOracle_{size}_NihTest"] = evaluate_model(model_gz, nih_test_loader)


🔁 Creating Oracle datasets with 5000 upsampled samples
🚀 Training model on VinDr (upsampled 5000) + NIH...
Starting Epoch 1
Epoch 1, Loss: 0.6016060100949329
Starting Epoch 2
Epoch 2, Loss: 0.5550868025292521
Starting Epoch 3
Epoch 3, Loss: 0.5354513909734423
Starting Epoch 4
Epoch 4, Loss: 0.5228426403477978
Starting Epoch 5
Epoch 5, Loss: 0.5090571226518782
Starting Epoch 6
Epoch 6, Loss: 0.501101412698436
Starting Epoch 7
Epoch 7, Loss: 0.49176809740493366
Starting Epoch 8
Epoch 8, Loss: 0.4836797059306403
Starting Epoch 9
Epoch 9, Loss: 0.47546267299853323
Starting Epoch 10
Epoch 10, Loss: 0.46606362287117087
Training Finished!
🚀 Training model on Guangzhou (upsampled 5000) + NIH...
Starting Epoch 1
Epoch 1, Loss: 0.5794779430417454
Starting Epoch 2
Epoch 2, Loss: 0.5322228898401455
Starting Epoch 3
Epoch 3, Loss: 0.521275351495694
Starting Epoch 4
Epoch 4, Loss: 0.5134427816895268
Starting Epoch 5
Epoch 5, Loss: 0.5094219214852204
Starting Epoch 6
Epoch 6, Loss: 0.501435122107300

In [26]:
# Print results
print(results_df)

                         Accuracy  Precision  Recall  F1 Score  AUC-ROC
VinOracle_5000_VinTest     0.9050     0.9543  0.9101    0.9317   0.9660
VinOracle_5000_NihTest     0.6840     0.6274  0.4308    0.5108   0.6832
GZOracle_5000_GzTest       0.8686     0.9872  0.6581    0.7897   0.9558
GZOracle_5000_NihTest      0.6840     0.5908  0.5692    0.5798   0.6917
VinOracle_10000_VinTest    0.9290     0.9470  0.9537    0.9503   0.9728
VinOracle_10000_NihTest    0.6790     0.5787  0.5953    0.5869   0.6940
GZOracle_10000_GzTest      0.8333     0.9851  0.5641    0.7174   0.9585
GZOracle_10000_NihTest     0.6840     0.5840  0.6084    0.5959   0.6859
VinOracle_20000_VinTest    0.9180     0.9200  0.9691    0.9439   0.9686
VinOracle_20000_NihTest    0.6860     0.6302  0.4360    0.5154   0.6909
GZOracle_20000_GzTest      0.8381     1.0000  0.5684    0.7248   0.9622
GZOracle_20000_NihTest     0.6980     0.6528  0.4517    0.5340   0.6906
