# Starter notebook for NIH Chest Xray

In [None]:
# user-specific setting
PROJECT = 'mcsds-dlh'  # CHANGE: billing project name (since the dataset is user-to-pay)
DATA_FOLDER = '../data/'

## Copy data from GCS (do only once)

In [None]:
# Download images from GCS. Takes a few minutes.
# Note: you may have trouble running these commands locally, with error "ServiceException: 401 Requester pays bucket access requires authentication."
# This is due to:
# 1. Billing was not setup correctly on Google Cloud.
# 2. Command shell needs admin privilege.
# Better alternative is to copy the data to another bucket and download from there, or from Kaggle.

# Images
# https://cloud.google.com/healthcare/docs/resources/public-datasets/nih-chest#gcp_data_access
#!gsutil -u {PROJECT} -m -q cp -r gs://gcs-public-data--healthcare-nih-chest-xray/png/*.png {DATA_FOLDER}

# Download addition labels
# https://pubs.rsna.org/doi/10.1148/radiol.2019191293
#!gsutil -u {PROJECT} -m -q cp -r gs://gcs-public-data--healthcare-nih-chest-xray-labels/* {DATA_FOLDER}

# Code starts here

In [None]:
# import libraries
import pandas as pd
import numpy as np
import random
import os
import torch
import torch.nn as nn

import torchvision
from torchvision import datasets, models, transforms
import torch.nn.functional as F
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from functions import NihDataset, load_data, train_model, eval_model
%matplotlib inline

# check if CUDA is available (GPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

In [None]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
# explore the dataset
# load train test split
with open('train_val_list.txt') as f: 
    train_val_list = [x.strip() for x in f.readlines()]
with open('test_list.txt') as f:
    test_list = [x.strip() for x in f.readlines()]

# load labels
df_labels = pd.read_csv('Data_Entry_2017_v2020.csv')
print(f"Number of images: {len(df_labels)}")
# split the finding (disease) labels, to a list
df_labels['targets'] = df_labels['Finding Labels'].str.split("|", expand = False)
# look at available labels
labels = set([item for sublist in df_labels['targets'].tolist() for item in sublist])

print(f"Number of labels: {len(labels)}")
print(f"Labels: {labels}")

# one-hot encode labels to columns
mlb = MultiLabelBinarizer(sparse_output=True)

df_labels = df_labels.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df_labels.pop('targets')),
                index=df_labels.index,
                columns=mlb.classes_))
df_labels[list(labels)]=df_labels[list(labels)].sparse.to_dense()  # for easy .describe()

# show converted data
df_labels[['Finding Labels', *list(labels)]].head(10)

In [None]:
# split into train_val and test sets
df_train_val = df_labels[df_labels['Image Index'].isin(train_val_list)]
df_test = df_labels[df_labels['Image Index'].isin(test_list)].reset_index()

print(f"Number of train/val images: {len(df_train_val)}")
print(f"Number of test images: {len(df_test)}")

assert (len(df_train_val) + len(df_test)) == len(df_labels), "Total number of images does not equal to sum of train/val and test!"

Take the label *Atelectasis* as pivot, let's build a classifier for it.

Settings:
1. Consider only PA view images.
2. Binary classification.

In [None]:
# distribution of diseases
df_labels[labels].sum().plot(kind="bar", figsize=(10,8))
plt.grid()

In [None]:
disease = 'Atelectasis'

In [None]:
# Label distribution
df_labels.describe(include='all')
df_labels[disease].hist()
print(f"Fraction of positive class: {len(df_labels[df_labels[disease]==1])/len(df_labels):.3f}")

In [None]:
from sklearn.model_selection import train_test_split
# 2 notes about train-val split:
# 1. make sure the same patient NEVER appears in both sets, to avoid data leakage
# 2. Stratify the sampling process to avoid bias, especially for imbalance class
# TODO: how to cater for these 2 objectives at the same time?
df_train, df_val = train_test_split(df_train_val, test_size=0.1, stratify=df_train_val[disease], random_state=seed)  # 10% val set, about half the size of test set
df_train.reset_index(inplace=True)
df_val.reset_index(inplace=True)

assert len(df_train) + len(df_val) == len(df_train_val)

'''
# Prepare train/val and test data
def select_images(df):
    df = df[df['View Position']=='PA'].reset_index()
    return df

df_train_pa = select_images(df_train)
df_val_pa = select_images(df_val)
df_test_pa = select_images(df_test)

print(f"# train images: {df_train_pa.shape[0]}")
print(f"# val images: {df_val_pa.shape[0]}")
print(f"# test images: {df_test_pa.shape[0]}")
'''

In [None]:
len(df_train)

In [None]:
# get statistics of training images. 
# Takes a long time to run as the image set is huge; here we sample it

# stack all training images together
'''
num_channels = 3
num_samples = 1000
sample_mean = np.zeros(1)
sample_var = np.zeros(1)
for img_pth in df_train['Image Index'].sample(num_samples):
    img_name = os.path.join(DATA_FOLDER, img_pth)
    _image = np.array(Image.open(img_name).convert('L'))  # shape: [H,W]
    sample_mean += np.mean(_image)
    sample_var += np.var(_image)

sample_mean = sample_mean/num_samples
sample_std = np.sqrt(sample_var/num_samples)
print(f"Mean: {sample_mean}, Std: {sample_std}")
'''

In [None]:
# or simple use a cached here
sample_mean = np.repeat(np.array([129.76628483]), 3)
sample_std = np.repeat(np.array([59.70063891]), 3)

In [None]:
sample_mean

**Warning: The validation images serve as test set. Do NOT use them for model tuning.**
Use leave-out set/CV on training images for tuning instead.

Define our CNN architecture:

In [None]:
# https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
# avaiable models in PyTorch: [resnet, alexnet, vgg, squeezenet, densenet, inception]

# this cell must sit above loader, as image resizing inside transform depends on `input_size`.

def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    if model_name == "resnet":
        """ 
        Resnet18
        """
        model_ft = models.resnet18(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "alexnet":
        """ 
        Alexnet
        """
        model_ft = models.alexnet(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "vgg":
        """ 
        VGG11_bn
        """
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "squeezenet":
        """ 
        Squeezenet
        """
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = 224

    elif model_name == "densenet":
        """ Densenet
        """
        model_ft = models.densenet121(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "inception":
        """ 
        Inception v3
        Be careful, expects (299,299) sized images and has auxiliary output
        """
        model_ft = models.inception_v3(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        # Handle the auxilary net
        num_ftrs = model_ft.AuxLogits.fc.in_features
        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
        # Handle the primary net
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs,num_classes)
        input_size = 299

    else:
        print("Invalid model name, exiting...")
        exit()

    return model_ft, input_size

model_name = 'alexnet'
num_classes = 2
feature_extract = True

# Initialize the model for this run
model, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
model = model.to(device)

# Print the model we just instantiated
print(model)
print(f"Input image size: {input_size}")

In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),  # data augmentation
        transforms.ToTensor(),
        transforms.Normalize(sample_mean, sample_std)
    ]),
    'test': transforms.Compose([
        transforms.Resize((256,256)),  # FIXME: how to cater for different `input_size`?
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize(sample_mean, sample_std)
    ]),
}

In [None]:
# Define loss function and optimizer

import torch.optim as optim
num_neg = sum(df_train[disease] == 0)
num_pos = sum(df_train[disease] == 1)
assert num_neg + num_pos == len(df_train)
print(f"# of negative/positive cases: {num_neg}:{num_pos}")

# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights
class_weight = torch.FloatTensor([(1 / num_neg)*(len(df_train))/2.0, (1 / num_pos)*(len(df_train))/2.0]).to(device)
print(f"Class weight: {class_weight}")

#criterion = nn.CrossEntropyLoss()  # change to CrossEntropyLoss if  multiclass
criterion = nn.CrossEntropyLoss(weight=class_weight)  # change to CrossEntropyLoss if  multiclass
optimizer = optim.Adam(model.parameters(), lr=0.0005)

Now we have the images and labels. We can train our model.

In [None]:
import time

num_epochs = 10
batch_size = 256

train_data_loader = load_data(df_train, DATA_FOLDER, disease, transform=data_transforms['train'], shuffle=True, batch_size=batch_size)
val_data_loader = load_data(df_val, DATA_FOLDER, disease, transform=data_transforms['test'], shuffle=False, batch_size=256)

print(f"Training start. Mode: {device}")
start_time = time.time()
model, t_losses, v_losses, v_best_auc, v_roc = train_model(model, train_data_loader, val_data_loader, criterion, optimizer, num_epochs=num_epochs, verbose=False)
print(f"Best ROC achieved on validation set: {v_best_auc:3f}")
print(f'Finished Training. Total time: {(time.time()-start_time)/60} minutes.')

In [None]:
# plot training and validation loss over epoches
plt.figure()
plt.plot(t_losses, 'b', label='Training loss')
plt.plot(v_losses, 'g', label='Validation Loss')
plt.legend()
plt.show()

In [None]:
# plot validation roc
plt.plot(v_roc, 'g', label='Validation ROC')

In [None]:
# Evaluate on test set
# load model
#model.load_state_dict(torch.load('../models/vgg_1617289457_bestroc_0.742825.pth'))
#model.eval()

# sometimes GPU goes out of memory. Can clear memory and load the model from disk, lower batch_size, or just use CPU
#device = 'cpu'  # comment this out if GPU has sufficient memory

test_data_loader = load_data(df_test, DATA_FOLDER, disease, transform=data_transforms['test'], shuffle=False, batch_size=32)
test_loss, test_auc, t_prob, t_pred, t_true = eval_model(model.to(device), test_data_loader, criterion)