# Starter notebook for NIH Chest Xray

## Copy data from GCS (if not so yet)

In [None]:
# user-specific setting
PROJECT = 'mcsds-dlh-free'  # CHANGE: billing project name (since the dataset is user-to-pay)

In [None]:
# Download images from GCS. Takes a few minutes.
# https://cloud.google.com/healthcare/docs/resources/public-datasets/nih-chest#gcp_data_access

DATA_FOLDER = '../data/'
!gsutil -u {PROJECT} -m -q cp -r gs://gcs-public-data--healthcare-nih-chest-xray/png/*.png {DATA_FOLDER}

In [None]:
# Download addition labels
# https://pubs.rsna.org/doi/10.1148/radiol.2019191293

!gsutil -u {PROJECT} -m -q cp -r gs://gcs-public-data--healthcare-nih-chest-xray-labels/* {DATA_FOLDER}

In [None]:
!ls {DATA_FOLDER}

# Code starts here

In [None]:
# import libraries
import pandas as pd
import numpy as np
import random
import os
import torch
from torch.utils.data import Dataset
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.functional as F
from PIL import Image

# check if CUDA is available (GPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

In [None]:
# explore the dataset
# load data label
df_train_labels = pd.read_csv(os.path.join(DATA_FOLDER, 'train.csv'))
df_valid_labels = pd.read_csv(os.path.join(DATA_FOLDER, 'valid.csv'))

print(f"Train samples: {df_train_labels.shape[0]}")
print(f"Valid samples: {df_valid_labels.shape[0]}")

In [None]:
df_train_labels.head()

In [None]:
# TODO: get doc on features
df_train_labels.describe(include='all')

For labels: **blank** for unmentioned, **0** for negative, **-1** for uncertain, and **1** for positive.

In [None]:
df_train_labels.columns

In [None]:
# read images
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
i=1

print(df_train_labels['Path'][i])
img=mpimg.imread(df_train_labels['Path'][i])
#print(img.shape)  # 2D. Size varies.
#print(img.max(), img.min())  # grayscale 0-255
print(df_train_labels['Age'][i])
print(df_train_labels['Sex'][i])
print(df_train_labels['Frontal/Lateral'][i])
print(df_train_labels['AP/PA'][i])

In [None]:
df_train_labels.loc[0:13,'Sex']

Note that the size of images varies.

In [None]:
imgplot = plt.imshow(img, cmap='gray')

Take the label *Pleural Effusion* as pivot, let's build a classifier for it.

Settings:
1. Consider only Frontal view images.
2. U-MultiClass for Pleural Effusion - treat the uncertainty label (-1) as its own class.
3. Null(NA) values are treated as *negative (0)*.

In [None]:
# 1. Select training images
def select_images(df):
    df = df[df['Frontal/Lateral']=='Frontal']
    df = df[['Path', 'Pleural Effusion']].fillna(0)  # Note that '0' is negative, '-1' is uncertain. We assume NA => Unmentioned => Negative => 0.
    return df.reset_index()

df_frontal = select_images(df_train_labels)

In [None]:
df_frontal.describe(include='all')
df_frontal['Pleural Effusion'].hist()

df_frontal.groupby(['Pleural Effusion']).size()

In [None]:
# Prepare train and validation data
df_train = select_images(df_train_labels)
df_valid = select_images(df_valid_labels)

print(f"# training images: {df_train.shape[0]}")
print(f"# validation images: {df_valid.shape[0]}")

**Warning: The validation images serve as test set. Do NOT use them for model tuning.**
Use leave-out set/CV on training images for tuning instead.

Now we have the images and labels. We can train our model.

In [None]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
# Loader
target_dict = {
    -1: 2,  # uncertain
    0: 0,  # negative
    1: 1  # positive
}

class CheXpertDataset(Dataset):
    def __init__(self, dataframe, root_dir, label, transform=None):
        """
        label: column name of the label of interest, e.g. 'Pleural Effusion'.
        """
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform
        self.label = label
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.dataframe.loc[idx, 'Path'])
        image = Image.open(img_name)
        target = self.dataframe.loc[idx, self.label]
        target = target_dict[target]  # map labels to 0,...,num_classes
        
        if self.transform:
            image = self.transform(image)
        
        return (image, target)

# For testing/debug
ds= CheXpertDataset(df_train, ROOT_PATH, 'Pleural Effusion')
img, label = ds[191027-1]
print(label)
print(img.size)
imgplot = plt.imshow(img, cmap='gray')

In [None]:
# train_loader: train data loader (type: torch.utils.data.DataLoader)
# val_loader: val data loader (type: torch.utils.data.DataLoader)
def load_data(dataframe, root_dir, label, transform=None, batch_size=32, shuffle=True, num_workers=4):
    '''
    Data Loader with batch loading and transform.
    '''
    image_data = CheXpertDataset(dataframe, root_dir, label, transform=transform)
    loader = torch.utils.data.DataLoader(image_data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
    return loader

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        #transforms.RandomHorizontalFlip(),  # data augmentation
        transforms.ToTensor(),
        #transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        #transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

Define our CNN architecture:

In [None]:
# Helper function
def conv_output_volume(W, F, S, P):
    
    """
    Given the input volume size $W$, the kernel/filter size $F$, 
    the stride $S$, and the amount of zero padding $P$ used on the border, 
    calculate the output volume size.
    """
    return int((W - F + 2*P) / S) + 1

def maxpool_output_volume(W, F, S):
    
    """
    Given the input volume size $W$, the kernel/filter size $F$, 
    the stride $S$, and the amount of zero padding $P$ used on the border, 
    calculate the output volume size.
    """
    return int(np.ceil((W - F + 1) / S))

conv_layer1_size = conv_output_volume(W=224, F=5, S=1, P=0)
maxpool_layer1_size = maxpool_output_volume(W=conv_layer1_size, F=2, S=2)

conv_layer2_size = conv_output_volume(W=maxpool_layer1_size, F=5, S=1, P=0)
maxpool_layer2_size = maxpool_output_volume(W=conv_layer2_size, F=2, S=2)

print(conv_layer1_size, maxpool_layer1_size, conv_layer2_size, maxpool_layer2_size)

In [None]:
# For now, just use a simple one from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html, plus dropout

import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=5)  # stride=1, padding=0
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(3, 8, 5)
        self.fc1 = nn.Linear(8 * 53 * 53, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 3)  # 3 classes
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # size: (batch_size*)3channels*110*110
        x = self.pool(F.relu(self.conv2(x)))  # size: (batch_size*)8channels*53*53
        x = x.view(-1, 8 * 53 * 53)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.fc3(x)
        return x


net = Net().to(device)

In [None]:
# Define loss function and optimizer

import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
# Train

import time

num_epochs = 1
batch_size = 1024

train_data_loader = load_data(df_train, ROOT_PATH, 'Pleural Effusion', transform=data_transforms['train'], shuffle=True, batch_size=batch_size)
val_data_loader = load_data(df_valid, ROOT_PATH, 'Pleural Effusion', transform=data_transforms['val'], shuffle=False, batch_size=batch_size)

print(f"Training start. Mode: {device}")
start_time = time.time()

net.train()
for epoch in range(num_epochs):  # loop over the dataset multiple times
    print(f"Epoch {epoch}")
    
    running_loss = 0.0
    for i, data in enumerate(train_data_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        # cast label data type to int
        labels = labels.type(torch.LongTensor).to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 10 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10))
            print(f"Average time per batch: {(time.time()-start_time)/(i+1)} secs")
            running_loss = 0.0

print(f'Finished Training. Total time: {time.time()-start_time} secs.')

In [None]:
# save state
MODEL_PATH = './simple_net.pth'
torch.save(net.state_dict(), MODEL_PATH)

In [None]:
# Test on validation set

# Load the saved model if necessary
net = Net().to(device)
net.load_state_dict(torch.load(MODEL_PATH))

In [None]:
def eval_model(model, dataloader):
    """
    :return:
        Y_pred: prediction of model on the dataloder.
            Should be an 2D numpy float array where the second dimension has length 2.
        Y_test: truth labels. Should be an numpy array of ints
    """
    model.eval()
    Y_prob = []
    Y_pred = []
    Y_test = []
    for data, target in dataloader:
        outputs = model(data.to(device))
        probs = torch.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
        Y_prob.append(probs)
        Y_pred.append(predicted)
        Y_test.append(target)
        
    Y_prob = np.concatenate(Y_prob, axis=0)
    Y_pred = np.concatenate(Y_pred, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return Y_prob, Y_pred, Y_test


from sklearn.metrics import accuracy_score, roc_auc_score
# lower batch size if out of memory
train_data_loader = load_data(df_train, ROOT_PATH, 'Pleural Effusion', transform=data_transforms['train'], shuffle=True, batch_size=64)
val_data_loader = load_data(df_valid, ROOT_PATH, 'Pleural Effusion', transform=data_transforms['val'], shuffle=False, batch_size=64)

y_prob, y_pred, y_true = eval_model(net, train_data_loader)
acc = accuracy_score(y_true, y_pred)
roc = roc_auc_score(y_true, y_prob, multi_class='ovr')
print(("Train Accuracy: " + str(acc)))
print(("Train ROC: " + str(roc)))

y_prob, y_pred, y_true = eval_model(net, val_data_loader)
#print(y_prob)
acc = accuracy_score(y_true, y_pred)
#roc = roc_auc_score(y_true, y_prob, multi_class='ovr')

print(("Validation Accuracy: " + str(acc)))
#print(("Validation ROC: " + str(roc)))

In [None]:
# clear GPU memory
!nvidia-smi  # show the PID
#!kill 8210
#!nvidia-smi  # check