In [109]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim

import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Common CONSTANT

In [25]:
N_CLASSES = 14
CLASS_NAMES = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia',
                'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia']
WIDTH = 224
HEIGHT = 224
CHANNELS = 3
BATCHSIZE = 16 #64*2
IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD = [0.229, 0.224, 0.225]
DATA_DIR = '/home/dattran/data/xray/'
PERCENTAGE = 0.01 # percentage of data use for quick run
MODEL_NAME = 'model.pth.tar'

In [3]:
# check the length of NIH training validation and testing set
print('train val size')
!cat train_val_list.csv | wc -l
print('test size')
!cat test_list.csv | wc -l

train val size
76193
test size
19049


### Utitlity for traininng and testing

In [106]:
class XrayDataset(Dataset):
    '''
    Get image for train, validate and test base on NIH split
    '''
    
    def __init__(self, 
                 image_list_file='test_list.csv', 
                 transform=None, 
                 percentage=0.01):
        data = pd.read_csv(image_list_file, sep=' ', header=None)
        self.images = data.iloc[:, 0].as_matrix()
        self.labels = data.iloc[:, 1:].as_matrix()
        self.transform = transform
        self.percentage = percentage
    
    def __getitem__(self, index):
        image_file = DATA_DIR + self.images[index]
        image = Image.open(image_file).convert('RGB')
        label = self.labels[index]
        if self.transform:
            image = self.transform(image)
        return image, torch.FloatTensor(label)
    
    def __len__(self):
        return int(self.images.shape[0] * self.percentage)

In [96]:
class DenseNet121(nn.Module):
    
    def __init__(self, out_size):
        super(DenseNet121, self).__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Sequential(
            nn.Linear(num_ftrs, out_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.densenet121(x)
        return x


In [None]:
def agumented_dataloader(image_list_file='train_val_list.csv', percentage=PERCENTAGE):
    # TODO: Implement kFold for train test split
    normalize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)
    transform = transforms.Compose([
        transforms.Resize(264),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(size=WIDTH),
        transforms.ColorJitter(0.15, 0.15),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        normalize
    ])
    dataset = XrayDataset(image_list_file, transform, percentage)
    return DataLoader(dataset=dataset, batch_size=BATCHSIZE,
                      shuffle=True, num_workers=2, pin_memory=False)

In [82]:
def non_agumented_dataloader(image_list_file='test_list.csv', percentage=PERCENTAGE):
    normallize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)
    transform = transforms.Compose([
        transforms.Resize(WIDTH),
        transforms.ToTensor(),
        normallize
    ])
    dataset = XrayDataset(image_list_file, transform, percentage)
    return DataLoader(dataset=dataset, batch_size=4*BATCHSIZE,
                      shuffle=False, num_workers=4, pin_memory=False)

In [12]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    for data, target in dataloader:
        data = Variable(torch.FloatTensor(data).cuda())
        target = Variable(torch.FloatTensor(target).cuda())
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

In [13]:
def valid_epoch(model, dataloader, criterion):
    model.eval()
    losses = []
    for data, target in dataloader:
        data = Variable(torch.FloatTensor(data).cuda(), volatile=True)
        target = Variable(torch.FloatTensor(target).cuda(), volatile=True)
         
        output = model(data)
        loss = criterion(output, target)
        loss_val.append(loss.data[0])
    return np.mean(losses)
        

In [112]:
def test(model, dataloader):
    targets = torch.FloatTensor()
    targets = targets.cuda()
    preds = torch.FloatTensor()
    preds = preds.cuda()
    
    for data, target in dataloader:
        target = target.cuda()
        data = Variable(data.cuda(), volatile=True)
        pred = model(data)
        targets = torch.cat((targets, target), 0)
        preds = torch.cat((preds, pred.data), 0)
    aurocs = compute_aucs(targets, preds)
    aurocs_avg = np.array(aurocs).mean()
    print('The average AUROC is {0:.3f}'.format(aurocs_avg))
    for i in range(N_CLASSES):
        print('The AUROC of {} is {}'.format(CLASS_NAMES[i], aurocs[i]))

In [41]:
def compute_aucs(targets, preds):
    aurocs = []
    targets = targets.cpu().numpy()
    preds = preds.cpu().numpy()
    for i in range(N_CLASSES):
        aurocs.append(roc_auc_score(targets[:, i], preds[:, i]))
    return aurocs

In [54]:
net = DenseNet121(N_CLASSES).cuda()
net = torch.nn.DataParallel(net, device_ids=[0]).cuda()
optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.BCELoss()
scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5, mode='min')

### Training

### Testing
Should testing on strong label images

In [114]:
test_loader = non_agumented_dataloader(percentage=1)
checkpoint = torch.load(MODEL_NAME)
net.load_state_dict(checkpoint['state_dict'])

In [115]:
test(net, test_loader)

The average AUROC is 0.859
The AUROC of Atelectasis is 0.8481752104354701
The AUROC of Cardiomegaly is 0.9298533506432424
The AUROC of Effusion is 0.9106131955376795
The AUROC of Infiltration is 0.7204998544257422
The AUROC of Mass is 0.8894179363074527
The AUROC of Nodule is 0.8017568727103292
The AUROC of Pneumonia is 0.799804804435368
The AUROC of Pneumothorax is 0.9099843188256744
The AUROC of Consolidation is 0.826353023656907
The AUROC of Edema is 0.9291931549027395
The AUROC of Emphysema is 0.9372362707221347
The AUROC of Fibrosis is 0.8353439849688982
The AUROC of Pleural_Thickening is 0.8302341680090146
The AUROC of Hernia is 0.8540255466083977
