In [1]:
from __future__ import print_function, division
import sklearn
import sklearn.metrics as sklm
# pytorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# image imports
from skimage import io, transform
from PIL import Image

# general imports
import os
import time
from shutil import copyfile
from shutil import rmtree

# data science imports
import pandas as pd
import numpy as np
import csv

use_gpu = torch.cuda.is_available()
gpu_count = torch.cuda.device_count()
print("Available GPU count:" + str(gpu_count))


  from .collection import imread_collection_wrapper


Available GPU count:1


In [2]:
class CXRDataset(Dataset):

    def __init__(
            self,
            path_to_images,
            fold,
            transform=None,
            sample=0,
            finding="any",
            starter_images=False):

        self.transform = transform
        self.path_to_images = path_to_images
        self.df = pd.read_csv("nih_labels.csv")
        self.df = self.df[self.df['fold'] == fold]

            
        # can limit to sample, useful for testing
        # if fold == "train" or fold =="val": sample=500
        if(sample > 0 and sample < len(self.df)):
            self.df = self.df.sample(sample)

        if not finding == "any":  # can filter for positive findings of the kind described; useful for evaluation
            if finding in self.df.columns:
                if len(self.df[self.df[finding] == 1]) > 0:
                    self.df = self.df[self.df[finding] == 1]
                else:
                    print("No positive cases exist for "+LABEL+", returning all unfiltered cases")
            else:
                print("cannot filter on finding " + finding +
                      " as not in data - please check spelling")

        self.df = self.df.set_index("Image Index")
        self.PRED_LABEL = [
            'Atelectasis',
            'Cardiomegaly',
            'Effusion',
            'Infiltration',
            'Mass',
            'Nodule',
            'Pneumonia',
            'Pneumothorax',
            'Consolidation',
            'Edema',
            'Emphysema',
            'Fibrosis',
            'Pleural_Thickening',
            'Hernia']
        RESULT_PATH = "results/"

        self.df["Patient Gender"]=self.df["Patient Gender"].replace(to_replace ="F",value =0)
        self.df["Patient Gender"]=self.df["Patient Gender"].replace(to_replace ="M",value =1)
        self.df.loc[self.df["Patient Age"].str.contains(pat = 'M'),"Patient Age"]='0'
        self.df.loc[self.df["Patient Age"].str.contains(pat = 'D'),"Patient Age"]='0'
        self.df["Patient Age"]=self.df["Patient Age"].str.strip('Y')
        self.df["Patient Age"] = pd.to_numeric(self.df["Patient Age"])
        self.df["Patient Age"]=(self.df["Patient Age"]-self.df["Patient Age"].min())/(self.df["Patient Age"].max()-self.df["Patient Age"].min())
        self.df["View Position"]=self.df["View Position"].replace(to_replace ="PA",value =0)
        self.df["View Position"]=self.df["View Position"].replace(to_replace ="AP",value =1)
        
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        image = Image.open(
            os.path.join(
                self.path_to_images,
                self.df.index[idx]))
        image = image.convert('RGB')

        label = np.zeros(len(self.PRED_LABEL), dtype=int)
        for i in range(0, len(self.PRED_LABEL)):
             # can leave zero if zero, else make one
            if(self.df[self.PRED_LABEL[i].strip()].iloc[idx].astype('int') > 0):
                label[i] = self.df[self.PRED_LABEL[i].strip()
                                   ].iloc[idx].astype('int')

        if self.transform:
            image = self.transform(image)

        tabular = self.df.loc[self.df.index == self.df.index[idx],["Patient Age", "Patient Gender","View Position"]].values[0].tolist()
        tabular = torch.FloatTensor(tabular)

        return (image,tabular ,label,self.df.index[idx])

In [51]:
PATH_TO_IMAGES = "data/"
WEIGHT_DECAY = 1e-4
LEARNING_RATE = 0.01

In [52]:
# transformed_datasets = {}
# mean = [0.485, 0.456, 0.406]
# std = [0.229, 0.224, 0.225]
# data_transforms = {
#     'train': transforms.Compose([
#         transforms.RandomHorizontalFlip(),
#         transforms.Scale(224),
#         # because scale doesn't always give 224 x 224, this ensures 224 x
#         # 224
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#         transforms.Normalize(mean, std)
#     ]),
#     'val': transforms.Compose([
#         transforms.Scale(224),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#         transforms.Normalize(mean, std)
#     ]),
# }
# transformed_datasets['val'] = CXRDataset(
#     path_to_images=PATH_TO_IMAGES,
#     fold='val',
#     transform=data_transforms['val'])

In [53]:
# for i in transformed_datasets['val']:
#     print(i)
#     break

In [54]:
class CombineModel(nn.Module):
    """Model modified.
    The architecture of our model is the same as standard DenseNet121
    except the classifier layer which has an additional sigmoid function.
    """
    def __init__(self, out_size):
        super(CombineModel, self).__init__()
        self.resnet18 = torchvision.models.resnet18(pretrained=True)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Sequential(
            nn.Linear(num_ftrs, out_size),
        )
        
        self.ln4 = nn.Linear(3, 10)
        self.ln5 = nn.Linear(10, 10)
        self.ln6 = nn.Linear(10, 14)
        self.ln7 = nn.Linear(28, 14)
        
        self.relu = nn.ReLU()
        self.sigmoid =nn.Sigmoid()

    def forward(self, img,tab):
        
        img = self.resnet18(img)
        
        
        tab = self.ln4(tab)
        tab = self.relu(tab)
        tab = self.ln5(tab)
        tab = self.relu(tab)
        tab = self.ln6(tab)
        tab = self.relu(tab)
#         print(img.shape)
#         print(tab.shape)
        x = torch.cat((img, tab), dim=1)
        x = self.ln7(x)
        x = self.sigmoid(x)

#         print(x.shape)
        return x

In [3]:
def train_cnn(PATH_TO_IMAGES, LR, WEIGHT_DECAY):
    """
    Train torchvision model to NIH data given high level hyperparameters.

    Args:
        PATH_TO_IMAGES: path to NIH images
        LR: learning rate
        WEIGHT_DECAY: weight decay parameter for SGD

    Returns:
        preds: torchvision model predictions on test fold with ground truth for comparison
        aucs: AUCs for each train,test tuple

    """
    NUM_EPOCHS = 100
    BATCH_SIZE = 16

    try:
        rmtree('results/')
    except BaseException:
        pass  # directory doesn't yet exist, no need to clear it
    os.makedirs("results/")

    # use imagenet mean,std for normalization
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    N_LABELS = 14  # we are predicting 14 labels

    # load labels
    df = pd.read_csv("nih_labels.csv", index_col=0)

    # define torchvision transforms
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.Scale(224),
            # because scale doesn't always give 224 x 224, this ensures 224 x
            # 224
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
        'val': transforms.Compose([
            transforms.Scale(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
    }

    # create train/val dataloaders
    transformed_datasets = {}
    transformed_datasets['train'] = CXRDataset(
        path_to_images=PATH_TO_IMAGES,
        fold='train',
        transform=data_transforms['train'])
    transformed_datasets['val'] = CXRDataset(
        path_to_images=PATH_TO_IMAGES,
        fold='val',
        transform=data_transforms['val'])

    dataloaders = {}
    dataloaders['train'] = torch.utils.data.DataLoader(
        transformed_datasets['train'],
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=8)
    dataloaders['val'] = torch.utils.data.DataLoader(
        transformed_datasets['val'],
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=8)

    model = CombineModel(N_LABELS).cuda()


    # define criterion, optimizer for training
    criterion = nn.BCELoss()
    optimizer = optim.SGD(
        filter(
            lambda p: p.requires_grad,
            model.parameters()),
        lr=LR,
        momentum=0.9,
        weight_decay=WEIGHT_DECAY)
    dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']}

    # train model
    model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS,
                                    dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY)



In [56]:
def checkpoint(model, best_loss, epoch, LR):
    """
    Saves checkpoint of torchvision model during training.

    Args:
        model: torchvision model to be saved
        best_loss: best val loss achieved so far in training
        epoch: current epoch of training
        LR: current learning rate in training
    Returns:
        None
    """

    print('saving')
    state = {
        'model': model,
        'best_loss': best_loss,
        'epoch': epoch,
        'rng_state': torch.get_rng_state(),
        'LR': LR
    }

    torch.save(state, 'results/checkpoint')

In [57]:
def train_model(
        model,
        criterion,
        optimizer,
        LR,
        num_epochs,
        dataloaders,
        dataset_sizes,
        weight_decay):
    """
    Fine tunes torchvision model to NIH CXR data.

    Args:
        model: torchvision model to be finetuned (densenet-121 in this case)
        criterion: loss criterion (binary cross entropy loss, BCELoss)
        optimizer: optimizer to use in training (SGD)
        LR: learning rate
        num_epochs: continue training up to this many epochs
        dataloaders: pytorch train and val dataloaders
        dataset_sizes: length of train and val datasets
        weight_decay: weight decay parameter we use in SGD with momentum
    Returns:
        model: trained torchvision model
        best_epoch: epoch on which best model val loss was obtained

    """
    since = time.time()

    start_epoch = 1
    best_loss = 999999
    best_epoch = -1
    last_train_loss = -1

    # iterate over epochs
    for epoch in range(start_epoch, num_epochs + 1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        # set model to train or eval mode based on whether we are in train or
        # val; necessary to get correct predictions given batchnorm
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train(True)
            else:
                model.train(False)

            running_loss = 0.0

            i = 0
            total_done = 0
            # iterate over all data in train/val dataloader:
            for data in dataloaders[phase]:
                i += 1
                inputs,tab,labels, _ = data
                batch_size = inputs.shape[0]
                inputs = Variable(inputs.cuda())
                tab = Variable(tab.cuda())
                labels = Variable(labels.cuda()).float()
                outputs = model(inputs,tab)

                # calculate gradient and update parameters in train phase
                optimizer.zero_grad()
                loss = criterion(outputs, labels)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                running_loss += loss.item() * batch_size

            epoch_loss = running_loss / dataset_sizes[phase]

            if phase == 'train':
                last_train_loss = epoch_loss

            print(phase + ' epoch {}:loss {:.4f} with data size {}'.format(
                epoch, epoch_loss, dataset_sizes[phase]))

            # decay learning rate if no val loss improvement in this epoch

            if phase == 'val' and epoch_loss > best_loss:
                print("decay loss from " + str(LR) + " to " +
                      str(LR / 10) + " as not seeing improvement in val loss")
                LR = LR / 10
                # create new optimizer with lower learning rate
                optimizer = optim.SGD(
                    filter(
                        lambda p: p.requires_grad,
                        model.parameters()),
                    lr=LR,
                    momentum=0.9,
                    weight_decay=weight_decay)
                print("created new optimizer with LR " + str(LR))

            # checkpoint model if has best val loss yet
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_epoch = epoch
                checkpoint(model, best_loss, epoch, LR)

            # log training and validation loss over each epoch
            if phase == 'val':
                with open("results/log_train", 'a') as logfile:
                    logwriter = csv.writer(logfile, delimiter=',')
                    if(epoch == 1):
                        logwriter.writerow(["epoch", "train_loss", "val_loss"])
                    logwriter.writerow([epoch, last_train_loss, epoch_loss])

        total_done += batch_size
        if(total_done % (100 * batch_size) == 0):
            print("completed " + str(total_done) + " so far in epoch")

        # break if no val loss improvement in 3 epochs
        if ((epoch - best_epoch) >= 3):
            print("no improvement in 3 epochs, break")
            break

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights to return
    checkpoint_best = torch.load('results/checkpoint')
    model = checkpoint_best['model']

    return model, best_epoch

In [37]:
train_cnn(PATH_TO_IMAGES, LEARNING_RATE, WEIGHT_DECAY)

Epoch 1/100
----------
train epoch 1:loss 0.1660 with data size 78468
val epoch 1:loss 0.1564 with data size 11219
saving
Epoch 2/100
----------
train epoch 2:loss 0.1543 with data size 78468
val epoch 2:loss 0.1529 with data size 11219
saving
Epoch 3/100
----------
train epoch 3:loss 0.1500 with data size 78468
val epoch 3:loss 0.1505 with data size 11219
saving
Epoch 4/100
----------
train epoch 4:loss 0.1469 with data size 78468
val epoch 4:loss 0.1501 with data size 11219
saving
Epoch 5/100
----------
train epoch 5:loss 0.1441 with data size 78468
val epoch 5:loss 0.1500 with data size 11219
saving
Epoch 6/100
----------
train epoch 6:loss 0.1416 with data size 78468
val epoch 6:loss 0.1514 with data size 11219
decay loss from 0.01 to 0.001 as not seeing improvement in val loss
created new optimizer with LR 0.001
Epoch 7/100
----------
train epoch 7:loss 0.1314 with data size 78468
val epoch 7:loss 0.1495 with data size 11219
saving
Epoch 8/100
----------
train epoch 8:loss 0.1276 

In [58]:
torch.cuda.empty_cache()

In [59]:
checkpoint_best = torch.load('results/checkpoint')
model = checkpoint_best['model']

In [60]:
BATCH_SIZE = 16
model.train(False)

# create dataloader
dataset = CXRDataset(
    path_to_images=PATH_TO_IMAGES,
    fold="test",
    transform=data_transforms['val'])
dataloader = torch.utils.data.DataLoader(
    dataset, BATCH_SIZE, shuffle=False, num_workers=8)
size = len(dataset)

# create empty dfs
prob_df = pd.DataFrame(columns=["Image Index"])
true_df = pd.DataFrame(columns=["Image Index"])
pred_df = pd.DataFrame(columns=["Image Index"])

In [61]:
for i, data in enumerate(dataloader):

    inputs,tab ,labels, _ = data
    inputs,tab ,labels = Variable(inputs.cuda()),Variable(tab.cuda()), Variable(labels.cuda())

    true_labels = labels.cpu().data.numpy()
    batch_size = true_labels.shape

    outputs = model(inputs,tab)
    probs = outputs.cpu().data.numpy()
    predicted = probs > 0.1
    
    # get predictions and true values for each item in batch
    for j in range(0, batch_size[0]):
        thisrow = {}
        truerow = {}
        predrow = {}
        thisrow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]
        truerow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]
        predrow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]

        # iterate over each entry in prediction vector; each corresponds to
        # individual label
        for k in range(len(dataset.PRED_LABEL)):
            thisrow["prob_" + dataset.PRED_LABEL[k]] = probs[j, k]
            truerow[dataset.PRED_LABEL[k]] = true_labels[j, k]
            predrow[dataset.PRED_LABEL[k]] = predicted[j, k]
        prob_df = prob_df.append(thisrow, ignore_index=True)
        true_df = true_df.append(truerow, ignore_index=True)
        pred_df = pred_df.append(predrow, ignore_index=True)

    if(i % 10 == 0):
        print(str(i * BATCH_SIZE))

auc_df = pd.DataFrame(columns=["label", "auc"])
acc_df = pd.DataFrame(columns=["label", "acc"])

0
160
320
480
640
800
960
1120
1280
1440
1600
1760
1920
2080
2240
2400
2560
2720
2880
3040
3200
3360
3520
3680
3840
4000
4160
4320
4480
4640
4800
4960
5120
5280
5440
5600
5760
5920
6080
6240
6400
6560
6720
6880
7040
7200
7360
7520
7680
7840
8000
8160
8320
8480
8640
8800
8960
9120
9280
9440
9600
9760
9920
10080
10240
10400
10560
10720
10880
11040
11200
11360
11520
11680
11840
12000
12160
12320
12480
12640
12800
12960
13120
13280
13440
13600
13760
13920
14080
14240
14400
14560
14720
14880
15040
15200
15360
15520
15680
15840
16000
16160
16320
16480
16640
16800
16960
17120
17280
17440
17600
17760
17920
18080
18240
18400
18560
18720
18880
19040
19200
19360
19520
19680
19840
20000
20160
20320
20480
20640
20800
20960
21120
21280
21440
21600
21760
21920
22080
22240
22400


In [62]:
for column in true_df:

    if column not in ['Atelectasis','Cardiomegaly','Effusion',
        'Infiltration','Mass','Nodule','Pneumonia',
        'Pneumothorax','Consolidation','Edema',
        'Emphysema','Fibrosis','Pleural_Thickening','Hernia']:
        continue
    actual = true_df[column]
    prob = prob_df["prob_" + column]
    thisrow = {}
    thisrow['label'] = column
    thisrow['auc'] = np.nan
    thisrow['auc'] = sklm.roc_auc_score(
        actual.to_numpy().astype(int), prob.to_numpy())
    auc_df = auc_df.append(thisrow, ignore_index=True)

In [63]:
auc_df

Unnamed: 0,label,auc
0,Atelectasis,0.801996
1,Cardiomegaly,0.897987
2,Consolidation,0.796294
3,Edema,0.884148
4,Effusion,0.879006
5,Emphysema,0.902379
6,Fibrosis,0.805017
7,Hernia,0.822729
8,Infiltration,0.703561
9,Mass,0.810268


In [64]:
for column in true_df:

    if column not in ['Atelectasis','Cardiomegaly','Effusion',
        'Infiltration','Mass','Nodule','Pneumonia',
        'Pneumothorax','Consolidation','Edema',
        'Emphysema','Fibrosis','Pleural_Thickening','Hernia']:
        continue
    actual = true_df[column]
    pred = pred_df[column]
    thisrow = {}
    thisrow['label'] = column
    thisrow['acc'] = np.nan
    thisrow['acc'] = sklm.accuracy_score(
        actual.to_numpy().astype(int), pred.to_numpy())
    acc_df = acc_df.append(thisrow, ignore_index=True)

In [65]:
acc_df

Unnamed: 0,label,acc
0,Atelectasis,0.722507
1,Cardiomegaly,0.927874
2,Consolidation,0.857754
3,Edema,0.938885
4,Effusion,0.759194
5,Emphysema,0.964739
6,Fibrosis,0.970668
7,Hernia,0.998128
8,Infiltration,0.455401
9,Mass,0.886774
