In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import cv2
import numpy as np
import pandas as pd
import os,gc
import sys
import shutil
import math
import random
import heapq 
import time
import copy
import itertools  
from sklearn.metrics import confusion_matrix,roc_curve,accuracy_score,auc,roc_auc_score 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision
#torch.cuda.set_device(1)
#print (torch.cuda.current_device())

In [2]:
#preparing the trainset and  testset
img_path = '/data/fjsdata/NIH-CXR/images/images/' 
trData = pd.read_csv("/data/fjsdata/NIH-CXR/fjs_train.csv" , sep=',') #trainset
trN, trI, trY = [],[],[]
for _, row in trData.iterrows():#[:1001]
    name = row['image_index']
    target = np.fromstring(row['target_vector'].strip('[').strip(']'), dtype=int, sep=' ') #turn string to numpy.ndarray
    try:
        trN.append(name)#'image_index'
        trY.append(target)#'target_vector'
        img = cv2.resize(cv2.imread(os.path.join(img_path, name)).astype(np.float32), (256, 256))#(256,256,3)
        trI.append(img)
    except:
        print(name+":"+str(os.path.join(img_path, name)))
    sys.stdout.write('\r{} / {} '.format(len(trN),trData.shape[0]))
    sys.stdout.flush()
print('The length of trainset is %d'%len(trN))
trI = np.array(trI)
trY = np.array(trY)

teData = pd.read_csv("/data/fjsdata/NIH-CXR/fjs_test.csv" , sep=',') #testset
teN, teI, teY = [],[],[]
for _, row in teData.iterrows():#[:1001]
    name = row['image_index']
    target = np.fromstring(row['target_vector'].strip('[').strip(']'), dtype=int, sep=' ') #turn string to numpy.ndarray
    try:
        teN.append(name)#'image_index'
        teY.append(target)#'target_vector'
        img = cv2.resize(cv2.imread(os.path.join(img_path, name)).astype(np.float32), (256, 256))#(256,256,3)
        teI.append(img)
    except:
        print(name+":"+str(os.path.join(img_path, name)))
    sys.stdout.write('\r{} / {} '.format(len(teN),teData.shape[0]))
    sys.stdout.flush()
print('The length of testset is %d'%len(teN))
teI = np.array(teI)
teY = np.array(teY)

86524 / 86524 The length of trainset is 86524
25596 / 25596 The length of testset is 25596


In [23]:
#model training
N_CLASSES = 15 #class numbers
model = DenseNet121(num_classes=N_CLASSES, is_pre_trained=True).cuda()#initialize model
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5, 6, 7]).cuda()# make model available multi GPU cores training
#torch.backends.cudnn.benchmark = True  # improve train speed slightly
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
criterion = torch.nn.BCELoss()
#train model
best_net, best_loss = None, float('inf')
batchSize = 100
for epoch in range(10):#iteration
    losses = []
    num_batches = len(trY) // batchSize + 1
    for i in range(num_batches):
        optimizer.zero_grad()#grad vanish
        min_idx = i * batchSize
        max_idx = np.min([len(trY), (i+1)*batchSize])
        I_batch = torch.from_numpy(trI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        y_batch = torch.from_numpy(trY[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        #forword
        y_outputs = model(I_batch.permute(0, 3, 1, 2))#permute the dims of matrix
        #loss
        loss = criterion(y_outputs, y_batch)
        loss.backward()
        #update parameters
        optimizer.step()
        sys.stdout.write('\r {} / {} : loss = {}'.format(i+1, num_batches, float('%0.6f'%loss.item())))
        sys.stdout.flush()     
        losses.append(loss.item())
    print("Eopch: %5d mean_loss = %.6f" % (epoch + 1, np.mean(losses)))
    if np.mean(losses) < best_loss:
        best_loss = np.mean(losses)
        best_net = copy.deepcopy(model)
print("best_loss = %.6f" % (best_loss))
model = model.cpu()#release gpu memory
torch.cuda.empty_cache()

 866 / 866 : loss = 0.363833Eopch:     1 mean_loss = 0.189883
 866 / 866 : loss = 0.373289Eopch:     2 mean_loss = 0.184085
 866 / 866 : loss = 0.374589Eopch:     3 mean_loss = 0.181662
 866 / 866 : loss = 0.370496Eopch:     4 mean_loss = 0.180667
 866 / 866 : loss = 0.358197Eopch:     5 mean_loss = 0.177757
 866 / 866 : loss = 0.356606Eopch:     6 mean_loss = 0.176455
 866 / 866 : loss = 0.352629Eopch:     7 mean_loss = 0.175392
 866 / 866 : loss = 0.348436Eopch:     8 mean_loss = 0.174355
 866 / 866 : loss = 0.350465Eopch:     9 mean_loss = 0.173616
 866 / 866 : loss = 0.354101Eopch:    10 mean_loss = 0.172488
best_loss = 0.172488


In [54]:
model = model.cpu()#release gpu memory
I_batch = I_batch.cpu()
y_batch = y_batch.cpu()
torch.cuda.empty_cache()

In [25]:
#performance of testset
# initialize the ground truth and output tensor
gt = torch.FloatTensor().cuda()
pred = torch.FloatTensor().cuda()
num_batches = len(teY) // batchSize  +1
for i in range(num_batches):
    min_idx = i * batchSize
    max_idx = np.min([len(teY), (i+1)*batchSize])
    I_batch = torch.from_numpy(teI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    y_batch = torch.from_numpy(teY[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    gt = torch.cat((gt, y_batch), 0)
    y_outputs = best_net(I_batch.permute(0, 3, 1, 2))#forword
    pred = torch.cat((pred, y_outputs.data), 0)
    sys.stdout.write('\r {} / {} '.format(i, num_batches))
    sys.stdout.flush()
    
CLASS_NAMES = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Effusion','Infiltration', 'Mass', 'Nodule', \
       'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'] 
def compute_AUCs(gt, pred):
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

AUROCs = compute_AUCs(gt, pred)
AUROC_avg = np.array(AUROCs).mean()
print('The average AUROC is {AUROC_avg:.4f}'.format(AUROC_avg=AUROC_avg))
for i in range(N_CLASSES):
    print('The AUROC of {} is {:.4f}'.format(CLASS_NAMES[i], AUROCs[i]))

 255 / 256 The average AUROC is 0.6535
The AUROC of No Finding is 0.6965
The AUROC of Atelectasis is 0.6779
The AUROC of Cardiomegaly is 0.6761
The AUROC of Effusion is 0.7775
The AUROC of Infiltration is 0.6087
The AUROC of Mass is 0.6185
The AUROC of Nodule is 0.5929
The AUROC of Pneumonia is 0.6224
The AUROC of Pneumothorax is 0.6310
The AUROC of Consolidation is 0.6906
The AUROC of Edema is 0.7417
The AUROC of Emphysema is 0.6053
The AUROC of Fibrosis is 0.6148
The AUROC of Pleural_Thickening is 0.6495
The AUROC of Hernia is 0.5993


In [47]:
trY_ab = trY[:,1:]
teY_ab = teY[:,1:]#get rid of 'no finding' label, turn to 14 class
N_CLASSES = 14 #class numbers
model = DenseNet121(num_classes=N_CLASSES, is_pre_trained=True).cuda()#initialize model
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5, 6, 7]).cuda()# make model available multi GPU cores training
#torch.backends.cudnn.benchmark = True  # improve train speed slightly
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
criterion = torch.nn.BCELoss()
#train model
best_net, best_loss = None, float('inf')
batchSize = 100
for epoch in range(10):#iteration
    losses = []
    num_batches = len(trY_ab) // batchSize + 1
    for i in range(num_batches):
        optimizer.zero_grad()#grad vanish
        min_idx = i * batchSize
        max_idx = np.min([len(trY_ab), (i+1)*batchSize])
        I_batch = torch.from_numpy(trI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        y_batch = torch.from_numpy(trY_ab[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        #forword
        y_outputs = model(I_batch.permute(0, 3, 1, 2))#permute the dims of matrix
        #loss
        loss = criterion(y_outputs, y_batch)
        loss.backward()
        #update parameters
        optimizer.step()
        sys.stdout.write('\r {} / {} : loss = {}'.format(i+1, num_batches, float('%0.6f'%loss.item())))
        sys.stdout.flush()     
        losses.append(loss.item())
    print("Eopch: %5d mean_loss = %.6f" % (epoch + 1, np.mean(losses)))
    if np.mean(losses) < best_loss:
        best_loss = np.mean(losses)
        best_net = copy.deepcopy(model)
print("best_loss = %.6f" % (best_loss))
model = model.cpu()#release gpu memory
torch.cuda.empty_cache()

 866 / 866 : loss = 0.377945Eopch:     1 mean_loss = 0.157334
 866 / 866 : loss = 0.355685Eopch:     2 mean_loss = 0.152673
 866 / 866 : loss = 0.359542Eopch:     3 mean_loss = 0.150143
 866 / 866 : loss = 0.355822Eopch:     4 mean_loss = 0.148482
 866 / 866 : loss = 0.346721Eopch:     5 mean_loss = 0.147016
 866 / 866 : loss = 0.361655Eopch:     6 mean_loss = 0.145787
 866 / 866 : loss = 0.346072Eopch:     7 mean_loss = 0.144734
 866 / 866 : loss = 0.346776Eopch:     8 mean_loss = 0.143769
 866 / 866 : loss = 0.346223Eopch:     9 mean_loss = 0.142909
 866 / 866 : loss = 0.341287Eopch:    10 mean_loss = 0.143084
best_loss = 0.142909


In [49]:
#performance of testset
# initialize the ground truth and output tensor
gt = torch.FloatTensor().cuda()
pred = torch.FloatTensor().cuda()
num_batches = len(teY_ab) // batchSize  +1
for i in range(num_batches):
    min_idx = i * batchSize
    max_idx = np.min([len(teY_ab), (i+1)*batchSize])
    I_batch = torch.from_numpy(teI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    y_batch = torch.from_numpy(teY_ab[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    gt = torch.cat((gt, y_batch), 0)
    y_outputs = best_net(I_batch.permute(0, 3, 1, 2))#forword
    pred = torch.cat((pred, y_outputs.data), 0)
    sys.stdout.write('\r {} / {} '.format(i, num_batches))
    sys.stdout.flush()
    
CLASS_NAMES = ['Atelectasis', 'Cardiomegaly', 'Effusion','Infiltration', 'Mass', 'Nodule', 'Pneumonia', \
               'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'] 
def compute_AUCs(gt, pred):
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

AUROCs = compute_AUCs(gt, pred)
AUROC_avg = np.array(AUROCs).mean()
print('The average AUROC is {AUROC_avg:.4f}'.format(AUROC_avg=AUROC_avg))
for i in range(N_CLASSES):
    print('The AUROC of {} is {:.4f}'.format(CLASS_NAMES[i], AUROCs[i]))

 255 / 256 The average AUROC is 0.6485
The AUROC of Atelectasis is 0.6646
The AUROC of Cardiomegaly is 0.6910
The AUROC of Effusion is 0.7758
The AUROC of Infiltration is 0.6205
The AUROC of Mass is 0.6252
The AUROC of Nodule is 0.5981
The AUROC of Pneumonia is 0.6271
The AUROC of Pneumothorax is 0.6132
The AUROC of Consolidation is 0.6978
The AUROC of Edema is 0.7578
The AUROC of Emphysema is 0.5906
The AUROC of Fibrosis is 0.6222
The AUROC of Pleural_Thickening is 0.6262
The AUROC of Hernia is 0.5689


In [51]:
#model training
N_CLASSES = 15 #class numbers
model = DenseNet121(num_classes=N_CLASSES, is_pre_trained=True).cuda()#initialize model
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5, 6, 7]).cuda()# make model available multi GPU cores training
#torch.backends.cudnn.benchmark = True  # improve train speed slightly
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
criterion = torch.nn.BCELoss()
#train model
best_net, best_loss = None, float('inf')
batchSize = 100
for epoch in range(100):#iteration
    losses = []
    num_batches = len(trY) // batchSize + 1
    for i in range(num_batches):
        optimizer.zero_grad()#grad vanish
        min_idx = i * batchSize
        max_idx = np.min([len(trY), (i+1)*batchSize])
        I_batch = torch.from_numpy(trI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        y_batch = torch.from_numpy(trY[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        #forword
        y_outputs = model(I_batch.permute(0, 3, 1, 2))#permute the dims of matrix
        #loss
        loss = criterion(y_outputs, y_batch)
        loss.backward()
        #update parameters
        optimizer.step()
        sys.stdout.write('\r {} / {} : loss = {}'.format(i+1, num_batches, float('%0.6f'%loss.item())))
        sys.stdout.flush()     
        losses.append(loss.item())
    print("Eopch: %5d mean_loss = %.6f" % (epoch + 1, np.mean(losses)))
    if np.mean(losses) < best_loss:
        best_loss = np.mean(losses)
        best_net = copy.deepcopy(model)
print("best_loss = %.6f" % (best_loss))
model = model.cpu()#release gpu memory
torch.cuda.empty_cache()

 866 / 866 : loss = 0.361723Eopch:     1 mean_loss = 0.190226
 866 / 866 : loss = 0.361804Eopch:     2 mean_loss = 0.183519
 866 / 866 : loss = 0.348654Eopch:     3 mean_loss = 0.180285
 866 / 866 : loss = 0.355258Eopch:     4 mean_loss = 0.178179
 866 / 866 : loss = 0.356647Eopch:     5 mean_loss = 0.176347
 866 / 866 : loss = 0.385051Eopch:     6 mean_loss = 0.175353
 866 / 866 : loss = 0.357464Eopch:     7 mean_loss = 0.174459
 866 / 866 : loss = 0.354676Eopch:     8 mean_loss = 0.173964
 866 / 866 : loss = 0.353243Eopch:     9 mean_loss = 0.172115
 866 / 866 : loss = 0.360465Eopch:    10 mean_loss = 0.171444
 866 / 866 : loss = 0.360942Eopch:    11 mean_loss = 0.170444
 866 / 866 : loss = 0.351964Eopch:    12 mean_loss = 0.172223
 866 / 866 : loss = 0.346954Eopch:    13 mean_loss = 0.169429
 866 / 866 : loss = 0.345182Eopch:    14 mean_loss = 0.168490
 866 / 866 : loss = 0.342252Eopch:    15 mean_loss = 0.168124
 866 / 866 : loss = 0.331435Eopch:    16 mean_loss = 0.167197
 866 / 8

In [53]:
#performance of testset
# initialize the ground truth and output tensor
gt = torch.FloatTensor().cuda()
pred = torch.FloatTensor().cuda()
num_batches = len(teY) // batchSize  +1
for i in range(num_batches):
    min_idx = i * batchSize
    max_idx = np.min([len(teY), (i+1)*batchSize])
    I_batch = torch.from_numpy(teI[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    y_batch = torch.from_numpy(teY[min_idx:max_idx]).type(torch.FloatTensor).cuda()
    gt = torch.cat((gt, y_batch), 0)
    y_outputs = best_net(I_batch.permute(0, 3, 1, 2))#forword
    pred = torch.cat((pred, y_outputs.data), 0)
    sys.stdout.write('\r {} / {} '.format(i, num_batches))
    sys.stdout.flush()
    
CLASS_NAMES = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Effusion','Infiltration', 'Mass', 'Nodule', \
       'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'] 
def compute_AUCs(gt, pred):
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

AUROCs = compute_AUCs(gt, pred)
AUROC_avg = np.array(AUROCs).mean()
print('The average AUROC is {AUROC_avg:.4f}'.format(AUROC_avg=AUROC_avg))
for i in range(N_CLASSES):
    print('The AUROC of {} is {:.4f}'.format(CLASS_NAMES[i], AUROCs[i]))

 255 / 256 The average AUROC is 0.6555
The AUROC of No Finding is 0.6081
The AUROC of Atelectasis is 0.6636
The AUROC of Cardiomegaly is 0.7062
The AUROC of Effusion is 0.7520
The AUROC of Infiltration is 0.5791
The AUROC of Mass is 0.6987
The AUROC of Nodule is 0.6253
The AUROC of Pneumonia is 0.6063
The AUROC of Pneumothorax is 0.7065
The AUROC of Consolidation is 0.6629
The AUROC of Edema is 0.7419
The AUROC of Emphysema is 0.6478
The AUROC of Fibrosis is 0.6017
The AUROC of Pleural_Thickening is 0.6496
The AUROC of Hernia is 0.5835
