## Libraries

In [1]:
# Import the data science stack
import numpy as np
import pandas as pd
import pickle

# File system access
import os
from shutil import copy2

# Image importing
import cv2

# Tracking progress
from tqdm import tqdm

# Random seed
np.random.seed(2021)

In [2]:
#PyTorch
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
import torch.nn.functional as F
device = torch.device("cuda")

In [3]:
# folder path
parent_dir = '/home/ubuntu/data/'

## Load Data - keep Kaggle order

In [12]:
# Read in train, validation, and test
train_labels = pd.read_csv(parent_dir + 'train_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
val_labels = pd.read_csv(parent_dir + 'val_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
test_labels = pd.read_csv(parent_dir + 'test_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])

print(len(train_labels))
train_labels.head()

143778


Unnamed: 0,image_id,class,xmin,ymin,xmax,ymax
0,NCP_96_1328_0032.png,2,9,94,512,405
1,NCP_96_1328_0035.png,2,10,106,512,405
2,NCP_96_1328_0036.png,2,10,105,512,406
3,NCP_96_1328_0037.png,2,11,104,512,406
4,NCP_96_1328_0038.png,2,11,103,512,406


In [14]:
# create folders
datasets = ['mini', 'train', 'hyper', 'val', 'test']
labels = ['Normal', 'Pneumonia', 'Covid']

# create main folder
output_dir = parent_dir+'2A_images_reorg2/'
os.mkdir(output_dir)

# create subfolders
for d in datasets:
    os.mkdir(output_dir+d)
    for l in labels:
        os.mkdir(output_dir+d+'/'+l)

FileExistsError: [Errno 17] File exists: '/home/ubuntu/data/2A_images_reorg2/'

In [10]:
# move images to folders
def move_files(img_folder, img_names, labels, output_dir):
    for i, img in enumerate(tqdm(img_names, position=0, leave=True)):
        if labels[i] == 0:
            copy2(img_folder+'/'+img, output_dir+'Normal')
        elif labels[i] == 1:
            copy2(img_folder+'/'+img, output_dir+'Pneumonia')
        elif labels[i] == 2:
            copy2(img_folder+'/'+img, output_dir+'Covid')   

In [15]:
img_folder="/home/ubuntu/data/2A_images/"

# train
move_files(img_folder, train_labels['image_id'], train_labels['class'], output_dir+'train/')

# hyper
move_files(img_folder, train_labels['image_id'][136445:], train_labels['class'][136445:], output_dir+'hyper/')

# val
move_files(img_folder, val_labels['image id'], val_labels['class'], output_dir+'val/')

# test
move_files(img_folder, test_labels['image_id'], test_labels['class'], output_dir+'test/')

100%|██████████| 25658/25658 [00:09<00:00, 2846.33it/s]


## Load Data - randomize images

Read in the labels

In [7]:
# Read in train, validation, and test
train_labels = pd.read_csv(parent_dir + 'train_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
val_labels = pd.read_csv(parent_dir + 'val_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
test_labels = pd.read_csv(parent_dir + 'test_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])

all_labels = pd.concat([train_labels, val_labels, test_labels])
all_labels = all_labels[['image_id', 'class']]
all_labels.head()

Unnamed: 0,image_id,class
0,NCP_96_1328_0032.png,2
1,NCP_96_1328_0035.png,2
2,NCP_96_1328_0036.png,2
3,NCP_96_1328_0037.png,2
4,NCP_96_1328_0038.png,2


Read in images - takes about 17 minutes

In [10]:
# Function to load images
def load_images_from_folder(folder, dim):
    '''
    folder = file path to image folder
    dim = tuple (width, height), output array of images
    '''
    img_name = []
    img_array = []
    
    # use tqdm to track progress
    # opens each image iteratively from folder
    for filename in tqdm(os.listdir(folder), position=0, leave=True):
        
        # save image name
        img_name.append(filename)
        
        # reads image 
        img = cv2.imread(os.path.join(folder,filename))
        
        if img is not None:
            # convert to gray scale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # resize image
            resized = cv2.resize(gray, dim, interpolation = cv2.INTER_AREA)
            #resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA

            # append image
            img_array.append(resized)
    
    return img_name, img_array
img_folder="/home/ubuntu/data/2A_images/"

In [6]:
# Run function
dim = (256,256)
img_name, img_array = load_images_from_folder(img_folder, dim)

100%|██████████| 194922/194922 [15:05<00:00, 215.33it/s]


In [14]:
# Convert to numpy array
img_name = np.array(img_name)
img_array = np.array(img_array)
img_array.shape

(194922, 256, 256)

Get labels

In [15]:
# Sort labels by image name order - 27 minutes
sorted_labels = []
for i in tqdm(img_name, position=0, leave=True):
    a = all_labels['class'][all_labels['image_id'] == i]
    sorted_labels.append(int(a))

100%|██████████| 194922/194922 [28:20<00:00, 114.61it/s]


In [20]:
# Convert to numpy array
sorted_labels = np.array(sorted_labels)

Randomize all data

In [21]:
# Generate random indices
all_idx = np.random.choice(img_array.shape[0], img_array.shape[0], replace=False)

# Randomize data
img_name = img_name[all_idx]
img_array = img_array[all_idx]
sorted_labels = sorted_labels[all_idx]

Split dataset to train, val, test

In [22]:
# Training set to debug
mini_img = img_name[0:10000]
mini_X = img_array[0:10000]
mini_y = sorted_labels[0:10000]

# 70% train
train_img = img_name[0:136445]
train_X = img_array[0:136445]
train_y = sorted_labels[0:136445]

# 10% hyper parameter tuning
hyper_img = img_name[136445:155938]
hyper_X = img_array[136445:155938]
hyper_y = sorted_labels[136445:155938]

# 10% valdiation
val_img = img_name[155938:175430]
val_X = img_array[155938:175430]
val_y = sorted_labels[155938:175430]

# 10% test
test_img = img_name[175430:]
test_X = img_array[175430:]
test_y = sorted_labels[175430:]

Check distribution of classes in mini, train, hyper, and valid

In [23]:
print(np.unique(mini_y, return_counts=True)[1]/(np.unique(mini_y, return_counts=True)[1].sum()))
print(np.unique(train_y, return_counts=True)[1]/(np.unique(train_y, return_counts=True)[1].sum()))
print(np.unique(hyper_y, return_counts=True)[1]/(np.unique(hyper_y, return_counts=True)[1].sum()))
print(np.unique(val_y, return_counts=True)[1]/(np.unique(val_y, return_counts=True)[1].sum()))
print(np.unique(test_y, return_counts=True)[1]/(np.unique(test_y, return_counts=True)[1].sum()))

[0.3065 0.2    0.4935]
[0.30869581 0.20586317 0.48544102]
[0.30646899 0.20715129 0.48637973]
[0.30427868 0.2085984  0.48712292]
[0.31079417 0.2102401  0.47896573]


Reorganize images into relevenat folders.  These images can be pulled in using Pytorch's ImageFolder later.

In [34]:
# create folders
datasets = ['mini', 'train', 'hyper', 'val', 'test']
labels = ['Normal', 'Pneumonia', 'Covid']

# create main folder
output_dir = parent_dir+'2A_images_reorg/'
os.mkdir(output_dir)

# create subfolders
for d in datasets:
    os.mkdir(output_dir+d)
    for l in labels:
        os.mkdir(output_dir+d+'/'+l)

In [46]:
# move images to folders
def move_files(img_folder, img_names, labels, output_dir):
    for i, img in enumerate(tqdm(img_names, position=0, leave=True)):
        if labels[i] == 0:
            copy2(img_folder+'/'+img, output_dir+'Normal')
        elif labels[i] == 1:
            copy2(img_folder+'/'+img, output_dir+'Pneumonia')
        elif labels[i] == 2:
            copy2(img_folder+'/'+img, output_dir+'Covid')   

In [48]:
# mini
#move_files(img_folder, mini_img, mini_y, output_dir+'mini/')

# train
move_files(img_folder, train_img, train_y, output_dir+'train/')

# hyper
move_files(img_folder, hyper_img, hyper_y, output_dir+'hyper/')

# val
move_files(img_folder, val_img, val_y, output_dir+'val/')

# test
move_files(img_folder, test_img, test_y, output_dir+'test/')

100%|██████████| 136445/136445 [03:24<00:00, 666.92it/s]
100%|██████████| 19493/19493 [00:40<00:00, 485.70it/s]
100%|██████████| 19492/19492 [00:39<00:00, 497.13it/s]
100%|██████████| 19492/19492 [00:39<00:00, 496.07it/s]


In [50]:
# save image names to file
os.mkdir(output_dir+'img_name/')
np.save(output_dir+'img_name/'+'mini_img_name', mini_img)
np.save(output_dir+'img_name/'+'train_img_name', train_img)
np.save(output_dir+'img_name/'+'hyper_img_name', hyper_img)
np.save(output_dir+'img_name/'+'val_img_name', val_img)
np.save(output_dir+'img_name/'+'test_img_name', test_img)

Or Save numpy to files, which can be loaded, converted to tenors, and to dataloader.

In [14]:
np.save('mini_X', mini_X)
np.save('mini_y', mini_y)
np.save('train_X', train_X)
np.save('train_y', train_y)
np.save('hyper_X', hyper_X)
np.save('hyper_y', hyper_y)
np.save('val_X', val_X)
np.save('val_y', val_y)
np.save('test_X', test_X)
np.save('test_y', test_y)

## Tensors

Load numpy files - 2 minutes

In [5]:
#mini_X = np.load('mini_X.npy')
#mini_y = np.load('mini_y.npy')
train_X = np.load('train_X.npy')
train_y = np.load('train_y.npy')
#hyper_X = np.load('hyper_X.npy')
#hyper_y = np.load('hyper_y.npy')
#val_X = np.load('val_X.npy')
#val_y = np.load('val_y.npy')
#test_X = np.load('test_X.npy')
#test_y = np.load('test_y.npy')

In [7]:
train_X.shape

(136445, 256, 256)

Convert to tensors - one at a time otherwise kernel will die b/c of OOM

In [15]:
# for mini
mini_seq_CNN = torch.from_numpy(mini_X).float()
mini_y_CNN = torch.tensor(mini_y).type(torch.LongTensor)

In [4]:
# for train set
train_seq_CNN = torch.from_numpy(train_X).float()
train_y_CNN = torch.tensor(train_y).type(torch.LongTensor)

In [4]:
# for hyperparameter tuning set
hyper_seq_CNN = torch.from_numpy(hyper_X).float()
hyper_y_CNN = torch.tensor(hyper_y).type(torch.LongTensor)

In [5]:
# for validation set
val_seq_CNN = torch.from_numpy(val_X).float()
val_y_CNN = torch.tensor(val_y).type(torch.LongTensor)

In [None]:
# for test set
test_seq_CNN = torch.from_numpy(test_X).float()
test_y_CNN = torch.tensor(test_y).type(torch.LongTensor)

Optionally save tensors

In [5]:
torch.save(mini_seq_CNN, 'mini_seq_CNN.pt')
torch.save(mini_y_CNN, 'mini_y_CNN.pt')

In [None]:
torch.save(train_seq_CNN, 'train_seq_CN.pt')
torch.save(train_y_CNN, 'train_y_CNN.pt')

In [8]:
torch.save(hyper_seq_CNN, 'hyper_seq_CNN.pt')
torch.save(hyper_y_CNN, 'hyper_y_CNN.pt')

In [None]:
torch.save(val_seq_CNN, 'val_seq_CNN.pt')
torch.save(val_y_CNN,'val_y_CNN.pt')

In [12]:
torch.save(test_seq_CNN, 'test_seq_CNN.pt')
torch.save(test_y_CNN,'test_y_CNN.pt')

Convert to TensorDataSet

In [5]:
mini_data_CNN = TensorDataset(mini_seq_CNN, mini_y_CNN)

In [6]:
train_data_CNN = TensorDataset(train_seq_CNN, train_y_CNN)

In [7]:
hyper_data_CNN = TensorDataset(hyper_seq_CNN, hyper_y_CNN)

In [7]:
val_data_CNN = TensorDataset(val_seq_CNN, val_y_CNN)

In [10]:
test_data_CNN = TensorDataset(test_seq_CNN, test_y_CNN)

NameError: name 'test_seq_CNN' is not defined

## Outputs

The outputs (images or numpy files) should be in the parent directory for you in model training and test.