## Libraries

In [1]:
# Import the data science stack
import numpy as np
import pandas as pd
#import tensorflow as tf
import pickle

from sklearn.utils import shuffle

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# File system access
import os

# Image importing
import csv
import cv2

# Tracking progress
from tqdm import tqdm

# Random seed
np.random.seed(2021)

In [2]:
#PyTorch
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
import torch.nn.functional as F
device = torch.device("cuda")

## Load Data

Read in the labels

In [5]:
# Read in train, validation, and test
train_labels = pd.read_csv('/home/ubuntu/data/train_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
val_labels = pd.read_csv('/home/ubuntu/data/val_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])
test_labels = pd.read_csv('/home/ubuntu/data/test_COVIDx_CT-2A.txt', sep=' ',
                         names=['image_id','class','xmin', 'ymin', 'xmax', 'ymax'])

all_labels = pd.concat([train_labels, val_labels, test_labels])
all_labels = all_labels[['image_id', 'class']]
all_labels.head()

Unnamed: 0,image_id,class
0,NCP_96_1328_0032.png,2
1,NCP_96_1328_0035.png,2
2,NCP_96_1328_0036.png,2
3,NCP_96_1328_0037.png,2
4,NCP_96_1328_0038.png,2


Read in images - takes about 17 minutes

In [6]:
# Function to load images
def load_images_from_folder(folder, dim):
    '''
    folder = file path to image folder
    dim = tuple (width, height), output array of images
    '''
    img_name = []
    img_array = []
    
    # use tqdm to track progress
    # opens each image iteratively from folder
    for filename in tqdm(os.listdir(folder), position=0, leave=True):
        
        # save image name
        img_name.append(filename)
        
        # reads image 
        img = cv2.imread(os.path.join(folder,filename))
        
        if img is not None:
            # convert to gray scale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # resize image
            resized = cv2.resize(gray, dim, interpolation = cv2.INTER_AREA)

            # append image
            img_array.append(resized)
    
    return img_name, img_array
folder="/home/ubuntu/data/2A_images"

In [7]:
# Run function
dim = (256,256)
img_name, img_array = load_images_from_folder(folder, dim)

100%|██████████| 194922/194922 [16:26<00:00, 197.52it/s]


In [8]:
# Convert to numpy array
img_array = np.array(img_array)
img_array.shape

(194922, 256, 256)

Get labels

In [9]:
# Sort labels by image name order - 27 minutes
sorted_labels = []
for i in tqdm(img_name, position=0, leave=True):
    a = all_labels['class'][all_labels['image_id'] == i]
    sorted_labels.append(int(a))

100%|██████████| 194922/194922 [27:06<00:00, 119.87it/s]


In [10]:
# Convert to numpy array
sorted_labels = np.array(sorted_labels)

Randomize all data

In [11]:
# Generate random indices
all_idx = np.random.choice(img_array.shape[0], img_array.shape[0], replace=False)

# Randomize data
img_array = img_array[all_idx]
sorted_labels = sorted_labels[all_idx]

Split dataset to train, val, test

In [12]:
# Training set to debug
mini_X = img_array[0:10000]
mini_y = sorted_labels[0:10000]

# 70% train
train_X = img_array[0:136445]
train_y = sorted_labels[0:136445]

# 10% hyper parameter tuning
hyper_X = img_array[136445:155938]
hyper_y = sorted_labels[136445:155938]

# 10% valdiation
val_X = img_array[155938:175430]
val_y = sorted_labels[155938:175430]

# 10% test
test_X = img_array[194922:]
test_y = sorted_labels[0:136445]

Check distribution of classes in mini, train, hyper, and valid

In [1]:
print(np.unique(mini_y, return_counts=True)[1]/(np.unique(mini_y, return_counts=True)[1].sum()))
print(np.unique(train_y, return_counts=True)[1]/(np.unique(train_y, return_counts=True)[1].sum()))
print(np.unique(hyper_y, return_counts=True)[1]/(np.unique(hyper_y, return_counts=True)[1].sum()))

NameError: name 'np' is not defined

Save numpy to files

In [16]:
np.save('mini_X', mini_X)
np.save('mini_y', mini_y)
np.save('train_X', train_X)
np.save('train_y', train_y)
np.save('hyper_X', hyper_X)
np.save('hyper_y', hyper_y)
np.save('val_X', val_X)
np.save('val_y', val_y)
np.save('test_X', test_X)
np.save('test_y', test_y)

## Tensors

Load numpy files and convert to tensor

In [12]:
mini_X = np.load('mini_X.npy')
mini_y = np.load('mini_y.npy')
train_X = np.load('train_X.npy')
train_y = np.load('train_y.npy')
hyper_X = np.load('hyper_X.npy')
hyper_y = np.load('hyper_y.npy')
val_X = np.load('val_X.npy')
val_y = np.load('val_y.npy')
test_X = np.load('test_X.npy')
test_y = np.load('test_y.npy')

In [None]:
# for mini
mini_seq_CNN = torch.from_numpy(mini_X).float()
mini_y_CNN = torch.tensor(mini_y).type(torch.LongTensor)

# for train set
train_seq_CNN = torch.from_numpy(train_X).float()
train_y_CNN = torch.tensor(train_y).type(torch.LongTensor)

# for hyperparameter tuning set
hyper_seq_CNN = torch.from_numpy(hyper_X).float()
hyper_y_CNN = torch.tensor(hyper_y).type(torch.LongTensor)

# for validation set
val_seq_CNN = torch.from_numpy(val_X).float()
val_y_CNN = torch.tensor(val_y).type(torch.LongTensor)

# for test set
test_seq_CNN = torch.from_numpy(test_X).float()
test_y_CNN = torch.tensor(test_y).type(torch.LongTensor)