In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import glob
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv
from tqdm import tqdm_notebook as tqdm
import tensorflow.keras as keras
import shutil
from pandas import read_excel
import random
from sklearn import model_selection
import json

In [2]:
SPLIT_INDEX = 3
cutoff = 800000

In [3]:
all_patients = glob.glob('/hddraid5/data/colin/covid-data/COVID Research Images/**/[0-9]*/', recursive=True)
patient_dates = {}
for patient in all_patients:
    patient = patient[:-1]
    patient_id = os.path.basename(patient)
    date = os.path.basename(os.path.dirname(patient))
    patient_dates[patient_id] = date
base_path = '/hddraid5/data/colin/covid-data/'
label_files = glob.glob(os.path.join(base_path, '*.xlsx'))
orders = []
test_results = []
for label_file in label_files:
    table = read_excel(label_file)
    table_orders = list(table['Order #'])
    table_test_results = list(table['Covid Test result'])
    orders = orders + table_orders
    test_results = test_results + table_test_results

In [4]:
# lets compile a DB
positive_images = {}
negative_images = {}
for order, test_result in zip(orders, test_results):
    try:
        label = 'positive' in test_result.lower()
        np.int(order)
    except (TypeError, AttributeError):
        continue
    all_image_paths = glob.glob(os.path.join(base_path, 'COVID Research Images','**', str(order), '*.jpg'), recursive=True)
    image_paths = [image_path for image_path in all_image_paths if (os.path.getsize(image_path) < cutoff and os.path.getsize(image_path) > 100)]
    if label:
        positive_images[str(order)] = image_paths
    else:
        negative_images[str(order)] = image_paths

In [5]:
positive_orders = list(positive_images.keys())
negative_orders = list(negative_images.keys())
positive_orders = [order for order in positive_orders if order in patient_dates]
negative_orders = [order for order in negative_orders if order in patient_dates]

In [6]:
def get_fold(data, random_state=0, split_index=0, folds=6):
    folder = model_selection.KFold(n_splits=folds, shuffle=True, random_state=random_state)
    splits = folder.split(X=negative_orders)
    for i, split in enumerate(splits):
        if i == split_index:
            break
    data = np.array(data)
    return data[split[0]], data[split[1]]


In [7]:
def load_orders(orders, image_paths, label=0):
    all_images = []
    all_labels = []
    all_orders = []
    all_files = []
    for order in tqdm(orders):
        images = []
        labels = []
        orders = []
        files = []
        for image_path in image_paths[order]:
            image = cv.imread(image_path)
            image = cv.resize(image, (224, 224))
            images.append(image)
            labels.append(label)
            orders.append(order)
            files.append(os.path.basename(image_path))
        all_images += images
        all_labels += labels
        all_orders += orders
        all_files  += files
    return all_images, all_labels, all_orders, all_files

In [8]:
train_positive_orders, val_positive_orders = get_fold(positive_orders, split_index=SPLIT_INDEX)
train_negative_orders, val_negative_orders = get_fold(negative_orders, split_index=SPLIT_INDEX)

In [9]:
train_pos_images, train_pos_labels, train_pos_orders, train_pos_files = load_orders(train_positive_orders, positive_images, 1)
train_neg_images, train_neg_labels, train_neg_orders, train_neg_files = load_orders(train_negative_orders, negative_images, 0)
train_images = train_pos_images + train_neg_images
train_labels = train_pos_labels + train_neg_labels
train_orders = train_pos_orders + train_neg_orders
train_files = train_pos_files + train_neg_files


val_pos_images, val_pos_labels, val_pos_orders, val_pos_files = load_orders(val_positive_orders, positive_images, 1)
val_neg_images, val_neg_labels, val_neg_orders, val_neg_files = load_orders(val_negative_orders, negative_images, 0)
val_images = val_pos_images + val_neg_images
val_labels = val_pos_labels + val_neg_labels
val_orders = val_pos_orders + val_neg_orders
val_files = val_pos_files + val_neg_files

train_images = (np.array(train_images) / 255).astype(np.float32)
val_images = (np.array(val_images) / 255).astype(np.float32)
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [10]:
def get_model(input_shape=(224,224,3), model_name='mobilenet_v2'):
    if model_name == 'mobilenet_v2':
        base_model = keras.applications.mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    elif model_name == 'densenet':
        base_model = keras.applications.densenet.DenseNet121(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    elif model_name == 'xception':
        base_model = keras.applications.xception.Xception(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    inputs = keras.Input(shape=input_shape)
    base_model.trainable = False
    x = base_model(inputs, training=False) # IMPORTANT
    x = keras.layers.GlobalAveragePooling2D()(x)
    x = keras.layers.Dense(256, activation='relu')(x) # just train this and following layer
    outputs = keras.layers.Dense(2, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    return model

In [11]:
image_proc = keras.preprocessing.image.ImageDataGenerator(rotation_range=45, horizontal_flip=True, vertical_flip=True, shear_range=5)

In [12]:
means = np.mean(train_images.reshape(-1, 3), axis=0)
stds = np.std(train_images.reshape(-1, 3), axis=0)

In [13]:
train_x = (train_images - means) / stds
train_y = keras.utils.to_categorical(train_labels)
val_x = (val_images - means) / stds
val_y = keras.utils.to_categorical(val_labels)

In [14]:
train_chance = np.sum(train_labels)/len(train_labels)
val_chance = np.sum(val_labels)/len(val_labels)

In [15]:
print("Train chance ", train_chance)
print("Val chance ", val_chance)

Train chance  0.5685178716963558
Val chance  0.588963963963964


In [16]:
model = get_model(model_name='densenet')
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
callbacks = keras.callbacks.ModelCheckpoint(f'densenet_covid_fold_{SPLIT_INDEX}.hdf5')
batch_size = 16
model.fit(image_proc.flow(train_x, train_y, batch_size=batch_size),
                    steps_per_epoch=len(train_x) / batch_size, epochs=25, validation_data=(val_x, val_y), shuffle=False, callbacks=[callbacks])

  ...
    to  
  ['...']
Train for 1073.625 steps, validate on 2664 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fa22813b0d0>

In [18]:
saved_model = keras.models.load_model(f'densenet_covid_fold_{SPLIT_INDEX}.hdf5')
labels = saved_model.predict(val_x)

In [19]:
results = {
}

for order, label, gt, file in zip(val_orders, labels, val_labels, val_files):
    if order in results:
        results[order]['labels'].append(float(label[1]))
        results[order]['files'].append(file)
    else:
        if gt == 1:
            test = True
        else:
            test = False
        results[order] = {
            'test_result': test,
            'labels': [float(label[1])],
            'files': [file]
        }

In [20]:
with open(f'val_results_v1_fold_{SPLIT_INDEX}_rev.json', 'w') as fp:
    json.dump(results, fp)

In [21]:
SPLIT_INDEX

3