In [1]:
# Let's first import all necessary libraries

import tensorflow
print(tensorflow.__version__)

2.3.0


In [2]:
import keras
from keras import backend as K

print('Keras version:',keras.__version__)

Keras version: 2.4.3


In [3]:
import pandas as pd
import numpy as np

In [4]:
# mount drive

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
# Import previously constructed dataset with equal ratio of classes

allclasses = pd.read_csv('/gdrive/MyDrive/allclasses')

In [6]:
allclasses.head()

Unnamed: 0.1,Unnamed: 0,FILE NAME,FORMAT,SIZE,URL,Class
0,0,Viral Pneumonia (375).png,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...,Viral Pneumonia
1,1,COVID-19 (24).png,PNG,1024*1024,https://academic.oup.com/cid/advance-article/d...,COVID-19
2,2,COVID-19 (17).png,PNG,1024*1024,https://onlinelibrary.wiley.com/doi/full/10.11...,COVID-19
3,3,NORMAL (53).png,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...,Normal
4,4,COVID-19 (115).png,PNG,1024*1024,https://www.sciencedirect.com/science/article/...,COVID-19


In [7]:
X = allclasses['FILE NAME']
y = allclasses['Class']

In [8]:
# Split at 10% test, we will use the training split for validation as well


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.10)

In [9]:
# Check sizes

print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)


X_train:  (359,)
X_test:  (40,)
y_train:  (359,)
y_test:  (40,)


In [10]:
# Check class ratios

print(y_train.value_counts())
print()
print(y_test.value_counts())

Viral Pneumonia    120
COVID-19           120
Normal             119
Name: Class, dtype: int64

Normal             14
Viral Pneumonia    13
COVID-19           13
Name: Class, dtype: int64


In [11]:
# Check unique feature counts

print(len(X_train.unique()))
print(len(X_test.unique()))

359
40


In [12]:
# Check no data leakage between train and test sets

X_train_unique = set(X_train.values)
X_test_unique = set(X_test.values)
    
images_both_groups = list(X_train_unique.intersection(X_test_unique))
images_both_groups == []

True

In [13]:
# Split train for 15% val

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                stratify=y_train, 
                                                test_size=0.15)

In [14]:
# Check sizes

print('X_train: ',X_train.shape)
print('X_val: ',X_val.shape)
print('y_train: ',y_train.shape)
print('y_val: ',y_val.shape)


X_train:  (305,)
X_val:  (54,)
y_train:  (305,)
y_val:  (54,)


In [15]:
# Check class ratios

print(y_train.value_counts())
print()
print(y_val.value_counts())

COVID-19           102
Viral Pneumonia    102
Normal             101
Name: Class, dtype: int64

Viral Pneumonia    18
Normal             18
COVID-19           18
Name: Class, dtype: int64


In [16]:
# Concatenate dataframes

training =  pd.concat([X_train, y_train], axis=1)
validation = pd.concat([X_val, y_val], axis=1)

In [17]:
print(training.head())
print()
print(validation.head())

                     FILE NAME            Class
275  Viral Pneumonia (769).png  Viral Pneumonia
51   Viral Pneumonia (173).png  Viral Pneumonia
398           NORMAL (278).png           Normal
378           NORMAL (538).png           Normal
315  Viral Pneumonia (690).png  Viral Pneumonia

                      FILE NAME            Class
68           COVID-19 (117).png         COVID-19
13   Viral Pneumonia (1072).png  Viral Pneumonia
250           NORMAL (1272).png           Normal
212           NORMAL (1050).png           Normal
199  Viral Pneumonia (1010).png  Viral Pneumonia


In [18]:
# Now we create generators without the original class imbalance and with
# The normalized features, with 0 mean and a standard deviation of 1

import os
from keras.preprocessing.image import ImageDataGenerator

# Recall that the images were moved to a folder names XRays. 
training_folder_name = '/gdrive/My Drive/XRays'

# The folder contains a subfolder for each class of shape
classes = sorted(os.listdir(training_folder_name))
print(classes)

# Our source images are 1024 x 1024, Resnet 50 is 224 by 224
image_size = (224,224)
batch_size = 32


print("Getting Data...")
train_datagen = ImageDataGenerator(samplewise_center=True, # Set each sample mean to 0 
                             samplewise_std_normalization=True, # Divide each input by its standard deviation
)

print("Preparing training dataset...")

# Flow from directory with specified batch size and target image size
train_generator = train_datagen.flow_from_dataframe(
        dataframe=training,
        directory='/gdrive/My Drive/XRaysCombo',
        x_col="FILE NAME", # features
        y_col= 'Class', # labels
        class_mode='categorical', # 3 classes
        batch_size= batch_size, # images per batch
        shuffle=True, # shuffle the rows or not
        target_size=(224,224) # width and height of output image this one matches RESNET50
)

['COVID-19', 'Normal', 'Viral Pneumonia']
Getting Data...
Preparing training dataset...
Found 305 validated image filenames belonging to 3 classes.


In [19]:
# create validation set using normalization statistics from
# the training set only

print(classes)
print("Getting Data...")

test_datagen = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization= True)

counter = 0

train_generator.reset()

# generate sample of entire training set
train_sample =np.concatenate([train_generator.next()[0] for i in range(train_generator.__len__())])

# generator fitted with training set only
test_datagen.fit(train_sample)


print("Preparing validation dataset...")

# Flow from directory with specified batch size and target image size
validation_generator = test_datagen.flow_from_dataframe(
        dataframe=validation,
        directory='/gdrive/My Drive/XRaysCombo',
        x_col="FILE NAME", # features
        y_col= 'Class', # labels
        class_mode='categorical', # 3 classes
        batch_size= batch_size, # images per batch
        shuffle=True, # shuffle the rows or not
        target_size=(224,224) # width and height of output image this one matches RESNET50
)


['COVID-19', 'Normal', 'Viral Pneumonia']
Getting Data...
Preparing validation dataset...
Found 54 validated image filenames belonging to 3 classes.


In [20]:
# Reset training generator back to its original state
train_generator.reset()

In [21]:
# save newly created dataframes for later use
training_covid = training.to_csv('/gdrive/My Drive/training_covid', header=True)
validation_covid = validation.to_csv('/gdrive/My Drive/validation_covid', header=True)
X_test_covid = X_test.to_csv('/gdrive/My Drive/X_test_covid', header=True)
y_test_covid = allclasses.to_csv('/gdrive/My Drive/y_test_covid', header=True)