In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.3.0


In [2]:
NUM_EPOCHS = 10

## Load the data

In [3]:
training_dir = 'chest_xray/chest_xray/train'
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  training_dir,
  seed=123,
  image_size=(200,200),
  batch_size=4)

validation_dir = 'chest_xray/chest_xray/val'
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  validation_dir,
  seed=123,
  image_size=(200,200),
  batch_size=4)

testing_dir = 'chest_xray/chest_xray/test'
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
  testing_dir,
  seed=123,
  image_size=(200,200),
  batch_size=4)

Found 5216 files belonging to 2 classes.
Found 16 files belonging to 2 classes.
Found 624 files belonging to 2 classes.


In [4]:
for image_batch, labels_batch in train_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

(4, 200, 200, 3)
(4,)


## Standardize the data

In [5]:
# Instatiate the normalization layer
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)

## Configure dataset for performance

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_tensors = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_tensors = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Train a model

In [None]:
num_classes = 1

model = tf.keras.Sequential([
  layers.experimental.preprocessing.Rescaling(1./255),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(
  optimizer='adam',
    loss=tf.losses.BinaryCrossentropy(from_logits=False),
  metrics=['accuracy'])

history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=NUM_EPOCHS
)

fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
# TEST

import cv2
from sklearn.metrics import accuracy_score, confusion_matrix

# I will be making predictions off of the test set in one batch size
# This is useful to be able to get the confusion matrix
test_data = []
test_labels = []

input_path = 'chest_xray/chest_xray/'

for cond in ['/NORMAL/', '/PNEUMONIA/']:
    for img in (os.listdir(input_path + 'test' + cond)):
        img = plt.imread(input_path+'test'+cond+img)
        img = cv2.resize(img, (200, 200))
        img = np.dstack([img, img, img])
        img = img.astype('float32') / 255
        if cond=='/NORMAL/':
            label = 0
        elif cond=='/PNEUMONIA/':
            label = 1
        test_data.append(img)
        test_labels.append(label)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

preds = model.predict(test_ds)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((history.history['accuracy'][-1])*100, 2)))


### To Do
* [ ] Change batch size
* [ ] Implement drop out
* [ ] Explore other model architectures
    * [ ] Convolutional NN

## Change batch size

###### Batch size should be large enough so that each batch has enough of a chance to see enough of each label.
###### Remember that there are 3 times as many 'PNEUMONIA' labels as there are 'NORMAL'

In [None]:
BATCH_SIZE = 64

training_dir = 'chest_xray/chest_xray/train'
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  training_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

validation_dir = 'chest_xray/chest_xray/val'
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  validation_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

testing_dir = 'chest_xray/chest_xray/test'
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
  testing_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

# Instatiate the normalization layer
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_tensors = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_tensors = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

num_classes = 1

model = tf.keras.Sequential([
  layers.experimental.preprocessing.Rescaling(1./255),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(
  optimizer='adam',
    loss=tf.losses.BinaryCrossentropy(from_logits=False),
  metrics=['accuracy'])

history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=NUM_EPOCHS
)

fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

In [None]:
# TEST

import cv2
from sklearn.metrics import accuracy_score, confusion_matrix

# I will be making predictions off of the test set in one batch size
# This is useful to be able to get the confusion matrix
test_data = []
test_labels = []

input_path = 'chest_xray/chest_xray/'

for cond in ['/NORMAL/', '/PNEUMONIA/']:
    for img in (os.listdir(input_path + 'test' + cond)):
        img = plt.imread(input_path+'test'+cond+img)
        img = cv2.resize(img, (200, 200))
        img = np.dstack([img, img, img])
        img = img.astype('float32') / 255
        if cond=='/NORMAL/':
            label = 0
        elif cond=='/PNEUMONIA/':
            label = 1
        test_data.append(img)
        test_labels.append(label)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

preds = model.predict(test_ds)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((history.history['accuracy'][-1])*100, 2)))


## Use dropout

In [None]:
BATCH_SIZE = 64

training_dir = 'chest_xray/chest_xray/train'
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  training_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

validation_dir = 'chest_xray/chest_xray/val'
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  validation_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

# Instatiate the normalization layer
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_tensors = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_tensors = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  layers.experimental.preprocessing.Rescaling(1./255),
  layers.Dropout(rate=0.2),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(rate=0.5),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(rate=0.5),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(
  optimizer='adam',
    loss=tf.losses.BinaryCrossentropy(from_logits=False),
  metrics=['accuracy'])

history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=NUM_EPOCHS
)

fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

In [None]:
# TEST

import cv2
from sklearn.metrics import accuracy_score, confusion_matrix

# I will be making predictions off of the test set in one batch size
# This is useful to be able to get the confusion matrix
test_data = []
test_labels = []

input_path = 'chest_xray/chest_xray/'

for cond in ['/NORMAL/', '/PNEUMONIA/']:
    for img in (os.listdir(input_path + 'test' + cond)):
        img = plt.imread(input_path+'test'+cond+img)
        img = cv2.resize(img, (200, 200))
        img = np.dstack([img, img, img])
        img = img.astype('float32') / 255
        if cond=='/NORMAL/':
            label = 0
        elif cond=='/PNEUMONIA/':
            label = 1
        test_data.append(img)
        test_labels.append(label)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

preds = model.predict(test_ds)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((history.history['accuracy'][-1])*100, 2)))


## Stack Conv2D Layers (No Dropout)

In [None]:
BATCH_SIZE = 64

training_dir = 'chest_xray/chest_xray/train'
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  training_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

validation_dir = 'chest_xray/chest_xray/val'
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  validation_dir,
  seed=123,
  image_size=(200,200),
  batch_size=BATCH_SIZE)

# Instatiate the normalization layer
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_tensors = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_tensors = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  layers.experimental.preprocessing.Rescaling(1./255),
  
  layers.Conv2D(32, 3, activation='relu'),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  
  layers.Conv2D(32, 3, activation='relu'),
  layers.Conv2D(32, 3, activation='relu'),
  layers.MaxPooling2D(),
  
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  
  layers.Dense(1, activation='sigmoid')
])

model.compile(
  optimizer='adam',
    loss=tf.losses.BinaryCrossentropy(from_logits=False),
  metrics=['accuracy'])

history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=NUM_EPOCHS
)

fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

In [None]:
# TEST

import cv2
from sklearn.metrics import accuracy_score, confusion_matrix

# I will be making predictions off of the test set in one batch size
# This is useful to be able to get the confusion matrix
test_data = []
test_labels = []

input_path = 'chest_xray/chest_xray/'

for cond in ['/NORMAL/', '/PNEUMONIA/']:
    for img in (os.listdir(input_path + 'test' + cond)):
        img = plt.imread(input_path+'test'+cond+img)
        img = cv2.resize(img, (200, 200))
        img = np.dstack([img, img, img])
        img = img.astype('float32') / 255
        if cond=='/NORMAL/':
            label = 0
        elif cond=='/PNEUMONIA/':
            label = 1
        test_data.append(img)
        test_labels.append(label)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

preds = model.predict(test_ds)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((history.history['accuracy'][-1])*100, 2)))


## Test the model

In [None]:
import cv2
from sklearn.metrics import accuracy_score, confusion_matrix

# I will be making predictions off of the test set in one batch size
# This is useful to be able to get the confusion matrix
test_data = []
test_labels = []

input_path = 'chest_xray/chest_xray/'

for cond in ['/NORMAL/', '/PNEUMONIA/']:
    for img in (os.listdir(input_path + 'test' + cond)):
        img = plt.imread(input_path+'test'+cond+img)
        img = cv2.resize(img, (200, 200))
        img = np.dstack([img, img, img])
        img = img.astype('float32') / 255
        if cond=='/NORMAL/':
            label = 0
        elif cond=='/PNEUMONIA/':
            label = 1
        test_data.append(img)
        test_labels.append(label)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

preds = model.predict(test_ds)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((history.history['accuracy'][-1])*100, 2)))
