In [1]:
import os
import cv2
import glob
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import joblib

from pathlib import Path
from random import randint

from scipy import ndimage

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

from tensorflow import keras  
from tensorflow.keras import layers
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from keras.models import Sequential
from tensorflow.keras.models import save_model

from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout, MaxPooling2D, Activation, Input
from keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.applications.vgg16 import VGG16

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
# Set the path for all three datasets.
#data_dir = Path("/Users/evachow/Documents/Documents - Eva’s Mac mini/USD/ADS599/Capstone/chest_xray")
#train_dir = data_dir/"train"
#val_dir = data_dir/"val"
#test_dir = data_dir/"test"

In [3]:
# Set the path for all three datasets.
data_dir = Path("D:/USD-ADS_graduate/ADS-599/chest_xray/")
train_dir = data_dir/"train"
val_dir = data_dir/"val"
test_dir = data_dir/"test"

In [4]:
# Define a function to load the data into dataframes.
# Assign labels corresponding to the folder names.

def load_data(data_path):
    # Below defines the directories for our "NORMAL" and "PNEUMONIA" images.
    normal_dir = data_path/"NORMAL"
    pneum_dir = data_path/"PNEUMONIA"
    
    # The images are in .jpeg format, so we will use glob() to retrieve
    # file/pathnames that match the jpeg format.    normal_img = normal_dir.glob('*.jpeg')
    normal_img = normal_dir.glob('*.jpeg')
    pneum_img = pneum_dir.glob('*.jpeg')
    
    # Generate lists of our image data and image label.
    img_data = []
    img_label = []
    
    for img in normal_img:
        img_data.append(img)
        img_label.append('NORMAL')
        
    for img in pneum_img:
        img_data.append(img)
        img_label.append('PNEUMONIA')
    df = pd.DataFrame(img_data,columns=['images'])
    df['label'] = img_label
    df = df.reset_index(drop=True)
    
    return df

In [5]:
# Define the training dataframe and sample randomly to check labeling.
df_train = load_data(train_dir)
print(df_train.sample(10))

ValueError: a must be greater than 0 unless no samples are taken

In [None]:
# Now define the validation and test dataframes.
# Check the shape of our dataframes to ensure no image loss and correct shape.
df_val = load_data(val_dir)
df_test = load_data(test_dir)
print('The training data has a shape of:',df_train.shape)
print('The validate data has a shape of:',df_val.shape)
print('The test data has a shape of:',df_test.shape)

In [None]:
# This new resizing function (defined as resizing2) will 
# resize an image to 224 x 224 pixels.
def resizing2(img):
    res_img = cv2.resize(img, (224,224))
    return res_img

In [None]:
# This function will change the color space of an image using the cv2 package.
def gray_scale(img):
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return gray_img

In [None]:
# Define an image processing pipeline that resizes an image using resizing2
# and applies the gray-scale function defined previously.
def prepare(path, pipeline) : 
    img = cv2.imread(str(path))
    
    for transform in pipeline : 
        img = transform(img)
        
    return img

pipeline = [resizing2, gray_scale]

In [None]:
# Apply the image processing pipeline to the training dataset.
df_train['images'] = df_train['images'].apply(prepare,pipeline = pipeline)

In [None]:
# Reduce the pixel values by dividing the values of the array by 255
# and redefine our labels as binary values.
for i in range(len(df_train)):
    df_train['images'][i] = df_train['images'][i].astype(np.float32)/255
    
df_train['label'] = df_train['label'].astype(str)
df_train['label'] = df_train['label'].map( {'NORMAL':0 , 'PNEUMONIA':1} )

In [None]:
# Define our predictor variable (images) and our target (label).
x_train = np.array([df_train['images'][i] for i in range(len(df_train))])
y_train = np.array(df_train['label'])

In [None]:
# Reshape the images from an array of 50176 to a dataframe.
x_train_2dim = x_train.reshape((x_train.shape[0], 50176))
le = LabelEncoder()
y_train_2dim = le.fit_transform(df_train['label'])

In [None]:
x_train_4dim = x_train.reshape(x_train.shape[0], x_train.shape[1], x_train.shape[2], 1)
y_train_4dim = tf.keras.utils.to_categorical(y_train, 3)

In [None]:
# Apply the image processing pipeline to the validation dataset.
df_val['images'] = df_val['images'].apply(prepare,pipeline=pipeline)

In [None]:
# Reduce the pixel values by dividing the values of the array by 255
# and redefine our labels as binary values.
for i in range(len(df_val)):
    df_val['images'][i] = df_val['images'][i].astype(np.float32)/255
    
df_val['label'] = df_val['label'].astype(str)
df_val['label'] = df_val['label'].map( {'NORMAL':0 , 'PNEUMONIA':1} )

In [None]:
# Define our predictor variable (images) and our target (label).
x_val = np.array([df_val['images'][i] for i in range(len(df_val))])
y_val = np.array(df_val['label'])

In [None]:
# Reshape the images from an array of 50176 to a dataframe.
x_val_2dim = x_val.reshape((x_val.shape[0], 50176))
y_val_2dim = le.fit_transform(df_val['label'])

In [None]:
x_val_4dim = x_val.reshape(x_val.shape[0], x_val.shape[1], x_val.shape[2], 1)
y_val_4dim = tf.keras.utils.to_categorical(y_val, 3)

In [None]:
# Apply the image processing pipeline to the test dataset.
df_test['images'] = df_test['images'].apply(prepare,pipeline=pipeline)

In [None]:
# Reduce the pixel values by dividing the values of the array by 255
# and redefine our labels as binary values.
for i in range(len(df_test)):
    df_test['images'][i] = df_test['images'][i].astype(np.float32)/255
    
df_test['label'] = df_test['label'].astype(str)
df_test['label'] = df_test['label'].map( {'NORMAL':0 , 'PNEUMONIA':1} )

In [None]:
# Define our predictor variable (images) and our target (label).
x_test = np.array([df_test['images'][i] for i in range(len(df_test))])
y_test = np.array(df_test['label'])

In [None]:
# Reshape the images from an array of 50176 to a dataframe.
x_test_2dim = x_test.reshape((x_test.shape[0], 50176))
y_test_2dim = le.fit_transform(df_test['label'])

In [None]:
x_test_4dim = x_test.reshape(x_test.shape[0], x_test.shape[1], x_test.shape[2], 1)
y_test_4dim = tf.keras.utils.to_categorical(y_test, 3)

In [None]:
# Use model.save('path/model.h5') to save a keras model.
# model.save('saved_model/CNN_model.h5')

In [None]:
# Load back our saved model.
CNN_model = tf.keras.models.load_model('saved_model/CNN_model.h5')

In [None]:
# Get loss and accuracy on our test data.
cnn_loss, cnn_acc = CNN_model.evaluate(x_test_4dim, y_test_4dim, verbose=2)
print('CNN model, accuracy: {:5.2f}%'.format(100 * cnn_acc))

In [None]:
# The following restores the session while loading.
# First, load the saved model.
CNN_model = tf.keras.models.load_model('saved_model/CNN_model.h5')

# Redefine functions to call the session.
saver = tf.train.Saver()
sess = keras.backend.get_sessions()
saver.restore(sess, 'saved_model/CNN_session.ckpt')

In [None]:
# Get loss and accuracy on our test data.
cnn_loss, cnn_acc = CNN_model.evaluate(x_test_4dim, y_test_4dim, verbose=2)
print('CNN model, accuracy: {:5.2f}%'.format(100 * cnn_acc))