### Importing modules

In [1]:
import pandas as pd
import numpy as np

from skimage import io
from skimage.transform import rescale, resize
from skimage.color import rgb2gray, gray2rgb

from sklearn.preprocessing import normalize, StandardScaler

import tensorflow as tf
from tensorflow import keras

  from ._conv import register_converters as _register_converters


In [2]:
path = '../Capstone_files/train/NORMAL/IM-0115-0001.jpeg'
image1 = io.imread(path)

### Defined functions

In [3]:
#Define a function to grayscale, resize and normalize all images from a folder

def img_process(folder, size):
    """Accepts an folder path as input and returns a list of all images in that folder 
    in grayscale, resized to square x by x dimensions with normalized pixel values"""
    imageset = io.imread_collection(folder)
    normed_images = []
    for i in range(0,len(imageset)):
        try:
            rgb = gray2rgb(imageset[i])
        except:
            rgb = imageset[i]
        resized = resize(rgb, (size,size))
#        normed = StandardScaler().fit(resized) #NOTE: I will need to edit this so the images are normalized AFTER this function, so I can use the mean/std from the training set on the others.
        normed_images.append(resized)
    return normed_images

In [4]:
#Function to generate category labels for each image set

def generate_labels(set1, set2):
    """Takes image sets in different categories and returns a list of binary labels"""
    labels = []
    for i in range(len(set1)):
        labels.append(0)
    for j in range(len(set2)):
        labels.append(1)
    return labels

In [5]:
#Function to convert a list of values to a tensorflow dataset

def generate_tf_data(data):
    """Accepts a list or array of data and converts it to a TensorFlow dataset object"""
    tensors = [tf.convert_to_tensor(x) for x in data]
    dataset = tf.data.Dataset.from_tensors(tensors)
    return dataset

### Cleaning and processing image sets

In [6]:
#Process the normal training set
folder = '../Capstone_files/train/NORMAL/*.jpeg'
training_n = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [7]:
#Process the pneumonia training set
folder = '../Capstone_files/train/PNEUMONIA/*.jpeg'
training_p = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [8]:
#Concatenate the images into one set, confirm size of subsets and calculate baseline accuracy for this set
train_images = training_n + training_p
train_labels = generate_labels(training_n, training_p)
#Also create sets with undersampling on the pneumonia set to address imbalance
train_images_usample = training_n + training_p[:1341]
train_labels_usample = generate_labels(training_n, training_p[:1341])
print(len(training_n))
print(len(training_p))
print(len(training_p[:1341]))
print(len(train_images))
baseline_train = (len(training_p)/len(train_images))
print('training baseline: ' + str(baseline_train))

1341
3875
1341
5216
training baseline: 0.7429064417177914


In [9]:
#Repeat the process for the test images
folder = '../Capstone_files/test/NORMAL/*.jpeg'
test_n = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [10]:
folder = '../Capstone_files/test/PNEUMONIA/*.jpeg'
test_p = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [11]:
test_images = test_n + test_p
test_labels = generate_labels(test_n, test_p)
print(len(test_n))
print(len(test_p))
print(len(test_images))
baseline_test = (len(test_p)/len(test_images))
print('test baseline: ' + str(baseline_test))

234
390
624
test baseline: 0.625


In [12]:
#And finally the validation set
folder = '../Capstone_files/val/NORMAL/*.jpeg'
val_n = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [13]:
folder = '../Capstone_files/val/PNEUMONIA/*.jpeg'
val_p = img_process(folder, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [14]:
val_images = val_n + val_p
val_labels = generate_labels(val_n, val_p)
print(len(val_n))
print(len(val_p))
print(len(val_images))
baseline_val = (len(val_p)/len(val_images))
print('validation baseline: ' + str(baseline_val))

8
8
16
validation baseline: 0.5


### Generating data input for TensorFlow

#Generate training datasets
X_train = generate_tf_data(train_images)
y_train = generate_tf_data(train_labels)

In [16]:
#Generate undersampled arrays
X_train_array = np.array(train_images_usample).reshape(-1, 224, 224, 3)
y_train_array = np.array(train_labels_usample).reshape(-1,)

In [25]:
y_train_array_k = keras.utils.to_categorical(y_train_array)

In [26]:
X_test = np.array(test_images).reshape(-1, 224, 224, 3)
y_test = np.array(test_labels).reshape(-1,)
y_test_k = keras.utils.to_categorical(y_test)

### Test a simple CNN using Keras

In [18]:
input_shape = (224, 224, 3)
num_classes = 2
batch_size = 64
epochs = 5

In [None]:
model = keras.applications.ResNet50(include_top=True, weights='imagenet')

In [19]:
model = keras.Sequential()
model.add(keras.layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1),
                 activation='relu',
                 input_shape=input_shape, data_format='channels_last'))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(keras.layers.Conv2D(64, (5, 5), activation='relu'))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(1000, activation='relu'))
model.add(keras.layers.Dense(num_classes, activation='softmax'))

In [20]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [21]:
class AccuracyHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.acc = []

    def on_epoch_end(self, batch, logs={}):
        self.acc.append(logs.get('acc'))

history = AccuracyHistory()

In [27]:
model.fit(X_train_array, y_train_array_k,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, y_test_k),
          callbacks=[history])

Train on 2682 samples, validate on 624 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2a7749c9940>