Data Preprocessing and Modeling

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import itertools

import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers

from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.optimizers import Adam, Adadelta, Adagrad, SGD, RMSprop

import os

In [2]:
from tensorflow.random import set_seed
set_seed(321)

import numpy as np
np.random.seed(123)

***
## Preprocessing

### Create directory paths for image data

In [3]:
data_train_dir = '../chest_xray/train/'
data_test_dir = '../chest_xray/test/'
data_val_dir = '../chest_xray/val/'

In [4]:
train_dir_pneumonia = '../chest_xray/train/PNEUMONIA'
train_dir_normal = '../chest_xray/train/NORMAL'
test_dir_pneumonia = '../chest_xray/test/PNEUMONIA'
test_dir_normal = '../chest_xray/test/NORMAL'
val_dir_pneumonia = '../chest_xray/val/PNEUMONIA'
val_dir_normal = '../chest_xray/val/NORMAL'

In [5]:
# Review image count
print('Train Pneumonia: ', len(os.listdir(train_dir_pneumonia)))
print('Train Normal: ', len(os.listdir(train_dir_normal)))
print('Test Pneumonia: ', len(os.listdir(test_dir_pneumonia)))
print('Test Normal: ', len(os.listdir(test_dir_normal)))
print('Val Pneumonia: ', len(os.listdir(val_dir_pneumonia)))
print('Val Normal: ', len(os.listdir(val_dir_normal)))

Train Pneumonia:  3875
Train Normal:  1341
Test Pneumonia:  390
Test Normal:  234
Val Pneumonia:  8
Val Normal:  8


### Create data generators
Creating data generators of training, testing, validation data. I want the pixels for each image to be 224 X 224 so I put that in target_size. The class_mode is binary since this is a binary problem. I set the batch_size according to the image count above.

In [6]:
train_generator = ImageDataGenerator(rescale = 1./255 ).flow_from_directory(data_train_dir,
                                                                            target_size=(224,224),
                                                                            batch_size=5216,
                                                                            class_mode='binary',
                                                                            seed=123)

Found 5216 images belonging to 2 classes.


In [7]:
test_generator = ImageDataGenerator(rescale = 1./255 ).flow_from_directory(data_test_dir,
                                                                            target_size=(224,224),
                                                                            batch_size=624,
                                                                            class_mode='binary',
                                                                            seed=123)

Found 624 images belonging to 2 classes.


In [8]:
val_generator = ImageDataGenerator(rescale = 1./255 ).flow_from_directory(data_val_dir,
                                                                            target_size=(224,224),
                                                                            batch_size=624,
                                                                            class_mode='binary',
                                                                            seed=123)

Found 16 images belonging to 2 classes.


### Review Class Indices

In [12]:
train_generator.class_indices

{'NORMAL': 0, 'PNEUMONIA': 1}

In [13]:
test_generator.class_indices

{'NORMAL': 0, 'PNEUMONIA': 1}

In [14]:
val_generator.class_indices

{'NORMAL': 0, 'PNEUMONIA': 1}

### Create datasets

In [9]:
train_images, train_labels = next(train_generator)

In [10]:
test_images, test_labels = next(test_generator)

In [11]:
val_images, val_labels = next(val_generator)

### Review image shapes 

In [16]:
print('Train Images Shape:', np.shape(train_images))
print('Train Labels Shape:', np.shape(train_labels))
print('Test Images Shape:' , np.shape(test_images))
print('Test Labels Shape:' , np.shape(test_labels))
print('Validation Images Shape:', np.shape(val_images))
print('Validation Labels Shape:', np.shape(val_labels))

Train Images Shape: (5216, 224, 224, 3)
Train Labels Shape: (5216,)
Test Images Shape: (624, 224, 224, 3)
Test Labels Shape: (624,)
Validation Images Shape: (16, 224, 224, 3)
Validation Labels Shape: (16,)


***
## Modeling

### Baseline model: 3 Convolution layes, 3 max pooling layers, and 1 fully connected layer

In [19]:
from tensorflow.keras import models, layers, optimizers, regularizers
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

baseline_model = models.Sequential()

baseline_model.add(Conv2D(32, (3, 3), activation='relu',
                    input_shape=(224, 224, 3)))

baseline_model.add(MaxPooling2D((2, 2)))

baseline_model.add(Conv2D(32, (3, 3), activation='relu'))
baseline_model.add(MaxPooling2D(2, 2))

baseline_model.add(Conv2D(64, (3, 3), activation='relu'))
baseline_model.add(MaxPooling2D((2, 2)))

baseline_model.add(Flatten())
baseline_model.add(Dense(64, activation='relu'))
baseline_model.add(Dense(1, activation='sigmoid'))

In [20]:
baseline_model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['acc'])

In [21]:
baseline_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 222, 222, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 111, 111, 32)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 109, 109, 32)      9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 54, 54, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 52, 52, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 26, 26, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 43264)            

In [None]:
base_history = baseline_model.fit(train_images,
                       train_labels,
                       epochs=25,
                       batch_size=50,
                       validation_data=(val_images, val_labels))

Epoch 1/25