Modeling and testing grounds

In [12]:
import glob, pylab, pandas as pd
import pydicom, numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.models import load_model

from numpy.random import seed
from tensorflow import set_random_seed

from skimage.transform import resize

import datetime
from tqdm import tqdm_notebook
import pickle

In [3]:
seed(42)
set_random_seed(42)

# Modeling and Evaluation Functions

Writing multiple functions to facilitate creating different datasets, modeling over different datasets and/or different hyperparameters, and evaluating the different models using graphs




In [4]:
class PnemoniaTargetMapIsWeird(AssertionError):
    """Raise this when the target is not mapped to 2 or 3 outputs"""
    pass

In [5]:
def create_pic_df(data, df_detailed, image_size = (128,128)):
    
    print('---Enter DataFrame Construction Function---')
    
    ## Data Prep
    # Need to define df_detailed somewhere
    data.drop_duplicates(inplace=True)
    merged_df = df_detailed.merge(data, on='patientId')
    
    print('---Create Pixel Array Labels---')
    
    # Create the column labels for the pixel arrays
    pixel_labels = []
    for i in range(image_size[0] * image_size[1]):
        pixel_labels.append("pixel"+str(i))
    
    print('---Create Total Pixel Array---')
    
    # Create the total pixel array
    huge_pixel_array = []
    for o in tqdm_notebook(range(merged_df.shape[0])):
        # Get the image data
        patientId = merged_df.iloc[o]['patientId']
        dcm_file = '../data/stage_1_train_images/%s.dcm' % patientId
        dcm_data = pydicom.read_file(dcm_file)
        im = dcm_data.pixel_array

        # Convert the image into a 1d array of pixels
        curr_pixel_array = []
        for i in resize(im, image_size): 
            curr_pixel_array.extend(i)
        huge_pixel_array.append(curr_pixel_array)

    print('---Create Dataframe---')
        
    # Creating the DataFrame
    temp = pd.DataFrame(data = huge_pixel_array, columns = pixel_labels)
    temp['class'] = merged_df['class']
    return temp

In [6]:
def model(data, model_name = "default_model_name", image_size = (128,128), y_map = {
    "Lung Opacity":1,
    "No Lung Opacity / Not Normal":0,
    "Normal":2
}, EPOCHS = 20, train_size = 0.75, metrics = ['accuracy'] ):
    """
    Args:
        data (Pandas DataFrame): cols=['class', all of the pixel data]
        
        model_name (str): The name with which to save the model
        
        image_size (touple: (int,int)): The width and height in pixels to scale the images. Max 1024x1024. This might be more finickey that I would've thought.
        
        y_map (dict): A dictionary mapping the target values to integers (0-?)
        
    Returns:
        Keras Model fit on the data.
        Saves the model to ../data/models/model_name datetime.h5
    """
    print('---Enter Function---')
    

    # Create target
    y = data['class']
    y = y.map(lambda x: y_map[x])
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='class'), y, 
                                                        stratify=y, random_state=42, train_size = train_size)
    
    # Transform target into arrays of integers
    y_train_c = to_categorical(y_train)
    y_test_c = to_categorical(y_test)
    
    # Reshape the X_train and X_test 
    X_train_re = X_train.values.reshape(X_train.shape[0], image_size[0], image_size[1], 1)
    X_test_re = X_test.values.reshape(X_test.shape[0], image_size[0], image_size[1], 1)
    
    print('---Modeling---')
    
    ## Modeling
    # Initialize Model
    model_convolutional = Sequential()

    # First Conv / Pool 
    model_convolutional.add(Conv2D(filters = 6,
                                   kernel_size = 3,
                                   activation = 'relu',
                                   input_shape = (image_size[0], image_size[1], 1)
                                  ))
    model_convolutional.add(MaxPooling2D(pool_size=(2,2)))

    # Second Conv/ Pool
    model_convolutional.add(Conv2D(filters=16, kernel_size=3, 
                                   activation='relu'))
    model_convolutional.add(MaxPooling2D(pool_size=(2,2)))

    # Flatten
    model_convolutional.add(Dropout(0.8))
    model_convolutional.add(Flatten())

    # Densely Connected Layers
    model_convolutional.add(Dense(512, activation='relu'))
    model_convolutional.add(Dropout(0.5))
    model_convolutional.add(Dense(128, activation='relu'))
    model_convolutional.add(Dropout(0.5))
    
    # Output Layer
    if len(set(y_map.values())) == 3:
        model_convolutional.add(Dense(3, activation='softmax'))
    elif len(set(y_map.values())) == 2:
        model_convolutional.add(Dense(2, activation='softmax'))
    else:
        raise PnemoniaTargetMapIsWeird("target is not mapped to 2 or 3 outputs")


    model_convolutional.compile(loss = 'categorical_crossentropy', 
                            optimizer = 'adam', 
                            metrics = metrics)
    
    # Train Model
    trained_convolutional = model_convolutional.fit(X_train_re,
                        y_train_c,
                        batch_size = 32,
                        epochs = EPOCHS,
                        verbose = 1,
                        validation_data = (X_test_re, y_test_c)
                       )

    # Save Model 
    try:
        file_name = '../data/models/' + model_name + ' ' + str(datetime.datetime.now()) + '.h5'
        
        model_convolutional.save(file_name)
        print('Model saved as: ' + file_name)
        
    except Exception as e:
        print('Model did not save.')
        print(e)
        
        
    return trained_convolutional, model_convolutional, file_name



In [7]:
# Function that displays pertinant information regarding the quality of the model

def evaluate_model (model, model_name = 'default', size=5):
    
    """
    Args:
        model: A trained model with 
        
    Output:
        Returns nothing. Prints out - plots comparing test set with training set on 
        each of the metrics (minimum: loss).
    
    """
    
    # checking out the different classes in the testing case and predictions
    
    # confusion matrix?
    
    keys=[]
    for i in model.history.keys():
        if i[0:3] != 'val':
            keys.append(i)
    
    fig, ax = plt.subplots(len(keys), 1, figsize = (size, size*len(keys))) 
    fig.suptitle('Model: ' + model_name)
    
    index = 0
    for i in keys:
        # Plot of the Loss for the train and testingsets
        ax[index].plot(model.history[i], label=i.capitalize())
        ax[index].plot(model.history['val_'+i], label='Val '+i.capitalize())
        ax[index].legend()
        ax[index].set_title(i.capitalize()[15:-2], )
        index+=1


## Some initial modeling

In [41]:
pic_df['class'].value_counts()

No Lung Opacity / Not Normal    11083
Lung Opacity                     8383
Normal                           8071
Name: class, dtype: int64

In [8]:
# Lung Opacity == Pneumonia
y_map_3 = { # Normal, Pneumonia, or not normal but not pneumonia
    "Lung Opacity":1,
    "No Lung Opacity / Not Normal":0,
    "Normal":2
}

y_map_2 = { # Either Pneumonia or Not Pneumonia
    "Lung Opacity":1,
    "No Lung Opacity / Not Normal":0,
    "Normal":0
}

adult_data = pd.read_csv('../data/adult_patient_data.csv')
df_detailed = pd.read_csv('../data/stage_1_detailed_class_info.csv')


In [9]:
models = {}

In [10]:
# Current Parameters
CURR_MAP = y_map_3
model_name = 'adult_data_2_options'
NUM_IMAGES = 6000 #pic_df.shape[0]
EPOCHS = 20
IMAGE_SIZE = (128,128)
TRAIN_SIZE = 0.8
METRICS = ['accuracy']



In [11]:
%%time

pic_df = create_pic_df(adult_data, df_detailed=df_detailed, image_size=IMAGE_SIZE)

---Enter DataFrame Construction Function---
---Create Pixel Array Labels---
---Create Total Pixel Array---


HBox(children=(IntProgress(value=0, max=27537), HTML(value='')))

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "



---Create Dataframe---
CPU times: user 8min 33s, sys: 15 s, total: 8min 48s
Wall time: 8min 50s


In [13]:
%%time



test_model_history, test_model_real, most_recent_model_name = model(pic_df.head(NUM_IMAGES), model_name = model_name, image_size=IMAGE_SIZE, y_map=CURR_MAP, EPOCHS=EPOCHS, train_size = TRAIN_SIZE, metrics = METRICS)



---Enter Function---




---Modeling---
Train on 4800 samples, validate on 1200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model saved as: ../data/models/adult_data_2_options 2018-10-16 22:46:23.821897.h5
CPU times: user 36min 44s, sys: 11min 34s, total: 48min 19s
Wall time: 8min 41s


In [14]:
models[most_recent_model_name] = {
    'NUM_IMAGES':NUM_IMAGES,
    'history':test_model_history,
    'EPOCHS':EPOCHS,
    'IMAGE_SIZE':IMAGE_SIZE,
    'Y_MAP':CURR_MAP,
    'TRAIN_SIZE':TRAIN_SIZE
}

# Save the list of models and parameters 
model_pickle_file_name = 'models.pk'
file_object = open(model_pickle_file_name,'wb')
pickle.dump(models, file_object)
file_object.close()


NameError: name 'pickle' is not defined

In [None]:
evaluate_model(test_model_history, most_recent_model_name)


In [60]:
%%time

X = pic_df.drop(columns='class')
pred_x = X.values.reshape(X.shape[0], IMAGE_SIZE[0], IMAGE_SIZE[1], 1)
preds = test_model_real.predict(pred_x)

CPU times: user 4min 21s, sys: 11.8 s, total: 4min 33s
Wall time: 51.3 s


In [None]:
preds[0]

In [None]:
y_real = pic_df['class']
y_real = to_categorical(y_real.map(lambda x: CURR_MAP[x]))
y_real.head(1)

In [55]:
display(models)

{'../data/models/adult_data_3options 2018-10-16 20:49:23.542213.h5': {'NUM_IMAGES': 6000,
  'history': <keras.callbacks.History at 0x7f06f81bf7f0>,
  'EPOCHS': 20,
  'IMAGE_SIZE': (128, 128),
  'Y_MAP': {'Lung Opacity': 0,
   'No Lung Opacity / Not Normal': 1,
   'Normal': 1}},
 '../data/models/adult_data_2_options 2018-10-16 21:26:28.869268.h5': {'NUM_IMAGES': 6000,
  'history': <keras.callbacks.History at 0x7f06fbe75390>,
  'EPOCHS': 20,
  'IMAGE_SIZE': (128, 128),
  'Y_MAP': {'Lung Opacity': 1, 'No Lung Opacity / Not Normal': 0, 'Normal': 0},
  'TRAIN_SIZE': 0.8},
 '../data/models/adult_data_2_options 2018-10-16 21:37:38.370253.h5': {'NUM_IMAGES': 6000,
  'history': <keras.callbacks.History at 0x7f0676b494a8>,
  'EPOCHS': 20,
  'IMAGE_SIZE': (128, 128),
  'Y_MAP': {'Lung Opacity': 1, 'No Lung Opacity / Not Normal': 0, 'Normal': 2},
  'TRAIN_SIZE': 0.8}}

In [None]:
# Loading in the list of models/parameters

model_pickle_file_name = 'models.pk'
file_object = open(model_pickle_file_name,'r')

models = pickle.load(file_object)
file_object.close()

display(models)

# What am I trying to do now?
- see if I can save the layers / meta information about each model so if I change it I know what model does well
- start changing models around potentially using 
    - Stochastic gradient descent
    - [R-CNN (or a different iteration)](https://heartbeat.fritz.ai/the-5-computer-vision-techniques-that-will-change-how-you-see-the-world-1ee19334354b)
    - [SpatialDropout2D](https://keras.io/layers/core/#spatialdropout2d)
- increase font on plot super title, make sure that the super title even works
