# All code from Raj and Dr. Gutman
## minor FG changes

In [1]:
# A multi-class image classifier, based on convolutional neural network using Keras and Tensorflow. 
# 18 classes
# Largely copied from: https://gist.github.com/seixaslipe
# Based on: https://medium.com/alex-attia-blog/the-simpsons-character-recognition-using-keras-d8e1796eae36
# Data downloaded from Kaggle 
# Will emulate the image classification functionlities for Neuro Pathology images/slides (WSI-Whole Slide images)
# Will implement/include data manipulating functionalities based on Girder (https://girder.readthedocs.io/en/latest/)
# Has 6 convolutions, filtering:64, 128, 256 with flattening to 1024
# Keras.ImageDataGenerator for Training/Validation data augmentation
# Environment: Keras, TensorFlow, Python-2, GPU-enabled

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.preprocessing import image
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.callbacks import Callback
import datetime, time, os, sys
import numpy as np
import h5py
import matplotlib as plt
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import pandas as pd

import nvidia_smi as nvs

Using TensorFlow backend.


# Metadata json: GPU

In [2]:
# modelinfo: json to store system metadata:
modelInfo = {}
# GPU/CPU:
modelInfo['Device']  = {}

# initialize GPU to get detailed info:
nvs.nvmlInit()
# Driver version:
driverVersion = nvs.nvmlSystemGetDriverVersion()
# Number of devices:
deviceCount = nvs.nvmlDeviceGetCount()
# Device Names:
deviceNames = []
for i in range(deviceCount):
    handle = nvs.nvmlDeviceGetHandleByIndex(i)
    dvn = nvs.nvmlDeviceGetName(handle) # store the device name
    deviceNames.append(dvn)
    # e.g. will print:
    #  Device 0 : Tesla K40c
nvs.nvmlShutdown()
# Save GPU metadata to modelInfo
modelInfo['Device']['driverVersion']  = driverVersion
modelInfo['Device']['deviceNames']  = deviceNames

# User Input:

In [3]:
# Image dimension:
img_width, img_height = 64, 64
# Epochs
epochs = 5
# Batch size:
batch_size = 64

# Save model metadata to modelInfo:
modelInfo['batch_size'] = batch_size
modelInfo['epochs'] = epochs
modelInfo['img_width'] = 64
modelInfo['img_height'] = 64
 

# Training and Testing Images Locations
training_dir = '/data/train'
validation_dir = '/data/validation'
testing_dir = '/data/test'

# Results Location:
results_dir ="/output/results/"

# Basic Image Statistics:

In [4]:
# Count training images:
ntraining = 0
for root, dirs, files in os.walk(training_dir):
    ntraining += len(files)

# Count validation images:
nvalidation = 0
for root, dirs, files in os.walk(validation_dir):
    nvalidation += len(files)

# Data Augmentation:

In [5]:
# get data format:
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)


# Training Image Augmentation:
# -Scale
# -Shear
# -Zoom
# -Height and Width Shift
# -Fill: Nearest
# -Horizontal Flip
train_datagen = ImageDataGenerator(
    rescale=1. / 255.0,
    shear_range=0.2,
    zoom_range=0.2,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    fill_mode = 'nearest',
    horizontal_flip=True)

# Validation Image Augmentation:
# -Scale
valid_datagen = ImageDataGenerator(rescale=1. / 255.0)

# Training Image Generator:
train_generator = train_datagen.flow_from_directory(
    training_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    shuffle=False,
    class_mode='categorical')

# Validation Image Generator:
validation_generator = valid_datagen.flow_from_directory(
    validation_dir, 
    target_size=(img_width, img_height),
    batch_size=batch_size,
    shuffle=False,
    class_mode='categorical')

# Number of Classes/Labels:
nLabels = len(validation_generator.class_indices)

Found 19548 images belonging to 20 classes.
Found 990 images belonging to 20 classes.


# Model

In [6]:
# Model
# - 6 Convolusional Layers
# - RELU Activation
# 32 -> 64 -> 256 -> 1024
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(256, (3, 3), padding='same')) 
model.add(Activation('relu'))
model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nLabels, activation = 'softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


# Captures GPU usage
#subprocess.Popen("timeout 120 nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv -l 1 | sed s/%//g > /app/results/GPU-stats.log",shell=True)

# TimeHistory: Callback class to get timings

In [7]:
# Timehistory callback to get epoch run times
class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

time_callback = TimeHistory()

# Model Run

In [8]:
# Model fitting and training run
simpsonsModel = model.fit_generator(
    train_generator,
    steps_per_epoch= ntraining // batch_size,
    epochs= epochs,
    validation_data= validation_generator,
    validation_steps= nvalidation // batch_size,

    callbacks= [time_callback]
)    

print "Training Finished"

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Finished


# Save Run metadata to modelInfo

In [13]:
# Get timestamp:
now = datetime.datetime.now()
filetime = str(now.year)+str(now.month)+str(now.day)+'_'+str(now.hour)+str(now.minute)

# Time per Epoch:
modelInfo['epochTimeInfo'] = time_callback.times

# Save timestamped model to modelfilename
modelfilename=results_dir+'Simpsonsmodel_'+filetime+'.h5'
model.save(modelfilename)

# Save Run Results to modelInfo:

In [14]:
# Training and Validation accuracy and loss per epoch
modelInfo['historyData'] =  pd.DataFrame(simpsonsModel.history).to_dict(orient='records')

###target_names maps the character names (or labels) to the index(integer) used in the output files
modelInfo['target_names']  = validation_generator.class_indices

modelInfo['labelname_to_index']  = validation_generator.class_indices
modelInfo['index_to_labelname']  = {(v,k) for k,v in validation_generator.class_indices.iteritems() }

# Run Model on Validation Images

In [15]:
# predict_Validation: narray
# row= image
# column= probability of falling within label matching column_index
predict_Validation = model.predict_generator(validation_generator, nvalidation // batch_size+1)

In [24]:
predict_Validation

array([[3.92075442e-02, 6.94692042e-03, 2.14388352e-02, ...,
        2.37906817e-03, 2.50159390e-02, 3.72594059e-01],
       [7.27632940e-02, 2.18505535e-04, 2.18703803e-02, ...,
        2.67088437e-03, 1.57639314e-03, 6.87655644e-04],
       [1.26411794e-06, 1.02595887e-08, 7.18261208e-06, ...,
        1.69607226e-08, 1.48471431e-08, 1.11701626e-08],
       ...,
       [1.37559051e-04, 1.60111284e-07, 3.81654464e-02, ...,
        5.58424857e-04, 2.77534491e-06, 1.29913233e-05],
       [8.51543188e-01, 5.06423996e-04, 9.97477211e-03, ...,
        2.97373073e-04, 2.91992619e-04, 1.29800371e-03],
       [7.49406099e-05, 1.12790666e-07, 2.00242255e-04, ...,
        3.97569165e-05, 3.94369563e-04, 8.32551531e-03]], dtype=float32)

In [18]:
# Label:Index Dictionary
label_index_dict = validation_generator.class_indices

In [29]:
# Best Prediction for all labels: I don't know why we are calculating this (FG)
best_prediction_per_label= [ max( predict_Validation[:,j] ) for j in range( predict_Validation.shape[1] ) ]


In [30]:
best_prediction_per_label

[0.99998975,
 1.0,
 0.99815995,
 0.99991953,
 1.0,
 0.9999999,
 0.9999682,
 0.99998486,
 1.0,
 1.0,
 0.9988827,
 0.9996495,
 1.0,
 0.98577416,
 1.0,
 0.9997886,
 1.0,
 0.97686076,
 0.9999999,
 0.9999298]

# Predicted label for each image:

In [45]:
type( validation_generator )

keras_preprocessing.image.DirectoryIterator

In [34]:
predicted_labels= []
# Find highest probability in prediction list for each image
for i in predict_Validation:
    i= list(i)
    max_value = max(i) 
    predicted_labels.append( i.index(max_value) )

In [43]:
cnf_matrix = confusion_matrix(validation_generator.classes, predicted_labels)
cls_rpt = classification_report(validation_generator.classes, predicted_labels, target_names= validation_generator.class_indices) 

In [44]:
print(cls_rpt)

                          precision    recall  f1-score   support

charles_montgomery_burns       0.03      0.02      0.02        48
            ned_flanders       0.10      0.10      0.10        50
           homer_simpson       0.06      0.06      0.06        50
           lenny_leonard       0.07      0.06      0.07        48
  abraham_grampa_simpson       0.06      0.06      0.06        50
            mayor_quimby       0.10      0.06      0.08        49
            chief_wiggum       0.05      0.04      0.04        50
          edna_krabappel       0.05      0.10      0.07        50
  apu_nahasapeemapetilon       0.03      0.04      0.03        50
       principal_skinner       0.08      0.08      0.08        50
           marge_simpson       0.12      0.08      0.10        50
             moe_szyslak       0.09      0.08      0.09        50
            nelson_muntz       0.06      0.06      0.06        50
        krusty_the_clown       0.02      0.02      0.02        50
         

In [None]:
print(cls_rpt)

In [None]:
print(cls_rpt)  ### This is a 20 by 20 matrix

## This looks cool, but we need to turn it into a table I guess?

In [None]:
## Turning into classification report into classification object
avgresults = cls_rpt.strip().split('\n')[-1].split()

In [None]:
overallResults={'label' : 'avg/total', 'precision': avgresults[3], 'recall':avgresults[4],'f1-score':avgresults[5], 'support':avgresults[6]}


In [None]:
precision, recall, fscore, support  =  precision_recall_fscore_support(validation_generator.classes, TEST_pred)

In [None]:
modelInfo['classificationObject'] =  characterResultsArray =  {
    'label': validation_generator.class_indices.keys(),
    'precision': precision,
    'recall':recall,
    'fscore': fscore, 'support':support,
    'overallResults':{'label' : 'avg/total', 
                      'precision': avgresults[3], 
                      'recall':avgresults[4],
                      'f1-score':avgresults[5],
                      'support':avgresults[6]}}

In [None]:
modelInfo['classificationObject']

In [None]:
modelInfo['confusion_matrix'] = confusion_matrix(validation_generator.classes, TEST_pred)

In [None]:
modelInfo['confusion_matrix']



In [None]:
###   filename --- CLASS


### LAST BUT NOT LEAST --- 



# MAKE IT A PARAMETER OUTPUT MODELPREDICTIOJ FOR TRAIN AND TEST OR JUST TEST  

# for image in glob.glob('/data/train/*/'):

#     I WANT
    
#     ['filename': "somename", 'actualImageLabel': asIndex, 'modelPrection': X ]
    
   


In [None]:
import glob
import os

dgWant = []

model=load_model(modelfilename)

for fld in os.listdir('/data/test/'): 
    trueLabel = fld
    for img in os.listdir('/data/test/%s/' %trueLabel): 
        imgPath = "/data/test/%s/%s" % (fld, img)
        x = image.load_img(imgPath, target_size=(64,64))
        x = image.img_to_array(x)
        x = x.reshape((1,) + x.shape)
        x = x/255.
        pr=model.predict(x)
        curr = {'filename': img, 'actualImageLabel': fld, 'modelprediction':pr} 
        dgWant.append(curr)

In [None]:
d = dgWant[300]['modelprediction']
maxIndex = np.argmax(d)

dict(modelInfo['index_to_labelname'])[maxIndex]

In [None]:
dict(modelInfo['index_to_labelname'])

In [None]:
dgWant[3]