In [1]:
# A multi classes image classifier, based on convolutional neural network using Keras and Tensorflow. 
# A multi-label classifier (having one fully-connected layer at the end), with multi-classification (18 classes, in this instance).
# Largely copied from the code https://gist.github.com/seixaslipe
# This is based on these posts: https://medium.com/alex-attia-blog/the-simpsons-character-recognition-using-keras-d8e1796eae36
# Data downloaded from Kaggle 
# Will emulate the image classification functionlities for Neuro Pathology images/slides (WSI-Whole Slide images)
# Will implement/include data manipulating functionalities based on Girder (https://girder.readthedocs.io/en/latest/)
# Has 6 convulsions, filtering start with 64, 128, 256 with flattening to 1024
# Used Keras.ImageDataGenerator for Training/Validation data augmentation and the augmented images are flown from respective directory
# Environment: A docker container having Keras, TensorFlow, Python-2 with GPU based execution

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.callbacks import Callback
import datetime, time, os, sys
import numpy as np
import h5py
import matplotlib as plt
plt.use('Agg')
import matplotlib.pyplot as pyplot
pyplot.figure
import pickle 
#from pickle import load
from sklearn.metrics import classification_report, confusion_matrix
import subprocess
import pandas as pd

import nvidia_smi as nvs
import io
import pickle
import json


try:
    to_unicode = unicode
except NameError:
    to_unicode = str



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:

modelInfo = {}
modelInfo['Device']  = {} ## Initialize an object to store info on the model and time info

nvs.nvmlInit()

driverVersion = nvs.nvmlSystemGetDriverVersion()
print("Driver Version: {}".format(driverVersion))
modelInfo['Device']['driverVersion']  = driverVersion

# e.g. will print:
#   Driver Version: 352.00
deviceCount = nvs.nvmlDeviceGetCount()
deviceNames = []
for i in range(deviceCount):
    handle = nvs.nvmlDeviceGetHandleByIndex(i)
    dvn = nvs.nvmlDeviceGetName(handle) # store the device name
    print("Device {}: {}".format(i,  dvn))
    deviceNames.append(dvn)
    # e.g. will print:
    #  Device 0 : Tesla K40c
nvs.nvmlShutdown()

modelInfo['Device']['deviceNames']  = deviceNames


### These parameters can be tuned and may affect classification results or accuracy
img_width, img_height = 64, 64
epochs = 1
batch_size = 128


modelInfo['batch_size'] = batch_size
modelInfo['epochs'] = epochs
modelInfo['img_width'] = 64
modelInfo['img_height'] = 64
 

### Define input dirs and output for results which contain the models as well as stats on the run
train_data_dir = '/data/train' 
validation_data_dir = '/data/test' 

resultsDir ="/app/results/"
if not os.path.isdir(resultsDir):
    os.makedirs(resultsDir)

nb_train_samples = 0

for root, dirs, files in os.walk(train_data_dir):
    nb_train_samples += len(files)

nb_validation_samples = 0
for root, dirs, files in os.walk(validation_data_dir):
    nb_validation_samples += len(files)


# Model definition
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)


# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1. / 255.0,
    shear_range=0.2,
    zoom_range=0.2,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    fill_mode = 'nearest',
    horizontal_flip=True)

# Only rescaling for validation
valid_datagen = ImageDataGenerator(rescale=1. / 255.0)

# Flows the data directly from the directory structure, resizing where needed
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

validation_generator = valid_datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

NumLabels = len(validation_generator.class_indices)

'''
6-conv layers - added on 06/21, Raj
'''
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(256, (3, 3), padding='same')) 
model.add(Activation('relu'))
model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(NumLabels, activation = 'softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


# Captures GPU usage
#subprocess.Popen("timeout 120 nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv -l 1 | sed s/%//g > /app/results/GPU-stats.log",shell=True)



Driver Version: 384.130
Device 0: GeForce GTX 1050
Found 19548 images belonging to 20 classes.
Found 990 images belonging to 20 classes.


In [3]:


# Timehistory callback to get epoch run times
class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

time_callback = TimeHistory()


# Model fitting and training run
simpsonsModel = model.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=nb_validation_samples // batch_size,

    callbacks=[time_callback])    


print "Finished running the basic model... trying to save results now.."


Epoch 1/1
Finished running the basic model... trying to save results now..


In [29]:
# To write the each epoch run time into a json file
now = datetime.datetime.now()
filetime = str(now.year)+str(now.month)+str(now.day)+'_'+str(now.hour)+str(now.minute)

modelInfo['epochTimeInfo'] = time_callback.times


## Write out the h5/model
modelfilename=resultsDir+'Simpsonsmodel_'+filetime+'.h5'
model.save(modelfilename)

## This outputs the training and validation accuracy and loss functions for each epoch
## This will be graphed as well using plotly ... you can use this data to look for overfitting
## and/or when you can stop training your model because it stops improving
modelInfo['historyData'] =  pd.DataFrame(simpsonsModel.history).to_dict(orient='records')

###target_names maps the character names (or labels) to the index(integer) used in the output files
modelInfo['target_names']  = validation_generator.class_indices

modelInfo['labelname_to_index']  = validation_generator.class_indices
modelInfo['index_to_labelname']  = {(v,k) for k,v in validation_generator.class_indices.iteritems() }


In [17]:
target_names = validation_generator.class_indices

## Prediction for TRAIN data set
Y_pred = model.predict_generator(validation_generator, nb_validation_samples // batch_size+1)

##Prediction for TEST data set
y_pred = np.argmax(Y_pred, axis=1)

cnf_matrix = confusion_matrix(validation_generator.classes, y_pred)
cls_rpt = classification_report(validation_generator.classes, y_pred, target_names=target_names) 


  'precision', 'predicted', average, warn_for)


In [33]:
print(cls_rpt)  ### This is a 20 by 20 matrix

## This looks cool, but we need to turn it into a table I guess?

                          precision    recall  f1-score   support

charles_montgomery_burns       0.00      0.00      0.00        48
            ned_flanders       0.00      0.00      0.00        50
           homer_simpson       0.06      0.06      0.06        50
           lenny_leonard       0.00      0.00      0.00        48
  abraham_grampa_simpson       0.08      0.28      0.13        50
            mayor_quimby       0.00      0.00      0.00        49
            chief_wiggum       0.00      0.00      0.00        50
          edna_krabappel       0.03      0.10      0.04        50
  apu_nahasapeemapetilon       0.00      0.00      0.00        50
       principal_skinner       0.06      0.20      0.09        50
           marge_simpson       0.00      0.00      0.00        50
             moe_szyslak       0.00      0.00      0.00        50
            nelson_muntz       0.06      0.10      0.07        50
        krusty_the_clown       0.00      0.00      0.00        50
         

In [None]:
### The classification report needs to .. not be a report.. we need classification data..

precision recall f1-score  support

classificationObject =  { 'characterResultsArray': [ 'label': 'homer','precision': #, recall, #, f1-score: # ],
                          'overallResults':  [ 'label' : 'avg/total', 'precision': 0.04, recall: 0.05]                         
                                                ] }


In [37]:
cnf_matrix.shape

modelInfo['confusion_matrix'] = cnf_matrix



In [38]:
modelInfo['confusion_matrix']

array([[ 0,  0,  3,  1, 11,  1,  0, 11,  0,  5,  0,  0,  4,  0,  0,  0,
         7,  0,  3,  2],
       [ 1,  0,  1,  3,  6,  0,  0, 10,  0, 16,  0,  0,  3,  0,  0,  0,
         6,  0,  4,  0],
       [ 2,  0,  3,  1,  6,  1,  0,  8,  0, 10,  0,  0,  3,  0,  0,  2,
         9,  0,  2,  3],
       [ 1,  0,  3,  0,  6,  1,  0, 13,  0, 11,  0,  0,  3,  0,  0,  1,
         8,  0,  1,  0],
       [ 3,  0,  3,  1, 14,  0,  0,  3,  0,  6,  0,  0,  7,  0,  0,  1,
         8,  0,  1,  3],
       [ 0,  0,  2,  0,  6,  0,  0, 11,  0,  5,  0,  0,  5,  0,  2,  0,
        12,  0,  5,  1],
       [ 1,  0,  2,  1, 10,  1,  0,  6,  0, 11,  0,  0,  3,  0,  0,  0,
         8,  0,  6,  1],
       [ 1,  0,  2,  0, 10,  0,  0,  5,  0, 12,  0,  0,  3,  0,  0,  0,
        11,  0,  5,  1],
       [ 0,  0,  3,  2,  8,  0,  0,  7,  0,  9,  0,  0,  5,  0,  0,  1,
        10,  0,  4,  1],
       [ 2,  0,  4,  0,  5,  0,  0,  8,  0, 10,  0,  0,  5,  0,  1,  0,
         7,  0,  6,  2],
       [ 1,  0,  4,  3, 10,  0

In [41]:
###   filename --- CLASS


### LAST BUT NOT LEAST --- 



# MAKE IT A PARAMETER OUTPUT MODELPREDICTIOJ FOR TRAIN AND TEST OR JUST TEST  

for image in glob(train/*):
    I WANT
    
    ['filename': "somename", 'actualImageLabel': asIndex, 'modelPrection': X ]
    
   


(990, 20)

In [None]:


#pandas.DataFrame(simpsonsModel.history).to_json("/data/trainingdata/simpsons_history_0629.json")


# saving Confusion Matrix and Classification Report to a text file for human vision



# Serialize confusion matrix and prediction/probabilities matrix stores in json file

modelInfo['confusionMatrix']   =  pd.DataFrame(cnf_matrix).to_dict(orient='records')
modelInfo['prediction_report'] =  pd.DataFrame(y_pred).to_dict(orient='records')

#modelInfo['classification_report'] = 


# df=pd.DataFrame(cnf_matrix)
# df.to_json(rptjson, orient='records', lines=True)

# sysoptfile = 'classificationSystemEnvironment_'+filetime+'.txt'
# import subprocess
# sysopt = (subprocess.check_output("lscpu", shell=True).strip()).decode()
# with open(sysoptfile,"a+") as f:
#     for line in sysopt:
#         f.write(line)








# saving Confusion Matrix and Classification Report to a file
target_names = validation_generator.class_indices
optfile = resultsDir+ 'SimpsonsModeoutput_'+filetime+'.txt'
file = open(optfile, "a+")
Y_pred = model.predict_generator(validation_generator, nb_validation_samples // batch_size+1)
y_pred = np.argmax(Y_pred, axis=1)
ptropt= 'Confusion Matrix' 
print >> file, ptropt
cnf_matrix = confusion_matrix(validation_generator.classes, y_pred)
print >>file, cnf_matrix


ptropt = 'Classification Report'
print >> file, ptropt
cls_rpt = classification_report(validation_generator.classes, y_pred, target_names=target_names) 
print >> file, cls_rpt
file.close()                                         



resultSummaryFile = resultsDir + filetime + ".GutmansTextFile.json"


#Confusion Matrix is shown on a Plot
pyplot.figure(figsize=(8,8))
cnf_matrix =confusion_matrix(validation_generator.classes, y_pred)
#classes = list(chardict.values())
classes = list(target_names)
pyplot.imshow(cnf_matrix, interpolation='nearest')
pyplot.colorbar()
tick_marks = np.arange(len(classes))  
_ = pyplot.xticks(tick_marks, classes, rotation=90)
_ = pyplot.yticks(tick_marks, classes)
plotopt= resultsDir + 'SimpsonsModelImage_'+filetime+'.png'
pyplot.savefig(plotopt)


#To plot GPU usage
# gpu = pd.read_csv("/app/results/GPU-stats.log")   # make sure that 120 seconds have expired before running this cell
# gpuplt=gpu.plot()
# gpuplt=pyplot.show()
# gpuplt='/app/results/SimsonsGPUImage_'+filetime+'.png'
# pyplot.savefig(gpuplt) 



# saving Confusion Matrix and Classification Report to a file
target_names = validation_generator.class_indices
optfile = 'SimsonsModeloutput_'+filetime+'.txt'
file = open(optfile, "a+")
Y_pred = model.predict_generator(validation_generator, nb_validation_samples // batch_size+1)
y_pred = np.argmax(Y_pred, axis=1)
ptropt= 'Confusion Matrix' 
print >> file, ptropt
cnf_matrix = confusion_matrix(validation_generator.classes, y_pred)
print >>file, cnf_matrix

# Serialize confusion matrix and stores in json file
cmfile= resultsDir + 'SimpsonsModelConfusionMatrix_'+filetime+'.json' 
with io.open(cmfile, 'w', encoding='utf8') as outfile:
    str_ = json.dumps(cnf_matrix.tolist(),
                      indent=4, sort_keys=True,
                      separators=(',', ':'), ensure_ascii=False)
    outfile.write(to_unicode(str_))



modelInfo['confusionMatrixV1'] =  cnf_matrix.tolist()

ptropt = 'Classification Report'
print >> file, ptropt
cls_rpt = classification_report(validation_generator.classes, y_pred, target_names=target_names) 
print >> file, cls_rpt
rptjson= resultsDir + 'SimpsonsModelClassificationReport_'+filetime+'.json' 
with io.open(rptjson, 'w', encoding='utf8') as outfile:
    str_ = json.dumps(cnf_matrix.tolist(),
                      indent=4, sort_keys=True,
                      separators=(',', ':'), ensure_ascii=False)
    outfile.write(to_unicode(str_))
file.close()                                         

# sysoptfile = resultsDir + 'SimpsonsSystemEnvironment_'+filetime+'.txt'
# import subprocess
# sysopt = (subprocess.check_output("lscpu", shell=True).strip()).decode()
# with open(sysoptfile,"a+") as f:
#     for line in sysopt:
#         f.write(line)

# from tensorflow.python.client import device_lib
# LOCAL_DEVICES = device_lib.list_local_devices()

# modelInfo['LOCAL_DEVICES'] = LOCAL_DEVICES



with open(resultSummaryFile ,"w")  as fp:
    json.dump(modelInfo,fp,indent=2)