# Keras image classification development
#### This is a binary image classification test on corn with leaf blight and healthy corn; 
#### We are training using drone images from Cornell University 
#### ---------------------------------------------------------------------------------------------------------------------------

# To Do for model development:

## Broad/long-term goals

- [ ] Figure out classifications beyond binary
- [ ] Implement 'meta-tuner' to optimize hyperparams automatically
- [ ] Try to build our model atop a pre-made model

## Immediate/short-term goals

- [ ] Tweak hyperparameters, see how to add decay, momentum, etc. (optimizer-dependent)
- [ ] Add/reshape model layers to enhance training

In [2]:
#  basic modules 
import random, datetime
import numpy as np
import tensorflow as tf

#  image and display utilities
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

#  our custom modules
import imagereader as ir

#  file/data utilities
import os, shutil
import _pickle as cPickle
from sklearn.model_selection import train_test_split
import gc  #  Garbage collector for cleaning redundant data from memory

#  supressing deprecation warnings
tf.logging.set_verbosity(tf.logging.ERROR)

### NOTE: The cell below contains all the preprocessing for the sample data set; if you have no use for the sample data then you can skip it

In [None]:
#  create sample image sets from the specified folders; this code as is from Kaggle, with optimizations for our usage
list_all = lambda dirname: ['./Input/sample_dataset/' + dirname +'{}'.format(i) for i in os.listdir('./Input/sample_dataset/' + dirname)]
#  path to folder containing sample data ^^^ ; change path here if necessary

#  create sample training sets from directories
train_healthy = list_all('train_corn/healthy/')
train_spot = list_all('train_corn/spot/')
train_rust = list_all('train_corn/rust/')
train_blight = list_all('train_corn/blight/')
#  create sample test sets from directories
test_healthy = list_all('test_corn/healthy/')
test_spot = list_all('test_corn/spot/')
test_rust = list_all('test_corn/rust/')
test_blight = list_all('test_corn/blight/')

#  isolate and create blight and healthy sets for our particular model training 
train_imgs = train_blight + train_healthy
random.shuffle(train_imgs)

test_imgs = test_blight + test_healthy 
random.shuffle(test_imgs)

def read_and_process_samples(list_of_images, image_location, nrows=256, ncolumns=256):
    x, y = [], []
    for image in list_of_images:
        x.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows, ncolumns), interpolation=cv2.INTER_CUBIC))
        if image in image_location:
            y.append(1)
        else:
            y.append(0)
    y = [not i for i in y]  #  disease labelling in sample data is inverted, compared to our own
    return x, y

#  Processing the training sample images
x, y = read_and_process_samples(train_imgs, train_healthy)

#  Processing the test sample images
x_test, y_test = read_and_process_samples(test_imgs, test_healthy)

#  formatting the training sample data
x_sample, x_val1, y_sample, y_val1 = train_test_split(x, y, test_size=0.2) 
gc.collect()

In [3]:
#  setting up keras utilities and model layers
import keras
from keras import layers
from keras import models
from keras import optimizers
from keras import metrics  #  adding the built-in metrics to better gauge model veracity
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 16} )
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

# TODO: add/reshape model layers
with tf.device("/gpu:0"):
    model = models.Sequential()  #  model is created
    model.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(256, 256, 3))) #  image size taken by model in initial layer
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))  #Dropout for regularization
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))  #Sigmoid function at the end because we have just two classes
    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 254, 254, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 127, 127, 32)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 125, 125, 64)      18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 62, 62, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 60, 60, 128)       73856     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 30, 30, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 28, 28, 128)       147584    
__________

# STOP! Configure model settings below before proceeding

In [None]:
#  !!CRITICAL FOR FUNCTIONALITY!! - Establish settings for model input and training

#  training data selection
set_content_source = 'all'  #  specify which kind of set(s) to be trained over (ours, sample, mixed, or all)
set_content = 'drone'  #  specify which one of the non-sample image sets is to be read (handheld, drone, or boom), irrelevant if using only sample data or all 

#  data manipulation/reshaping
augment_traindata = True  #  determine level of augmentation of data; accuracy/speed tradeoff for True/False, respectively
balance_data = False #  optionally, even out how many healthy and unhealthy data points there are in our set (only affects our data, not the sample YET)
                     #  PROGNOTE: negative performance impact seems to be attributable to reduced sample size and pseudo-random distribution, not mismatch

#  configuration of hyperparameters
batch_size = 32  #  32 seems best suited for our data sizes, investigate further to confirm
nr_epochs = 4
learn_rate = 1e-4 

In [None]:
#  All reading and formatting of the model input data is done in this cell; split freely if finer debugging is needed


#  control for reading our chosen set    
if set_content_source == 'ours' or set_content_source == 'mixed':   
    if os.path.exists('./Input/images_' + set_content) == True and os.path.exists('./Input/labels_{}.csv'.format(set_content)) == True:
        #  the above checks that there exist files which constitute a valid data set of the name specified
        pictures, blightvals = ir.get_features_and_labels(set_content, balance=balance_data)
        x_train, x_val, y_train, y_val = train_test_split(pictures, blightvals, test_size=0.2)  #  splitting the image data into train and validation sets
        print('Images input from the %s set'%set_content)
    else:
        print('Invalid/Incomplete set specified')
elif set_content_source == 'sample':
    print('Training over sample data, skipping image file input')
elif set_content_source == 'all':  
    dataset_files, complete_datasets = os.listdir('./Input'), []
    for i in dataset_files:  #  gets the names of all the complete datasets currently in 'Inputs' (apart from sample)
        if i.split('_')[0] == 'images' and os.path.exists('./Input/labels_%s.csv'%i.split('_')[1]):
            complete_datasets.append(i.split('_')[1])
    discrete_data = [ir.get_features_and_labels(dataset) for dataset in complete_datasets]  #  combined pictures and labels for all complete sets
    random.shuffle(discrete_data)  #  mix for better model variety, parity is preserved via tuples
    pictures, blightvals = [pic for single_set in discrete_data for pic in single_set[0]], [bbool for single_set in discrete_data for bbool in single_set[1]]
    #  list comprehensions which unpack the picture and blightbool sets from the now random combined set ^^^
    x_train, x_val, y_train, y_val = train_test_split(pictures, blightvals, test_size=0.2)  #  splitting the image data into train and validation sets
    print('Found and input %d complete sets: '%len(complete_datasets), complete_datasets)
else:
    print("No set specified")

#  control for which images/data are selected to be trained over
def merge_and_shuffle(set1, set2):  #  merges two sets into a single training set, used when 'mixed' set is chosen
    modelset = set1 + set2
    random.shuffle(modelset)
    return modelset

if set_content_source == "ours" or set_content_source == "all":  #  training and evaluating over our data
    pass
elif set_content_source == "sample":  #  training and evaluating over the sample data
    x_train = x_sample
    x_val = x_val1
    y_train = y_sample
    y_val = y_val1
    feat, lab = x_test, y_test
elif set_content_source == "mixed":
    x_train = merge_and_shuffle(x_train, x_sample)  #  mixing the two data sets 
    x_val = merge_and_shuffle(x_val, x_val1)  #  (has significantly less favorable performance due to visual disparity in images)
    y_train = merge_and_shuffle(y_train, y_sample)
    y_val = merge_and_shuffle(y_val, y_val1)
    feat, lab = pictures + x_test, blightvals + y_test
else:
    print("No data loaded")

#  convert to arrays (train generator flow doesn't take lists)
x_train, y_train, x_val, y_val = np.array(x_train), np.array(y_train), np.array(x_val), np.array(y_val)

#  data sizes, for diagnostics and for setting model step size (below)
ntrain, nval = len(x_train), len(y_train)
print(' \nFull input set size: %d'%(ntrain + len(x_val)),"\nTraining set size: %d"%ntrain,"\nValidation set size: %d"%len(x_val))

In [None]:
#  All image generation and augmentation prep is done in this cell


#  create the image augmentation generators used in the model training
if augment_traindata == False:
    train_datagen = ImageDataGenerator(rescale=1./255)   #  only rescales, does not modify images fed
elif augment_traindata == True:
    train_datagen = ImageDataGenerator(rescale=1./255,   #  rescale = scaling brightness, not size
                                        rotation_range=40,
                                        #width_shift_range=0.2,
                                        #height_shift_range=0.2,
                                        shear_range=0.2,
                                        zoom_range=0.2,
                                        horizontal_flip=True,
                                        fill_mode="reflect")  #  tweak augment features at will
else:
    print("No augmentation setting specified")
val_datagen = ImageDataGenerator(rescale=1./255)  #  Only ever rescales the validation data, for a true measure of model accuracy


#  establish data generators and model optimizer 
train_generator = train_datagen.flow(x_train, y_train, batch_size=batch_size)
val_generator = val_datagen.flow(x_val, y_val, batch_size=batch_size)

model.compile(loss='binary_crossentropy', optimizer=optimizers.Nadam(lr=learn_rate), metrics=['acc'])
gc.collect()

# Model Training Follows:

In [None]:
#  The actual training; adjust hyperparameters in the initial settings cell to tweak model performance
history = model.fit_generator(train_generator,
                              steps_per_epoch = ntrain // batch_size,
                              epochs=nr_epochs,
                              validation_data = val_generator,
                              validation_steps = nval // batch_size)



#  Saving the generated model locally; checks for model files and rewrites, if necessary
if os.path.isfile('model_complete.hd5') == True:  #  model overwrite procedure
    os.remove('model_complete.hd5')
os.mknod('model_complete.hd5')

if os.path.isfile('model_weights.hd5') == True:  #  weights overwrite procedure
    os.remove('model_weights.hd5')
os.mknod('model_weights.hd5')

#  Save the model
model.save_weights('model_weights.hd5')  #  saving files
model.save('model_complete.hd5')



#  Plotting the train and val progress of the model
extrametric = None #'mean_absolute_error'

#  define accuracy, and optionally another metric (which must be set before training in model.compile(...), located at the end of the reading stage)
acc = history.history['acc']
val_acc = history.history['val_acc'] 
if extrametric != None:
    extra1 = history.history[extrametric]
    extra2 = history.history['val_' + extrametric]  #originally just 'val' and 'val_acc', option to add metrics
#  loss, constant definition
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

#  Plotting accuracies, and optionally another metric
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
if extrametric != None:
    plt.plot(epochs, extra1, 'y', label='Training ' + extrametric)
    plt.plot(epochs, extra2, 'g', label='Validation ' + extrametric)
plt.title('Training and Validation accuracy')
plt.legend()
plt.figure()

#  Plotting losses
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()

In [None]:
#  specify the set which the model is to be evaluated over (!NOTE!: must be a labelled data set)

if set_content_source != 'sample':
    feat, lab = ir.get_features_and_labels('boom', balance=False, start=4000, end=8000)
print(len(feat), len(lab))  #  personal check for whether the correct data has been loaded

In [None]:
#  predicts over the specified labelled data, then identifies, labels, and exports false predictions
def document_mistakes(features, labels, export=True):
    print("Predicting...", end="\r")  #  predicting over chosen set, create list of predictions
    results = model.predict(np.array(features))
    print("Done Predicting", end="\r")
    predictions = []
    for val in results:
        predictions.append(int(round(val[0])))

    fps, labelz = [], {1:"Sick", 0:"Healthy"}  #  isolate false predictions based on their example label
    for i in range(len(labels)-1):
        if predictions[i] != labels[i]:
            fps.append(i)

    #  display amount of results that are false predictions
    print("False Predictions in %d%% of the data (%d fakes)" % (100*len(fps)//len(labels), len(fps)))

    #  export false evaluations to fakes folder
    if export == True:
        print('Exporting labelled fakes...', end='\r')
        if os.path.exists("./Fakes") == True:  # empty out old fakes
            shutil.rmtree("./Fakes")
        os.mkdir("Fakes")
        
        time = lambda a: datetime.datetime.now().strftime(a)
        for i, val in enumerate(fps):
            plt.imsave("./Fakes/image{}-flagged:{}@{}.png".format(val, labelz[predictions[val]], time("%X")), features[val])
        print('Finished Exporting fakes    ')

#  made as function so this code can be recycled for evaluation in future model frameworks
document_mistakes(feat, lab, export=True)

In [None]:
#  option to preserve the current model, along with weights file, for future predictions if results are favorable
preserve = True

if preserve == True:
    model.save('./Assorted Models/%dep-%s(%sbal, %saug), lr=%1.2e, bs=%d_model.h5'%
        (nr_epochs, set_content, 'un'*int(not balance_data), 'un'*int(not augment_traindata), learn_rate, batch_size))
    model.save_weights('./Assorted Models/%dep-%s(%sbal, %saug), lr=%1.2e, bs=%d_weights.h5'%
        (nr_epochs, set_content, 'un'*int(not balance_data), 'un'*int(not augment_traindata), learn_rate, batch_size))

In [None]:
#  Test and display model prediction over generated images based on the training set; 
#  less useful over non-sample data, as it is hard to tell from cursory visual inspection whether or not the prediction is accurate

#  Configure augmentation settings for the generated evaluation images
test_datagen = ImageDataGenerator(rescale=1./300,
                                  rotation_range=0,
                                  width_shift_range=0.0,
                                  height_shift_range=0.0,
                                  shear_range=0.0,
                                  zoom_range=0.0,
                                  horizontal_flip=True,
                                  vertical_flip=False,
                                  fill_mode="constant",
                                  cval=12)

def show_predictions(columns, total_images):  #  Displays the predictions much like ir.show_crops(), only with the length explicitly specified     
    text_labels, gen_imgs, i = [], [], 0
    for batch in test_datagen.flow(np.array(random.sample(x_train, total_images)), batch_size=1):  #  generates images based on training set
        gen_imgs.append(np.squeeze(batch, axis=0))                             #  NOTE: random sample improves variety, slicing saves memory
        pred = model.predict(batch)
        if pred > 0.5:
            text_labels.append('blighted')
        else:
            text_labels.append('Healthy')
        i += 1
        if i == total_images:
            break
    ir.adaptive_graph(gen_imgs, text_labels, columns)  #  neatly formats and displays the produced images and labels
            
show_predictions(9, 43)