# Fine tuning convolutional neural networks
Use this notebook to fine-tune pre-trained networks from Keras found here https://keras.io/applications/

## Instructions
1. Import packages in cell 1.
2. Comment with enough detail to understand what the utility functions do in cell 2.
3. Pre-process data, completing code with TO DO statements above them in cell 3
4. Build, compile, and train model, completing code with TO DO statements in cell 4 (4a and 4b)
5. Predict how well model did, completing code with TO DO statements in cell 5
6. Use this notebook as a template to fine tune a different pre-trained model architecture (found at https://keras.io/applications/), making adjustments for that model as necessary
7. Compare performance for at least 3 model architectures and document which is the best to use

### 1. Importing packages

In [16]:
import glob
import os
import math
import random
from random import shuffle
import time
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
from keras.utils import np_utils

# Imports for deep learning specifically
from keras.applications.inception_v3 import InceptionV3#--[don't need if running Xception]
from keras.applications.xception import Xception
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D,  Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator

### 2. Defining utility functions
Do NOT change the functions in this cell, ONLY comment as needed

In [17]:
def get_image_files(root_dir, img_types):
    #os.walk creates 3-tuple with (dirpath, dirnames, filenames)
    
    # Get all the root directories, subdirectories, and files
    full_paths = [x for x in os.walk(root_dir)] 
    imgs_temp = [os.path.join(ds,f) for ds,_,fs in full_paths for f in fs if f]   
    
    # Filter out so only have directories with .jpg, .tiff, .tif, .png, .jpeg
    imgs = [j for j in imgs_temp if any (k in j for k in img_types)]
    return imgs

def get_dimensions(files):
    # Set starting points for min and max dimensions
    min_height, min_width = 10000, 10000
    max_height, max_width = 0, 0
    
    for f in files:
        # Read in images
        img = cv.imread(f) # Read in images
        h,w = img.shape[:2] # get height and width
        
        # Update min and max values, if necessary
        if h < min_height:
            min_height = h 
        if h > max_height:
            max_height = h
        if w < min_width:
            min_width = w
        if w > max_width:
            max_width = w
            
    return min_height, min_width, max_height, max_width

def make_labels(files):
    # Assume input is a list of complete file paths.
    # Count the number of unique directory names that are immediate parent of the files.
    # Order the directory names alphabetically from a-z, and associate labels accordingly.
    set_temp = {x.split('/')[-2] for x in files} #doing as set to get only unique values
    list_temp = list(set_temp) #Change to list so can interate over it
    list_new = sorted(list_temp) #Alphabetizing
    label_dict = {list_new[x]:x for x in range(len(list_new))} #create dictionary with category:index
    
    return label_dict


def make_train_val_test(files, labels):
    train=[]
    valid = []
    test =[]
    train_labels_name = []
    valid_labels_name = []
    test_labels_name = []
    train_prop = 0.6 #proportion of data set that will be training
    val_prop = 0.2 #proprotion of dataset that is validation
    for key in labels: #going through each key
        temp = [f for f in files if key in f] #getting all files in a specific category (ie key)
        lower_prop = math.ceil(train_prop*len(temp))
        train.extend(temp[:lower_prop]) #training data set
        valid.extend(temp[lower_prop:lower_prop+math.ceil(val_prop*len(temp))]) # validation data set
        test.extend(temp[lower_prop+math.ceil(val_prop*len(temp)):])
    train_labels_name = [x.split('/')[-2] for x in train]
    valid_labels_name = [x.split('/')[-2] for x in valid]
    test_labels_name =  [x.split('/')[-2] for x in test]
    return train, valid, test, train_labels_name, valid_labels_name, test_labels_name


def get_batches(files, label_map, batch_size, resize_size, num_color_channels, augment=False, predict=False, do_shuffle = True):
    if do_shuffle:
        shuffle(files)
    count = 0
    num_files = len(files)
    num_classes = len(label_map)
    
    batch_out = np.zeros((batch_size, resize_size[0], resize_size[1], num_color_channels), dtype=np.uint8)
    labels_out = np.zeros((batch_size,num_classes)) #one-hot labeling, which is why have num_classes num of col.   

    while True: # while True is to ensure when yielding that start here and not previous lines

        f = files[count]
        img = cv.imread(f)       

        # Resize
        # First resize while keeping aspect ratio
        rows,cols = img.shape[:2] # Define in input num_color_channels in case want black and white
        rc_ratio = rows/cols
        if resize_size[0] > int(resize_size[1]*rc_ratio):# if resize rows > rows with given aspect ratio
            img = cv.resize(img, (resize_size[1], int(resize_size[1]*rc_ratio)))#NB: resize dim arg are col,row
        else:
            img = cv.resize(img, (int(resize_size[0]/rc_ratio), resize_size[0]))
            
        # Second, pad to final size
        rows,cols = img.shape[:2] #find new num rows and col of resized image
        res = np.zeros((resize_size[0], resize_size[1], num_color_channels), dtype=np.uint8)#array of zeros
        res[(resize_size[0]-rows)//2:(resize_size[0]-rows)//2+rows,
            (resize_size[1]-cols)//2:(resize_size[1]-cols)//2+cols,:] = img # fill in image in middle of zeros
                
        # Augmentation 
        if augment:            
            rows,cols = res.shape[:2]
            # calculates affine rotation with random angle rotation, keeping same center and scale
            M = cv.getRotationMatrix2D((cols/2,rows/2),np.random.uniform(0.0,360.0,1),1) 
            # applies affine rotation
            res = cv.warpAffine(res,M,(cols,rows))

        # Change to gray scale if input argument num_color_channels = 1
        if num_color_channels == 1: 
            res = cv.cvtColor(res, cv.COLOR_BGR2GRAY)# convert from bgr to gray
            res = res[...,None] # add extra dimension with blank values to very end, needed for keras
            
        batch_out[count%batch_size,...] = res # put image in position in batch, never to exceed size of batch
        
        for k in label_map.keys():
            if k in f: #if a category name is found in the path to the file of the image
                labels_out[count%batch_size,:] = np_utils.to_categorical(label_map[k],num_classes) #one hot labeling
                break   
                
        count += 1
        if count == num_files:# if gone through all files, restart the counter
            count = 0
        if count%batch_size == 0: #if gone through enough files to make a full batch
            if predict: # i.e., there is no label for this batch of images, so in prediction mode
                yield batch_out.astype(np.float)/255.
            else: # training
                yield batch_out.astype(np.float)/255., labels_out
            
            
            
def convert_to_class(prediction,label_map):
    predict_max = np.argmax(prediction,axis=1)#provides index of max value out of prediction classes
    predict_label = []
    for i in range(len(predict_max)):
        for k,v in label_map.items():
                if predict_max[i] == v:
                    predict_label.append(k)
    return predict_label    

def prop_correct(predict_label,actual_label):
    correct_class = []
    for i in range(len(predict_label)):
        if predict_label[i]==actual_label[i]:
            correct_class.append(1)
        else:
            correct_class.append(0)
    num_correct = sum(correct_class)
    proportion_correct = num_correct/len(predict_label)
    return proportion_correct

### 3. Pre-processing

In [18]:
# Get full paths to all classification data
# Data is assumed to reside under the directory "root_dir", and data for each class is assumed to reside in a separate subfolder

# TO DO: define in the variable root_dir the directory path to where the folders with the images are located
root_dir = '/home/guest_3/Desktop/Image_classification_SPCS/Scripps_plankton_camera_system_images/Labeled_ciliates_and_other'


# TO DO: add in any additional image types in path above that are not already listed in the img_types variable below
img_types=['.jpg', '.tiff', '.tif', '.png', '.jpeg']

files = get_image_files(root_dir, img_types)
print('number of files is ',len(files))
print('example file names are ', files[0:4])

# Get the dimension range of the data for informational purposes
minh,minw,maxh,maxw = get_dimensions(files)
print('Over all images - minimum height: {}, minimum width: {}, maximum height: {}, maximum width:{}'.format(minh,minw,maxh,maxw))

# Assign numerical labels to categories - the number of categories is equal to the number of subfolders
label_map = make_labels(files)
print(label_map)

# TO DO: Using the appropriate utility function from cell 2, divide data into training, validation, and testing data
# Variable names should be as follows:
train_files ='training data'
val_files ='validation data'
test_files ='testing data'
train_labels_name = 'training labels'
val_labels_name = 'validation labels'
test_labels_name = 'testing data labels'




# Print length of each data set and labels array
print('length of trainig data is ',len(train_files))
print('length of trainig data is ',len(val_files))
print('length of trainig data is ',len(test_files))

print('train labels length is ',len(train_labels_name))
print('validation labels length is', len(val_labels_name))
print('test labels length is', len(test_labels_name))    
      

number of files is  0
example file names are  []
Over all images - minimum height: 10000, minimum width: 10000, maximum height: 0, maximum width:0
{}
length of trainig data is  13
length of trainig data is  15
length of trainig data is  12
train labels length is  15
validation labels length is 17
test labels length is 19


## 4. Fine tuning
The code below was modified from https://keras.io/applications/#fine-tune-inceptionv3-on-a-new-set-of-classes and must be adapted for use with xception instead of InceptionV3

#### 4a. Creating base pre-trained model

In [19]:
# Create the base pre-trained model
# TO DO: Think about if you should include the top layers (the layers used for classification in the ORIGINAL model). 
# If you should, type in include_top = True, otherwise, use include_top = False

base_model = InceptionV3(weights='imagenet', include_top=False)

# Add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)

# Add a fully-connected layer
x = Dense(4, activation='relu')(x)

# Add logistic layer -- let's say we have x classes--determined by len(label_map)
predictions = Dense(len(label_map), activation='softmax')(x)

# Below is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# Train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model (should be done AFTER setting layers to non-trainable)
model.compile(optimizer='adam', loss = 'categorical_crossentropy',metrics= ['accuracy'])


#### 4b. Training existing model

In [20]:
## TO DO: try the following batch sizes, one at a time: 16, 32, 64, recording accuracy for all
BS = 32

# Initializing other parameters
EPOCHS = 1000
im_wid = 150
im_height = 150 

# Construct the training image generator for data augmentation
data_gen = ImageDataGenerator(featurewise_center = False, samplewise_center = False,
                             featurewise_std_normalization = False, samplewise_std_normalization=False,
                             rotation_range = 360, width_shift_range = 0.2, height_shift_range = 0.2, 
                             zoom_range = 0.5, fill_mode = 'constant',cval=0,horizontal_flip = True,
                             vertical_flip = True, rescale = None)

# Get array of training and validaiton images 
train_gen = get_batches(train_files, label_map, batch_size = len(train_files),resize_size=[im_height,im_wid],
                       num_color_channels=3)
val_gen = get_batches(val_files,label_map,batch_size = len(val_files),resize_size=[im_height,im_wid],
                     num_color_channels = 3)


train_data, train_labels_oh = next(train_gen) #one-hot encoded data
val_data, val_labels_oh = next(val_gen)

# Train the network
ES = EarlyStopping(monitor='val_loss',patience=20,verbose=0)# callback to stop if validation loss has not improved in 20 iterations
model.fit_generator(data_gen.flow(train_data,train_labels_oh, batch_size = BS),
                   steps_per_epoch=len(train_files)//BS,epochs = EPOCHS,
                   validation_data=data_gen.flow(val_data,val_labels_oh,batch_size=BS),
                   validation_steps = len(val_files)//BS,
                   callbacks=[ES])




TypeError: 'str' object does not support item assignment

## 5. Prediction

In [None]:
# Predict using test data
predict_gen = get_batches(test_files,label_map,batch_size=1,resize_size=[im_height,im_wid],
                          num_color_channels=3, predict = True,do_shuffle=False)
prediction = model.predict_generator(predict_gen,steps = len(test_files))

# TO DO: Use the appropriate utility function from cell 2 to convert predictions (saved in the variable prediction
    # to a classification category
    # Save the output in the variable predict_class

    

# TO DO: Determine the proportion of classifications that were classified correctly using the appropriate utility
    # function from cell 2. Save that proportion in the variable proportion_correct
    
    
# Printing proportion correct    
print(proportion_correct)

# TO DO: record the variable settings (e.g., CNN architecture, batch size, epochs, optimizers, and proportion correct) 
    # in some document (e.g., excel spreadsheet, git, etc.)
    
    
    

### Loading model and compiling first layers

In [None]:
#base_model = Xception(include_top=False, weights='imagenet', input_tensor=None, input_shape=(880,920,3), pooling=None)
base_modelX = Xception(include_top=False, weights='imagenet')

# Adding global spatial average pooling layer
xX = base_modelX.output
xX = GlobalAveragePooling2D()(xX)

# Adding in fully-connected layer
xX = Dense(4, activation='relu')(xX)

# Logistic layer for number of classes
## [Add in flexible number of classes]
predictions = Dense(len(label_map),activation='softmax')(xX)

# Model that will be trained
modelX = Model(inputs = base_modelX.input, outputs = predictions)

# Training only top layers
for layer in base_modelX.layers:
    layer.trainable = False
    
# Compile model
## [Choose different optimizers]
modelX.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])


### Training the network

In [None]:
## [Vary batch size, maybe epochs]
EPOCHS = 1000
BS = 32

im_height = 150
im_wid = 150

# Creating training image generator for data augmentation
data_gen = ImageDataGenerator(featurewise_center = False, samplewise_center = False, 
                             featurewise_std_normalization = False, samplewise_std_normalization = False,
                             rotation_range = 360, width_shift_range = 0.2, height_shift_range = 0.2, 
                             zoom_range = 0.5, fill_mode = 'constant',cval = 0, horizontal_flip = True,
                             vertical_flip = True, rescale = None)

# Getting training and validation generators
train_gen = get_batches(train_files, label_map, batch_size = len(train_files),
                       resize_size = [im_height,im_wid],num_color_channels=3)
val_gen = get_batches(val_files, label_map,batch_size = len(val_files),
                     resize_size = [im_height,im_wid],num_color_channels=3)
train_data,train_labels_oh = next(batch_gen)
val_data,val_labels_oh = next(val_gen)

# Training network