## CNN Cross-Validation

This notebook contains the final model pipeline, as well as some cross-validation code at the bottom which was used to perform hyperparameter tuning. Written by Vincent Lao.

In [1]:
# preprocessing
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import cv2

from skimage import io
from skimage.transform import rescale, resize, downscale_local_mean #scaling options
import skimage
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# modeling 
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Input, Dropout, BatchNormalization
from keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

def prep_pixels(train, test):
    train_norm = train.astype('float32') / 255.0
    test_norm = test.astype('float32') / 255.0
    return train_norm, test_norm
 
# plot diagnostic learning curves
def summarize_diagnostics(history, model, testX, testY, classes):

    # plot loss
    plt.subplot(211)
    plt.title('Cross Entropy Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='orange', label='test')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # plot accuracy
    plt.subplot(212)
    plt.title('Classification Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='orange', label='test')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()

    # save plot to file
    plt.savefig('diagnostics/vgg16_diagnostic_plot_vl.png')
    plt.close()

    
# run the test harness for evaluating a model
def run_test_harness(folder_name, epochs=100, batch_size=64, verbose=2, test_size=0.1, random_state=100, n=-1, shape = (100, 100, 3), diagnostics = True, lr=0.05):
    # load dataset

    if folder_name == 'all':
        trainX, trainY, testX, testY, shape, classes = load_all_server_dataset(test_size=test_size,                                                                         random_state=random_state, n=n, shape = shape)
    elif folder_name == 'binary':
        trainX, trainY, testX, testY, shape, classes = load_binary_dataset(test_size=test_size,                                                                         random_state=random_state, n=n, shape = shape)
    else:
        trainX, trainY, testX, testY, shape, classes = load_server_dataset(folder_name, test_size=test_size,                                                                random_state=random_state, n=n, shape = shape)
    print('Dataset Loaded!')

    # define model
    print('Defining Model...')
    model = define_model(classes, shape, lr=lr)

    # fit model
    print('Fitting Model...')
    history = model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, validation_data=(testX, testY), verbose=verbose)
    print('Model fitted! Epochs=%d, Batch Size=%d' % (epochs, batch_size))
    
    # evaluate model
    _, acc = model.evaluate(testX, testY, verbose=verbose)
    print('Model Evalution:')
    print('> %.3f' % (acc * 100.0))
    # learning curves
    if diagnostics:
        summarize_diagnostics(history, model, testX, testY, classes)

    print('----FINISHED----')
    
    return model

Using TensorFlow backend.


In [9]:
# loading in data for kfold cv
trainX, trainY, testX, testY, shape, classes = load_all_server_dataset()


Classes:  ['unknown' 'kitchen' 'bathroom' 'bedroom' 'house_view' 'living_room'
 'road_view' 'yard' 'porch' 'dining_room' 'porch_yard' 'garage']
Number of observations: 1336


In [2]:
def load_server_dataset(folder_name, test_size=0.1, random_state=100, n=-1, shape = (100, 100, 3)):
    '''
    Given a folder name on Materiall's shared server:
    (1) read in all the files, 
    (2) convert them to pixels, 
    (3) separate them into train and testing data 
        (default test_size and random_state for reproducibility)
        TO_DO: need a better way to split so num_classes is the same in test/train

    Return: X_train, X_test, y_train, y_test, shape of each image
    '''

    # read in filelist and rename columns
    df = pd.read_csv(f'{folder_name}/filelist', sep=" ", header=None)
    df.columns = ['link', 'filepath', 'class', 'probability']

    # filter
    classes = ['living_room', 'house_view', 'kitchen', 'yard', 'garage']
    df = df[df['probability'] > 0.95]
    df = df[df['class'].isin(classes)]

    # change dataset size if specified
    if n > 0:
        df = df.sample(n)
    
    print("Number of observations: " + str(df.shape[0]))

    # add pixels column, resize images (not proportional)
    df['pixels'] = df['link'].apply(lambda x: resize(io.imread(x), shape)) #might take a while

    #split and return training and test sets
    X = np.array([x for x in df['pixels']])
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                        test_size=test_size, \
                                                        random_state=random_state)

    # integer encode target labels so that keras can one hot
    label_encoder = LabelEncoder()
    vec_train = label_encoder.fit_transform(y_train)
    vec_test = label_encoder.fit_transform(y_test)

    # one hot encode target values
    y_train = to_categorical(vec_train)
    y_test = to_categorical(vec_test)

    #trainX, trainY, testX, testY 
    #(make sure ordering is correct to match harness function)
    return X_train, y_train, X_test, y_test, shape, classes

In [3]:
def load_all_server_dataset(test_size=0.1, random_state=100, n=-1, shape = (100, 100, 3)):
    '''
    Loads in data from all three datasets instead of for a specific folder.
    '''
    all_df = pd.DataFrame()
    folders = ['ny_dataset', 'fremont_dataset', 'sa_dataset']

    for folder_name in folders: 
        df = pd.read_csv(f'{folder_name}/filelist', sep=" ", header=None)
        df.columns = ['link', 'filepath', 'class', 'probability']

        # filter
        # classes = ['living_room', 'house_view', 'kitchen', 'yard', 'garage']
        df = df[df['probability'] > 0.90]
        # df = df[df['class'].isin(classes)]

        all_df = all_df.append(df)

    # change the label of all the classes with less than 20 images into `unknown`
    all_classes = all_df['class'].value_counts()
    all_df.loc[all_df['class'].apply(lambda x: x in all_classes.index[all_classes < 20]).values, 'class'] = 'unknown'
    classes = all_df['class'].unique()

    print('Classes: ', classes)

    # change dataset size if specified
    if n > 0:
        all_df = all_df.sample(n)

    print("Number of observations: " + str(all_df.shape[0]))

    # add pixels column, resize images (not proportional)
    all_df['pixels'] = all_df['link'].apply(lambda x: resize(io.imread(x), shape)) #might take a while

    #split and return training and test sets
    X = np.array([x for x in all_df['pixels']])
    y = all_df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                        test_size=test_size, \
                                                        random_state=random_state)

    # integer encode target labels so that keras can one hot
    label_encoder = LabelEncoder()
    vec_train = label_encoder.fit_transform(y_train)
    vec_test = label_encoder.fit_transform(y_test)

    # one hot encode target values
    y_train = to_categorical(vec_train)
    y_test = to_categorical(vec_test)

    #trainX, trainY, testX, testY 
    #(make sure ordering is correct to match harness function)
    return X_train, y_train, X_test, y_test, shape, classes


In [4]:
def load_binary_dataset(test_size=0.1, random_state=100, n=-1, shape = (100, 100, 3)):

    df = pd.read_csv('combined_binary_dataset.csv')
    df = df[df['probability'] > 0.90]
    classes = ['inside', 'outside']

    # change dataset size if specified
    if n > 0:
        df = df.sample(n)

    print("Number of observations: " + str(df.shape[0]))

    # add pixels column, resize images (not proportional)
    df['pixels'] = df['link'].apply(lambda x: resize(io.imread(x), shape)) #might take a while

    #split and return training and test sets
    X = np.array([x for x in df['pixels']])
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                        test_size=test_size, \
                                                        random_state=random_state)

    # integer encode target labels so that keras can one hot
    label_encoder = LabelEncoder()
    vec_train = label_encoder.fit_transform(y_train)
    vec_test = label_encoder.fit_transform(y_test)

    # one hot encode target values
    y_train = to_categorical(vec_train)
    y_test = to_categorical(vec_test)

    #trainX, trainY, testX, testY 
    #(make sure ordering is correct to match harness function)
    return X_train, y_train, X_test, y_test, shape, classes


In [5]:
# define cnn model
def define_model(classes, shape=(32,32,3), lr=0.01, momentum = 0.9, verbose=1):

    # grab the pre-trained VGG16 model, removing the top layers and changing the input shape
    vgg_model = VGG16(weights="imagenet", include_top=False, input_shape=shape)
        
    # freeze pre-trained layers
    for layer in vgg_model.layers:
        layer.trainable = False

    # add new classifier layers
    flat1 = Flatten()(vgg_model.layers[-1].output)
    class1 = Dense(512, activation='relu')(flat1)
    # drop1 = Dropout(0.7)(class1)
    class2 = Dense(512, activation='relu')(class1)
    output = Dense(len(classes), activation='softmax')(class2)

    # define new model with top layers
    model = Model(inputs=vgg_model.inputs, outputs=output)
    # opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
    opt = SGD(lr=0.01, momentum= 0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    
    # for output while training
    if verbose > 0:
        model.summary()

    return model

In [42]:
# testing the kfold stuff on the data
n_splits = 5
# lr = [0.001, 0.01, 0.1, 1] # 0.01 best: MSE = [0.02341774, 0.00772396, 0.00837915, 0.02089699]
# lr = [0.005, 0.01, 0.05] # 0.05 best: MSE = [0.02314396, 0.02623941, 0.01162545]
# batch_size = [32, 64, 100] # 64 best: MSE = [0.02740691, 0.00986043, 0.09347975]
batch_size = [50, 60, 70]

# perform k-fold cv
mses = np.full((n_splits,len(lr)),np.nan)
accs = np.full((n_splits,len(lr)), np.nan)
kf = KFold(n_splits = n_splits, random_state = 2020, shuffle = True)
fold = 0
for train_i, val_i in kf.split(trainX):
    
    # get training and validation values
    trainX_fold = trainX[train_i]
    valX_fold = trainX[val_i]
    trainY_fold = trainY[train_i]
    valY_fold = trainY[val_i]
    
    print('Fold ' + str(fold) + ' of ' + str(n_splits) + '...')
    for i in range(len(batch_size)): # loop through hyperparameter values

        # define model
        print('Defining Model...')
        model = define_model(classes, shape, lr=0.05, verbose=0)

        # fit model
        print('Fitting Model...')
        history = model.fit(trainX_fold, trainY_fold, epochs=10, batch_size=batch_size[i], validation_data=(valX_fold, valY_fold), verbose=0)
        print('Model fitted! Epochs=%d, Batch Size=%d' % (10, batch_size[i]))
        
        # get predictions
        y_pred = model.predict(valX_fold) 
        
        # save MSE for this fold and hyperparameter value
        mses[fold, i] = mean_squared_error(y_pred, valY_fold)

        _, acc = model.evaluate(testX, testY, verbose=0)
        accs[fold, i] = acc 

    fold += 1

average_mses = np.mean(mses, axis=0)
average_accs = np.mean(accs, axis=0)
average_mses, average_accs


Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=50
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=60
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=70
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=50
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=60
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=70
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=50
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=60
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=70
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=50
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=60
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=70
Defining Model...
Fitting Model...
Model fitted! Epochs=10, Batch Size=50
Defining Model...
Fitting Model...
Mod

(array([0.04767436, 0.00631983, 0.07143497]),
 array([0.83469387, 0.96938777, 0.71224489]))