# **Libraries**

In [None]:
# Daniel Jang, 20096632, 17DDHJ
# CMPE452 Project Group 6 

In [None]:
import tensorflow as tf
import numpy as np
import random as random
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageEnhance

from tensorflow.keras import utils
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras import Model
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import to_categorical

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

# **Data Preprocessing**

In [None]:
# storing dataset path names
sample_path = "/kaggle/input/state-farm-distracted-driver-detection/sample_submission.csv"
imgs_list_path = "/kaggle/input/state-farm-distracted-driver-detection/driver_imgs_list.csv"
train_path = "/kaggle/input/state-farm-distracted-driver-detection/imgs/train"
test_path = "/kaggle/input/state-farm-distracted-driver-detection/imgs/test"

# read csv file
driver_imgs_list = pd.read_csv(imgs_list_path)

In [None]:
# global variables
img_width, img_height = (64, 64)
model_input_shape = (img_width, img_height, 3)
input_image = (img_width, img_height)

In [None]:
# sort class names and images 
def pair_sort(className,values):
    for j in range(0,len(className)-1):
        for i in range(0,len(className)-1):
            if values[i] > values[i+1]:
                temp =  values[i+1]
                values[i+1] = values[i]
                values[i] = temp

                N_temp =  className[i+1]
                className[i+1] = className[i]
                className[i] = N_temp
    
    return className,values

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(15, 5), dpi=80, facecolor='w', edgecolor='k')

# store list of class names
class_names = np.unique(driver_imgs_list['classname'])

# store list of images separated by class name
class_image_list = [len(driver_imgs_list[driver_imgs_list['classname'] == current_class]) for current_class in class_names]

class_names,class_image_list=  pair_sort(class_names,class_image_list)

# display the number of files associated with each class type
plt.suptitle('Number of images per Class')
plt.bar(class_names,class_image_list,color=(0.2, 0.4, 0.6, 0.6))
plt.show()

In [None]:
from matplotlib.pyplot import figure

# similar code to previous block
sub_names = np.unique(driver_imgs_list['subject'])
sub_image_list = [len(driver_imgs_list[driver_imgs_list['subject'] == current_sub]) for current_sub in sub_names]
sub_names,sub_image_list=  pair_sort(sub_names,sub_image_list)

figure(num=None, figsize=(15, 10), dpi=80, facecolor='w', edgecolor='k')

y_pos = np.arange(len(sub_names))

# plt.barh(y_pos, sub_image_list,color=(0.2, 0.4, 0.6, 0.6))
 
# plt.yticks(y_pos,sub_names )
# plt.suptitle('Number of images per subject')

# plt.show()

In [None]:
# load image from path
def load_image(path):
    read_path =  train_path + "/" + path
    image = Image.open(read_path)
    image = image.resize(input_image)

    return np.asarray(image)

In [None]:
# function to display images
def show_images(image_ids,class_names):
    pixels = [load_image(path) for path in image_ids]
    num_of_images = len(image_ids)
    fig, axes = plt.subplots(
        1, 
        num_of_images, 
        figsize=(5 * num_of_images, 5 * num_of_images),
    )
    print(fig, axes)
    for i, image_pixels in enumerate(pixels):
        axes[i].imshow(image_pixels)
        axes[i].axis("off")
        axes[i].set_title(class_names[i])

In [None]:
# display one image from each class
sub_names_imgs = [ current_class+"/"+driver_imgs_list[driver_imgs_list['classname'] == current_class]['img'].values[0] for current_class in class_names]
show_images(sub_names_imgs[:5],class_names[:5])
show_images(sub_names_imgs[5:],class_names[5:])

In [None]:
# init empty lists to store training and test sets
x_train = []
y_train = []

x_val = []
y_val = []

# split rate controls how much of the dataset is used
split_rate = 0.8
# split_rate = 0.1

# pick 
for current_class in class_names:
    select_df = driver_imgs_list[driver_imgs_list['classname'] == current_class ]
    image_list = select_df['img'].values
    image_list = shuffle(image_list)
    
    # split truncated dataset into test and validation sets
    train_amount = int(len(image_list)*split_rate)
    train_list = image_list[:train_amount]
    val_list = image_list[train_amount:]
    # val_list = image_list[2200:]
    
    # load images into respective sets
    for filename in train_list:
        x_train.append(load_image(current_class+"/"+filename))
        y_train.append(current_class.replace('c',''))

    for filename in val_list:
        x_val.append(load_image(current_class+"/"+filename))
        y_val.append(current_class.replace('c',''))

# variables to feed into model
x_train = np.asarray(x_train)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
x_val = np.asarray(x_val)
y_val =tf.keras.utils.to_categorical(y_val, num_classes=10)

print("Train x Shape: ",x_train.shape)
print("Test x Shape: ",x_val.shape)
print("Train y Shape: ",y_train.shape)
print("Test y Shape: ",y_val.shape)

# **Model**

In [None]:
#general vgg16 structure

# model = models.Sequential()

# model.add(layers.Conv2D(64, activation='relu'))
# model.add(layers.Conv2D(64, activation='relu'))
# model.add(layers.MaxPooling2D())
          
# model.add(layers.Conv2D(128, activation='relu'))
# model.add(layers.Conv2D(128, activation='relu'))
# model.add(layers.MaxPooling2D())

# model.add(layers.Conv2D(256, activation='relu'))
# model.add(layers.Conv2D(256, activation='relu'))
# model.add(layers.Conv2D(256, activation='relu'))
# model.add(layers.MaxPooling2D())

# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.MaxPooling2D())

# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.Conv2D(512, activation='relu'))
# model.add(layers.MaxPooling2D())
          
# model.add(Dense(4096, activation='relu'))
# model.add(Dense(1000, activation='relu'))
# model.add(Dense(10, activation='softmax'))

In [None]:
# modified VGG
# reduced number of convolutional layers, dropout layers added in attempt to reduce overtraining

model = models.Sequential()

model.add(Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(64,64,3), kernel_initializer='glorot_normal'))
model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.5))

model.add(Conv2D(filters=128, kernel_size=2, padding='same', activation='relu', kernel_initializer='glorot_normal'))
model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.5))

model.add(Conv2D(filters=256, kernel_size=2, padding='same', activation='relu', kernel_initializer='glorot_normal'))
model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.5))

model.add(Conv2D(filters=512, kernel_size=2, padding='same', activation='relu', kernel_initializer='glorot_normal'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(500, activation='relu', kernel_initializer='glorot_normal'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax', kernel_initializer='glorot_normal'))

In [None]:
model.summary()

In [None]:
# model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# num_epochs = 30
num_epochs = 20

model_history = model.fit(x = x_train,y=y_train,
      validation_data=(x_val,y_val),
      batch_size = 20,
      #batch_size = 2,
      epochs=num_epochs,
      verbose=1)

In [None]:
# print graph of training accuracy and validation accuracy
fig, (ax) = plt.subplots(1, 1, figsize=(8, 8))
ax.plot(model_history.history['accuracy'], color='b', label="Training accuracy")
ax.plot(model_history.history['val_accuracy'], color='r',label="Validation accuracy")
ax.set_xticks(np.arange(1, 20, 1))

legend = plt.legend(loc='best', shadow=True)
plt.tight_layout()
plt.show()

# **Analysis**

In [None]:
# evaluator function
def evaluator(test_y, pred_y):
    print(confusion_matrix(test_y, pred_y))
    print('accuracy score: ', accuracy_score(test_y, pred_y))
    print('recall score: ', recall_score(test_y, pred_y, average='micro'))
    print('precision score: ', precision_score(test_y, pred_y, average='micro'))
    print('f1 score: ', f1_score(test_y, pred_y, average='micro'))

In [None]:
# print evaluator
y_pred = model.predict(x_val)
evaluator(np.argmax(y_val, axis=1), np.argmax(y_pred, axis=1))

In [None]:
# comparison of accuracy and loss for training and validation
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].set_title('Training')
ax[0].plot(model_history.history['accuracy'])
ax[0].plot(model_history.history['val_accuracy'])

ax[1].set_title('Validation')
ax[1].plot(model_history.history['loss'])
ax[1].plot(model_history.history['val_loss'])