In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import time
from datetime import datetime

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, AveragePooling2D, GlobalAveragePooling2D, Dense
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

tf.config.run_functions_eagerly(True)

# CPU vs GPU Performance

When I first started training, I was using my CPU (Intel Core-i9 11900KF) and the projected training time for one model was about 1 day (using 50 epochs).  I knew this was not going to allow me many models or flexiblity in changes so I enabled my GPU. I ended up looking at a lot of tutorials but ended up following the instructions from Bex T. [7] I was able to get the drivers from NVIDIA and enable TensorFlow to access my GPU (NVIDIA 3080 TI).  This allowed me to train the same model in  2.5 hours.

In [2]:
print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Batching For System Memory

The second hardware issue that I ran into was self inflicted.  Before I augmented my dataset, I was trying to load all of the numpy arrays the images created into memory, I was not batching anything.  This would max my system RAM out (64GB) and prevent the rest of the notebook from progressing.  Agarwal [1] had a great example on avoiding this issue.  First, rather than import the images and convert them to arrays immediately, he created a Pandas dataframe to hold the ages and the image file locations. Once he was ready to process the arrays, he would convert the images. He also utialized TensorFlows dataset process and how to batch that information. After I call my Pipeline function, I also batched the datasets to help with system memory.

Once I got this system in place, I felt comfortable expanding my image dataset from 35,000 to 350,000.

In [3]:
tf.random.set_seed(99)
np.random.seed(99)

In [4]:
file_df = pd.read_csv('faces_files.csv')

In [5]:
train, test = train_test_split(file_df, test_size=0.15)

In [6]:
train_files = list(train['File'])
train_labels = list(train['Group'])

test_files = list(test['File'])
test_labels = list(test['Group'])

In [7]:
X_train_tensor = tf.constant(train_files)
y_train_tensor = tf.constant(train_labels)

X_test_tensor = tf.constant(test_files)
y_test_tensor = tf.constant(test_labels)

In [8]:
num_groups = 6

def pipeline(file, label):
    image = tf.io.read_file(file)
    image_gray = tf.io.decode_jpeg(image, channels=1) 
    image_resized = tf.image.resize(image_gray, [200,200])
    label = tf.one_hot(label, num_groups)
    return image_resized, label

Below is where I batched the images into groups of 64

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tensor, y_train_tensor))
train_dataset = train_dataset.map(pipeline)
train_dataset = train_dataset.batch(64)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test_tensor, y_test_tensor))
test_dataset = test_dataset.map(pipeline)
test_dataset = test_dataset.batch(64)



In [None]:
agarwal = Sequential()

agarwal.add(Conv2D(filters=32, kernel_size=3, activation='relu', input_shape=(200, 200, 1)))    # 3rd dim = 1 for grayscale images.
agarwal.add(AveragePooling2D(pool_size=(2,2)))

agarwal.add(Conv2D(filters=64, kernel_size=3, activation='relu'))
agarwal.add(AveragePooling2D(pool_size=(2,2)))

agarwal.add(Conv2D(filters=128, kernel_size=3, activation='relu'))
agarwal.add(AveragePooling2D(pool_size=(2,2)))

agarwal.add(Conv2D(filters=256, kernel_size=3, activation='relu'))
agarwal.add(AveragePooling2D(pool_size=(2,2)))

agarwal.add(GlobalAveragePooling2D())

agarwal.add(Dense(132, activation='relu'))

agarwal.add(Dense(6, activation='softmax'))

agarwal.summary()

model_name='prerakAdam1e4'
epoch_size = 20

#Model from Agarwal [1]

In [10]:
alex_net = keras.models.Sequential([
    keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(200, 200, 1)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
    keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
    keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
    keras.layers.Flatten(),
    keras.layers.Dense(4096, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(4096, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])
#Using the AlexNet CNN model [4].
model_name='nonAugAlexNetSGD1e4'
epoch_size = 20

In [11]:
base_log_dir = os.path.join(os.curdir, "logs\\fit\\")

def get_log_dir():
    file_id = time.strftime('run_%Y_%m_%d-%H_%M')
    return os.path.join(base_log_dir, file_id)

log_dir = get_log_dir()
tensorboard  = tf.keras.callbacks.TensorBoard(log_dir)
#Adapted from Alake [2]

In [None]:
agarwal.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), metrics=['accuracy'])

The hyperparameters that I used were mostly Adam, Adamax and SGD optimizers, learning rate between 1e-3 and 1e-5, landing on 1e-4 being the overall best metric.

In [12]:
alex_net.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=1e-4), metrics=['accuracy'])

In [13]:
checkpoint = ModelCheckpoint(filepath=r"C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints",
                             monitor='val_accuracy',
                             save_best_only=True,
                             verbose=1
                            )

In [None]:
agarwal_history = agarwal.fit(train_dataset,
                                  batch_size=512,
                                  validation_data=test_dataset,
                                  epochs=20,
                                  callbacks=[tensorboard, checkpoint]
                                 )

In [14]:
alex_net_history = alex_net.fit(train_dataset,
                                batch_size=64,
                                validation_data=test_dataset,
                                epochs=epoch_size,
                                callbacks=[tensorboard, checkpoint]
                               )

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.22541, saving model to C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints
INFO:tensorflow:Assets written to: C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints\assets
Epoch 2/20
Epoch 2: val_accuracy improved from 0.22541 to 0.23897, saving model to C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints
INFO:tensorflow:Assets written to: C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints\assets
Epoch 3/20
Epoch 3: val_accuracy improved from 0.23897 to 0.24557, saving model to C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints
INFO:tensorflow:Assets written to: C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints\assets
Epoch 4/20
Epoch 4: val_accuracy improved from 0.24557 to 0.28157, saving model to C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints
INFO:tensorflow:Assets written to: C:\Users\cstod\GradFiles\Captstone\.ipynb_checkpoints\assets
Epoch 5/20
Epoch 5: val_accuracy improved from 0.28157 to 0.28553, savi

In [None]:
agarwal.evaluate(test_dataset)

In [15]:
alex_net.load_weights('.ipynb_checkpoints')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x13275f26640>

In [16]:
alex_net.evaluate(test_dataset)



[1.5485175848007202, 0.36223143339157104]

In [17]:
alex_net.save(f'{model_name}_{epoch_size}e')

INFO:tensorflow:Assets written to: nonAugAlexNetSGD1e4_20e\assets
