<b>Working with large data sets in Keras</b>

Imran A. Zualkernan

March 2, 2021

The purpose of this notebook is to demonstrate how generators can be used to create Keras models from large amounts of data.  

Before using this notebook please download data on images of bird species from <i> https://www.kaggle.com/gpiosenka/100-bird-species/code </i> into the same directory containing this notebook. 

<b> v1.1 </b>
<hr>
<i> copyright Imran Zualkernan </i>

In [None]:
# Useful links 
# https://www.hostinger.com/tutorials/ssh/basic-ssh-commands


In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_built_with_cuda())

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from IPython.display import Image, display
import random
import math
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.utils import plot_model 
from keras.layers import Input, Dense, Dropout, Flatten, Activation,Concatenate
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from keras.optimizers import Adam
from keras import backend, models
#import tensorflow_addons as tfa
import tensorflow as tf
print(tf.__version__)

# need to add these for the GPU
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

2.2.0


In [2]:
# import the image generator
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
#Setting the parameters for training

# batch size and image width to use
batch_size=128
width=100

# all the data directories
train_dir='train/';
test_dir='test/'
valid_dir='valid/';

# the number of epochs
num_epochs=10

# creating an image generator that will feed the data from
# each of the directories

# we use scaling transformation in this generator
generator=ImageDataGenerator(rescale=1./255)

# we specify the size of the input and batch size
# size of the input is necessary because the image
# needs to be rescaled for the neural network

train_data=generator.flow_from_directory(train_dir, target_size=(width,width),batch_size=batch_size)
valid_data=generator.flow_from_directory(valid_dir, target_size=(width,width),batch_size=batch_size)
test_data=generator.flow_from_directory(test_dir, target_size=(width,width),batch_size=batch_size)

# the number of steps per epoch is samples/batch size
# we need to use these numbers later

train_steps_per_epoch=math.ceil(train_data.samples/batch_size)
valid_steps_per_epoch=math.ceil(valid_data.samples/batch_size)
test_steps_per_epoch=math.ceil(test_data.samples/batch_size)

Found 35215 images belonging to 250 classes.
Found 1250 images belonging to 250 classes.
Found 1250 images belonging to 250 classes.


In [4]:
# the actual model should go here 
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(width, width, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Dropout(0.5))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(250, activation='softmax'))

In [5]:
# Compile the model
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [6]:
# see if the model is good. 
print(model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7ff7eac3b7b8>


In [7]:
print(valid_steps_per_epoch)
num_epochs = 20
history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch)

10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Compile the model
from keras import metrics

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='my_auc'),
                        F1_Score])

In [None]:
# https://keras.io/api/callbacks/
# We can use a variety of pre-defined callbacks.
# Experiment with ReduceLROnPlateuau()

import tensorflow_addons as tfa

from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

# We can also do a modelcheck point 
# https://machinelearningmastery.com/check-point-deep-learning-models-keras/
  
# checkpoint to save the model with best validation accuracy
checkpoint = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', 
                             monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# We can also stop the model early
#https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
# val_loss
early_stopping_callback = EarlyStopping(monitor='accuracy', mode='min', verbose=1, patience=200)


# initialize TimeStopping callback
# https://www.tensorflow.org/addons/tutorials/time_stopping
# note that it will still run a minimum of 1 epoch
time_stopping_callback = tfa.callbacks.TimeStopping(seconds=600, verbose=1)

# We can also use CVSLogger to log information in a CSV
csvlogger = CSVLogger("logfile.csv",separator=',',append=False)


# ** IMPORTANT ** - please make sure that csvlogger is the last call back
# in the list.

my_callbacks = [time_stopping_callback,early_stopping_callback,checkpoint,csvlogger]

                                  

In [None]:
# Fitting the model with call-backs

num_epochs = 1

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=my_callbacks)


In [None]:
# Compile the model
from keras import metrics
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='auc'),
                        metrics.Precision(name='precision'),
                        metrics.Recall(name='recall')])

# Fitting the model with more metrics

num_epochs = 1

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=my_callbacks)

In [None]:
# Defining custom metrics to record while running
from keras import backend as K

def F1_Score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def my_metric_fn(y_true, y_pred):
    squared_difference = tf.square(y_true - y_pred)
    return tf.reduce_mean(squared_difference, axis=-1)  # Note the `axis=-1`

In [None]:
# Compile the model
from keras import metrics
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='auc'),
                        metrics.Precision(name='precision'),
                        metrics.Recall(name='recall'),
                        F1_Score])

# Fitting the model with more metrics

num_epochs = 1

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=my_callbacks)

In [None]:
# Defining custom call backs

# https://www.tensorflow.org/guide/keras/custom_callback
# https://keras.io/guides/writing_your_own_callbacks/

from keras.callbacks import Callback
import time

class TimingCallback(keras.callbacks.Callback):
    def __init__(self):
        super(TimingCallback, self).__init__()
    def on_batch_begin(self, epoch, logs=None):
        self.starttime=time.time()
    def on_batch_end(self, epoch, logs=None):
        logs['epoch_time'] = (time.time()-self.starttime)
        print('\nepoch_time(sec)=',logs['epoch_time'],'\n')
        
# create an instance of the timingcallback
timing_call = TimingCallback() 

# We can also use other metrics
# https://keras.io/api/metrics/
class PrintBatchCallback(keras.callbacks.Callback):  
    def on_train_batch_end(self, batch, logs=None):
        print("For batch {}, loss is {:7.2f}.".format(batch, logs["loss"]))
        print("For batch {}, accuracy is {:7.2f}.".format(batch, logs["accuracy"]))
        print("For batch {}, AUC is {:7.2f}.".format(batch, logs["auc"]))

print_batch_call = PrintBatchCallback()

# add to the callback list
my_callbacks = [time_stopping_callback,early_stopping_callback,checkpoint,print_batch_call, timing_call, CSVLogger('new.csv', separator=',')]


In [None]:
# Compile the model
from keras import metrics
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='auc'),
                        metrics.Precision(name='precision'),
                        metrics.Recall(name='recall'),
                        F1_Score])

# Fitting the model with more metrics

num_epochs = 1

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=my_callbacks)

In [None]:
# https://neptune.ai/blog/keras-metrics

# How to save batch level data in a file 

import os
from keras.callbacks import Callback
import numpy as np


class SaveBatchLevelDataCallback(keras.callbacks.Callback):
    def __init__(self, validation_data, save_dir):
        super().__init__()
        self.validation_data = validation_data
        os.makedirs(save_dir, exist_ok=True)
        self.save_dir = save_dir
        self.f = None

    def on_epoch_begin(self, epoch, logs=None):
        # create a file
        self.f= open(os.path.join(self.save_dir, f'epoch_{epoch}.csv'),'w+')
        line = "batch,loss,accuracy,auc\n"
        self.f.write(line)
    
    def on_epoch_end(self, batch, logs=None):
        self.f.close()
        
    def on_train_batch_end(self, batch, logs=None):
        line = "{},{:7.2f},{:7.2f},{:7.2f}\n".format(batch, logs["loss"], logs["accuracy"],logs["auc"])
        self.f.write(line)
        
    
batch_write_cbk = SaveBatchLevelDataCallback(validation_data=valid_data,save_dir='batch_data')

# add to the callback list
my_callbacks = [time_stopping_callback,early_stopping_callback,checkpoint,batch_write_cbk, CSVLogger('new.csv', separator=',')]


In [None]:
# # Compile the model
from keras import metrics
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='auc'),
                        metrics.Precision(name='precision'),
                        metrics.Recall(name='recall'),
                        F1_Score])

# Fitting the model with more metrics

num_epochs = 10

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=my_callbacks)

In [None]:
# print history 
print(history.history)

In [None]:
#plot accuracy vs epoch
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validate'], loc='upper left')
plt.show()

# Plot loss values vs epoch
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validate'], loc='upper left')
plt.show()

# Plot loss values vs epoch
plt.plot(history.history['F1_Score'])
plt.plot(history.history['val_F1_Score'])
plt.title('Model F1-Score')
plt.ylabel('F1_Score')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validate'], loc='upper left')
plt.show()

# Plot accuracy vs. prevision
plt.plot(history.history['precision'],label='precision')
plt.plot(history.history['val_precision'],label='val_precision')
plt.plot(history.history['recall'],label='recall')
plt.plot(history.history['val_recall'],label='val_precision')
plt.title('Model Precision and Recall')
plt.ylabel('Precision and Recall')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Plot accuracy vs. prevision
plt.plot(history.history['precision'],history.history['recall'],'o', color='black',label='precision vs. recall')
plt.plot(history.history['recall'],history.history['val_recall'],'o', color='red',label='val_precision vs. val_recall')
plt.title('Model Precision and Recall')
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.legend()
plt.show()

# Evaluate against test data.
scores = model.evaluate(test_data, verbose=1)

print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
print('Test AUC:', scores[1])
print('Test precision:', scores[1])
print('Test recall:', scores[1])
print('Test F1-Score:', scores[1])


In [None]:
# For evaluation first, we will create the actual and predicted labels
# We can then use these to generate all the reports we need.

# make predictions on the testing images, finding the index of the
# label with the corresponding largest predicted probability

predicted = model.predict(x=test_data, steps=test_steps_per_epoch)

# create predited IDs
predicted = np.argmax(predicted, axis=1)

# create test labels from the generator
actual = []
for i in range(0,int(test_steps_per_epoch)):
    actual.extend(np.array(test_data[i][1]))

# create actual IDs
actual = np.asarray(actual).argmax(axis=1)

# make sure predicted and actual are the same size and shape
print(predicted.shape)
print(actual.shape)

In [None]:
from sklearn.metrics import classification_report

print("[INFO] evaluating network...")
print(classification_report(actual, predicted))

In [None]:
# Now we can determine the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(actual,predicted)

def print_cm(cm, frm, to,abs_or_relative=0):
    import seaborn as sns
    import matplotlib.pylab as plt

    cm = cm[frm:to+1,frm:to+1]
    # create labels
    x_axis_labels = np.arange(frm,to+1)
    y_axis_labels = np.arange(frm,to+1)
    
    plt.xticks(rotation=45)
    plt.yticks(rotation=-45)
    
    if(abs_or_relative==0):
        sns.heatmap(cm, annot=True,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
    else:
        sns.heatmap(cm/np.sum(cm), annot=True, 
           fmt='.2%', cmap='Blues',
           xticklabels=x_axis_labels, yticklabels=y_axis_labels)

print_cm(cm,1 ,20,0)

In [None]:
# we already have actual and predicted 

# also see https://www.dlology.com/blog/simple-guide-on-how-to-generate-roc-plot-for-keras-classifier/
# for micro-average ROC curves as well

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

#extract the actual labels from the test data
Y_test = []
for i in range(0,int(test_steps_per_epoch)):
    Y_test.extend(np.array(test_data[i][1]))
Y_test = np.array(Y_test)
n_classes = Y_test.shape[1]  # one hot encoded

# create actual output from the model using test_data
y_score=model.predict(x=test_data, steps=test_steps_per_epoch)

print(Y_test.shape)
print(y_score.shape)

In [None]:
print(n_classes)
# compare each class's probabilities one by one
# each acts like a single column
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_test[:,i], y_score[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Print the AUC scores
from IPython.display import display
import pandas as pd
auc_array = np.array(list(roc_auc.items()))
df = pd.DataFrame(auc_array[:,1])
df.columns = ['AUC']
display(df)

In [None]:
# plot the ROC for the ith class cls
import matplotlib.pyplot as plt
import os

def plot_roc(cls,roc_dir):  
    plt.plot(fpr[cls], tpr[cls], lw=2,label='ROC curve of class {0} (area = {1:0.3f})'
    ''.format(cls, roc_auc[cls]))
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(os.path.join(roc_dir, f'ROC_{cls}.png'))
    plt.show()


# make sure directory exists
def make_directory(roc_dir):
    try:
        os.mkdir(roc_dir)
    except OSError:
        print ("Creation of the directory %s failed" % roc_dir)
    else:
        print ("Successfully created the directory %s " % roc_dir)
        
# print the roc curve for 0

make_directory('rocs')

for i in range(n_classes):
    plot_roc(i,'rocs')

In [None]:
# Using tensorflow extension
# Load the TensorBoard notebook extension
%load_ext tensorboard
import datetime

In [None]:
# Define tensorboard callback

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Using remote tensorboard
#https://blog.yyliu.net/remote-tensorboard/

In [None]:
# Compile the model
from keras import metrics
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                        metrics.AUC(name='auc'),
                        metrics.Precision(name='precision'),
                        metrics.Recall(name='recall')])

# Fitting the model with more metrics

num_epochs = 10

history=model.fit(train_data,
                  steps_per_epoch =train_steps_per_epoch, 
                  validation_data=valid_data,
                  epochs=num_epochs,
                  validation_steps=valid_steps_per_epoch,
                  callbacks=[tensorboard_callback])

In [None]:
#%tensorboard --logdir logs/fit