# Assignment 7 : LSTM vs GRU Performance
create LSTM and GRU models and compare their performance in terms of training time and test accuracy using the MNIST dataset

Fiona Nicdao
COMP 487


In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import  keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

## Processing the MINST Dataset
* normalized the data
* split the data into 70% training set and 30% testing set

In [6]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Change the data to be split into 70% training set and 30% testing set
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))
train_size = 0.7
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size,
                                                    random_state=42)

dev_size = 0.8 * x_train.shape[0]
dev_size = int(dev_size)

#shuffle the x_train (good practice)
#seed for reproducibility
indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)
x_train = x_train[indices]
y_train = y_train[indices]

# plot the image
# plt.imshow(x_train[0], cmap='gray')
# plt.show()

#dividing the training dataset into 80/20 : training set/ validation set
x_val = x_train[dev_size:] #validation sets
y_val = y_train[dev_size:]

x_train = x_train[:dev_size] #training sets
y_train = y_train[:dev_size]

#preparing training data
x_train = (x_train/255.0)
x_val = (x_val/255.0)
x_test = (x_test/255.0)

#make the classes one-hot encodings
y_train = tf.keras.utils.to_categorical(y_train)
y_val = tf.keras.utils.to_categorical(y_val)
y_test = tf.keras.utils.to_categorical(y_test)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

(39200, 28, 28)
(39200, 10)
(21000, 28, 28)
(21000, 10)
(9800, 28, 28)
(9800, 10)


## Settings
* sequence length = 28
* 1 hidden layer with 100 units
* SGD optimizer with 0.0001 learning rate
* Batch size = 32
* number of training epochs = 50
* show and compare the training time and testing accuracy for both models

In [7]:
# make a dataframe to compare the accuracy and loss for all the different architecture choices
df = pd.DataFrame(columns=[ 'Loss', 'Accuracy','Time'])
print(df)

Empty DataFrame
Columns: [Loss, Accuracy, Time]
Index: []


## LSMT

In [33]:
class LSTM(tf.keras.Model):
    def __init__(self, step_size, input_dim, rnn_units, n_layers, n_classes):
        # step_size = sequence length
        # rnn_units = hidden units
        super().__init__()
        self.rnn = tf.keras.Sequential()
        self.input_shape=(step_size, input_dim)
        if n_layers >= 2:
          for _ in range(n_layers-1):
            self.rnn.add(tf.keras.layers.LSTM(units=rnn_units,
                                    return_sequences=True, input_shape=self.input_shape)
            )
            self.rnn.add(tf.keras.layers.LSTM(units=rnn_units,
                                  return_sequences=False))
        else:
          self.rnn.add(tf.keras.layers.LSTM(units=rnn_units,
                                  return_sequences=False, input_shape=self.input_shape ))
        self.rnn.add(tf.keras.layers.Dense(units=n_classes, activation='softmax'))

    def call(self, x):
        for i in self.rnn.layers:
          x = i(x)
        return x

    def build_model(self, input_shape):
        x = tf.keras.layers.Input(shape=input_shape)
        return tf.keras.Model(inputs=[x], outputs=self.call(x))

    def summary(self):
        return self.rnn.summary()

In [34]:
lstm = LSTM(28,28,100,1,10)
lstm.summary()

  super().__init__(**kwargs)


### Runing the LSTM Model

In [35]:
optim = tf.keras.optimizers.Adam(learning_rate=0.0001)
lstm.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])
start_time = time.time()
lstm.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_val, y_val))
end_time = time.time()-start_time

Epoch 1/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.4337 - loss: 1.6614 - val_accuracy: 0.8268 - val_loss: 0.5746
Epoch 2/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.8536 - loss: 0.4865 - val_accuracy: 0.8998 - val_loss: 0.3439
Epoch 3/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9080 - loss: 0.3128 - val_accuracy: 0.9237 - val_loss: 0.2571
Epoch 4/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9280 - loss: 0.2424 - val_accuracy: 0.9307 - val_loss: 0.2334
Epoch 5/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9396 - loss: 0.2038 - val_accuracy: 0.9340 - val_loss: 0.2196
Epoch 6/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9488 - loss: 0.1720 - val_accuracy: 0.9433 - val_loss: 0.1852
Epoch 7/50
[1m

In [36]:
results = lstm.evaluate(x_test, y_test)

[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9784 - loss: 0.0737


In [37]:
df.loc[0] = [results[0], results[1], end_time]
print(df)

       Loss  Accuracy        Time
0  0.072440  0.980190  431.507013
1  2.076673  0.308905  407.775287


## GRU

In [38]:
class GRU(tf.keras.Model):
    def __init__(self, step_size, input_dim, rnn_units, n_layers, n_classes):
        super().__init__()
        self.rnn = tf.keras.Sequential()
        self.input_shape=(step_size, input_dim)
        if n_layers >= 2:
          for _ in range(n_layers-1):
              self.rnn.add(tf.keras.layers.GRU(units=rnn_units,
                                      return_sequences=True, input_shape=self.input_shape)
              )
              self.rnn.add(tf.keras.layers.GRU(units=rnn_units,
                                    return_sequences=False))
        else :
          self.rnn.add(tf.keras.layers.GRU(units=rnn_units,
                                  return_sequences=False, input_shape=self.input_shape ))
        self.rnn.add(tf.keras.layers.Dense(units=n_classes, activation='softmax'))

    def call(self, x):

        for i in self.rnn.layers:
          x = i(x)

        return x

    def build_model(self, input_shape):
        x = tf.keras.layers.Input(shape=input_shape)
        return tf.keras.Model(inputs=[x], outputs=self.call(x))

    def summary(self):
        return self.rnn.summary()


In [39]:
gru = GRU(28, 28, 100, 1, 10)
gru.summary()

  super().__init__(**kwargs)


In [42]:
optim = tf.keras.optimizers.Adam(learning_rate=0.0001)
gru.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])
start_time = time.time()
gru.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_val, y_val))
end_time = time.time()-start_time

Epoch 1/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.7669 - loss: 0.7071 - val_accuracy: 0.8345 - val_loss: 0.5347
Epoch 2/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8505 - loss: 0.4835 - val_accuracy: 0.8847 - val_loss: 0.4046
Epoch 3/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.8897 - loss: 0.3665 - val_accuracy: 0.9036 - val_loss: 0.3287
Epoch 4/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9126 - loss: 0.2954 - val_accuracy: 0.9181 - val_loss: 0.2791
Epoch 5/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9234 - loss: 0.2560 - val_accuracy: 0.9277 - val_loss: 0.2447
Epoch 6/50
[1m1225/1225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9346 - loss: 0.2201 - val_accuracy: 0.9343 - val_loss: 0.2198
Epoch 7/50
[1

In [43]:
results = lstm.evaluate(x_test, y_test)

[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9784 - loss: 0.0737


In [45]:
df.loc[1] = [results[0], results[1], end_time]
print(df)

      Loss  Accuracy        Time
0  0.07244   0.98019  431.507013
1  0.07244   0.98019  430.053378


# Results : low accuracy for both LSTM and GRU and similary accuracy, loss and time. Using SGD

In [32]:
print(df)

       Loss  Accuracy        Time
0  2.035379  0.295810  425.732425
1  2.076673  0.308905  407.775287


# using Adam

In [46]:
print(df)

      Loss  Accuracy        Time
0  0.07244   0.98019  431.507013
1  0.07244   0.98019  430.053378
