Preparing train_data

In [None]:
#Data prep
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy

train_ds = tfds.load('mnist', split = 'train')

# extracting image and labels
train_ds = train_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label']))
# reshape from, 28,28,1 to one vector
train_ds = train_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label))
# rescaling the values
train_ds = train_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label))
# one-hot encoder
train_ds = train_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10)))
# taking batches of 64 out of 1024
train_ds = train_ds.shuffle(1024).batch(64)
# always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu
train_ds = train_ds.prefetch(4)

Preparing test_data

In [None]:
# preparing test_data
test_ds = tfds.load('mnist', split = 'test')

# extracting image and labels
test_ds = test_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label']))
# reshape from, 28,28,1 to one vector
test_ds = test_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label))
# rescaling the values
test_ds = test_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label))
# one-hot encoder
test_ds = test_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10)))
# taking batches of 64 out of 1024
test_ds = test_ds.shuffle(1024).batch(64)
# always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu
test_ds = test_ds.prefetch(4)

Building Model

In [None]:
# Model Creation via Subclassing from tf.keras.Model
class MLP_Model(tf.keras.Model):
  def __init__(self, layer_sizes, output_size=10):
    super().__init__()
    self.mlp_layers = []
    # layer_sizes e.g. [256, 256]
    for layer_size in layer_sizes:
      new_layer = tf.keras.layers.Dense(units=layer_size, activation='sigmoid')
      self.mlp_layers.append(new_layer)
    self.output_layer = tf.keras.layers.Dense(units=output_size, activation='softmax')

  def call(self, x):
    for layer in self.mlp_layers:
      x = layer(x)
    y = self.output_layer(x)
    return y

Accuracy

In [None]:
def accuracy_calc(y_true, y_pred):
  equal_check = tf.equal(tf.argmax(y_true, axis=1), tf.argmax(y_pred, axis=1))
  accuracy = tf.reduce_mean(tf.cast(equal_check, 'float32'))
  return accuracy

Visualization

In [None]:
def visualization(train_losses, train_accuracies, test_losses, test_accuracies):
  plt.figure()
  line1, = plt.plot(train_losses, "b-")
  line2, = plt.plot(test_losses, "r-")
  line3, = plt.plot(train_accuracies, "b:")
  line4, = plt.plot(test_accuracies, "r:")
  plt.xlabel("Training steps")
  plt.ylabel("Loss/Accuracy")
  plt.legend((line1, line2, line3, line4), ("training loss", "test loss", "train accuracy", "test accuracy"))
  plt.show

Training, Visualization of basic Model

In [None]:
# Training
EPOCHS = 10

model = MLP_Model(layer_sizes=[256, 256])
cce = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01) # learning_rate
#ds = ds

total_train_losses = []
total_train_accuracy = []
total_test_losses = []
total_test_accuracy = []

for epoch in range(EPOCHS):
  train_losses = []
  train_accuracy = []
  test_losses = []
  test_accuracy = []

  for x, target in train_ds:
    with tf.GradientTape() as tape:
      pred = model(x)
      loss = cce(target, pred)
    # GradientTape calculates gradient already, we don't want to calc them again
    # that's why we have to switch indentations
    gradients = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(gradients, model.variables))
    train_losses.append(loss.numpy())
    train_accuracy_val = accuracy_calc(target, pred)
    train_accuracy.append(train_accuracy_val)

  for x, target in test_ds:
    pred = model(x)
    loss = cce(target, pred)
    test_losses.append(loss.numpy())
    test_accuracy_val = accuracy_calc(target, pred)
    test_accuracy.append(test_accuracy_val)

  total_train_losses.append(numpy.mean(train_losses))
  total_train_accuracy.append(numpy.mean(train_accuracy))
  total_test_losses.append(numpy.mean(test_losses))
  total_test_accuracy.append(numpy.mean(test_accuracy))

  #print(f'train losses: ', numpy.mean(train_losses))
  #print(f'train accuracy: ', numpy.mean(train_accuracy))
  #print(f'test losses: ', numpy.mean(test_losses))
  #print(f'test accuracy: ', numpy.mean(test_accuracy))

# visualization
visualization(total_train_losses, total_train_accuracy, total_test_losses, total_test_accuracy)

Modifying differnt Models

Varying learning_rate (from 0.01 to 0.1)

In [None]:
# varying learning rate
EPOCHS = 10

model = MLP_Model(layer_sizes=[256, 256])
cce = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.1) 

total_train_losses = []
total_train_accuracy = []
total_test_losses = []
total_test_accuracy = []

for epoch in range(EPOCHS):
  train_losses = []
  train_accuracy = []
  test_losses = []
  test_accuracy = []

  for x, target in train_ds:
    with tf.GradientTape() as tape:
      pred = model(x)
      loss = cce(target, pred)
    # GradientTape calculates gradient already, we don't want to calc them again
    # that's why we have to switch indentations
    gradients = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(gradients, model.variables))
    train_losses.append(loss.numpy())
    train_accuracy_val = accuracy_calc(target, pred)
    train_accuracy.append(train_accuracy_val)

  for x, target in test_ds:
    pred = model(x)
    loss = cce(target, pred)
    test_losses.append(loss.numpy())
    test_accuracy_val = accuracy_calc(target, pred)
    test_accuracy.append(test_accuracy_val)

  total_train_losses.append(numpy.mean(train_losses))
  total_train_accuracy.append(numpy.mean(train_accuracy))
  total_test_losses.append(numpy.mean(test_losses))
  total_test_accuracy.append(numpy.mean(test_accuracy))

# visualization
visualization(total_train_losses, total_train_accuracy, total_test_losses, total_test_accuracy)

A slightly higher learning rate seems to improve our Model (lower starting loss, higher overall accuracy). Based on prior knowledge, we can say that this is only the case up to a specific value since otherwise previous learned things will not be acknowledged as much as they should. 

Varying amount of layers

In [None]:
# varying layers
import numpy
EPOCHS = 10

model = MLP_Model(layer_sizes=[256, 256, 256, 256])
cce = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01) 

total_train_losses = []
total_train_accuracy = []
total_test_losses = []
total_test_accuracy = []

for epoch in range(EPOCHS):
  train_losses = []
  train_accuracy = []
  test_losses = []
  test_accuracy = []

  for x, target in train_ds:
    with tf.GradientTape() as tape:
      pred = model(x)
      loss = cce(target, pred)
    # GradientTape calculates gradient already, we don't want to calc them again
    # that's why we have to switch indentations
    gradients = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(gradients, model.variables))
    train_losses.append(loss.numpy())
    train_accuracy_val = accuracy_calc(target, pred)
    train_accuracy.append(train_accuracy_val)

  for x, target in test_ds:
    pred = model(x)
    loss = cce(target, pred)
    test_losses.append(loss.numpy())
    test_accuracy_val = accuracy_calc(target, pred)
    test_accuracy.append(test_accuracy_val)

  total_train_losses.append(numpy.mean(train_losses))
  total_train_accuracy.append(numpy.mean(train_accuracy))
  total_test_losses.append(numpy.mean(test_losses))
  total_test_accuracy.append(numpy.mean(test_accuracy))

# visualization
visualization(total_train_losses, total_train_accuracy, total_test_losses, total_test_accuracy)

Expanding the Model with three more layers does not mean that the Model get more accurate. It seems more like that it is the other way around, since the Model is like this not usable anymore.

Varying Model size

In [None]:
# varying layers
EPOCHS = 10

model = MLP_Model(layer_sizes=[10, 10])
cce = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01) 

total_train_losses = []
total_train_accuracy = []
total_test_losses = []
total_test_accuracy = []

for epoch in range(EPOCHS):
  train_losses = []
  train_accuracy = []
  test_losses = []
  test_accuracy = []

  for x, target in train_ds:
    with tf.GradientTape() as tape:
      pred = model(x)
      loss = cce(target, pred)
    # GradientTape calculates gradient already, we don't want to calc them again
    # that's why we have to switch indentations
    gradients = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(gradients, model.variables))
    train_losses.append(loss.numpy())
    train_accuracy_val = accuracy_calc(target, pred)
    train_accuracy.append(train_accuracy_val)

  for x, target in test_ds:
    pred = model(x)
    loss = cce(target, pred)
    test_losses.append(loss.numpy())
    test_accuracy_val = accuracy_calc(target, pred)
    test_accuracy.append(test_accuracy_val)

  total_train_losses.append(numpy.mean(train_losses))
  total_train_accuracy.append(numpy.mean(train_accuracy))
  total_test_losses.append(numpy.mean(test_losses))
  total_test_accuracy.append(numpy.mean(test_accuracy))

# visualization
visualization(total_train_losses, total_train_accuracy, total_test_losses, total_test_accuracy)

Cutting down the layer size drastically shows that the Model is still improving with time, but it takes way more epochs than with more Perceptrons in one layer.

Varying batch size

Disclaimer: we need to reprepare the whole data for this since the batch size is set in this first step.

In [None]:
# reprepare train_data to change batch size
train_ds = tfds.load('mnist', split = 'train')

# extracting image and labels
train_ds = train_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label']))
# reshape from, 28,28,1 to one vector
train_ds = train_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label))
# rescaling the values
train_ds = train_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label))
# one-hot encoder
train_ds = train_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10)))
# taking batches of 64 out of 1024
train_ds = train_ds.shuffle(1024).batch(10)
# always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu
train_ds = train_ds.prefetch(4)

# reprepare test_data
test_ds = tfds.load('mnist', split = 'test')

# extracting image and labels
test_ds = test_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label']))
# reshape from, 28,28,1 to one vector
test_ds = test_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label))
# rescaling the values
test_ds = test_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label))
# one-hot encoder
test_ds = test_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10)))
# taking batches of 64 out of 1024
test_ds = test_ds.shuffle(1024).batch(10)
# always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu
test_ds = test_ds.prefetch(4)

In [None]:
# Training
EPOCHS = 10

model = MLP_Model(layer_sizes=[256, 256])
cce = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01)

total_train_losses = []
total_train_accuracy = []
total_test_losses = []
total_test_accuracy = []

for epoch in range(EPOCHS):
  train_losses = []
  train_accuracy = []
  test_losses = []
  test_accuracy = []

  for x, target in train_ds:
    with tf.GradientTape() as tape:
      pred = model(x)
      loss = cce(target, pred)
    # GradientTape calculates gradient already, we don't want to calc them again
    # that's why we have to switch indentations
    gradients = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(gradients, model.variables))
    train_losses.append(loss.numpy())
    train_accuracy_val = accuracy_calc(target, pred)
    train_accuracy.append(train_accuracy_val)

  for x, target in test_ds:
    pred = model(x)
    loss = cce(target, pred)
    test_losses.append(loss.numpy())
    test_accuracy_val = accuracy_calc(target, pred)
    test_accuracy.append(test_accuracy_val)

  total_train_losses.append(numpy.mean(train_losses))
  total_train_accuracy.append(numpy.mean(train_accuracy))
  total_test_losses.append(numpy.mean(test_losses))
  total_test_accuracy.append(numpy.mean(test_accuracy))

# visualization
visualization(total_train_losses, total_train_accuracy, total_test_losses, total_test_accuracy)

A smaller batch size results in a somewhat higher accuracy and a lower loss. The price we pay to get this better result is that the computing time is bigger than with bigger batches. This may not be too obvious with this dataset, but will be clearer with huge amount of data.