<a href="https://colab.research.google.com/github/grniemeyer/IANNwTF_hw2/blob/main/2ndtrial_hw2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Dense
from sklearn.model_selection import cross_val_score, StratifiedKFold
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np




In [19]:
(train_ds, test_ds), ds_info = tfds.load ('mnist', split =['train', 'test'], as_supervised = True, with_info = True)
#tfds.show_examples(train_ds, ds_info)

def prepare_data(ds):
    """
    Preprocess a TensorFlow dataset for training.
    """
    # Reshape images to a flat vector
    ds = ds.map(lambda image, label: (tf.reshape(image, (-1, )),label))
    # Normalize image data to a range between -1 and 1
    ds = ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128.)-1., label))
    # Convert labels to one-hot encoding with a depth of 10
    ds = ds.map(lambda image, label: (image, tf.one_hot(label, depth=10)))
    # Shuffle the dataset with a buffer size of 1024, batch into groups of 4
    ds = ds.shuffle(1024).batch(32)
    # Pre-load the batch
    ds = ds.prefetch(4)
    return ds

train_dataset = train_ds.apply(prepare_data)
test_dataset = test_ds.apply(prepare_data)

In [20]:
class MLP_Model(tf.keras.Model):
    """
    Multi-Layer Perceptron for classification.
    """
    def __init__(self, layer_sizes, output_size=10):
      super(MLP_Model, self).__init__() # Initialize the parent class
      self.mlp_layers = []
      # Layer sizes [, , , ] -- list of integers
      # Create hidden Dense layers with sigmoid activation
      for layer_size in layer_sizes:
        new_layer = tf.keras.layers.Dense(layer_size, 'sigmoid')
        self.mlp_layers.append(new_layer)
      self.output_layer = tf.keras.layers.Dense(output_size, 'softmax')

    def call(self, x):
        """
          Forward pass through the MLP model.
        """
        for layer in self.mlp_layers:
          x = layer(x) # Pass the input through each hidden layer
        y = self.output_layer(x)
        return y

In [21]:
def train(n_epochs, model, train_dataset, test_dataset, loss_function, optimizer):
  """
  Train and test a MLP.
  """
  # Lists to store training and testing metrics across epochs
  train_losses = []
  train_accuracies = []
  test_accuracies = []
  test_losses = []

  for epoch in range(n_epochs):
        # Lists to aggregate losses and accuracies for each batch in the training dataset
        epoch_loss_agg = []
        epoch_accuracy_agg = []
        for x, target in train_dataset:
            with tf.GradientTape() as tape:
                pred = model(x)
                loss = loss_function(target, pred)
                epoch_loss_agg.append(loss)

            # Compute gradients and update model weights
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            accuracy_tr = tf.keras.metrics.Accuracy()
            accuracy_tr.update_state(tf.argmax(target, axis=1), tf.argmax(pred, axis=1))
            epoch_accuracy_agg.append(accuracy_tr.result().numpy())

        train_accuracies.append(tf.reduce_mean(epoch_accuracy_agg))
        train_losses.append(tf.reduce_mean(epoch_loss_agg))

         # Lists to aggregate losses and accuracies for each batch in the testing dataset
        test_accuracy_aggregator = []
        test_loss_aggregator = []
        for x, target in test_dataset:
            pred = model(x)
            test_loss = loss_function(target, pred)

            accuracy_te = tf.keras.metrics.Accuracy()
            accuracy_te.update_state(tf.argmax(target, axis=1), tf.argmax(pred, axis=1))
            test_accuracy_aggregator.append(accuracy_te.result().numpy())

            test_loss_aggregator.append(test_loss)

        test_accuracies.append(tf.reduce_mean(test_accuracy_aggregator))
        test_losses.append(tf.reduce_mean(test_loss_aggregator))

        print(f'Epoch: {epoch}, Test Accuracy: {test_accuracies[-1]}, Test Loss: {test_losses[-1]}')
  return train_losses, train_accuracies, test_losses, test_accuracies

In [23]:
train_dataset = train_dataset.take(1000)
test_dataset = test_dataset.take(1000)

model = MLP_Model(layer_sizes = [256, 256])
cce = tf.keras.losses.CategoricalCrossentropy() # Define the Categorical Crossentropy loss function
optimizer = tf.keras.optimizers.SGD(0.001) # Define the Stochastic Gradient Descent optimizer with a learning rate of 0.001
training = train(5, model, train_dataset, test_dataset, cce, optimizer)

Epoch: 0, Test Accuracy: 0.21106229722499847, Test Loss: 2.260310173034668
Epoch: 1, Test Accuracy: 0.3029153347015381, Test Loss: 2.2266860008239746
Epoch: 2, Test Accuracy: 0.444888174533844, Test Loss: 2.1909801959991455
Epoch: 3, Test Accuracy: 0.5446286201477051, Test Loss: 2.152503728866577
Epoch: 4, Test Accuracy: 0.5709864497184753, Test Loss: 2.1096558570861816


In [None]:
def visualization(train_losses, train_accuracies, test_losses, test_accuracies):
    """
    Visualizes accuracy and loss for training and test data using
    the mean of each epoch.
    Loss is displayed in a regular line, accuracy in a dotted line.
    Training data is displayed in blue, test data in red.
    """
    # Create a new figure for plotting
    plt.figure()
    # Plot training and test losses with blue and red lines
    line1, = plt.plot(train_losses, "b-")
    line2, = plt.plot(test_losses, "r-")
    # Plot training and test accuracies with blue and red dotted lines
    line3, = plt.plot(train_accuracies, "b:")
    line4, = plt.plot(test_accuracies, "r:")
    # Set the labels for the x and y-axes
    plt.xlabel("Training steps")
    plt.ylabel("Loss / Accuracy")
    # Create a legend for the plot
    plt.legend((line1, line2, line3, line4), ("training loss", "test loss", "train accuracy", "test accuracy"))
    # Display the plot
    plt.show()

train_losses, train_accuracies, test_losses, test_accuracies = training
visualization(train_losses, test_losses, train_accuracies, test_accuracies)

In [None]:
# Perform a grid search
layer_sizes_options = [[64, 32], [128, 64, 32], [256, 128, 64]]

Adam = tf.keras.optimizers.legacy.Adam(0.001)
SGD = tf.keras.optimizers.legacy.SGD(0.001)
RMSprop = tf.keras.optimizers.legacy.SGD(0.001)

optimizer_options = [Adam, SGD, RMSprop]
n_epochs_options = [6, 10, 14]


best_params = None
best_accuracy = 0.0

for layer_sizes in layer_sizes_options:
    for optimizer in optimizer_options:
        for n_epochs in n_epochs_options:
            model = MLP_Model(layer_sizes=layer_sizes)

            optimizer_instance = optimizer

            train_losses, train_accuracies, test_losses, test_accuracies = train(n_epochs=n_epochs, model=model, train_dataset=train_dataset, test_dataset=test_dataset, loss_function = cce, optimizer=optimizer_instance)

            mean_accuracy = np.mean(test_accuracies)

In [None]:
### 4 deviations and their interpretations

adjustable parameters:
* learning rate,
* batch size,
* number and size of layers of model
* optimizer (in SGD’s case the momentum hyperparameter)



- lr: 0.001, batch size:4, shape: [256, 256], 5 epochs, SGD
-- Epoch: 0, Test Accuracy: 0.11124999821186066, Test Loss: 2.2863993644714355
-- Epoch: 1, Test Accuracy: 0.2567499876022339, Test Loss: 2.2502074241638184
-- Epoch: 2, Test Accuracy: 0.46674999594688416, Test Loss: 2.2150402069091797
-- Epoch: 3, Test Accuracy: 0.476500004529953, Test Loss: 2.177403211593628
-- Epoch: 4, Test Accuracy: 0.3177500069141388, Test Loss: 2.141585350036621

- evth. equal but lr: 0.01
-- Epoch: 0, Test Accuracy: 0.49924999475479126, Test Loss: 1.9054900407791138
-- Epoch: 1, Test Accuracy: 0.7622500061988831, Test Loss: 1.143931269645691
-- Epoch: 2, Test Accuracy: 0.8075000047683716, Test Loss: 0.7798288464546204
-- Epoch: 3, Test Accuracy: 0.8364999890327454, Test Loss: 0.6070896983146667
-- Epoch: 4, Test Accuracy: 0.8510000109672546, Test Loss: 0.5335938930511475

->  with higher lr, NN learned fast espeacially in the beginning, and achieved much higher accuacies in the same amout of epochs. That is because it takes bigger steps in each iteration. However this can also lead to unwanted/unpredictable chnages in the weights as visible in the next example with an even higher lr
- evth. equal again except lr: 0.5
-- Epoch: 0, Test Accuracy: 0.7537500262260437, Test Loss: 0.7244381308555603
-- Epoch: 1, Test Accuracy: 0.8299999833106995, Test Loss: 0.5498851537704468
-- Epoch: 2, Test Accuracy: 0.8142499923706055, Test Loss: 0.491791695356369
-- Epoch: 3, Test Accuracy: 0.875, Test Loss: 0.4159727096557617
-- Epoch: 4, Test Accuracy: 0.7404999732971191, Test Loss: 0.7665658593177795
->  here we see that the model learns extremly quickly however the accuracy goes down again after 2 epochs already and it can happen that it just misses the minimum of the loss function... or it can be stuck easily in local minima. Besides it is not really a tractable learning since the weights (can) get updated so much in every iteration. Plus everything else is impossible to fine-tune since the lr's impact overshadows all other parameters

- lr: 0.001, batch size:4, shape: [256, 256], 5 epochs, SGD
-- Epoch: 0, Test Accuracy: 0.11124999821186066, Test Loss: 2.2863993644714355
-- Epoch: 1, Test Accuracy: 0.2567499876022339, Test Loss: 2.2502074241638184
-- Epoch: 2, Test Accuracy: 0.46674999594688416, Test Loss: 2.2150402069091797
-- Epoch: 3, Test Accuracy: 0.476500004529953, Test Loss: 2.177403211593628
-- Epoch: 4, Test Accuracy: 0.3177500069141388, Test Loss: 2.141585350036621
-vs. evth. equal but batchsize: 8
-- Epoch: 0, Test Accuracy: 0.18975000083446503, Test Loss: 2.2664546966552734
-- Epoch: 1, Test Accuracy: 0.29624998569488525, Test Loss: 2.2294161319732666
-- Epoch: 2, Test Accuracy: 0.36412501335144043, Test Loss: 2.1946041584014893
-- Epoch: 3, Test Accuracy: 0.5042499899864197, Test Loss: 2.1515777111053467
-- Epoch: 4, Test Accuracy: 0.5975000262260437, Test Loss: 2.106874465942383
->  we can see that the model reaches better and more straightforward(?) results (even though we might probably need more epochs to be more precise in our claim, let's see how that looks like with an even bigger batchsize:32
-- Epoch: 0, Test Accuracy: 0.21106229722499847, Test Loss: 2.260310173034668
-- Epoch: 1, Test Accuracy: 0.3029153347015381, Test Loss: 2.2266860008239746
-- Epoch: 2, Test Accuracy: 0.444888174533844, Test Loss: 2.1909801959991455
-- Epoch: 3, Test Accuracy: 0.5446286201477051, Test Loss: 2.152503728866577
-- Epoch: 4, Test Accuracy: 0.5709864497184753, Test Loss: 2.1096558570861816
->  okay that does not look like as if the outcome it so much better with a bigger batch size maybe only a bit quicker in hte beginning, maybe in this case a higher learning rate would also fit better.
