In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import datetime
import tensorflow_datasets as tfds

In [None]:
mnist = tfds.load('mnist', split=['train', 'test'], as_supervised=True)

''' shape should be (batch_size, sequence_length, features) '''
def preprocess_data(mnist, batch_size, sequence_length):
  #convert data from uint8 to float32
  mnist = mnist.map(lambda img, target: (tf.cast(img, tf.float32), target))
  #sloppy input normalization, just bringing image values from range [0, 255] to [-1, 1]
  mnist = mnist.map(lambda img, target: ((img/128.)-1., target))
  
  #cache this progress in memory, as there is no need to redo it; it is deterministic after all
  mnist = mnist.cache()
  #shuffle, batch, prefetch
  mnist = mnist.shuffle(1000)

  #create tensor with length of data/representing indices
  stop = sequence_length
  sequence_length = tf.range(len(mnist)) 
  #this creates the alternating positve and negative signes by checkign whether the entry index modulo 2 is zero
  #for even index take elem else take -elem
  alternating_target_numbers = tf.where(tf.math.floormod(sequence_length, 2)==0, [elem[1] for elem in mnist], [-(elem[1]) for elem in mnist])
  #print("alternatiing:", alternating_target_numbers)
  c_sum = tf.math.cumsum(alternating_target_numbers)

  # get new targets in a vetcor
  c_sum = tf.data.Dataset.from_tensor_slices(c_sum)

  # put MNIST and new targets together
  prepared = tf.data.Dataset.zip((mnist,c_sum))
  prepared = prepared.map(lambda img, target: (img[0], target))
  #print('prepared', prepared.shape)
  #print("c_sum:", c_sum)
  mnist = mnist.batch(batch_size)
  mnist = mnist.prefetch(20)
  
  #return preprocessed dataset
  return mnist

In [None]:
class LSTMCell(tf.keras.layers.AbstractRNNCell):

    def __init__(self, num_units, **kwargs):
        super().__init__(**kwargs)

        self.num_units = num_units
        self.hidden_state = num_units
        self.cell_state = num_units
        self.states = [self.hidden_state, self.cell_state]
        
        # first sigmoid layer: forget_gate
        self.layer_sigmoid1 = tf.keras.layers.Dense(num_units,
                                                     kernel_initializer= tf.keras.initializers.Orthogonal(gain=1.0, seed=None),
                                                     activation=tf.nn.sigmoid)
        # second sigmoid layer: input_gate
        self.layer_sigmoid2 = tf.keras.layers.Dense(num_units, kernel_initializer= tf.keras.initializers.Orthogonal(gain=1.0, seed=None), 
                                                       activation=tf.nn.sigmoid)    

        # tanh layer: input_gate (candidates)
        self.layer_tanh = tf.keras.layers.Dense(num_units, kernel_initializer= tf.keras.initializers.Orthogonal(gain=1.0, seed=None), 
                                                       activation=tf.nn.tanh)
        # third sigmoid layer: output_gate
        self.layer_sigmoid3 = tf.keras.layers.Dense(num_units, kernel_initializer= tf.keras.initializers.Orthogonal(gain=1.0, seed=None), 
                                                       activation=tf.nn.sigmoid)
        

        # layer normalization for trainability
        self.layer_norm_h = tf.keras.layers.LayerNormalization()
        self.layer_norm_c = tf.keras.layers.LayerNormalization()
    
    @property
    def state_size(self):
        return [tf.TensorShape(self.hidden_state), tf.TensorShape(self.cell_state)]

    @property
    def output_size(self):
        return tf.TensorShape(self.hidden_state) # return [tf.TensorShape([self.recurrent_units_2])]


    def get_initial_state(self, inputs=None, batch_size=None, dtype=None): 
        return (tf.zeros((32, self.hidden_state,self.hidden_state, 1)),
                tf.zeros((32, self.cell_state, self.cell_state, 1)))
        

    def call(self, input, states):
    
      self.cell_state, self.hidden_state = states
      drive = tf.concat((input, self.hidden_state), axis=1)

      forget_gate_drive = self.forget_gate(drive)
      forget_drive = forget_gate_drive * self.cell_state

      input_gate_drive = self.input_gate(drive)
      candiate_gate_drive = self.candidate_gate(drive)
      input_candidate_drive = input_gate_drive * candiate_gate_drive

      self.cell_state = forget_drive + input_candidate_drive

      self.hidden_state = self.output_gate(drive) * self.output_tanh(self.cell_state)

      return self.cell_state, self.hidden_state

       

    def get_config(self):
        return {"hidden state": self.hidden_state, 
                "cell state": self.cell_state,
                "number of units": self.num_units}

In [None]:
class LSTM_Layer(tf.keras.layers.Layer):

  def __init__(self, cell):
    super().__init__()
    self.lstm_cell = cell

  def call(self, input):
    batch_size = input.shape[0]
    sequence_length = input.shape[1]
    time_steps = [self.zero_states(batch_size, sequence_length),]
    
    for i in range(sequence_length):
      cell_state, hidden_state = self.lstm_cell(input[:, i, :], time_steps[-1])
      time_steps.append((cell_state, hidden_state))
    
    last_cell_state, last_hidden_state = time_steps[-1]

    return last_hidden_state
      

  def zero_states(self, batch_size, sequence_length):
    self.lstm_cell.cell_state = tf.zeros((batch_size, sequence_length), dtype=tf.dtypes.float32)
    self.lstm_cell.hidden_state = tf.zeros((batch_size, sequence_length), dtype=tf.dtypes.float32)
    
    return tf.zeros((batch_size, sequence_length), dtype=tf.dtypes.float32), tf.zeros((batch_size, sequence_length), dtype=tf.dtypes.float32)

In [None]:
class LSTMModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        self.convlayer1 = tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001))
        self.convlayer2 = tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001))
        self.pooling = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        self.time1 = tf.keras.layers.TimeDistributed(self.pooling)

        self.convlayer3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001))
        self.convlayer4 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001))
        self.pooling2 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        self.time2 = tf.keras.layers.TimeDistributed(self.pooling2)

        #print(tf.shape(self.global_pool))
        
        
        self.lstm_layer = LSTM_Layer(LSTMCell(num_units = 28, dtype=tf.float32))# SHAPE AFTER POOLING = 1x?
        
        # return_sequences collects and returns the output of the lstm_cell for all time-steps
        # unroll unrolls the network for speed (at the cost of memory)
        #self.wrap = tf.keras.layers.RNN(self.lstm_layer, return_sequences=True, unroll=True)

        self.global_pool = tf.keras.layers.GlobalAvgPool3D() # 3D POOLING BECAUSE SHAPE (32,20,28,28,32)
        self.time3 = tf.keras.layers.TimeDistributed(self.global_pool)

        
        self.output_layer = tf.keras.layers.Dense(1, activation="relu")
        
        self.metrics_list = [tf.keras.metrics.Mean(name="loss"),
                             tf.keras.metrics.BinaryAccuracy()]
    
    @property
    def metrics(self):
        return self.metrics_list
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_state()
        
    def call(self, x, training=False):

      x = self.convlayer1(x)
      print('shape', x.shape)
      
      x = self.convlayer2(x)
      print('shape', x.shape)
      
      x = self.pooling(x)
      print('shape', x.shape)
      
      x = self.time1(x)
      print('shape', x.shape)
      
      x = self.convlayer3(x)
      x = self.convlayer4(x)
      x = self.pooling2(x)
      x = self.time2(x)
      print('shappe before', x.shape)
      x = self.lstm_layer(x)
      #x = self.wrap(x)
      x = self.global_pool(x)
      x = self.time3(x)
      print('shappe after', x.shape)
      
      return self.output_layer(x)
    
    def train_step(self, data):
        
        """
        Standard train_step method, assuming we use model.compile(optimizer, loss, ...)
        """
        
        sequence, label = data
        with tf.GradientTape() as tape:
            output = self(sequence, training=True)
            loss = self.compiled_loss(label, output, regularization_losses=self.losses)
        gradients = tape.gradient(loss, self.trainable_variables)
        
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        self.metrics[0].update_state(loss)
        self.metrics[1].update_state(label, output)
        
        return {m.name : m.result() for m in self.metrics}
    
    def test_step(self, data):
        
        """
        Standard test_step method, assuming we use model.compile(optimizer, loss, ...)
        """
        
        sequence, label = data
        #print(sequence)
        output = self(sequence, training=False)
        loss = self.compiled_loss(label, output, regularization_losses=self.losses)
                
        self.metrics[0].update_state(loss)
        self.metrics[1].update_state(label, output)
        
        return {m.name : m.result() for m in self.metrics}

In [None]:
# TASK 4 - Training the networks


def training():
  
  # instantiate the model
  ourmodel = LSTMModel()

  
  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
  loss = tf.keras.losses.MeanSquaredError()

  # compile the model (here, adding a loss function and an optimizer)
  ourmodel.compile(optimizer = optimizer, loss=loss)
  
  # create datasets
  train_ds = mnist[0]
  test_ds = mnist[1]

  train_ds = preprocess_data(train_ds, batch_size=32, sequence_length =20) #train_ds.apply(preprocess)
  test_ds = preprocess_data(test_ds, batch_size=32, sequence_length =20) #val_ds.apply(preprocess)

  # internal training loop function
  def training_loop(model, train_ds, test_ds, epochs, save_path=False): 

    #save_path = save_path

    for epoch in range(epochs):
        print(f"Epoch {epoch}:")
        print('Loop 1')
        # Validation:
        # (we do the validation first so that we get the accuracy and loss before training the network)
        for data in test_ds:
           print('Loop 2')
           metrics = model.test_step(data)
        
        print([f"test_{key}: {value.numpy()}" for (key, value) in metrics.items()])

        # reset all metrics
        model.reset_metrics()    
        
        # Training:
        for data in train_ds:
           print('Loop 1')
           metrics = model.train_step(data)

        # print the metrics
        print([f"train_{key}: {value.numpy()}" for (key, value) in metrics.items()])


        # reset all metrics
        model.reset_metrics()
        print("\n")

    #save weights
    #if save_path:
    #    model.save_weights(save_path)
  
  training_loop(ourmodel, train_ds, test_ds, 11)#, f"logs/{str(opt)}/weights")
  print('end')
  return

training()


Epoch 0:
Loop 1
Loop 2
shape (32, 28, 28, 32)
shape (32, 28, 28, 32)
shape (32, 14, 14, 32)


ValueError: ignored

In [None]:
EXPERIMENT_NAME = "lstm_noise"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logging_callback = tf.keras.callbacks.TensorBoard(log_dir=f"./logs/{EXPERIMENT_NAME}/{current_time}")

In [None]:
history = model.fit(train_ds,
                    validation_data=val_ds,
                    initial_epoch=25,
                    epochs=50,
                    callbacks=[logging_callback])

In [None]:
# save the complete model (incl. optimizer state, loss function, metrics etc.)
# ideally save to google drive if you're using colab
model.save("saved_model")

In [None]:
# load the model and resume training where we had to stop
loaded_model = tf.keras.models.load_model("saved_model", custom_objects={"LSTMCell": LSTMCell,
                                                                         "LSTMModel": LSTMModel})