<a href="https://colab.research.google.com/github/dupeljan/collab/blob/main/Copy_of_writing_a_training_loop_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Try to freeze weights in runtime

## Setup

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

Setup dataset

In [2]:
# Prepare the training dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

# Reserve 10,000 samples for validation.
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

# Eager mode

In [None]:
inputs = keras.Input(shape=(784,), name="digits")
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)



In [None]:
def train(model, epochs = 2):
  for epoch in range(epochs):
      print("\nStart of epoch %d" % (epoch,))

      # Iterate over the batches of the dataset.
      for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):

          # Open a GradientTape to record the operations run
          # during the forward pass, which enables auto-differentiation.
          with tf.GradientTape() as tape:

              # Run the forward pass of the layer.
              # The operations that the layer applies
              # to its inputs are going to be recorded
              # on the GradientTape.
              logits = model(x_batch_train, training=True)  # Logits for this minibatch

              # Compute the loss value for this minibatch.
              loss_value = loss_fn(y_batch_train, logits)

          # Use the gradient tape to automatically retrieve
          # the gradients of the trainable variables with respect to the loss.
          grads = tape.gradient(loss_value, model.trainable_weights)

          # Run one step of gradient descent by updating
          # the value of the variables to minimize the loss.
          optimizer.apply_gradients(zip(grads, model.trainable_weights))

          # Log every 200 batches.
          if step % 200 == 0:
              print(
                  "Training loss (for one batch) at step %d: %.4f"
                  % (step, float(loss_value))
              )
              print("Seen so far: %s samples" % ((step + 1) * 64))
train(model)


Start of epoch 0
Training loss (for one batch) at step 0: 84.5313
Seen so far: 64 samples
Training loss (for one batch) at step 200: 2.1334
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.7563
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.6857
Seen so far: 38464 samples

Start of epoch 1
Training loss (for one batch) at step 0: 0.6501
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.5389
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.5519
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.5096
Seen so far: 38464 samples


Check trainable weights

In [None]:
[ x.name for x in model.trainable_weights]

['dense_2/kernel:0',
 'dense_2/bias:0',
 'dense_3/kernel:0',
 'dense_3/bias:0',
 'predictions/kernel:0',
 'predictions/bias:0']

Freeze layers

In [None]:
def copy(weights):
    res = []
    for w in weights:
      res.append(tf.identity(w))
    return res 


weights = dict()
# Freeze all layers besides the last one
target_layer_name = 'predictions'
for layer in model.layers:
  if layer.weights:
    if layer.name != target_layer_name:
      layer.trainable = False
    weights[layer.name] = copy(layer.weights)

# Check trainable weights list
[x.name for x in model.trainable_weights]

['predictions/kernel:0', 'predictions/bias:0']

Train only last layer of model and check it


In [None]:
train(model)

def equal(w1, w2):
    return [ bool(tf.math.reduce_all(tf.math.equal(w, w_saved))) 
              for w, w_saved in zip(w1, w2)]

    
for layer in model.layers:
  if not layer.weights:
    continue
  if layer.name != target_layer_name:
    assert all(equal(weights[layer.name], layer.weights))
  else:
    assert not all(equal(weights[layer.name], layer.weights))
  print("{0} sqr diff: {1}".format(
        layer.name,
        tf.math.reduce_sum(tf.math.pow(
                        layer.weights[0] -  weights[layer.name][0], 2))
    ))



Start of epoch 0
Training loss (for one batch) at step 0: 0.3266
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.3251
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.6280
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.2726
Seen so far: 38464 samples

Start of epoch 1
Training loss (for one batch) at step 0: 0.2719
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.7506
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.3221
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.4017
Seen so far: 38464 samples
dense_2 sqr diff: 0.0
dense_3 sqr diff: 0.0
predictions sqr diff: 0.02466883882880211


Freezing in eager mode is working!

You can freeze layer, but you can't set weight trainable attribute. If layer have trainable == false this it just not lised at `model.trainable_weights`

In [None]:
for layer in model.layers:
  if layer.name != target_layer_name:
    for weight in layer.weights:
      print('w', weight.name, weight.trainable)
    print('l', layer.name, layer.trainable)

l digits False
w dense/kernel:0 True
w dense/bias:0 True
l dense False
w dense_1/kernel:0 True
w dense_1/bias:0 True
l dense_1 False


# Non eager mode with GradTape

Setup train loop

In [None]:
@tf.function
def train():
  epochs = 3
  for epoch in range(epochs):
      print("\nStart of epoch %d" % (epoch,))

      # Iterate over the batches of the dataset.
      for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):

          # Open a GradientTape to record the operations run
          # during the forward pass, which enables auto-differentiation.
          with tf.GradientTape() as tape:

              # Run the forward pass of the layer.
              # The operations that the layer applies
              # to its inputs are going to be recorded
              # on the GradientTape.
              logits = model(x_batch_train, training=True)  # Logits for this minibatch

              # Compute the loss value for this minibatch.
              loss_value = loss_fn(y_batch_train, logits)

          # Use the gradient tape to automatically retrieve
          # the gradients of the trainable variables with respect to the loss.
          grads = tape.gradient(loss_value, model.trainable_weights)

          # Run one step of gradient descent by updating
          # the value of the variables to minimize the loss.
          optimizer.apply_gradients(zip(grads, model.trainable_weights))



Create model

In [None]:
inputs = keras.Input(shape=(784,), name="digits")
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = keras.Model(inputs=inputs, outputs=outputs)

Setup trainable params

In [None]:
def copy(weights):
    res = []
    for w in weights:
      res.append(tf.identity(w))
    return res 


weights = dict()
def save_weight():
  global weights
  for layer in model.layers:
    if layer.weights:
      weights[layer.name] = copy(layer.weights)

save_weight()

# Freeze all layers besides the last one
target_layer_name = 'predictions'
for layer in model.layers:
  if layer.weights:
    if layer.name != target_layer_name:
      layer.trainable = False

# Check trainable weights list
[x.name for x in model.trainable_weights]

['predictions/kernel:0', 'predictions/bias:0']

Try to train only last layer

In [None]:
train()


Start of epoch 0

Start of epoch 1

Start of epoch 2


Check freezing

In [None]:

def equal(w1, w2):
    return [ bool(tf.math.reduce_all(tf.math.equal(w, w_saved))) 
              for w, w_saved in zip(w1, w2)]

def check_is_trainable():    
  for layer in model.layers:
    if not layer.weights:
      continue
    if layer.name != target_layer_name:
      assert all(equal(weights[layer.name], layer.weights))
    else:
      assert not all(equal(weights[layer.name], layer.weights))
    print("{0} sqr diff: {1}".format(
          layer.name,
          tf.math.reduce_sum(tf.math.pow(
                          layer.weights[0] -  weights[layer.name][0], 2))
      ))

check_is_trainable()

dense_4 sqr diff: 0.0
dense_5 sqr diff: 0.0
predictions sqr diff: 13.99891471862793


That's ok

Try to unfreeze all layers

In [None]:
for layer in model.layers:
  layer.trainable = True

In [None]:
save_weight()
train()
check_is_trainable()

dense_4 sqr diff: 0.0
dense_5 sqr diff: 0.0
predictions sqr diff: 0.32455798983573914


No success

# Keras API training mode

Create model with custom train_step to be sure that trainable vars is got from self.trainable_variables

In [22]:
class CustomModel(keras.Model):
    def train_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        print("Trainable vars", trainable_vars)
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

In [37]:
inputs = keras.Input(shape=(784,), name="inputs")
x1 = layers.Dense(64, activation="relu", name="dense1")(inputs)
x2 = layers.Dense(64, activation="relu", name="dense2")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = CustomModel(inputs=inputs, outputs=outputs)
[x.trainable for x in model.layers]

[True, True, True, True]

In [39]:
[x.name for x in model.trainable_weights]

['dense1/kernel:0',
 'dense1/bias:0',
 'dense2/kernel:0',
 'dense2/bias:0',
 'predictions/kernel:0',
 'predictions/bias:0']

Make callable which can freeze learning








In [56]:
class FreezingCallback(tf.keras.callbacks.Callback):
  @staticmethod 
  def copy(weights):
    res = []
    for w in weights:
      res.append(tf.identity(w))
    return res 

  @staticmethod
  def have_weights(layer):
    return hasattr(layer, 'weights') and layer.weights

  @staticmethod
  def equal(w1, w2):
    return [(w.name, bool(tf.math.reduce_all(tf.math.equal(w, w_saved)))) 
              for w, w_saved in zip(w1, w2)]
    
  def __init__(self, weights_to_freeze, epoch_to_freeze):
    super().__init__()
    self.weights_to_freeze = weights_to_freeze
    self.epoch_to_freeze = epoch_to_freeze
    self.weights = dict()

  def on_epoch_begin(self, epoch, logs=None):
    # Keep weights before epoch
    for layer in self.model.layers:
      if self.have_weights(layer):
        self.weights[layer.name] = self.copy(layer.weights)
        assert all([x[1] for x in 
                   self.equal(layer.weights, self.weights[layer.name])])
        
    # Freeze if it's time to freeze
    if epoch == self.epoch_to_freeze:
      #gets a reference to the list containing the trainable variables
      name = tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
      trainable_collection = tf.compat.v1.get_collection_ref(name)
      variables_to_remove = list()
      print("trainable_collection:", trainable_collection)
      for vari in trainable_collection:
          #uses the attribute 'name' of the variable
        if vari.name in self.weights_to_freeze:
            variables_to_remove.append(vari)
      for rem in variables_to_remove:
        trainable_collection.remove(rem)
      print(f"Freeze weights: {variables_to_remove}")
  
  def on_epoch_end(self, epoch, logs=None):
    print(logs)
    for layer in self.model.layers:
      if self.have_weights(layer):
        equal = self.equal(layer.weights, self.weights[layer.name])
        print(f"Layer {layer.name} changed? "
              f"{[(x[0], not x[1]) for x in equal]}")
        print("sqr diff {}".format(
            tf.math.reduce_sum(tf.math.pow(
                        layer.weights[0] - self.weights[layer.name][0], 2))))
        # Keep new weights
        self.weights[layer.name] = self.copy(layer.weights)


freezing_callback = FreezingCallback(weights_to_freeze=[
                                        'dense1/kernel:0',
                                        'dense1/bias:0',
                                        'dense2/kernel:0',
                                        'dense2/bias:0', ],
                                      epoch_to_freeze=4)

In [13]:
# You can change trainable status only before compileing
for layer in model.layers:
  layer.trainable = True
[x.trainable for x in model.layers]

[True, True, True, True]

In [8]:
compile_kwargs = {
    'optimizer': keras.optimizers.SGD(learning_rate=1e-3), 
    'loss': keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    'metrics': [keras.metrics.SparseCategoricalAccuracy()],
}

model.compile(
    **compile_kwargs
)


In [60]:
# Setup trainable param
#for layer in model.layers:
#  layer.trainable = False
#model.trainable = False

model.fit(
    x_train,
    y_train,
    batch_size=32,
    epochs=10,
    callbacks=[freezing_callback]
)


Train on 50000 samples


OperatorNotAllowedInGraphError: ignored

# TF1

In [3]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
inputs = keras.Input(shape=(784,), name="inputs")
x1 = layers.Dense(64, activation="relu", name="dense1")(inputs)
x2 = layers.Dense(64, activation="relu", name="dense2")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [7]:
class TF1FreezingCallback(tf.keras.callbacks.Callback):
  @staticmethod 
  def copy(weights):
    res = []
    for w in weights:
      res.append(tf.identity(w))
    return res 

  @staticmethod
  def have_weights(layer):
    return hasattr(layer, 'weights') and layer.weights

 # @staticmethod
 # def equal(w1, w2):
#    return [(w.name, bool(tf.math.reduce_all(tf.math.equal(w, w_saved)))) 
 #             for w, w_saved in zip(w1, w2)]
    
  def __init__(self, weights_to_freeze, epoch_to_freeze):
    super().__init__()
    self.weights_to_freeze = weights_to_freeze
    self.epoch_to_freeze = epoch_to_freeze
    self.weights = dict()

  def on_epoch_begin(self, epoch, logs=None):
    # Keep weights before epoch
    for layer in self.model.layers:
      if self.have_weights(layer):
        self.weights[layer.name] = self.copy(layer.weights)
                
    # Freeze if it's time to freeze
    if epoch == self.epoch_to_freeze:
      #gets a reference to the list containing the trainable variables
      name = tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
      trainable_collection = tf.compat.v1.get_collection_ref(name)
      variables_to_remove = list()
      print("trainable_collection:", trainable_collection)
      for vari in trainable_collection:
          #uses the attribute 'name' of the variable
        if True:#vari.name in self.weights_to_freeze:
            variables_to_remove.append(vari)
      for rem in variables_to_remove:
        trainable_collection.remove(rem)
      print(f"Freeze weights: {variables_to_remove}")
  
  def on_epoch_end(self, epoch, logs=None):
    print(logs)
    for layer in self.model.layers:
      if self.have_weights(layer):
        #equal = self.equal(layer.weights, self.weights[layer.name])
        #print(f"Layer {layer.name} changed? "
        #      f"{[(x[0], not x[1]) for x in equal]}")
        print("sqr diff {}".format(
            tf.math.reduce_sum(tf.math.pow(
                        layer.weights[0] - self.weights[layer.name][0], 2)
                            )))
        # Keep new weights
        self.weights[layer.name] = self.copy(layer.weights)


freezing_callback = TF1FreezingCallback(weights_to_freeze=[
                                        'dense1/kernel:0',
                                        'dense1/bias:0',
                                        'dense2/kernel:0',
                                        'dense2/bias:0', ],
                                      epoch_to_freeze=4)

In [None]:
c_name = tf.compat.v1.get_default_graph().collections
graph = tf.compat.v1.get_default_graph()
for name in c_name:
  print(f'{name} - {graph.get_collection_ref(name)}')

variables - []
local_variables - []
trainable_variables - []
('__variable_store',) - []
('__varscope',) - [<tensorflow.python.ops.variable_scope._VariableScopeStore object at 0x7f1d3de32ee8>]


In [10]:
tf.compat.v1.get_default_graph()._collections['trainable_variables']

[]

In [9]:
#s = tf.Session()
#with s.as_default():
model.fit(
      x_train,
      y_train,
      batch_size=32,
      epochs=10,
      callbacks=[freezing_callback]
  )


Train on 50000 samples
Epoch 1/10
sqr diff Tensor("Sum:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_1:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_2:0", shape=(), dtype=float32)
Epoch 2/10
sqr diff Tensor("Sum_3:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_4:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_5:0", shape=(), dtype=float32)
Epoch 3/10
sqr diff Tensor("Sum_6:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_7:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_8:0", shape=(), dtype=float32)
Epoch 4/10
sqr diff Tensor("Sum_9:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_10:0", shape=(), dtype=float32)
sqr diff Tensor("Sum_11:0", shape=(), dtype=float32)
trainable_collection: [<tf.Variable 'dense1/kernel:0' shape=(784, 64) dtype=float32>, <tf.Variable 'dense1/bias:0' shape=(64,) dtype=float32>, <tf.Variable 'dense2/kernel:0' shape=(64, 64) dtype=float32>, <tf.Variable 'dense2/bias:0' shape=(64,) dtype=float32>, <tf.Variable 'predictions/kernel:0' shape=(64, 

<tensorflow.python.keras.callbacks.History at 0x7f5837a66390>