# Chapter 12 - Custom Models and Training with TensorFlow

## Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.4 is required in this notebook
# Earlier 2.x versions will mostly work the same, but with a few bugs
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.4"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Tensors and operations

In [2]:
tf.constant([[1., 2., 3.], [4., 5., 6.]]) #matrix


<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [3]:
t = tf.constant([[1., 2., 3.], [4., 5., 6.]]) 
t.shape

TensorShape([2, 3])

In [4]:
t.dtype

tf.float32

In [5]:
t[:, 1:]

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 3.],
       [5., 6.]], dtype=float32)>

In [6]:
t[..., 1, tf.newaxis]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.],
       [5.]], dtype=float32)>

In [7]:
t+10

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[11., 12., 13.],
       [14., 15., 16.]], dtype=float32)>

In [8]:
tf.square(t)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)>

In [9]:
t @ tf.transpose(t)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 32.],
       [32., 77.]], dtype=float32)>

## Tensors and NumPy

In [10]:
a = np.array([2., 4., 5.])
tf.constant(a)

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([2., 4., 5.])>

In [11]:
t.numpy() # or np.array(t)

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

In [12]:
tf.square(a)

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([ 4., 16., 25.])>

In [13]:
np.square(t)

array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)

## Type Conversions

In [14]:
tf.constant(2.) + tf.constant(40)

InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:AddV2]

In [15]:
tf.constant(2.) + tf.constant(40., dtype=tf.float64)

InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a double tensor [Op:AddV2]

In [16]:
t2 = tf.constant(40., dtype=tf.float64)
tf.constant(2.0) + tf.cast(t2, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=42.0>

## Variables

In [17]:
v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])
v

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [18]:
v.assign(2 * v)                                     # => [[2., 4., 6.], [8., 10., 12.]]
v[0, 1].assign(42)                                  # => [[2., 42., 6.], [8., 10., 12.]]
v[:, 2].assign([0., 1.])                            # => [[2., 42., 0.], [8., 10., 1.]]
v.scatter_nd_update(indices=[[0,0],[1,2]], updates=[100., 300.]) # => [[100., 42., 0.], [8., 10., 200.]]

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[100.,  42.,   0.],
       [  8.,  10., 300.]], dtype=float32)>

## Custom Loss Functions

In [19]:
def huber_fn(y_true, y_pred):
  error = y_true - y_pred
  is_small_error = tf.abs(error) < 1
  squared_loss = tf.square(error) / 2
  linear_loss = tf.abs(error) - 0.5
  return tf.where(is_small_error, squared_loss, linear_loss)

In [20]:
model.compile(loss=huber_fn, optimizer="nadam")
model.fit(X_train, y_train)

NameError: name 'model' is not defined

In [21]:
model = keras.models.load_model("my_model_with_a_custom_loss.h5",
                                custom_objects={"huber_fn":huber_fn})
												

OSError: No file or directory found at my_model_with_a_custom_loss.h5

In [22]:
def create_huber(threshold=1.0):
  def huber_fn(y_true, y_pred):
    error = y_true - y_pred
    is_small_error = tf.abs(error) < threshold
    squared_loss = tf.square(error) / 2
    linear_loss = threshold * tf.abs(error) - threshold**2 / 2
    return tf.where(is_small_error, squared_loss, linear_loss)
  return huber_fn

model.compile(loss=create_huber(2.0), optimizer="nadam")

NameError: name 'model' is not defined

In [23]:
model = keras.models.load_model("my_model_with_a_custom_loss_threshold_2.h5",
                                custom_objects={"huber_fn":create_huber(2.0)})

OSError: No file or directory found at my_model_with_a_custom_loss_threshold_2.h5

In [24]:
class HuberLoss(keras.losses.Loss):
  def __init__(self, threshold=1.0, **kwargs):
    self.threshold = threshold
    super().__init__(**kwargs)
  def call(self, y_true, y_pred):
    error = y_true - y_pred
    is_small_error = tf.abs(error) < self.threshold
    squared_loss = tf.square(error) / 2
    linear_loss = self.threshold * tf.asb(error) - self.threshold**2 / 2
    return tf.where(is_small_error, squared_loss, linear_loss)
  def get_config(self):
    base_config = super().get_config()
    return {**base_config, "threshold":self.threshold}

In [25]:
model.compile(loss=HuberLoss(2.), optimizer="nadam")

NameError: name 'model' is not defined

In [26]:
model = keras.models.load_model("my_model_with_a_custom_loss_class.h5",
                                custom_objects={"HuberLoss": HuberLoss})

OSError: No file or directory found at my_model_with_a_custom_loss_class.h5

## Custom Activation Functions, Initializers, Regularizers, and Constraints

### Custom Activation Function

In [27]:
def my_softplus(z): 
  return tf.math.log(tf.exp(z) + 1.0)

### Custom Initializer

In [28]:
def my_glorot_initializer(shape, dtype=tf.float32):
  stddev = tf.sqrt(2. /(shape[0] + shape[1]))
  return tf.random.noraml(shape, stddev=stddev, dtype=dtype)

### Custom Regularizer

In [29]:
def my_l1_regularizer(weights):
  return tf.reduce_sum(tf.abs(0.01 * weights))

### Custom Constraint

In [30]:
def my_positive_weights(weights):
  return tf.where(weights < 0., tf.zeros_like(weights), weights)

In [31]:
layer = keras.layers.Dense(30, activation=my_softplus, 
                          kernel_initializer = my_glorot_initializer,
                          kernel_regularizer = my_l1_regularizer,
                          kernel_constraint = my_positive_weights)

In [32]:
class MyL1Regularization(keras.regularizers.Regularizer):
  def __init__(self, factor):
    self.factor = factor
  def __call__(self, weights):
    return tf.reduce_sum(tf.abs(self.factor * weights))
  def get_config(self):
    return {"factor": self.factor}
    

## Custom Metrics

In [33]:
model.compile(loss="mse",
              optimizer="nadam",
              metrics=[create_huber(2.0)]
              )

NameError: name 'model' is not defined

In [34]:
precision = keras.metrics.Precision()
precision([0,1,1,1,0,1,0,1], [1,1,0,1,0,1,0,1])

<tf.Tensor: shape=(), dtype=float32, numpy=0.8>

In [35]:
precision([0,1,0,0,1,0,1,1], [1,0,1,1,0,0,0,0])

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [36]:
precision.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [37]:
precision.variables

[<tf.Variable 'true_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>,
 <tf.Variable 'false_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>]

In [38]:
precision.reset_states()

In [39]:
class HuberMetric(keras.metrics.Metric):
  def __init__(self, threshold=1.0, **kwargs):
    super().__init__(**kwargs) # handles base args (e.g., dtype)
    self.threshold = threshold
    self.huber_fn = create_huber(threshold)
    self.total = self.add_weight("total", initializer="zeros")
    self.count = self.add_weight("count", initializer="zeros")
  def update_state(self, y_true, y_pred, sample_weight=None):
    metric = self.huber_fn(y_true, y_pred)
    self.total.assign_add(tf.reduce_sum(metric))
    self.count.assign_add(tf.cast(tf.size(y_true), tf.float32))
  def result(self):
    return self.total / self.count
  def get_config(self):
    base_config = super().get_config()
    return {**base_config, "threshold":self.threshold}


## Custom Layer

In [40]:
exponential_layer = keras.layers.Lambda(lambda x : tf.exp(x))

In [41]:
class MyDense(keras.layers.Layer):
  def __init__(self, units, activation=None, **kwargs):
    super().__init__(**kwargs)
    self.units = units 
    self.activation = keras.activations.activation.get(activation)
  
  def build(self, batch_input_shape):
    self.kernel = self.add_weight( name="kernel",
                                   shape=[batch_input_shape[-1],
                                    self.units],
                                    initializer="glorot_normal")
    self.bias = self.add_weight(name="bias",
                                shape=[self.units],
                                initializer="zeros")
    super().build(batch_input_shape) # must be at the end
  
  def call(self, X):
    return self.activation(X @ self.kernel + self.bias)
  
  def compute_output_shape(self, batch_input_shape):
    return tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])
  
  def get_config(self):
    base_config = super().get_config()
    return {**base_config,
            "units" : self.units,
            "activation" : keras.activations.serialize(self.activation)}

  


In [42]:
class MyMultiLayer(keras.layers.Layer):
  def call(self, X):
    X1, X2 = X
    return [X1 + X2, X1 * X2, X1 / X2]

  def compute_output_shape(self, batch_input_shape):
    b1, b2 = batch_input_shape
    return [b1, b1, b1] # should probabaly handle broadcasting rules

In [43]:
class MyGaussianNoise(keras.layers.Layer):
  def __init__(self, stddev, **kwargs):
    super().__init__(**kwargs)
    self.stddev = stddev
  
  def call(self, X, training=None):
    if training:
      noise = tf.random.normal(tf.shape(X), stddev=self.stddev)
      return X + noise
    else:
      return X
  
  def compute_output_shape(self, batch_input_shape):
    return batch_input_shape

## Custom Models

In [44]:
class ResidualBlock(keras.layers.Layer):
  def __init__(self, n_layers, n_neurons, **kwargs):
    super().__init__(**kwargs)
    self.hidden = [keras.layers.Dense(n_neurons,
                                     activation="elu",
                                      kernel_initializer="he_normal",
                                      ) for _ in range(n_layers)]
  def call(self, inputs):
    Z = inputs
    for layer in self.hidden:
      Z = layer(Z)
      return inputs + Z

In [45]:
class ResidualRegressor(keras.Model):
  def __init__(self, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.hidden1 = keras.layers.Dense(30, activation="elu", kernel_initializer="he_normal")
    self.block1 = ResidualBlock(2, 30)
    self.block2 = ResidualBlock(3, 30)
    self.out = keras.layers.Dense(output_dim)
  
  def call(self, inputs):
    Z = self.hidden1(inputs)
    for _ in range(1 + 3):
      Z = self.block1(Z)
    Z = self.block2(Z)
    return self.out(Z)

## Losses and Metrics Based on Model Internals

In [46]:
class ReconstructingRegressor(keras.Model):
  def __init__(self, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.hidden = [keras.layers.Dense(30, activation="selu",
                                      kernel_initializer="lecun_normal") for __ in range(5)]
    self.out = keras.layers.Dense(output_dim)
  
  def build(self, batch_input_shape):
    n_inputs = batch_input_shape[-1]
    self.reconstruct = keras.layers.Dense(n_inputs)
    super().build(batch_input_shape)

  def call(self, inputs):
    z = inputs
    for layer in self.hidden:
      z = layer(z)
    reconstruction = self.reconstruct(z)
    recon_loss = tf.reduce_mean(tf.square(reconstruction - inputs))
    self.add_loss(0.05 * recon_loss)
    return self.out(z)

## Computing Gradients Using Autodiff

In [47]:
def f(w1, w2):
	return 3 * w1 ** 2 + 2 * w1 * w2

In [48]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
  z = f(w1, w2)

gradients = tape.gradient(z, [w1, w2])

In [49]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [50]:
with tf.GradientTape(persistent=True) as tape:
  z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1)
dz_dw2 = tape.gradient(z, w2)
del tape


In [51]:
c1, c2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
  z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2]) # returns [None, None]

In [52]:
with tf.GradientTape() as tape:
  tape.watch(c1)
  tape.watch(c2)
  z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2]) # returns [tensor 36., tensor 10.]

In [53]:
def f(w1, w2):
  return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2) 

with tf.GradientTape() as tape:
  z = f(w1, w2) # same result as without stop_gradient()

gradients = tape.gradient(z, [w1, w2]) # => returns [tensor 30., None]

In [54]:
@tf.custom_gradient
def my_better_softplus(z):
  exp = tf.exp(z)
  def my_softplus_gradients(grad):
    return grad / (1 + 1 / exp)
  return tf.math.log(exp + 1 ), my_softplus_gradients

## Custom Training Loops

In [55]:
l2_reg = keras.regularizers.l2(0.05)
model = keras.models.Sequential([
  keras.layers.Dense(30, activation="elu",
                    kernel_initializer="he_normal",
                    kernel_regularizer=l2_reg),
  keras.layers.Dense(1, kernel_regularizer=l2_reg)
])

In [56]:
def random_batch(X, y, batch_size=32):
  idx = np.random.randint(len(X), size=batch_size)
  return X[idx], y[idx]


In [57]:
def print_status_bar(iteration, total, loss, metrics=None):
  metrics = " - ".join(["{}: {:.4f} ".format(m.name, m.result()) 
                      for m in [loss] + (metrics or [])])
  end = "" if iteration < total else "\n"
  print("\r{}/{} ".format(iteration, total)+ metrics, end=end)

In [58]:
X_train = []
n_epochs= 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.mean_squared_error
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.MeanAbsoluteError()]

  super(Nadam, self).__init__(name, **kwargs)


In [59]:
X_train_scaled, y_train =[], []
for epoch in range(1, n_epochs+1):
  print("Epcoch {}/{} ".format(epoch, n_epochs))
  for step in range(1, n_steps + 1):
    X_batch, y_batch = random_batch(X_train_scaled, y_train)
    with tf.GradientTape() as tape:
      y_pred = model(X_batch, training=True)
      main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
      loss = tf.add_n([main_loss] + model.losses)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trianable_variables))
    mean_loss(loss)
    for metric in metrics:
      metric(y_batch, y_pred)
    print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)
  print_status_bar(len(y_train), len(y_train), mean_loss, metrics)
  for metric in [mean_loss] + metrics:
    metric.reset_states()


Epcoch 1/5 
0/0 mean: 0.0000  - mean_absolute_error: 0.0000 
Epcoch 2/5 
0/0 mean: 0.0000  - mean_absolute_error: 0.0000 
Epcoch 3/5 
0/0 mean: 0.0000  - mean_absolute_error: 0.0000 
Epcoch 4/5 
0/0 mean: 0.0000  - mean_absolute_error: 0.0000 
Epcoch 5/5 
0/0 mean: 0.0000  - mean_absolute_error: 0.0000 


In [60]:
for variable in model.variables:
	if variable.constraint is not None:
        
variable.assign(variable.constraint(variable))

IndentationError: expected an indented block (Temp/ipykernel_18012/2948967561.py, line 4)

## TensorFlow Functions and Graphs

In [61]:
def cube(x):
  return x ** 3

In [62]:
cube(2)

8

In [63]:
cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

In [64]:
tf_cube = tf.function(cube)
tf_cube

<tensorflow.python.eager.def_function.Function at 0x22ceae0d2e0>

In [65]:
tf_cube(2)

<tf.Tensor: shape=(), dtype=int32, numpy=8>

In [66]:
tf_cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

In [67]:
@tf.function
def tf_cube(x):
  return x ** 3

In [68]:
tf_cube.python_function(2)

8

## Question 12.


In [69]:
class LayerNormalization(keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self.epsilon = 0.001

  def build(self, batch_input_shape):
    self.alpha = self.add_weight(name="alpha",
                                shape=batch_input_shape[-1:],
                                initializer="ones")
    self.beta = self.add_weight(name="beta",
                                shape=batch_input_shape[-1:],
                                initializer="zeros")
    super().build(batch_input_shape)
  
  def call(self, x):
    mean, variance = tf.nn.moments(x, axes=-1, keepdims=True)
    return self.alpha * (x - mean)/(tf.sqrt(variance + self.epsilon)) + self.beta
  

In [70]:
X = x_train.astype(np.float32)

custom_layer_norm = LayerNormalization()
keras_layer_norm = keras.layers.LayerNormalization()

tf.reduce_mean(keras.losses.mean_absolute_error(keras_layers_norm(X), custom_layer_norm(X)))

NameError: name 'x_train' is not defined

## Question 13.

In [71]:
(X_train_full, y_train_full),(X_test, y_full) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.

In [72]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [73]:
model = keras.models.Sequential([
  keras.layers.Flatten(input_shape=[28,28]),
  keras.layers.Dense(100, activation="relu"),
  keras.layers.Dense(10, activation="softmax")
])

In [74]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = keras.losses.sparse_categorical_crossentropy
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.SparseCategoricalAccuracy()]

In [75]:
from tqdm.notebook import trange
from collections import OrderedDict

with trange(1, n_epochs + 1, desc="All epochsl") as epochs:
  for epoch in epochs:
    with trange(1, n_steps + 1, desc="Epoch{}/{}".format(epoch, n_epochs)) as steps:
      for step in steps:
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape(persistent=True) as tape:
          y_pred = model(X_batch)
          main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
          loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        for variable in model.variables:
          if variable.constraint is not None:
            variable.assign(variable.constraint(variable))
        status = OrderedDict()
        mean_loss(loss)
        status["loss"] = mean_loss.result().numpy()
        for metric in metrics:
          metric(y_batch, y_pred)
          status[metric.name] = metric.result().numpy()
        steps.set_postfix(status) 
      y_pred = model(X_valid)
      status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
      status["val_loss"] = np.mean(keras.metrics.sparse_categorical_accuracy(tf.constant(y_valid, dtype=np.float32), y_pred))
      steps.set_postfix(status)
    for metric in [mean_loss] + metrics:
      metric.reset_states()



All epochsl:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch1/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch2/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch3/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch4/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch5/5:   0%|          | 0/1718 [00:00<?, ?it/s]

In [76]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [77]:
lower_layers = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(100, activation="relu"),
])
upper_layers = keras.models.Sequential([
    keras.layers.Dense(10, activation="softmax"),
])
model = keras.models.Sequential([
    lower_layers, upper_layers
])

In [78]:
lower_optimizer = keras.optimizers.SGD(learning_rate=1e-4)
upper_optimizer = keras.optimizers.Nadam(learning_rate=1e-3)

In [79]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
loss_fn = keras.losses.sparse_categorical_crossentropy
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.SparseCategoricalAccuracy()]

In [80]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc="Epoch {}/{}".format(epoch, n_epochs)) as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape(persistent=True) as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                for layers, optimizer in ((lower_layers, lower_optimizer),
                                          (upper_layers, upper_optimizer)):
                    gradients = tape.gradient(loss, layers.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
                del tape
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))                    
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(keras.metrics.sparse_categorical_accuracy(
                tf.constant(y_valid, dtype=np.float32), y_pred))
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_states()

All epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/1718 [00:00<?, ?it/s]