# Introduction to Keras for Researchers
https://keras.io/getting_started/intro_to_keras_for_researchers/

In this guide, you will learn about:

- Tensors, variables, and gradients in TensorFlow
- Creating layers by subclassing the Layer class
- Writing low-level training loops
- Tracking losses created by layers via the add_loss() method
- Tracking metrics in a low-level training loop
- Speeding up execution with a compiled tf.function
- Executing layers in training or inference mode
- The Keras Functional API
You will also see the Keras API in action in two end-to-end research examples: a Variational Autoencoder, and a Hypernetwork.




In [1]:
import tensorflow as tf
import keras
import numpy as np

# Tensors

In [2]:
x = tf.constant([[5, 2], [1, 3], [4,8]])
print(x)

tf.Tensor(
[[5 2]
 [1 3]
 [4 8]], shape=(3, 2), dtype=int32)


In [3]:
x.numpy()

array([[5, 2],
       [1, 3],
       [4, 8]], dtype=int32)

In [4]:
print("dtype:", x.dtype)
print("shape:", x.shape)

dtype: <dtype: 'int32'>
shape: (3, 2)


A common way to create constant tensors is via tf.ones and tf.zeros (just like np.ones and np.zeros):

In [5]:
print(tf.ones(shape=(2, 1)))
print(tf.zeros(shape=(2, 1)))

tf.Tensor(
[[1.]
 [1.]], shape=(2, 1), dtype=float32)
tf.Tensor(
[[0.]
 [0.]], shape=(2, 1), dtype=float32)


You can also create random constant tensors:

In [6]:
x1 = tf.random.normal(shape=(2, 2), mean=0.0, stddev=1.0)
print(x1.numpy())
x2 = tf.random.uniform(shape=(2, 2), minval=0, maxval=10, dtype="int32")
print(x2.numpy())

[[-0.500706   1.3266544]
 [ 1.1777245  1.7845808]]
[[3 4]
 [6 4]]


## Variables

Variables are special tensors used to store mutable state (such as the weights of a neural network). You create a Variable using some initial value. <br>

In [8]:
initial_value2 = tf.random.normal(shape=(2, 2))
print(initial_value2)
a = tf.Variable(initial_value2)
print(initial_value2)
print(a)

tf.Tensor(
[[ 0.37197933 -0.23103228]
 [ 0.21129033 -0.8975576 ]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[ 0.37197933 -0.23103228]
 [ 0.21129033 -0.8975576 ]], shape=(2, 2), dtype=float32)
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[ 0.37197933, -0.23103228],
       [ 0.21129033, -0.8975576 ]], dtype=float32)>


In [9]:
new_value = tf.random.normal(shape=(2, 2))
print(new_value.numpy())
a.assign(new_value)
print(a)
print(a.numpy())
try:
    for i in range(2):
        for j in range(2):
            assert a[i, j] != new_value[i, j], "si no son distintos, son iguales"
except AssertionError as msg:
    print(msg)

[[ 1.4388729  -0.67365   ]
 [-2.3744066  -0.12767933]]
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[ 1.4388729 , -0.67365   ],
       [-2.3744066 , -0.12767933]], dtype=float32)>
[[ 1.4388729  -0.67365   ]
 [-2.3744066  -0.12767933]]
si no son distintos, son iguales


In [10]:
added_value = tf.random.normal(shape=(2, 2))
print(f"added_value is type :{type(added_value)}")
print(a.numpy())
print(added_value.numpy())
a.assign_add(added_value)
print(a.numpy())
for i in range(2):
    for j in range(2):
        assert a[i, j] == new_value[i, j] + added_value[i, j]


added_value is type :<class 'tensorflow.python.framework.ops.EagerTensor'>
[[ 1.4388729  -0.67365   ]
 [-2.3744066  -0.12767933]]
[[-0.30981076 -1.0489062 ]
 [-0.57449245 -2.2033632 ]]
[[ 1.1290622 -1.7225562]
 [-2.948899  -2.3310425]]


### Doing math in TensorFlow

In [11]:
a = tf.random.normal(shape=(2, 2))
b = tf.random.normal(shape=(2, 2))

c = a + b
d = tf.square(c)
e = tf.exp(d)
print(c.numpy())

[[ 0.835143   -0.50794905]
 [-1.5375143   1.81733   ]]


# Gradients
Here's another big difference with NumPy: you can automatically retrieve the gradient of any differentiable expression.
Just open a GradientTape, start "watching" a tensor via tape.watch(), and compose a differentiable expression using this tensor as input:

In [12]:
a = tf.Variable(a)

with tf.GradientTape() as tape:
    c = (tf.square(a) + tf.square(b))
    dc_da = tape.gradient(c, a)
    print(dc_da)

tf.Tensor(
[[ 3.1941676  -2.4711428 ]
 [-0.87906164 -1.4395616 ]], shape=(2, 2), dtype=float32)


In [13]:
a_initial = np.array([[1.,2.],[3.,4.]])
b_initial = np.array([[10.,20.],[30.,40.]])
a_tensor = tf.convert_to_tensor(a_initial)
b_tensor = tf.convert_to_tensor(b_initial)
a = tf.Variable(a_tensor)
b = tf.Variable(b_tensor)

print(a)
print(b)

with tf.GradientTape() as tape:
    tape.watch(a)  # Start recording the history of operations applied to `a`
    c = (tf.square(a) + tf.square(b))  # Do some math using `a`
    c1 = tf.sqrt(tf.square(a) + tf.square(b))  # Do some math using `a`
    # What's the gradient of `c` with respect to `a`?
    dc_da = tape.gradient(c, a)
    print(dc_da)

with tf.GradientTape() as tape:
    tape.watch(b)  # Start recording the history of operations applied to `a`
    c = (tf.square(a) + 2*(b))  # Do some math using `a`
    c1 = tf.sqrt(tf.square(a) + tf.square(b))  # Do some math using `a`
    # What's the gradient of `c` with respect to `a`?
    dc_db = tape.gradient(c, b)
    print(dc_db)
    

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float64, numpy=
array([[1., 2.],
       [3., 4.]])>
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float64, numpy=
array([[10., 20.],
       [30., 40.]])>
tf.Tensor(
[[2. 4.]
 [6. 8.]], shape=(2, 2), dtype=float64)
tf.Tensor(
[[2. 2.]
 [2. 2.]], shape=(2, 2), dtype=float64)


By default, variables are watched automatically, so you don't need to manually watch them:



In [14]:
a = tf.Variable(a)

with tf.GradientTape() as tape:
    c = (tf.square(a) + tf.square(b))
    dc_da = tape.gradient(c, a)
    print(dc_da)

tf.Tensor(
[[2. 4.]
 [6. 8.]], shape=(2, 2), dtype=float64)


Note that you can compute higher-order derivatives by nesting tapes:


In [15]:
with tf.GradientTape() as outer_tape:
    with tf.GradientTape() as tape:
        c = (tf.square(a) + tf.square(b))
        dc_da = tape.gradient(c, a)
    d2c_da2 = outer_tape.gradient(dc_da, a)
    print(d2c_da2)

tf.Tensor(
[[2. 2.]
 [2. 2.]], shape=(2, 2), dtype=float64)


# Keras layers

The Layer class is the fundamental abstraction in Keras. A Layer encapsulates a state (weights) and some computation (defined in the call method).<br>
A simple layer looks like this. The **self.add_weight()** method gives you a shortcut for creating weights:

In [16]:
class Linear(keras.layers.Layer):
    """y = w.x + b"""

    def __init__(self, units=32, input_dim=32):
        super().__init__()
        self.w = self.add_weight(
            shape=(input_dim, units), initializer="random_normal", trainable=True
        )
        self.b = self.add_weight(shape=(units,), initializer="zeros", trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

You would use a Layer instance much like a Python function:


In [17]:
# Instantiate our layer.
linear_layer = Linear(units=4, input_dim=2)

# The layer can be treated as a function.
# Here we call it on some data.
y = linear_layer(tf.ones((2, 2)))
assert y.shape == (2, 4)
print(f"los pesos w :{linear_layer.w}")
print(f"los bias b :{linear_layer.b.numpy()}")
print(f" la lista on los w y b: {linear_layer.weights}")
print(f" ahora solo pesos : {linear_layer.weights[0].numpy()}")
print(f" len (linear_layers.weights : {len(linear_layer.weights)}")
print(y)

los pesos w :<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.04134304,  0.05916696,  0.05587523, -0.10565665],
       [-0.04480125,  0.04697331,  0.02400556, -0.07866778]],
      dtype=float32)>
los bias b :[0. 0. 0. 0.]
 la lista on los w y b: [<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.04134304,  0.05916696,  0.05587523, -0.10565665],
       [-0.04480125,  0.04697331,  0.02400556, -0.07866778]],
      dtype=float32)>, <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]
 ahora solo pesos : [[-0.04134304  0.05916696  0.05587523 -0.10565665]
 [-0.04480125  0.04697331  0.02400556 -0.07866778]]
 len (linear_layers.weights : 2
tf.Tensor(
[[-0.08614428  0.10614027  0.07988079 -0.18432443]
 [-0.08614428  0.10614027  0.07988079 -0.18432443]], shape=(2, 4), dtype=float32)


You have many built-in layers available, from Dense to Conv2D to LSTM to fancier ones like Conv3DTranspose or ConvLSTM2D. Be smart about reusing built-in functionality.

### Layer weight creation in build(input_shape)
It's often a good idea to defer weight creation to the build() method, so that you don't need to specify the input dim/shape at layer construction time:

In [18]:
class Linear(keras.layers.Layer):
    """y = w.x + b"""

    def __init__(self, units=32):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units,), initializer="random_normal", trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b


# Instantiate our layer.
linear_layer = Linear(4)

# This will also call `build(input_shape)` and create the weights.
y = linear_layer(tf.ones((2, 2)))

# Layer gradients

In [19]:
# Prepare a dataset.
(x_train, y_train), _ = keras.datasets.mnist.load_data()
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32") / 255, y_train)
)
dataset = dataset.shuffle(buffer_size=1024).batch(64)

# Instantiate our linear layer (defined above) with 10 units.
linear_layer = Linear(10)

# Instantiate a logistic loss function that expects integer targets.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)

# Iterate over the batches of the dataset.
for step, (x, y) in enumerate(dataset):
    # Open a GradientTape.
    with tf.GradientTape() as tape:
        # Forward pass.
        logits = linear_layer(x)

        # Loss value for this batch.
        loss = loss_fn(y, logits)

    # Get gradients of the loss wrt the weights.
    gradients = tape.gradient(loss, linear_layer.trainable_weights)

    # Update the weights of our linear layer.
    optimizer.apply_gradients(zip(gradients, linear_layer.trainable_weights))

    # Logging.
    if step % 100 == 0:
        print("Step:", step, "Loss:", float(loss))

2023-08-12 09:13:09.975611: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 188160000 exceeds 10% of free system memory.
2023-08-12 09:13:10.851251: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 188160000 exceeds 10% of free system memory.


Step: 0 Loss: 2.3749279975891113
Step: 100 Loss: 2.320178270339966
Step: 200 Loss: 2.167597770690918
Step: 300 Loss: 2.0588619709014893
Step: 400 Loss: 1.9784367084503174
Step: 500 Loss: 1.9501299858093262
Step: 600 Loss: 1.8006864786148071
Step: 700 Loss: 1.8074889183044434
Step: 800 Loss: 1.7239673137664795
Step: 900 Loss: 1.5802383422851562


In [20]:
print(type(dataset))

<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>


In [21]:
count = 0
for element in dataset:
    count += 1
    print(element)
print(count)

(<tf.Tensor: shape=(64, 784), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(64,), dtype=uint8, numpy=
array([9, 8, 7, 0, 9, 9, 5, 1, 8, 6, 7, 7, 7, 7, 5, 1, 8, 0, 6, 4, 1, 7,
       3, 3, 0, 5, 4, 6, 6, 6, 7, 6, 0, 4, 4, 3, 6, 2, 2, 6, 1, 7, 3, 5,
       1, 3, 7, 0, 3, 1, 5, 1, 7, 9, 3, 8, 6, 4, 0, 4, 2, 5, 3, 1],
      dtype=uint8)>)
(<tf.Tensor: shape=(64, 784), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(64,), dtype=uint8, numpy=
array([1, 4, 3, 0, 1, 6, 5, 9, 7, 5, 1, 9, 9, 2, 0, 3, 

### Trainable and non-trainable weights

In [29]:
class ComputeSum(keras.layers.Layer):
    """Returns the sum of the inputs."""

    def __init__(self, input_dim):
        super().__init__()
        # Create a non-trainable weight.
        self.total = self.add_weight(
            initializer="zeros", shape=(input_dim,), trainable=False
        )

    def call(self, inputs):
        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
        return self.total


my_sum = ComputeSum(2)
x = tf.ones((2, 2))
print(x)

y = my_sum(x)
print(y.numpy())  # [2. 2.]
print(type(y))

y = my_sum(x)
print(y.numpy())  # [4. 4.]

y = my_sum(x)
print(y.numpy())  # [4. 4.]

assert my_sum.weights == [my_sum.total]
assert my_sum.non_trainable_weights == [my_sum.total]
assert my_sum.trainable_weights == []


tf.Tensor(
[[1. 1.]
 [1. 1.]], shape=(2, 2), dtype=float32)
[2. 2.]
<class 'tensorflow.python.ops.resource_variable_ops.ResourceVariable'>
[4. 4.]
[6. 6.]


# Layers that own layers

In [37]:
# Let's reuse the Linear class
# with a `build` method that we defined above.


class MLP(keras.layers.Layer):
    """Simple stack of Linear layers."""

    def __init__(self):
        super().__init__()
        self.linear_1 = Linear(6)
        self.linear_2 = Linear(5)
        self.linear_3 = Linear(2)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)


mlp = MLP()

# The first call to the `mlp` object will create the weights.
y = mlp(tf.ones(shape=(3, 6)))

# Weights are recursively tracked.
assert len(mlp.weights) == 6
print(y)
print('###'*10)
print(mlp.weights)

tf.Tensor(
[[-0.00039482  0.0097865 ]
 [-0.00039482  0.0097865 ]
 [-0.00039482  0.0097865 ]], shape=(3, 2), dtype=float32)
##############################
[<tf.Variable 'mlp_5/linear_18/Variable:0' shape=(6, 6) dtype=float32, numpy=
array([[-0.03285932, -0.0495584 ,  0.0432191 ,  0.00834198, -0.02634824,
        -0.05585678],
       [ 0.07838593,  0.02674093,  0.04078542, -0.03921499,  0.05484057,
        -0.03196099],
       [-0.07155383,  0.04820278,  0.04064392,  0.12064433,  0.01601767,
         0.12765495],
       [-0.00152788,  0.0629672 ,  0.02415638, -0.01292972, -0.09311235,
         0.01047353],
       [-0.01790203,  0.0170336 ,  0.01180858, -0.01910472,  0.03926925,
        -0.09911089],
       [ 0.00639297,  0.06985662,  0.0167389 , -0.08347644, -0.04789467,
         0.01613125]], dtype=float32)>, <tf.Variable 'mlp_5/linear_18/Variable:0' shape=(6,) dtype=float32, numpy=
array([-0.03976129,  0.00555886, -0.02352729,  0.03478118,  0.07411997,
        0.00294945], dtype=float3

# The Functional API for model-building
To build deep learning models, you don't have to use object-oriented programming all the time. All layers we've seen so far can also be composed functionally, like this (we call it the "Functional API"):

