### Load libraries

In [1]:
# Setup
import tensorflow as tf
import keras

In [2]:
print(tf.__version__)
print(keras.__version__)

2.13.0
2.13.1


### Tensors

In [3]:
# Constant
x = tf.constant([[5, 2], [1, 3]])
print(x)
print(x.dtype)
print(x.shape)

tf.Tensor(
[[5 2]
 [1 3]], shape=(2, 2), dtype=int32)
<dtype: 'int32'>
(2, 2)


In [4]:
print(tf.zeros( shape=(2, 3) ))
print(tf.ones( shape=(2, 3) ))

tf.Tensor(
[[0. 0. 0.]
 [0. 0. 0.]], shape=(2, 3), dtype=float32)
tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]], shape=(2, 3), dtype=float32)


In [5]:
# Random tensors
x = tf.random.normal(shape=(2, 2), mean=0.0, stddev=1.0)

y = tf.random.uniform(shape=(2, 2), minval=0, maxval=10, dtype="int32")
print(x)
print(y)

tf.Tensor(
[[ 0.40533835 -0.38241544]
 [ 0.2789592  -0.22413269]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[0 7]
 [4 9]], shape=(2, 2), dtype=int32)


### Variables

In [6]:
init_val = tf.random.normal(shape=(2, 2))
a = tf.Variable(init_val)
print(a)

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[ 1.2160999 , -0.46850505],
       [ 0.7336236 , -2.635396  ]], dtype=float32)>


In [7]:
new_value = tf.random.normal(shape=(2, 2))
a.assign(new_value)

print(a)

added_value = tf.ones(shape=(2, 2))
a.assign_add(added_value)
print(a)

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[-1.4831139 , -0.91129625],
       [ 0.7267284 ,  0.1639861 ]], dtype=float32)>
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[-0.48311388,  0.08870375],
       [ 1.7267284 ,  1.1639861 ]], dtype=float32)>


### Math

In [8]:
A = tf.random.normal(shape=(2, 2))
B = tf.random.normal(shape=(2, 2))
C = A + B
print(C)

C = tf.square(A)
print(C)

C = tf.exp(A)
print(C)

tf.Tensor(
[[ 0.78119344  1.535048  ]
 [ 1.4116659  -0.01090778]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[0.13829057 0.3618697 ]
 [5.230093   0.01428382]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[1.4504508  1.8249563 ]
 [9.844763   0.88735074]], shape=(2, 2), dtype=float32)


### Gradients

In [9]:
a = tf.random.normal(shape=(2, 2))
b = tf.random.normal(shape=(2, 2))

with tf.GradientTape() as tape:
    # tape.watch(a) # recording the operations applied to a
    c = tf.sqrt(tf.square(a) + tf.square(b))
    dc_da = tape.gradient(c, a)
    print(dc_da)


None


In [10]:
# Higher-order derivatives
a = tf.random.normal(shape=(2, 2))
b = tf.random.normal(shape=(2, 2))

with tf.GradientTape() as outer_tape:
    with tf.GradientTape() as tape:
        tape.watch(a)
        c = tf.sqrt((tf.square(a) + tf.square(b)))
        print(f"c: {c}")
        dc_da = tape.gradient(c, a)
        print(f"dc_da : {dc_da}")
    d2c_da2 = outer_tape.gradient(dc_da, a)
    print(d2c_da2)

c: [[1.1067553 1.258142 ]
 [2.0347679 1.459056 ]]
dc_da : [[-0.29737923 -0.80857784]
 [-0.9786953  -0.84093577]]
None


### Keras Layer

In [11]:
class Linear(keras.layers.Layer):
    """y = w @ x + b"""

    def __init__(self, units=32):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        units = self.units
        self.w = self.add_weight(
            shape=(input_shape[-1], units), initializer="random_normal", trainable=True
        )
        self.b = self.add_weight(shape=(units,), initializer="zeros", trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [12]:
# Instantiate linear layer
linear_layer = Linear(units=4)

# The layer can be treated as a function
x = tf.random.normal(shape=(2, 2))
y = linear_layer(x)

print(f"x : {x}")
print(f"y : {y}")

x : [[ 1.1149738  -0.0498697 ]
 [-0.30876592  1.2861652 ]]
y : [[-0.03949983 -0.01233765 -0.10583743 -0.01148053]
 [ 0.07072432 -0.09807748  0.02978218 -0.065705  ]]


### Layer Gradient

In [13]:
# Prepare dataset
data_path = "/Users/mghifary/Work/Code/AI/data/mnist.npz"
(x_train, y_train), _ = keras.datasets.mnist.load_data(data_path)

In [14]:
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32")/255., y_train)
)
dataset = dataset.shuffle(buffer_size=1024).batch(64)

In [15]:
# Initiate a linear layer with 10 units
linear_layer = Linear(10)

# Instantiate a logistic loss function that expects integer targets.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Instantiate an optimizer
optimizer = keras.optimizers.legacy.SGD(learning_rate=1e-3) # faster on Apple M1/M2 chip
# optimizer = keras.optimizers.SGD(learning_rate=1e-3)

In [16]:
import time as timer

In [17]:
# Iterate over the batches of the dataset
for step, (x, y) in enumerate(dataset):
    start_t = timer.time()
    with tf.GradientTape() as tape:
        # Forward pass.
        logits = linear_layer(x)

        # Loss value for this batch
        loss = loss_fn(y, logits)

    # Get gradients of the loss wrt the weights..
    gradients = tape.gradient(loss, linear_layer.trainable_weights)

    # Update the weights of our linear layer.
    optimizer.apply_gradients(
        zip(gradients, linear_layer.trainable_weights)
    )
    elapsed_t = timer.time() - start_t

    # Logging
    if step % 100 == 0:
        print(f"Step: {step}, Loss: {float(loss)}, Elapsed time: {elapsed_t}")

Step: 0, Loss: 2.382845878601074, Elapsed time: 0.01374506950378418
Step: 100, Loss: 2.2317450046539307, Elapsed time: 0.0023469924926757812
Step: 200, Loss: 2.181346893310547, Elapsed time: 0.0017001628875732422
Step: 300, Loss: 2.0270278453826904, Elapsed time: 0.0017709732055664062
Step: 400, Loss: 1.9355878829956055, Elapsed time: 0.0019321441650390625
Step: 500, Loss: 1.957491397857666, Elapsed time: 0.0019001960754394531
Step: 600, Loss: 1.759414792060852, Elapsed time: 0.0016651153564453125
Step: 700, Loss: 1.7332817316055298, Elapsed time: 0.0017499923706054688
Step: 800, Loss: 1.754225492477417, Elapsed time: 0.0019469261169433594
Step: 900, Loss: 1.5429472923278809, Elapsed time: 0.0017049312591552734


### Layers that own layers

In [20]:
class MLP(keras.layers.Layer):
    """Simple stack of linear layers"""

    def __init__(self):
        super().__init__()
        self.linear_1 = Linear(32)
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(10)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)
    
model = MLP()

y = model(tf.ones(shape=(3, 64)))
print(f"y : {y}")


assert len(model.weights) == 6

y : [[ 0.0024752  -0.00264416 -0.01154168 -0.00092896 -0.01484032 -0.0029017
   0.00040229  0.01672307 -0.0073487   0.00290369]
 [ 0.0024752  -0.00264416 -0.01154168 -0.00092896 -0.01484032 -0.0029017
   0.00040229  0.01672307 -0.0073487   0.00290369]
 [ 0.0024752  -0.00264416 -0.01154168 -0.00092896 -0.01484032 -0.0029017
   0.00040229  0.01672307 -0.0073487   0.00290369]]


In [21]:
mlp = keras.Sequential(
    [
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(10),
    ]
)

### Tracking losses created by layers

Layers can create losses during the forward pass via the `add_loss()` method.

In [26]:
class ActivityRegularization(keras.layers.Layer):
    """
    Layer that creates an activity sparsity regularization loss.
    """

    def __init__(self, rate=1e-2):
        super().__init__()
        self.rate = rate

    def call(self, inputs):
        self.add_loss(self.rate * tf.reduce_sum(inputs))
        return inputs

In [28]:
class SparseMLP(keras.layers.Layer):
    def __init__(self):
        super().__init__()
        self.linear_1 = Linear(32)
        self.regularization = ActivityRegularization(1e-2)
        self.linear_3 = Linear(10)
    
    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.regularization(x)
        return self.linear_3(x)
    
mlp = SparseMLP()
y = mlp(tf.ones((10, 10)))
print(f"Loss: {mlp.losses}")
print(f"y : {y}")   

Loss: [<tf.Tensor: shape=(), dtype=float32, numpy=0.1750945>]
y : [[-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.04105251 -0.04844959
  -0.01363078 -0.0024152  -0.01546797  0.05749356]
 [-0.04428886 -0.02245777  0.06045137  0.02478068 -0.041052

In [30]:
(x_train, y_train), _ = keras.datasets.mnist.load_data(data_path)
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32")/255., y_train)
)
dataset = dataset.shuffle(buffer_size=1024).batch(64)

mlp = SparseMLP()

# loss and optimizer
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.legacy.SGD(learning_rate=1e-3)

for step, (x, y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        logits = mlp(x)
        loss = loss_fn(y, logits)
        loss += sum(mlp.losses)

        gradients = tape.gradient(loss, mlp.trainable_weights)

    optimizer.apply_gradients(zip(gradients, mlp.trainable_weights))

    if step % 100 == 0:
        print(f"Step: {step}, Loss: {float(loss)}")

Step: 0, Loss: 5.947953224182129
Step: 100, Loss: 2.599604606628418
Step: 200, Loss: 2.421022891998291
Step: 300, Loss: 2.3658080101013184
Step: 400, Loss: 2.349546194076538
Step: 500, Loss: 2.334782600402832
Step: 600, Loss: 2.314484119415283
Step: 700, Loss: 2.3390402793884277
Step: 800, Loss: 2.319648504257202
Step: 900, Loss: 2.315389394760132


### Keeping track of training metrics

In [33]:
# Instantiate a metric object
accuracy = keras.metrics.SparseCategoricalAccuracy()

# Prepare the model, loss, and optimizer.
model = keras.Sequential(
    [
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(10),
    ]
)

loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.legacy.Adam(learning_rate=1e-3)

for epoch in range(2):
    # Iterate over the batches of a dataset.
    for step, (x, y) in enumerate(dataset):
        with tf.GradientTape() as tape:
            logits = model(x)
            loss_value = loss_fn(y, logits)

            # update the state of the accuracy metric
            accuracy.update_state(y, logits)

            # update the weights of the model to minimize the loss value
            gradients = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))

        if step % 200 == 0:
            print(f"Epoch: {epoch}, Step: {step}, Loss: {float(loss_value)}")
            print(f"Total running accuracy so far: {accuracy.result()}")

    # Reset the metric's state at the end of an epoch
    accuracy.reset_states()


Epoch: 0, Step: 0, Loss: 2.317469835281372
Total running accuracy so far: 0.078125
Epoch: 0, Step: 200, Loss: 0.3991779386997223
Total running accuracy so far: 0.7653917670249939
Epoch: 0, Step: 400, Loss: 0.3472175598144531
Total running accuracy so far: 0.8283198475837708
Epoch: 0, Step: 600, Loss: 0.213937908411026
Total running accuracy so far: 0.8563851714134216
Epoch: 0, Step: 800, Loss: 0.17006748914718628
Total running accuracy so far: 0.8722495436668396
Epoch: 1, Step: 0, Loss: 0.3476253151893616
Total running accuracy so far: 0.875
Epoch: 1, Step: 200, Loss: 0.13529257476329803
Total running accuracy so far: 0.9399875402450562
Epoch: 1, Step: 400, Loss: 0.22766488790512085
Total running accuracy so far: 0.9413185715675354
Epoch: 1, Step: 600, Loss: 0.14776422083377838
Total running accuracy so far: 0.941841721534729
Epoch: 1, Step: 800, Loss: 0.2048586905002594
Total running accuracy so far: 0.9422401785850525


### Compiled functions

Running eagerly is great for debugging, but we'll get better performance by compiling the computation into static graphs. Static graphs are a researcher's best friends. We can compile any function by wrapping it in a `tf.function()` decorator.

In [34]:
# Prepare our model, loss, and optimizer.
model = keras.Sequential(
    [
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(10),
    ]
)

loss_fun = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.legacy.Adam(learning_rate=1e-3)

@tf.function # make it faster!
def train_on_batch(x, y):
    with tf.GradientTape() as tape:
        logits = model(x)
        loss = loss_fn(y, logits)
        gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
    return loss

# Prepare a dataset
(x_train, y_train), _ = keras.datasets.mnist.load_data(data_path)
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32")/255., y_train)
)

dataset = dataset.shuffle(buffer_size=1024).batch(64)

for step, (x, y) in enumerate(dataset):
    loss = train_on_batch(x, y)
    if step % 100 == 0:
        print(f"Step: {step}, Loss: {float(loss)}")

Step: 0, Loss: 2.3639283180236816
Step: 100, Loss: 0.4514067471027374
Step: 200, Loss: 0.3441607654094696
Step: 300, Loss: 0.6879332065582275
Step: 400, Loss: 0.42147642374038696
Step: 500, Loss: 0.2743704319000244
Step: 600, Loss: 0.23235085606575012
Step: 700, Loss: 0.14530634880065918
Step: 800, Loss: 0.24023517966270447
Step: 900, Loss: 0.24263697862625122
