## Model Investigation with Tensorboard

Graph, Loss, Accuracy & Weights visualization using Tensorboard and TensorFlow v2. This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/).

In [1]:
import tensorflow as tf
import numpy as np
from tensorboard.plugins.hparams import api as hp


In [2]:
# Path to save logs into.
version = "v8"
logs_path = f"tensorboard/{version}"


# MNIST dataset parameters.
num_classes = 10  # total classes (0-9 digits).
num_features = 784  # data features (img shape: 28*28).

# Training parameters.
learning_rate = 0.01
training_steps = 3000
batch_size = 256
display_step = 100

# Network parameters.
n_hidden_1 = 128  # 1st layer number of neurons.
n_hidden_2 = 128  # 2nd layer number of neurons.
optimizer = "rmsprop"

# Create a Summary Writer to log the metrics to Tensorboad.
summary_writer = tf.summary.create_file_writer(logs_path)
with summary_writer.as_default():
    hp_n_hidden_1 = hp.HParam("n_hidden_1")
    hp_n_hidden_2 = hp.HParam("n_hidden_2")
    hp_learning_rate = hp.HParam("learning_rate")
    hp_optimizer = hp.HParam("optimizer")
    hp.hparams_config(
        hparams=[hp_n_hidden_1, hp_n_hidden_2, hp_learning_rate, hp_optimizer],
        metrics=[hp.Metric("test-accuracy", display_name="Accuracy")],
    )

    hp.hparams(
        {
            hp_n_hidden_1: n_hidden_1,
            hp_n_hidden_2: n_hidden_2,
            hp_learning_rate: learning_rate,
            hp_optimizer: optimizer,
        },
        trial_id=version,
    )  # record the values used in this trial


2022-01-14 13:35:30.544132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-14 13:35:30.550289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-01-14 13:35:30.550306: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-01-14 13:35:30.550846: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [3]:
# Prepare MNIST data.
from tensorflow.keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data("dataset/mnist.npz")
# Convert to float32.
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
# Flatten images to 1-D vector of 784 features (28*28).
x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape(
    [-1, num_features]
)
# Normalize images value from [0, 255] to [0, 1].
x_train, x_test = x_train / 255.0, x_test / 255.0


In [4]:
# Use tf.data API to shuffle and batch data.
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)


test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.batch(batch_size).prefetch(1)


In [5]:
# Store layers weight & bias

# A random value generator to initialize weights.
random_normal = tf.initializers.RandomNormal()

weights = {
    "h1_weights": tf.Variable(
        random_normal([num_features, n_hidden_1]), name="h1_weights"
    ),
    "h2_weights": tf.Variable(
        random_normal([n_hidden_1, n_hidden_2]), name="h2_weights"
    ),
    "logits_weights": tf.Variable(
        random_normal([n_hidden_2, num_classes]), name="logits_weights"
    ),
}
biases = {
    "h1_bias": tf.Variable(tf.zeros([n_hidden_1]), name="h1_bias"),
    "h2_bias": tf.Variable(tf.zeros([n_hidden_2]), name="h2_bias"),
    "logits_bias": tf.Variable(tf.zeros([num_classes]), name="logits_bias"),
}


In [6]:
# Construct model and encapsulating all ops into scopes, making
# Tensorboard's Graph visualization more convenient.


@tf.function
def neural_net(x):
    with tf.name_scope("Model"):
        with tf.name_scope("HiddenLayer1"):
            # Hidden fully connected layer with 128 neurons.
            layer_1 = tf.add(tf.matmul(x, weights["h1_weights"]), biases["h1_bias"])
            # Apply sigmoid to layer_1 output for non-linearity.
            layer_1 = tf.nn.sigmoid(layer_1)
        with tf.name_scope("HiddenLayer2"):
            # Hidden fully connected layer with 256 neurons.
            layer_2 = tf.add(
                tf.matmul(layer_1, weights["h2_weights"]), biases["h2_bias"]
            )
            # Apply sigmoid to layer_2 output for non-linearity.
            layer_2 = tf.nn.sigmoid(layer_2)
        with tf.name_scope("LogitsLayer"):
            # Output fully connected layer with a neuron for each class.
            out_layer = (
                tf.matmul(layer_2, weights["logits_weights"]) + biases["logits_bias"]
            )
            # Apply softmax to normalize the logits to a probability distribution.
            out_layer = tf.nn.softmax(out_layer)
    return out_layer


In [7]:
# Cross-Entropy loss function.
def cross_entropy(y_pred, y_true):
    with tf.name_scope("CrossEntropyLoss"):
        # Encode label to a one hot vector.
        y_true = tf.one_hot(y_true, depth=num_classes)
        # Clip prediction values to avoid log(0) error.
        y_pred = tf.clip_by_value(y_pred, 1e-9, 1.0)
        # Compute cross-entropy.
        return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred)))


# Accuracy metric.
def accuracy(y_pred, y_true):
    with tf.name_scope("Accuracy"):
        # Predicted class is the index of highest score in prediction vector (i.e. argmax).
        correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
        return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)


# Stochastic gradient descent optimizer.
with tf.name_scope("Optimizer"):
    if optimizer == "sgd":
        optimizer = tf.optimizers.SGD(learning_rate)
    elif optimizer == "adam":
        optimizer = tf.optimizers.Adam(learning_rate)
    elif optimizer == "rmsprop":
        optimizer = tf.optimizers.RMSprop(learning_rate)
    elif optimizer == "adagrad":
        optimizer = tf.optimizers.Adagrad(learning_rate)
    elif optimizer == "adadelta":
        optimizer = tf.optimizers.Adadelta(learning_rate)
    else:
        raise


In [8]:
# Optimization process.
def run_optimization(x, y):
    # Wrap computation inside a GradientTape for automatic differentiation.
    with tf.GradientTape() as g:
        pred = neural_net(x)
        loss = cross_entropy(pred, y)

    # Variables to update, i.e. trainable variables.
    trainable_variables = list(weights.values()) + list(biases.values())

    # Compute gradients.
    gradients = g.gradient(loss, trainable_variables)

    # Update weights/biases following gradients.
    optimizer.apply_gradients(zip(gradients, trainable_variables))


In [9]:
# Visualize weights & biases as histogram in Tensorboard.
def summarize_weights(step):
    for w in weights:
        tf.summary.histogram(w.replace("_", "/"), weights[w], step=step)
    for b in biases:
        tf.summary.histogram(b.replace("_", "/"), biases[b], step=step)


In [10]:
# Run training for the given number of steps.
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):

    # Start to trace the computation graph. The computation graph remains
    # the same at each step, so we just need to export it once.
    if step == 1:
        tf.summary.trace_on(graph=True, profiler=True)

    # Run the optimization (computation graph).
    run_optimization(batch_x, batch_y)

    # Export the computation graph to tensorboard after the first
    # computation step was performed.
    if step == 1:
        with summary_writer.as_default():
            tf.summary.trace_export(name="trace", step=0, profiler_outdir=logs_path)

    if step % display_step == 0:
        pred = neural_net(batch_x)
        loss = cross_entropy(pred, batch_y)
        acc = accuracy(pred, batch_y)
        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))

        # Write loss/acc metrics & weights to Tensorboard every few steps,
        # to avoid storing too much data.
        with summary_writer.as_default():
            tf.summary.scalar("loss", loss, step=step)
            tf.summary.scalar("accuracy", acc, step=step)
            summarize_weights(step)

        # evaluate
        list_test_acc = []
        list_test_loss = []
        for test_batch_x, test_batch_y in test_data:
            test_pred = neural_net(test_batch_x)
            test_loss = cross_entropy(test_pred, test_batch_y)
            test_acc = accuracy(test_pred, test_batch_y)
            list_test_acc.append(test_acc)
            list_test_loss.append(test_loss)

        avg_test_loss = np.mean(list_test_loss)
        avg_test_acc = np.mean(list_test_acc)
        print(
            "step: %i, test_loss: %f, test_accuracy: %f"
            % (step, avg_test_loss, avg_test_acc)
        )

        with summary_writer.as_default():
            tf.summary.scalar("test-loss", avg_test_loss, step=step)
            tf.summary.scalar("test-accuracy", avg_test_acc, step=step)


Instructions for updating:
use `tf.profiler.experimental.start` instead.
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


2022-01-14 13:35:31.133238: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-01-14 13:35:31.133289: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.
2022-01-14 13:35:31.133327: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1630] Profiler found 1 GPUs
2022-01-14 13:35:31.133620: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory
2022-01-14 13:35:31.319218: I tensorflow/core/profiler/lib/profiler_session.cc:67] Profiler session collecting data.
2022-01-14 13:35:31.319334: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1764] CUPTI activity buffer flushed


Instructions for updating:
`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.
Instructions for updating:
`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.


2022-01-14 13:35:31.338589: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:526]  GpuTracer has collected 0 callback api events and 0 activity events. 
2022-01-14 13:35:31.339691: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.


step: 100, loss: 51.271515, accuracy: 0.925781
step: 100, test_loss: 74.396393, test_accuracy: 0.909375
step: 200, loss: 25.229937, accuracy: 0.972656
step: 200, test_loss: 45.147236, test_accuracy: 0.947363
step: 300, loss: 9.531292, accuracy: 0.996094
step: 300, test_loss: 38.262829, test_accuracy: 0.954297
step: 400, loss: 18.181026, accuracy: 0.984375
step: 400, test_loss: 34.447243, test_accuracy: 0.956348
step: 500, loss: 9.909430, accuracy: 0.992188
step: 500, test_loss: 29.955433, test_accuracy: 0.965137
step: 600, loss: 6.605366, accuracy: 0.996094
step: 600, test_loss: 26.608892, test_accuracy: 0.968750
step: 700, loss: 5.060273, accuracy: 0.996094
step: 700, test_loss: 25.277161, test_accuracy: 0.970215
step: 800, loss: 5.513615, accuracy: 0.996094
step: 800, test_loss: 23.089401, test_accuracy: 0.973145
step: 900, loss: 5.496675, accuracy: 0.996094
step: 900, test_loss: 20.757366, test_accuracy: 0.975879
step: 1000, loss: 1.679301, accuracy: 1.000000
step: 1000, test_loss: 

### Run Tensorboard

To run tensorboard, run the following command in your terminal:

```
tensorboard --logdir=/tmp/tensorflow_logs
```

And then connect your web browser to: [http://localhost:6006](http://localhost:6006)
