In [1]:
import tensorflow as tf
import time
import numpy as np

In [2]:
tf.keras.backend.clear_session()   # clean slate

# ------------ 1. Build some random data ------------
N = 1_000          # number of iterations for the test loop
d = 1024           # square matrix dimension

rng = np.random.default_rng(42)
A = tf.constant(rng.standard_normal((d, d), dtype=np.float32))
B = tf.constant(rng.standard_normal((d, d), dtype=np.float32))

# ------------ 2. Define three versions of the same op ------------

def matmul_eager(a, b):
    return tf.matmul(a, b)

@tf.function            # JIT / graph mode
def matmul_graph(a, b):
    return tf.matmul(a, b)

@tf.function(jit_compile=True)   # Graph + XLA fusion
def matmul_xla(a, b):
    return tf.matmul(a, b)

# ------------ 3. Warm-up (always do this!) ------------
matmul_eager(A, B)
matmul_graph(A, B)
matmul_xla(A, B)

# ------------ 4. Timing helper ------------
def time_it(fn, name):
    start = time.perf_counter()
    for _ in range(N):
        _ = fn(A, B)
    tf.keras.backend.clear_session()  # avoid memory blow-up
    elapsed = time.perf_counter() - start
    print(f"{name:<18}: {elapsed*1e3/N:7.2f} ms / call")

# ------------ 5. Run the benchmark ------------
time_it(matmul_eager, "Eager")
time_it(matmul_graph, "Graph (@tf.function)")
time_it(matmul_xla,  "Graph + XLA")


Eager             :    5.64 ms / call
Graph (@tf.function):    6.31 ms / call
Graph + XLA       :    6.43 ms / call
