# PyTorch benchmark, profiler and JIT compiler

In this notebook we will see a simple example on how to use the PyTorch benchmark and profiler, and discuss the impact of Just-in-Time compilation on performance of custom functions.
We will use the mean squared error loss for its simplicity.

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['OMP_NUM_THREADS'] = '12' # for Numpy and BLAS
  
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.utils import data
from torch.profiler import profile, record_function, ProfilerActivity
import torch.utils.benchmark as benchmark

from tqdm.notebook import trange, tqdm

In [2]:
# Sample random scalar data
nsamples = 20000
nfeat = 500

x = np.random.random([nsamples, nfeat])
y = np.random.random([nsamples, nfeat])

In [3]:
def mse(input, target):
    '''
    measures the mean squared error (squared L2 norm) between
    each element in the input `x` and target :`y`
    '''
    diff = (input - target)  # WARN: this temporary variable makes this function 10-20% slower in Numpy
    return (diff**2).mean()

In [None]:
# Time the MSE function
%timeit -n1 -r100 mse(x, y)

## PyTorch benchmark (CPU)

In [None]:
# Copyless Tensors
tx = torch.from_numpy(x)
ty = torch.from_numpy(y)

tx.dtype

In [None]:
torch.set_num_threads(12)

%timeit -n1 -r100 mse(tx, ty)

## GPU benchmarks

In [None]:
device = 'cuda'
cx = tx.to(device)
cy = ty.to(device)

# %timeit -n1 -r100 mse(cx, cy) ## !!! Wrong way of time GPU functions, without torch.cuda.synchronize()
## https://pytorch.org/tutorials/recipes/recipes/benchmark.html#benchmarking-with-torch-utils-benchmark-timer

In [None]:
t0 = benchmark.Timer(
    stmt='mse(x, y)',
    setup='from __main__ import mse',
    globals={'x': cx, 'y': cy})

print(t0.timeit(100))

In [None]:
with profile(activities=[ProfilerActivity.CUDA]) as prof:
    with record_function("mse"):
        mse(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("mse.trace.json")

In [None]:
df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

# PyTorch Just-in-Time compiler

https://pytorch.org/docs/stable/jit.html

#### Scripting vs. Tracing

- `torch.jit.script` compiles the function or module into TorchScript, using a \[large\] subset of Python
- `torch.jit.trace` uses the example input to compute a fixed graph, and therefore cannot handle control flow, e.g. if statements and similar conditions

In [None]:
jit_mse_scripted = torch.jit.script(mse)
print(jit_mse_scripted.code)
print(jit_mse_scripted.graph)

In [None]:
jit_mse_traced = torch.jit.trace(mse, example_inputs=(cx, cy))
# jit.trace only support Tensors but is ideal for benchmarking
print(jit_mse_traced.graph)

In [None]:
@torch.jit.script
def jit_mse(
        input,
        target,
        reduce : bool = True  # types have to be annotated or are assumed Tensors
    ):
    err = (input - target)**2
    return err.mean() if reduce else err

print(jit_mse.code)

In [None]:
t0 = benchmark.Timer(
    stmt='jit_mse_traced(x, y)',
    setup='from __main__ import jit_mse_traced',
    globals={'x': cx, 'y': cy})
t1 = benchmark.Timer(
    stmt='jit_mse_scripted(x, y)',
    setup='from __main__ import jit_mse_scripted',
    globals={'x': cx, 'y': cy})
t2 = benchmark.Timer(
    stmt='jit_mse(x, y)',
    setup='from __main__ import jit_mse',
    globals={'x': cx, 'y': cy})

print(t0.timeit(100))
print(t1.timeit(100))
print(t2.timeit(100))
# torch.jit.script requires a warmup run before benchmarking

In [None]:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("jit_mse"):
        jit_mse(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("jit_mse.trace.json")

df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

## Comparing with torch.nn.functional.mse_loss

In [None]:
## F.mse_loss(cx, cy)
assert jit_mse(cx, cy) - F.mse_loss(cx, cy) < 1e-12

t2 = benchmark.Timer(
    stmt='mse_loss(x, y)',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': cx, 'y': cy})

print(t2.timeit(100))

In [None]:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("mse_loss"):
        F.mse_loss(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("mse_loss.trace.json")

df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

Our simple JIT'ed function is actually slightly faster than the PyTorch mse_loss with it's custom CUDA kernel, however, the same is not true in backward pass as we can see below.

In [None]:
var = cx.clone()
var.requires_grad = True

# warm up
jit_mse(var, cy).backward()
torch.cuda.synchronize()

t0 = benchmark.Timer(
    stmt='jit_mse(x, y).backward()',
    setup='from __main__ import jit_mse',
    globals={'x': var, 'y': cy})

t1 = benchmark.Timer(
    stmt='mse_loss(x, y).backward()',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': var, 'y': cy})

print(t0.timeit(1000))
print(t1.timeit(1000))

## Often JIT'ed autograd functions are faster than autograd of JIT'ed functions

In [None]:
@torch.jit.script
def mse_fw(input, target):
    err = (input - target)**2
    return err.mean()

@torch.jit.script
def mse_bw(input, target, grad_output):
    grad = (input - target) * (grad_output * 2 / input.numel())
    return grad, grad


class MSE(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input, target):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input, target)
        return mse_fw(input, target)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, target = ctx.saved_tensors
        return mse_bw(input, target, grad_output)


# To apply our Function, we use Function.apply method.
jit_mse = MSE.apply

# some basic testing
print(torch.allclose(F.mse_loss(cx, cy), jit_mse(cx, cy)))

var.grad = None
F.mse_loss(var, cy).backward()
fg = var.grad.clone()

var.grad = None
jit_mse(var, cy).backward()
torch.cuda.synchronize()

print(torch.allclose(fg, var.grad))

In [None]:
t0 = benchmark.Timer(
    stmt='jit_mse(x, y).backward()',
    setup='from __main__ import jit_mse',
    globals={'x': var, 'y': cy})

t1 = benchmark.Timer(
    stmt='mse_loss(x, y).backward()',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': var, 'y': cy})

print(t0.timeit(100))
print(t1.timeit(100))

**TorchScript functions/modules can be easily saved, opened with PyTorch C++ API (LibTorch) and further optimized for inference with NVIDIA TensorRT.**

If JIT is not enough for your use case, it is also possible to write your own kernels.
1. [PyTorch C++ Extension](https://pytorch.org/tutorials/advanced/cpp_extension.html)
2. [OpenAI Triton](https://triton-lang.org/getting-started/tutorials/02-fused-softmax.html)