# PyTorch benchmark, profiler and JIT compiler

In this notebook we will see a simple example on how to use the PyTorch benchmark and profiler, and discuss the impact of Just-in-Time compilation on performance of custom functions.
We will use the mean squared error loss for its simplicity.

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['OMP_NUM_THREADS'] = '12' # for Numpy and BLAS
  
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.utils import data
from torch.profiler import profile, record_function, ProfilerActivity
import torch.utils.benchmark as benchmark

from tqdm.notebook import trange, tqdm

In [2]:
# Sample random scalar data
nsamples = 20000
nfeat = 500

x = np.random.random([nsamples, nfeat])
y = np.random.random([nsamples, nfeat])

In [3]:
def mse(input, target):
    '''
    measures the mean squared error (squared L2 norm) between
    each element in the input `x` and target :`y`
    '''
    diff = (input - target)  # WARN: this temporary variable makes this function 10-20% slower in Numpy
    return (diff**2).mean()

In [4]:
# Time the MSE function
%timeit -n1 -r100 mse(x, y)

75.7 ms ± 320 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)


## PyTorch benchmark (CPU)

In [5]:
# Copyless Tensors
tx = torch.from_numpy(x)
ty = torch.from_numpy(y)

tx.dtype

torch.float64

In [6]:
torch.set_num_threads(12)

%timeit -n1 -r100 mse(tx, ty)

15.8 ms ± 736 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)


## GPU benchmarks

In [7]:
device = 'cuda'
cx = tx.to(device)
cy = ty.to(device)

# %timeit -n1 -r100 mse(cx, cy) ## !!! Wrong way of time GPU functions, without torch.cuda.synchronize()
## https://pytorch.org/tutorials/recipes/recipes/benchmark.html#benchmarking-with-torch-utils-benchmark-timer

In [8]:
t0 = benchmark.Timer(
    stmt='mse(x, y)',
    setup='from __main__ import mse',
    globals={'x': cx, 'y': cy})

print(t0.timeit(100))

<torch.utils.benchmark.utils.common.Measurement object at 0x2aabee21ab50>
mse(x, y)
setup: from __main__ import mse
  980.93 us
  1 measurement, 100 runs , 1 thread


In [9]:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
    with record_function("mse"):
        mse(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("mse.trace.json")

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     475.000us        48.62%     475.000us     475.000us             1  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     350.000us        35.82%     350.000us     350.000us             1  
void at::

In [10]:
df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

Unnamed: 0,count,cpu_time_total,cuda_time_total
cudaLaunchKernel,3,1571030.0,0
cudaDeviceSynchronize,1,667.0,0
cudaMemsetAsync,1,12.0,0
"void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<double>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<double>, at::detail::Array<char*, 3>)",1,0.0,475
"void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl<double, double>(at::TensorIteratorBase&, double)::{lambda(double)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl<double, double>(at::TensorIteratorBase&, double)::{lambda(double)#1}, at::detail::Array<char*, 2>)",1,0.0,350
"void at::native::reduce_kernel<512, 1, at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4> >(at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4>)",1,0.0,151
Memset (Device),1,0.0,1


# PyTorch Just-in-Time compiler

https://pytorch.org/docs/stable/jit.html

#### Scripting vs. Tracing

- `torch.jit.script` compiles the function or module into TorchScript, using a \[large\] subset of Python
- `torch.jit.trace` uses the example input to compute a fixed graph, and therefore cannot handle control flow, e.g. if statements and similar conditions

In [11]:
jit_mse_scripted = torch.jit.script(mse)
print(jit_mse_scripted.code)
print(jit_mse_scripted.graph)

def mse(input: Tensor,
    target: Tensor) -> Tensor:
  diff = torch.sub(input, target)
  return torch.mean(torch.pow(diff, 2))

graph(%input.1 : Tensor,
      %target.1 : Tensor):
  %10 : NoneType = prim::Constant()
  %5 : int = prim::Constant[value=1]()
  %8 : int = prim::Constant[value=2]() # /tmp/ipykernel_23909/3968975852.py:7:18
  %diff.1 : Tensor = aten::sub(%input.1, %target.1, %5) # /tmp/ipykernel_23909/3968975852.py:6:12
  %9 : Tensor = aten::pow(%diff.1, %8) # /tmp/ipykernel_23909/3968975852.py:7:12
  %11 : Tensor = aten::mean(%9, %10) # /tmp/ipykernel_23909/3968975852.py:7:12
  return (%11)



In [12]:
jit_mse_traced = torch.jit.trace(mse, example_inputs=(cx, cy))
# jit.trace only support Tensors but is ideal for benchmarking
print(jit_mse_traced.graph)

graph(%input : Double(20000, 500, strides=[500, 1], requires_grad=0, device=cuda:0),
      %target : Double(20000, 500, strides=[500, 1], requires_grad=0, device=cuda:0)):
  %2 : int = prim::Constant[value=1]() # /tmp/ipykernel_23909/3968975852.py:6:0
  %diff : Double(20000, 500, strides=[500, 1], requires_grad=0, device=cuda:0) = aten::sub(%input, %target, %2) # /tmp/ipykernel_23909/3968975852.py:6:0
  %4 : int = prim::Constant[value=2]() # /tmp/ipykernel_23909/3968975852.py:7:0
  %5 : Double(20000, 500, strides=[500, 1], requires_grad=0, device=cuda:0) = aten::pow(%diff, %4) # /tmp/ipykernel_23909/3968975852.py:7:0
  %6 : NoneType = prim::Constant()
  %7 : Double(requires_grad=0, device=cuda:0) = aten::mean(%5, %6) # /tmp/ipykernel_23909/3968975852.py:7:0
  return (%7)



In [13]:
@torch.jit.script
def jit_mse(
        input,
        target,
        reduce : bool = True  # types have to be annotated or are assumed Tensors
    ):
    err = (input - target)**2
    return err.mean() if reduce else err

print(jit_mse.code)

def jit_mse(input: Tensor,
    target: Tensor,
    reduce: bool=True) -> Tensor:
  err = torch.pow(torch.sub(input, target), 2)
  if reduce:
    _0 = torch.mean(err)
  else:
    _0 = err
  return _0



In [14]:
t0 = benchmark.Timer(
    stmt='jit_mse_traced(x, y)',
    setup='from __main__ import jit_mse_traced',
    globals={'x': cx, 'y': cy})
t1 = benchmark.Timer(
    stmt='jit_mse_scripted(x, y)',
    setup='from __main__ import jit_mse_scripted',
    globals={'x': cx, 'y': cy})
t2 = benchmark.Timer(
    stmt='jit_mse(x, y)',
    setup='from __main__ import jit_mse',
    globals={'x': cx, 'y': cy})

print(t0.timeit(100))
print(t1.timeit(100))
print(t2.timeit(100))
# torch.jit.script requires a warmup run before benchmarking

<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4c9d280>
jit_mse_traced(x, y)
setup: from __main__ import jit_mse_traced
  594.18 us
  1 measurement, 100 runs , 1 thread
<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4c9d580>
jit_mse_scripted(x, y)
setup: from __main__ import jit_mse_scripted
  2.06 ms
  1 measurement, 100 runs , 1 thread
<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4c9d640>
jit_mse(x, y)
setup: from __main__ import jit_mse
  2.06 ms
  1 measurement, 100 runs , 1 thread


In [15]:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
    with record_function("jit_mse"):
        jit_mse(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("jit_mse.trace.json")

df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          fused_sub_pow         0.00%       0.000us         0.00%       0.000us       0.000us     435.000us        74.11%     435.000us     435.000us             1  
void at::native::reduce_kernel<512, 1, at::native::R...         0.00%       0.000us         0.00%       0.000us       0.000us     151.000us        25.72%     151.000us     151.000us             1  
         

Unnamed: 0,count,cpu_time_total,cuda_time_total
cudaDeviceSynchronize,1,387,0
cudaMemsetAsync,1,14,0
cudaLaunchKernel,1,14,0
fused_sub_pow,1,0,435
"void at::native::reduce_kernel<512, 1, at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4> >(at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4>)",1,0,151
Memset (Device),1,0,1


## Comparing with torch.nn.functional.mse_loss

In [16]:
## F.mse_loss(cx, cy)
assert jit_mse(cx, cy) - F.mse_loss(cx, cy) < 1e-12

t2 = benchmark.Timer(
    stmt='mse_loss(x, y)',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': cx, 'y': cy})

print(t2.timeit(100))

<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf443c3d0>
mse_loss(x, y)
setup: from torch.nn.functional import mse_loss
  631.63 us
  1 measurement, 100 runs , 1 thread


In [17]:
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
    with record_function("mse_loss"):
        F.mse_loss(cx, cy)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("mse_loss.trace.json")

df = pd.DataFrame({e.key:e.__dict__ for e in prof.key_averages()}).T
df[['count', 'cpu_time_total', 'cuda_time_total']].sort_values(['cpu_time_total', 'cuda_time_total'], ascending=False)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     474.000us        75.60%     474.000us     474.000us             1  
void at::native::reduce_kernel<512, 1, at::native::R...         0.00%       0.000us         0.00%       0.000us       0.000us     152.000us        24.24%     152.000us     152.000us             1  
         

Unnamed: 0,count,cpu_time_total,cuda_time_total
cudaLaunchKernel,2,1145080.0,0
cudaDeviceSynchronize,1,437.0,0
cudaMemsetAsync,1,14.0,0
"void at::native::vectorized_elementwise_kernel<4, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(double, double)#1}, at::detail::Array<char*, 3> >(int, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(double, double)#1}, at::detail::Array<char*, 3>)",1,0.0,474
"void at::native::reduce_kernel<512, 1, at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4> >(at::native::ReduceOp<double, at::native::MeanOps<double, double>, unsigned int, double, 4>)",1,0.0,152
Memset (Device),1,0.0,1


Our simple JIT'ed function is actually slightly faster than the PyTorch mse_loss with it's custom CUDA kernel, however, the same is not true in backward pass as we can see below.

In [18]:
var = cx.clone()
var.requires_grad = True

# warm up
jit_mse(var, cy).backward()
torch.cuda.synchronize()

t0 = benchmark.Timer(
    stmt='jit_mse(x, y).backward()',
    setup='from __main__ import jit_mse',
    globals={'x': var, 'y': cy})

t1 = benchmark.Timer(
    stmt='mse_loss(x, y).backward()',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': var, 'y': cy})

print(t0.timeit(1000))
print(t1.timeit(1000))

<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4c9d100>
jit_mse(x, y).backward()
setup: from __main__ import jit_mse
  2.03 ms
  1 measurement, 1000 runs , 1 thread
<torch.utils.benchmark.utils.common.Measurement object at 0x2aaab7a33250>
mse_loss(x, y).backward()
setup: from torch.nn.functional import mse_loss
  1.75 ms
  1 measurement, 1000 runs , 1 thread


## Often JIT'ed autograd functions are faster than autograd of JIT'ed functions

In [19]:
@torch.jit.script
def mse_fw(input, target):
    err = (input - target)**2
    return err.mean()

@torch.jit.script
def mse_bw(input, target, grad_output):
    grad = (input - target) * (grad_output * 2 / input.numel())
    return grad, grad


class MSE(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input, target):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input, target)
        return mse_fw(input, target)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, target = ctx.saved_tensors
        return mse_bw(input, target, grad_output)


# To apply our Function, we use Function.apply method.
jit_mse = MSE.apply

# some basic testing
print(torch.allclose(F.mse_loss(cx, cy), jit_mse(cx, cy)))

var.grad = None
F.mse_loss(var, cy).backward()
fg = var.grad.clone()

var.grad = None
jit_mse(var, cy).backward()
torch.cuda.synchronize()

print(torch.allclose(fg, var.grad))

True
True


In [20]:
t0 = benchmark.Timer(
    stmt='jit_mse(x, y).backward()',
    setup='from __main__ import jit_mse',
    globals={'x': var, 'y': cy})

t1 = benchmark.Timer(
    stmt='mse_loss(x, y).backward()',
    setup='from torch.nn.functional import mse_loss',
    globals={'x': var, 'y': cy})

print(t0.timeit(100))
print(t1.timeit(100))

<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4cb6d30>
jit_mse(x, y).backward()
setup: from __main__ import jit_mse
  1.84 ms
  1 measurement, 100 runs , 1 thread
<torch.utils.benchmark.utils.common.Measurement object at 0x2aabf4cb6ee0>
mse_loss(x, y).backward()
setup: from torch.nn.functional import mse_loss
  1.75 ms
  1 measurement, 100 runs , 1 thread


**TorchScript functions/modules can be easily saved, opened with PyTorch C++ API (LibTorch) and further optimized for inference with NVIDIA TensorRT.**

If JIT is not enough for your use case, it is also possible to write your own kernels.
1. [PyTorch C++ Extension](https://pytorch.org/tutorials/advanced/cpp_extension.html)
2. [OpenAI Triton](https://triton-lang.org/getting-started/tutorials/02-fused-softmax.html)