In [2]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity

class MLPModule(torch.nn.Module):
    def __init__(self, d_hid: int):
        super().__init__()
        self.net1 = torch.nn.Linear(d_hid, d_hid)
        self.relu = torch.nn.ReLU()
        self.net2 = torch.nn.Linear(d_hid, d_hid)

    def forward(self, x):
        x = self.net1(x)
        x = self.relu(x)
        x = self.net2(x)
        return x

class MultiMLP(torch.nn.Module):
    def __init__(self, d_hid: int, n_layers: int = 10):
        super().__init__()
        self.layers = torch.nn.ModuleList([MLPModule(d_hid) for _ in range(n_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Create a model instance and some sample data
d_hid = 1024
model = MultiMLP(d_hid)
input_data = torch.randn(64, d_hid)

In [3]:
def run_model(model, input_data, device="cpu", warmup_iters=5):
    model = model.to(device)
    input_data = input_data.to(device)
    # Warm-up iterations
    for _ in range(warmup_iters):
        with torch.no_grad():
            _ = model(input_data)
    if device == "cuda":
        torch.cuda.synchronize()

    activities = [ProfilerActivity.CPU]
    if device == "cuda":
        activities.append(ProfilerActivity.CUDA)
    with profile(
        activities=activities,
    ) as prof:
        output = model(input_data)

    display = prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)
    prof.export_chrome_trace("trace.json")
    return display


In [4]:
print(run_model(model, input_data))

----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
          aten::linear         0.79%     151.499us        87.05%      16.673ms     833.636us            20  
           aten::addmm        65.43%      12.531ms        84.09%      16.105ms     805.236us            20  
           aten::copy_        18.10%       3.466ms        18.10%       3.466ms     173.299us            20  
            aten::relu         1.27%     242.700us        12.95%       2.480ms     247.959us            10  
       aten::clamp_min        11.68%       2.237ms        11.68%       2.237ms     223.689us            10  
               aten::t         1.45%     277.794us         2.17%     416.493us      20.825us            20  
       aten::transp

In [5]:
print(run_model(model, input_data, device='cuda'))

--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::linear        10.60%     486.700us        89.74%       4.119ms     205.965us     258.000us         4.50%       5.542ms     277.100us            20  
         aten::addmm        43.10%       1.978ms        57.30%       2.630ms     131.520us       2.244ms        39.14%       3.117ms     155.850us            20  
             aten::t        13.64%     626.100us        21.83%       1.002ms      50.110us     242.000us         4.22%       2.167ms     108.350us            20  
         aten::empty 

In [6]:
import copy
import types

new_model = copy.deepcopy(model)
def new_forward(self, x):
    for i, layer in enumerate(self.layers):
        with record_function(f"Layer {i} Forward"):
            x = layer(x)
    return x

new_model.forward = types.MethodType(new_forward, new_model)

In [7]:
print(run_model(new_model, input_data, device='cuda'))

--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::linear        15.59%     972.000us        65.91%       4.108ms     205.400us     429.000us         6.63%       5.737ms     286.850us            20  
     Layer 6 Forward         7.85%     489.400us        37.51%       2.338ms       2.338ms      10.000us         0.15%     352.000us     352.000us             1  
         aten::addmm        25.77%       1.606ms        26.87%       1.675ms      83.750us       3.725ms        57.58%       3.917ms     195.850us            20  
             aten::t 

In [None]:
compiled_model = torch.compile(model)
print(run_model(compiled_model, input_data, device='cuda'))