In [8]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
from accelerate import Accelerator, ProfileKwargs

# Pytorch

In [4]:
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         0.17%      60.000us        74.92%      26.740ms       1.337ms      47.37 Mb           0 b            20  
                aten::convolution        -7.76%   -2768.000us        74.75%      26.680ms       1.334ms      47.37 Mb      -4.31 Mb            20  
               aten::_convolution         8.39%       2.996ms        74.35%      26.538ms       1.327ms      47.37 Mb       4.31 Mb            20  
         aten::mkldnn_convolution        73.90%      26.376ms        74.11%      26.452ms       1.323ms      47.

STAGE:2024-08-25 10:12:19 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:12:19 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:12:19 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [3]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                     aten::conv2d         0.03%      13.000us        17.12%       7.703ms       1.926ms             4                             [[5, 64, 56, 56], [64, 64, 3, 3], [], [], [], [], []]  
                aten::convolution         0.08%      34.000us        17.09%       7.690ms       1.923ms             4                     [[5, 64, 56, 56], [64, 64, 3, 3], [], [], [], [], [], 

In [16]:
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")

STAGE:2024-08-25 10:22:18 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:22:18 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:22:18 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [17]:
from torch.profiler import schedule

my_schedule = schedule(
    skip_first=10,
    wait=5,
    warmup=1,
    active=3,
    repeat=2
)

def trace_handler(p):
    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=my_schedule,
    on_trace_ready=trace_handler
) as p:
    for idx in range(8):
        model(inputs)
        p.step()

In [18]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_flops=True
) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="flops", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.62%      97.000us        77.06%      12.057ms     602.850us       0.000us         0.00%       1.353ms      67.650us            20     18135.613  
                                            aten::addmm         0.58%      90.000us         0.68%     107.000us     107.000us       9.000us         0.4

STAGE:2024-08-25 10:23:52 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:23:52 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:23:52 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


# Accelerator

In [19]:
from accelerate import Accelerator, ProfileKwargs
import torch
import torchvision.models as models

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

profile_kwargs = ProfileKwargs(
    activities=["cpu"],
    record_shapes=True
)

accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    with torch.no_grad():
        model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         1.03%     429.000us        75.71%      31.463ms       1.573ms            20  
                aten::convolution         0.22%      93.000us        75.56%      31.403ms       1.570ms            20  
               aten::_convolution         0.25%     103.000us        75.34%      31.310ms       1.565ms            20  
         aten::mkldnn_convolution        74.91%      31.133ms        75.09%      31.207ms       1.560ms            20  
                 aten::batch_norm         0.11%      47.000us        11.78%       4.894ms     244.700us            20  
     aten::_batch_norm_impl_index       

STAGE:2024-08-25 10:24:55 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:24:55 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:24:55 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [20]:
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

profile_kwargs = ProfileKwargs(
    activities=["cpu"],
    profile_memory=True,
    record_shapes=True
)

accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.35%     129.000us         0.35%     129.000us       0.645us      94.83 Mb      94.83 Mb           200  
    aten::max_pool2d_with_indices         8.05%       2.966ms         8.05%       2.966ms       2.966ms      11.48 Mb      11.48 Mb             1  
                     aten::conv2d         3.39%       1.248ms        71.49%      26.337ms       1.317ms      47.37 Mb       1.91 Mb            20  
                      aten::addmm         0.46%     170.000us         0.48%     178.000us     178.000us      19.

STAGE:2024-08-25 10:25:13 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:25:13 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:25:13 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [21]:
profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"],
    output_trace_dir="trace"
)

accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    model(inputs)

# The trace will be saved to the specified directory

STAGE:2024-08-25 10:25:23 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:25:23 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:25:23 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [22]:
def trace_handler(p):
    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"],
    schedule_option={"wait": 5, "warmup": 1, "active": 3, "repeat": 2, "skip_first": 10},
    on_trace_ready=trace_handler
)

accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    for idx in range(8):
        model(inputs)
        prof.step()

In [23]:
profile_kwargs = ProfileKwargs(
    with_flops=True
)
accelerator = Accelerator(kwargs_handlers=[profile_kwargs])

with accelerator.profile() as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="flops", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  Total MFLOPs  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         3.41%     906.000us        70.51%      18.734ms     936.700us            20     18135.613  
                      aten::addmm         0.52%     138.000us         0.55%     147.000us     147.000us             1         5.120  
                aten::convolution        -4.64%   -1234.000us        70.30%      18.678ms     933.900us            20            --  
               aten::_convolution         5.59%       1.486ms        69.71%      18.520ms     926.000us            20            --  
         aten::mkldnn_convolution        69.09%      18.355ms 

STAGE:2024-08-25 10:25:38 1695051:1695051 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-08-25 10:25:38 1695051:1695051 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-08-25 10:25:38 1695051:1695051 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
