In [2]:
import torch

In [61]:
x = torch.ones((1, 1), requires_grad=True)

Gradients are accumulating! x is not updated!

In [63]:
for _ in range(10):  # any normal python code, really!
    y = x ** 2
    y.backward()
    print(x, x.grad)

tensor([[1.]], requires_grad=True) tensor([[2.]])
tensor([[1.]], requires_grad=True) tensor([[4.]])
tensor([[1.]], requires_grad=True) tensor([[6.]])
tensor([[1.]], requires_grad=True) tensor([[8.]])
tensor([[1.]], requires_grad=True) tensor([[10.]])
tensor([[1.]], requires_grad=True) tensor([[12.]])
tensor([[1.]], requires_grad=True) tensor([[14.]])
tensor([[1.]], requires_grad=True) tensor([[16.]])
tensor([[1.]], requires_grad=True) tensor([[18.]])
tensor([[1.]], requires_grad=True) tensor([[20.]])


In [64]:
with torch.autograd.profiler.profile() as prof:
    for _ in range(10):  # any normal python code, really!
        y = x ** 2
        y.backward()

In [67]:
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))

-----------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
Name                                 Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     CUDA total %     CUDA total       CUDA time avg    Number of Calls  
-----------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
pow                                  50.58%           219.194us        50.58%           219.194us        10.960us         NaN              0.000us          0.000us          20               
mul                                  28.96%           125.497us        28.96%           125.497us        6.275us          NaN              0.000us          0.000us          20               
PowBackward0                         9.34%  

In [66]:
prof.self_cpu_time_total

433.3860000000004

### PyProf Package

* https://pypi.org/project/torchprof/
* All metrics are derived using the PyTorch autograd profiler.

In [75]:
#pip install torchprof
#pip install torchsummary

In [77]:
from torchsummary import summary
import torch
import torchvision
import torchprof

In [78]:
model = torchvision.models.alexnet(pretrained=False).cuda()

In [85]:
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [83]:
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 55, 55]          23,296
              ReLU-2           [-1, 64, 55, 55]               0
         MaxPool2d-3           [-1, 64, 27, 27]               0
            Conv2d-4          [-1, 192, 27, 27]         307,392
              ReLU-5          [-1, 192, 27, 27]               0
         MaxPool2d-6          [-1, 192, 13, 13]               0
            Conv2d-7          [-1, 384, 13, 13]         663,936
              ReLU-8          [-1, 384, 13, 13]               0
            Conv2d-9          [-1, 256, 13, 13]         884,992
             ReLU-10          [-1, 256, 13, 13]               0
           Conv2d-11          [-1, 256, 13, 13]         590,080
             ReLU-12          [-1, 256, 13, 13]               0
        MaxPool2d-13            [-1, 256, 6, 6]               0
AdaptiveAvgPool2d-14            [-1, 25

In [84]:
x = torch.rand([1, 3, 224, 224]).cuda()

with torchprof.Profile(model, use_cuda=True) as prof:
    model(x)

print(prof.display(show_events=False))

Module         | Self CPU total | CPU total | CUDA total | Occurrences
---------------|----------------|-----------|------------|------------
AlexNet        |                |           |            |            
├── features   |                |           |            |            
│├── 0         |      236.892us | 878.269us |  902.720us |           1
│├── 1         |       36.998us |  36.998us |   37.920us |           1
│├── 2         |       52.498us |  93.996us |   93.248us |           1
│├── 3         |      113.596us | 404.885us |  912.320us |           1
│├── 4         |       25.799us |  25.799us |   26.624us |           1
│├── 5         |       38.398us |  68.696us |   68.576us |           1
│├── 6         |      126.795us | 454.784us |  612.320us |           1
│├── 7         |       23.899us |  23.899us |   24.608us |           1
│├── 8         |      120.195us | 437.683us |  782.400us |           1
│├── 9         |       25.199us |  25.199us |   25.600us |           1
│├── 1

In [86]:
import torch.nn as nn


In [88]:
nn.Linear?

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0min_features[0m[0;34m,[0m [0mout_features[0m[0;34m,[0m [0mbias[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

Args:
    in_features: size of each input sample
    out_features: size of each output sample
    bias: If set to ``False``, the layer will not learn an additive bias.
        Default: ``True``

Shape:
    - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
      additional dimensions and :math:`H_{in} = \text{in\_features}`
    - Output: :math:`(N, *, H_{out})` where all but the last dimension
      are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

Attributes:
    weight: the learnable weights of the module of shape
        :math:`(\text{out\_features}, \text{in\_features})`. The values are
        initialized from :math:`\math