In [1]:
# Dependencies
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np

%matplotlib inline

In [2]:
batch_size: int = 4
input_dim: int = 10
output_dim: int = 2
hidden_dim: int = 5

In [3]:
class Model(nn.Module):
    r"""An example model.
    """

    #
    def __init__(self):
        r"""The initializer.

        Parameters
        ----------

        Returns
        -------
        """
        super(Model, self).__init__()
        self.layer_1 = nn.Linear(in_features=input_dim, out_features=hidden_dim, bias=True)
        self.act_1 = nn.ReLU()
        self.layer_2 = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=True)

    #
    def forward(self, x_: torch.Tensor) -> torch.Tensor:
        r"""Implements the forward pass.

        Parameters
        ----------
        x_:
            The input tensor.
            SHAPE: [*<batch_dims>, input_dim].

        Returns
        -------
        out:
            The output tensor.
            SHAPE: [*<batch_dims>, output_dim].
        """
        out_1 = self.act_1(self.layer_1(x_))
        out = self.layer_2(out_1)
        return  out

In [4]:
model = Model()
print('model:\n{}'.format(model))

model:
Model(
  (layer_1): Linear(in_features=10, out_features=5, bias=True)
  (act_1): ReLU()
  (layer_2): Linear(in_features=5, out_features=2, bias=True)
)


In [5]:
input_data = torch.randn(size=(batch_size, input_dim)).requires_grad_()
output_data = torch.randn(size=(batch_size, output_dim)).requires_grad_()
print('input data shape: {}'.format(input_data.shape))
print('output data shape: {}'.format(output_data.shape))

input data shape: torch.Size([4, 10])
output data shape: torch.Size([4, 2])


In [6]:
print('model layer 1 data details:')
print('\tdata type:\n\t\tweight: {}\n\t\tbias: {}'.format(
    type(model.layer_1.weight), type(model.layer_1.bias)
))
print('\tdata shape:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.shape, model.layer_1.bias.shape
))
print('\tdata attr type:\n\t\tweight: {}\n\t\tbias: {}'.format(
    type(model.layer_1.weight.data), type(model.layer_1.bias.data)
))
print('\tdata attr shape:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.data.shape, model.layer_1.bias.data.shape
))
print('\tdata:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.data, model.layer_1.bias.data
))
print('\tdata -> numpy:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.data.numpy(), model.layer_1.bias.data.numpy()
))
print('\tgradients:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad, model.layer_1.bias.grad
))

model layer 1 data details:
	data type:
		weight: <class 'torch.nn.parameter.Parameter'>
		bias: <class 'torch.nn.parameter.Parameter'>
	data shape:
		weight: torch.Size([5, 10])
		bias: torch.Size([5])
	data attr type:
		weight: <class 'torch.Tensor'>
		bias: <class 'torch.Tensor'>
	data attr shape:
		weight: torch.Size([5, 10])
		bias: torch.Size([5])
	data:
		weight: tensor([[-0.2757, -0.2238,  0.1012,  0.2820,  0.3134, -0.0605, -0.1855,  0.2337,
          0.2572,  0.2192],
        [-0.3035,  0.1361, -0.1711,  0.3041, -0.0623,  0.0329, -0.3068,  0.2005,
         -0.1149,  0.1247],
        [-0.3055, -0.2741,  0.1318,  0.0795,  0.0851, -0.0998, -0.0532,  0.2881,
         -0.2898, -0.2112],
        [ 0.2005, -0.2470, -0.1415,  0.2311, -0.1457, -0.0029, -0.0783,  0.1730,
         -0.1699,  0.0685],
        [ 0.0116, -0.2378, -0.2751,  0.2066,  0.1237, -0.2924,  0.1969, -0.1509,
         -0.3160, -0.0388]])
		bias: tensor([-0.1250,  0.3048, -0.2829,  0.2584, -0.1077])
	data -> numpy:
		w

In [7]:
print('model layer 2 data details:')
print('\tdata type:\n\t\tweight: {}\n\t\tbias: {}'.format(
    type(model.layer_2.weight), type(model.layer_2.bias)
))
print('\tdata shape:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.shape, model.layer_2.bias.shape
))
print('\tdata attr type:\n\t\tweight: {}\n\t\tbias: {}'.format(
    type(model.layer_2.weight.data), type(model.layer_2.bias.data)
))
print('\tdata attr shape:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.data.shape, model.layer_2.bias.data.shape
))
print('\tdata:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.data, model.layer_2.bias.data
))
print('\tdata -> numpy:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.data.numpy(), model.layer_2.bias.data.numpy()
))
print('\tgradients:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad, model.layer_2.bias.grad
))

model layer 2 data details:
	data type:
		weight: <class 'torch.nn.parameter.Parameter'>
		bias: <class 'torch.nn.parameter.Parameter'>
	data shape:
		weight: torch.Size([2, 5])
		bias: torch.Size([2])
	data attr type:
		weight: <class 'torch.Tensor'>
		bias: <class 'torch.Tensor'>
	data attr shape:
		weight: torch.Size([2, 5])
		bias: torch.Size([2])
	data:
		weight: tensor([[-0.2746, -0.2589, -0.0818,  0.1747, -0.3733],
        [-0.0527, -0.0410, -0.2782,  0.4263,  0.3732]])
		bias: tensor([0.0305, 0.1637])
	data -> numpy:
		weight: [[-0.2746404  -0.25892842 -0.08181432  0.17469496 -0.3732766 ]
 [-0.05265009 -0.04097661 -0.27816778  0.42629427  0.37320584]]
		bias: [0.030462   0.16367447]
	gradients:
		weight: None
		bias: None


In [8]:
print('*'*79)
print('original model:\n{}'.format(model.layer_1.weight))
model.layer_1.weight.data = torch.Tensor(size=(5, 10)).uniform_()
print('update model:\n{}'.format(model.layer_1.weight))
print('*'*79)
np_weights = np.random.random(size=(5, 10)).astype(np.float32)
print('numpy data:\n{}'.format(np_weights))
model.layer_1.weight.data = torch.from_numpy(np_weights)
print('update model from numpy data:\n{}'.format(model.layer_1.weight))
# Operations on non-grad tensors can have grad functions associated with them.
print('*'*79)
print('model layer 1:\n\t{}\nmodel layer 1 slice:\n\t{}'.format(
    model.layer_1.weight, model.layer_1.weight[0:2, 0:2]
))
print('model layer 1 grad fn:\n\t{}\nmodel layer 1 slice grad fn:\n\t{}'.format(
    model.layer_1.weight.grad_fn, model.layer_1.weight[0:2, 0:2].grad_fn
))
print('model layer 1 grad:\n\t{}\nmodel layer 1 slice grad:\n\t{}'.format(
    model.layer_1.weight.grad, model.layer_1.weight[0:2, 0:2].grad
))


*******************************************************************************
original model:
Parameter containing:
tensor([[-0.2757, -0.2238,  0.1012,  0.2820,  0.3134, -0.0605, -0.1855,  0.2337,
          0.2572,  0.2192],
        [-0.3035,  0.1361, -0.1711,  0.3041, -0.0623,  0.0329, -0.3068,  0.2005,
         -0.1149,  0.1247],
        [-0.3055, -0.2741,  0.1318,  0.0795,  0.0851, -0.0998, -0.0532,  0.2881,
         -0.2898, -0.2112],
        [ 0.2005, -0.2470, -0.1415,  0.2311, -0.1457, -0.0029, -0.0783,  0.1730,
         -0.1699,  0.0685],
        [ 0.0116, -0.2378, -0.2751,  0.2066,  0.1237, -0.2924,  0.1969, -0.1509,
         -0.3160, -0.0388]], requires_grad=True)
update model:
Parameter containing:
tensor([[0.7269, 0.0331, 0.4493, 0.7921, 0.7990, 0.2386, 0.0191, 0.7845, 0.4033,
         0.7511],
        [0.4514, 0.5769, 0.9506, 0.3311, 0.5953, 0.0848, 0.7213, 0.4370, 0.5277,
         0.8182],
        [0.5869, 0.0522, 0.3129, 0.3035, 0.6376, 0.3869, 0.0780, 0.2773, 0.0634,
 

  model.layer_1.weight.grad, model.layer_1.weight[0:2, 0:2].grad


In [9]:
optimizer = optim.SGD(params=model.parameters(), lr=1e-3)
print('optimizer:\n{}'.format(optimizer))

optimizer:
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)


In [10]:
# Setting modes by ignoring their return values, which are references to the model.
print('is model training?: {}'.format(model.training))
model.train(mode=False)
print('is model training?: {}'.format(model.training))
model.train(mode=True)
print('is model training?: {}'.format(model.training))
model.eval()
print('is model training?: {}'.format(model.training))
model.train()

is model training?: True
is model training?: False
is model training?: True
is model training?: False


Model(
  (layer_1): Linear(in_features=10, out_features=5, bias=True)
  (act_1): ReLU()
  (layer_2): Linear(in_features=5, out_features=2, bias=True)
)

In [11]:
print('is model training?: {}'.format(model.training))
model = model.train(mode=False)
print('is model training?: {}'.format(model.training))
model = model.train(mode=True)
print('is model training?: {}'.format(model.training))
model = model.eval()
print('is model training?: {}'.format(model.training))
model = model.train()

is model training?: True
is model training?: False
is model training?: True
is model training?: False


In [12]:
# One epoch.
pred_data = model(input_data)
loss = (pred_data - output_data).pow(2).sum()
print('loss:\n{}'.format(loss))

loss:
15.137835502624512


In [13]:
# Backprop.
print('*'*79)
print('data before backprop')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.data, model.layer_1.bias.data
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.data, model.layer_2.bias.data
))
w1_before, b1_before = model.layer_1.weight.data, model.layer_1.bias.data
w2_before, b2_before = model.layer_2.weight.data, model.layer_2.bias.data
print('gradients before backprop')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad, model.layer_1.bias.grad
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad, model.layer_2.bias.grad
))
loss.backward()
print('*'*79)
print('data after backprop')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.data, model.layer_1.bias.data
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.data, model.layer_2.bias.data
))
w1_after, b1_after = model.layer_1.weight.data, model.layer_1.bias.data
w2_after, b2_after = model.layer_2.weight.data, model.layer_2.bias.data
print('gradients after backprop')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad, model.layer_1.bias.grad
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad, model.layer_2.bias.grad
))
print('*'*79)
print('gradient shapes after backprop')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad.shape, model.layer_1.bias.grad.shape
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad.shape, model.layer_2.bias.grad.shape
))
print('gradient shapes match:')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad.shape==model.layer_1.weight.shape, model.layer_1.bias.grad.shape==model.layer_1.bias.shape
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad.shape==model.layer_2.weight.shape, model.layer_2.bias.grad.shape==model.layer_2.bias.shape
))
print('weights unchanged before and after backprop:')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.eq(w1_before, w1_after).all(), torch.eq(b1_before, b1_after).all()
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.eq(w2_before, w2_after).all(), torch.eq(b2_before, b2_after).all()
))

*******************************************************************************
data before backprop
	layer 1:
		weight: tensor([[0.9790, 0.0416, 0.6085, 0.8745, 0.1410, 0.5263, 0.3361, 0.4033, 0.5472,
         0.3635],
        [0.5340, 0.3305, 0.5293, 0.2510, 0.0277, 0.3494, 0.2465, 0.2335, 0.8834,
         0.3390],
        [0.3491, 0.5322, 0.3330, 0.5291, 0.7240, 0.6802, 0.1524, 0.7553, 0.4154,
         0.6962],
        [0.1079, 0.3055, 0.4225, 0.6795, 0.8758, 0.6380, 0.7228, 0.1495, 0.9096,
         0.8255],
        [0.9953, 0.3955, 0.2228, 0.5578, 0.2041, 0.4756, 0.0886, 0.6081, 0.2021,
         0.5918]])
		bias: tensor([-0.1250,  0.3048, -0.2829,  0.2584, -0.1077])
	layer 2:
		weight: tensor([[-0.2746, -0.2589, -0.0818,  0.1747, -0.3733],
        [-0.0527, -0.0410, -0.2782,  0.4263,  0.3732]])
		bias: tensor([0.0305, 0.1637])
gradients before backprop
	layer 1:
		weight: None
		bias: None
	layer 2:
		weight: None
		bias: None
*******************************************************

In [14]:
# Save the gradients.
g_w1, g_b1 = model.layer_1.weight.grad.data.clone(), model.layer_1.bias.grad.data.clone()
g_w2, g_b2 = model.layer_2.weight.grad.data.clone(), model.layer_2.bias.grad.data.clone()
# Perform 4 more backward passes.
for _ in range(4):
    pred_data = model(input_data)
    loss = (pred_data - output_data).pow(2).sum()
    loss.backward()
g_w1_5epochs, g_b1_5epochs = model.layer_1.weight.grad.data.clone(), model.layer_1.bias.grad.data.clone()
g_w2_5epochs, g_b2_5epochs = model.layer_2.weight.grad.data.clone(), model.layer_2.bias.grad.data.clone()
print('gradients accumulated correctly:')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.dist(g_w1*5, g_w1_5epochs), torch.dist(g_b1*5, g_b1_5epochs)
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.dist(g_w2*5, g_w2_5epochs), torch.dist(g_b2*5, g_b2_5epochs)
))
print('gradients match correctly:')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.eq(g_w1*5, g_w1_5epochs).all(), torch.eq(g_b1*5, g_b1_5epochs).all()
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.eq(g_w2*5, g_w2_5epochs).all(), torch.eq(g_b2*5, g_b2_5epochs).all()
))


gradients accumulated correctly:
	layer 1:
		weight: 0.0
		bias: 0.0
	layer 2:
		weight: 0.0
		bias: 0.0
gradients match correctly:
	layer 1:
		weight: True
		bias: True
	layer 2:
		weight: True
		bias: True


In [15]:
# In order to not accumulate any gradients, call `.zero_grad()` on the `optimizer`
print('current gradients')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad, model.layer_1.bias.grad
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad, model.layer_2.bias.grad
))
print('gradients after `.zero_grad()` on the `optimizer`')
optimizer.zero_grad()
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_1.weight.grad, model.layer_1.bias.grad
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    model.layer_2.weight.grad, model.layer_2.bias.grad
))

current gradients
	layer 1:
		weight: tensor([[ -0.2770,   0.0454,  -0.0317,   0.4463,  -0.1871,  -0.0287,   0.6350,
          -0.5172,  -0.1002,  -0.5640],
        [  1.1041,  -5.3218,   5.5335,   8.1680,   0.7223,  -7.4554,  -4.7095,
          -1.5210,   0.4657,  -5.1708],
        [ -2.8811,  -5.4991,   5.9507,  16.1975,  -1.9732,  -9.0926,   4.0751,
          -9.5416,  -0.9658, -14.4852],
        [  4.1099,   9.3287, -10.0497, -25.9778,   2.8215,  15.1561,  -5.1845,
          14.6457,   1.3587,  22.8053],
        [  6.8560,  -1.4497,   1.1280, -10.4184,   4.6294,   0.2324, -15.8565,
          12.5758,   2.4854,  13.4919]])
		bias: tensor([ -0.4912,  -4.9239, -13.1039,  20.6798,  11.7236])
	layer 2:
		weight: tensor([[-0.9997,  6.5129, 10.5165, 19.9080,  2.4107],
        [ 7.7488, 35.2996, 48.5269, 32.6213, 43.1756]])
		bias: tensor([-0.4204, 50.4751])
gradients after `.zero_grad()` on the `optimizer`
	layer 1:
		weight: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 

In [16]:
# Reset the tracked gradients.
optimizer.zero_grad()
# Save old weights for reference.
w1_before, b1_before = model.layer_1.weight.data.clone(), model.layer_1.bias.data.clone()
w2_before, b2_before = model.layer_2.weight.data.clone(), model.layer_2.bias.data.clone()
# Perform one forward-backward pass.
pred_data = model(input_data)
loss = (pred_data - output_data).pow(2).sum()
loss.backward()
# Save gradients for reference.
g_w1, g_b1 = model.layer_1.weight.grad.data.clone(), model.layer_1.bias.grad.data.clone()
g_w2, g_b2 = model.layer_2.weight.grad.data.clone(), model.layer_2.bias.grad.data.clone()
# In order to actually optimize the weights, call `.step()` on the `optimizer`.
optimizer.step()
w1_after, b1_after = model.layer_1.weight.data.clone(), model.layer_1.bias.data.clone()
w2_after, b2_after = model.layer_2.weight.data.clone(), model.layer_2.bias.data.clone()
# Check. NOTE THAT IF THERE IS MOMENTUM OR OTHER MODIFICATIONS TO GRADIENT DESCENT, THE FOLLOWING MATCH WILL FAIL!!
print('gradient updates match correctly:')
print('\tlayer 1:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.dist(-1e-3*g_w1, w1_after - w1_before), torch.dist(-1e-3*g_b1, b1_after - b1_before)
))
print('\tlayer 2:\n\t\tweight: {}\n\t\tbias: {}'.format(
    torch.dist(-1e-3*g_w2, w2_after - w2_before), torch.dist(-1e-3*g_b2, b2_after - b2_before)
))

gradient updates match correctly:
	layer 1:
		weight: 1.0349069157200574e-07
		bias: 1.3445788482613352e-08
	layer 2:
		weight: 2.0795969390974278e-08
		bias: 5.600729657828651e-09


In [17]:
print('*'*79)
x = torch.randn(size=(3, 2), requires_grad=True)  # Or, `.requires_grad_()`
y = x**2
print('Computation of y with gradients:')
print('x:\n{}\ny:\n{}'.format(x, y))
print('*'*79)
print('Computation of z without gradients:')
with torch.no_grad():
    z = x**2
print('x:\n{}\nz:\n{}'.format(x, z))
print('*'*79)
print('Computation of w without gradients:')
w = x**2
# w.requires_grad = False  # THIS WILL NOT WORK!!
w = w.detach()  # Creates a COPY of the variable. The copy is not in the tracking graph for backprop.
print('x:\n{}\nw:\n{}'.format(x, w))
# Setting off the gradients for computation.
print('*'*79)
print('x:\n{}'.format(x))
with torch.enable_grad():
    u = x**2
print('x:\n{}\nu:\n{}'.format(x, u))
x.requires_grad = False
print('*'*39)
print('x:\n{}'.format(x))
with torch.enable_grad():
    u = x**2
print('x:\n{}\nu:\n{}'.format(x, u))

*******************************************************************************
Computation of y with gradients:
x:
tensor([[ 0.1745,  0.5423],
        [ 0.1308, -2.0104],
        [ 1.6899,  0.0594]], requires_grad=True)
y:
tensor([[3.0438e-02, 2.9408e-01],
        [1.7108e-02, 4.0419e+00],
        [2.8556e+00, 3.5254e-03]], grad_fn=<PowBackward0>)
*******************************************************************************
Computation of z without gradients:
x:
tensor([[ 0.1745,  0.5423],
        [ 0.1308, -2.0104],
        [ 1.6899,  0.0594]], requires_grad=True)
z:
tensor([[3.0438e-02, 2.9408e-01],
        [1.7108e-02, 4.0419e+00],
        [2.8556e+00, 3.5254e-03]])
*******************************************************************************
Computation of w without gradients:
x:
tensor([[ 0.1745,  0.5423],
        [ 0.1308, -2.0104],
        [ 1.6899,  0.0594]], requires_grad=True)
w:
tensor([[3.0438e-02, 2.9408e-01],
        [1.7108e-02, 4.0419e+00],
        [2.8556e+00, 3.5

In [18]:
# Autograd profiler.
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
is_cuda = torch.cuda.is_available()
# Perform computations.
with torch.autograd.profiler.profile(use_cuda=is_cuda) as profile:
    for _ in range(10):
        pred_data = model(input_data)
        loss = (pred_data - output_data).pow(2).sum()
        loss.backward()
# Print the analysis.
print('profile:\n{}'.format(profile.key_averages().table(sort_by='self_cpu_time_total')))

profile:
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                          aten::pow        11.16%     236.558us        12.67%     268.717us      13.436us            20  
                        aten::addmm         9.33%     197.754us        12.76%     270.492us      13.525us            20  
                            aten::t         7.72%     163.624us        15.12%     320.553us       3.206us           100  
                      AddmmBackward         6.85%     145.339us        21.01%     445.532us      22.277us            20  
                          aten::mul         5.85%     124.023us         8.85%     187.583us       6.253us            30  
               

In [27]:
# Computing Jacobians and Hessians for a computation.
# For this, we need to wrap our computation as a function.
# noinspection PyMissingOrEmptyDocstring
def calculate_loss(input_data_):
    pred_data_ = model(input_data_)
    return (pred_data_ - output_data)**2
print('input:\n{}'.format(input_data))
inputs = (input_data, )
jacobian = torch.autograd.functional.jacobian(calculate_loss, inputs)
print('d Loss / d Inputs')
print('\tshape: {}'.format(jacobian[0].shape))
print('\tdata: {}'.format(jacobian[0]))
# A simple example.
print('*'*79)
# noinspection PyMissingOrEmptyDocstring
def fn(input_1, input_2):
    return input_1 + input_2**2
input_1_data = torch.Tensor(3, 4).normal_().requires_grad_()
input_2_data = torch.Tensor(4, ).normal_().requires_grad_()
input_pair = (input_1_data, input_2_data)
jacobian = torch.autograd.functional.jacobian(fn, input_pair)
print('jacobian:')
print('\tinput 1:')
print('\t\tshape: {}'.format(jacobian[0].shape))
print('\t\tdata: {}'.format(jacobian[0]))
print('\tinput 2:')
print('\t\tshape: {}'.format(jacobian[1].shape))
print('\t\tdata: {}'.format(jacobian[1]))

input:
tensor([[-0.2604,  1.1346, -1.1783, -1.6933, -0.1709,  1.5809,  1.0596,  0.2748,
         -0.1083,  1.0456],
        [-0.0447,  0.2723, -0.1933, -0.9828, -0.5130, -1.9057,  1.0917, -0.7817,
         -0.6409, -1.6574],
        [ 0.5639, -0.0925,  0.0646, -0.9086,  0.3809,  0.0585, -1.2927,  1.0529,
          0.2041,  1.1482],
        [ 0.3414, -0.1821, -0.6738,  0.4276, -0.6007, -0.5217,  0.0145,  0.2583,
          0.0299, -2.6737]], requires_grad=True)
d Loss / d Inputs
	shape: torch.Size([4, 2, 4, 10])
	data: tensor([[[[-1.6328, -0.7112, -0.5510, -0.6306,  0.0173, -0.6822,  0.0381,
           -1.0199, -0.5788, -0.7113],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000,  0.0000,  0.0000]],

         [[ 0.8130,  0.