In [1]:
import numpy as np
import torch
import torch.utils.data as torchdata
import torch.nn as nn
import torch.nn.functional as F
import torchvision

import utility as util
import utility.doc

## Numerical Operations

In [61]:
# Create a tensor directly from ndarray using torch.tensor()
A = np.arange(4).reshape(2,2)
A, torch.tensor(A)

(array([[0, 1],
        [2, 3]]),
 tensor([[0, 1],
         [2, 3]]))

In [90]:
# create a tensor of size (2, 2)
# Usually, use torch.tensor() instead of torch.Tensor() as the former is the (default) factory method.
A = torch.tensor([[1,2],[3,4]])
B = torch.arange(4).reshape((2,2))
# Loop through the first axis of a tensor
for a in A:
    print(a)
A.shape, B.shape, A, B

tensor([1, 2])
tensor([3, 4])


(torch.Size([2, 2]),
 torch.Size([2, 2]),
 tensor([[1, 2],
         [3, 4]]),
 tensor([[0, 1],
         [2, 3]]))

In [91]:
# Inverting a tensor of booleans
a = torch.tensor([True,False,True])
~a

tensor([False,  True, False])

In [92]:
# casting one type to the torch.DoubleTensor (aka. float64)
a = torch.tensor([True,False,True])
a.double(), a.double().type()

(tensor([1., 0., 1.], dtype=torch.float64), 'torch.DoubleTensor')

In [4]:
# Boolean array operations
a = torch.tensor([True,False,True,False])
b = torch.tensor([False,True,True,False])
a | b, a & b

(tensor([ True,  True,  True, False]), tensor([False, False,  True, False]))

In [12]:
# stacking along a new axis 0 and axis 1
a = torch.Tensor([1,2]); b = torch.Tensor([3,4]); c = torch.Tensor([5,6])
torch.stack([a,b,c]), torch.stack([a,b,c], dim=1)

(tensor([[1., 2.],
         [3., 4.],
         [5., 6.]]),
 tensor([[1., 3., 5.],
         [2., 4., 6.]]))

In [11]:
# concatenating tensors together
# no new dimension is introduced
a = torch.Tensor([1,2]); b = torch.Tensor([3,4]); c = torch.Tensor([5,6])
torch.cat([a, b, c])

tensor([1., 2., 3., 4., 5., 6.])

In [13]:
# convenience function to remove a row from a tensor
A = (torch.arange(8) + 1).reshape((4, 2))
a = 2
A, torch.cat([A[:a], A[a + 1:]])

(tensor([[1, 2],
         [3, 4],
         [5, 6],
         [7, 8]]),
 tensor([[1, 2],
         [3, 4],
         [7, 8]]))

In [22]:
A = torch.arange(6).reshape((2,3))
A, A.shape, A.unsqueeze(0).shape, A.unsqueeze(1).shape, A.unsqueeze(2).shape

(tensor([[0, 1, 2],
         [3, 4, 5]]),
 torch.Size([2, 3]),
 torch.Size([1, 2, 3]),
 torch.Size([2, 1, 3]),
 torch.Size([2, 3, 1]))

In [25]:
A = torch.arange(6).reshape((2,3))
A, A.shape, A.unsqueeze(-1), A.unsqueeze(-1).shape, A.unsqueeze(-2).shape

(tensor([[0, 1, 2],
         [3, 4, 5]]),
 torch.Size([2, 3]),
 tensor([[[0],
          [1],
          [2]],
 
         [[3],
          [4],
          [5]]]),
 torch.Size([2, 3, 1]),
 torch.Size([2, 1, 3]))

In [35]:
A = torch.arange(6).reshape((2,3))
A, A.repeat((2,1)), A.repeat((1,2))

(tensor([[0, 1, 2],
         [3, 4, 5]]),
 tensor([[0, 1, 2],
         [3, 4, 5],
         [0, 1, 2],
         [3, 4, 5]]),
 tensor([[0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5]]))

In [40]:
# torch.tile() is the same as Tensor.repeat()
A = torch.arange(6).reshape((2,3))
A, torch.tile(A, (2,1)), torch.tile(A, (1,2))

(tensor([[0, 1, 2],
         [3, 4, 5]]),
 tensor([[0, 1, 2],
         [3, 4, 5],
         [0, 1, 2],
         [3, 4, 5]]),
 tensor([[0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5]]))

In [16]:
# Duplicating a matrix along a new axis using torch.tile()
A = torch.arange(12).reshape((3,2,2))
B = torch.tile(A.unsqueeze(2), (2, 1))
A.shape, B.shape, A, B

(torch.Size([3, 2, 2]),
 torch.Size([3, 2, 2, 2]),
 tensor([[[ 0,  1],
          [ 2,  3]],
 
         [[ 4,  5],
          [ 6,  7]],
 
         [[ 8,  9],
          [10, 11]]]),
 tensor([[[[ 0,  1],
           [ 0,  1]],
 
          [[ 2,  3],
           [ 2,  3]]],
 
 
         [[[ 4,  5],
           [ 4,  5]],
 
          [[ 6,  7],
           [ 6,  7]]],
 
 
         [[[ 8,  9],
           [ 8,  9]],
 
          [[10, 11],
           [10, 11]]]]))

In [12]:
# using sums
A = (torch.arange(8) + 1).reshape((4, 2))
torch.sum(A), torch.sum(A, dim=0), torch.sum(A, dim=1)

(tensor(36), tensor([16, 20]), tensor([ 3,  7, 11, 15]))

In [2]:
A = torch.Tensor([[1, 2], [0, 1]])
torch.matrix_exp(A)

tensor([[2.7183, 5.4366],
        [0.0000, 2.7183]])

In [3]:
A = torch.Tensor([[[1, 2], [0, 1]], [[1, 2], [0, 1]], [[1, 2], [0, 1]]])
torch.matrix_exp(A)

tensor([[[2.7183, 5.4366],
         [0.0000, 2.7183]],

        [[2.7183, 5.4366],
         [0.0000, 2.7183]],

        [[2.7183, 5.4366],
         [0.0000, 2.7183]]])

In [42]:
# Batched Matrix-matrix multiplication using np.einsum()
# Matrix-matrix multiplication
A = torch.Tensor([
    [
        [1, 2],
        [4, 2]
    ],[
        [3, 1],
        [2,-1]
    ]
])
B = torch.Tensor([
    [
        [2, 0],
        [0, -1]
    ],[
        [2, 1],
        [1, 0]
    ]
])
A[0] @ B[0], A[1] @ B[1], torch.einsum("...ij,...jk->...ik", A, B)

(tensor([[ 2., -2.],
         [ 8., -2.]]),
 tensor([[7., 3.],
         [3., 2.]]),
 tensor([[[ 2., -2.],
          [ 8., -2.]],
 
         [[ 7.,  3.],
          [ 3.,  2.]]]))

In [14]:
# Trace
A1 = torch.Tensor([[ 1, 2], [3, 4]])
A2 = torch.Tensor([[-1, 2], [3, 2]])
A3 = torch.Tensor([[ 2,-1], [3, 1]])
A4 = torch.Tensor([[ 2, 1], [1, 0]])
A = torch.stack([
    torch.stack([A1, A2]),
    torch.stack([A3, A4])])
torch.trace(A1), torch.trace(A2), torch.trace(A3), \
        torch.trace(A4), torch.einsum("...ii", A)

(tensor(5.),
 tensor(1.),
 tensor(3.),
 tensor(2.),
 tensor([[5., 1.],
         [3., 2.]]))

In [98]:
A = torch.randn(2,2,2)
values, indices = A.max(0)
values

tensor([[-1.3450,  0.3728],
        [ 0.4188, -0.8023]])

In [104]:
A = torch.randn(2,2,2).unsqueeze(0)
values, indices = A.max(0)
values

tensor([[[ 1.8009,  0.5166],
         [ 0.4627, -0.4415]],

        [[ 0.0510,  2.5605],
         [-0.6070,  0.3585]]])

## Indexing, selection and manipulating tensor shapes

In [52]:
A = torch.tensor([
        [1,2],
        [3,4],
        [5,6],
        [7,8]])
indices = torch.tensor([0, 2, 1, 0], dtype=torch.long)
A[indices]

tensor([[1, 2],
        [5, 6],
        [3, 4],
        [1, 2]])

In [83]:
# Indicing tensors.
# Cannot use ndarray or lists as indices. They must be integer/long tensors.
A = torch.tensor([
        [1,2],
        [3,4],
        [5,6],
        [7,8]])
ind1 = torch.tensor([[0],[1],[0],[1]])
ind2 = torch.tensor([[3],[1]])
torch.gather(A, 0, ind2), torch.gather(A, 1, ind1), torch.gather(A, 1, ind1).squeeze(1)

(tensor([[7],
         [3]]),
 tensor([[1],
         [4],
         [5],
         [8]]),
 tensor([1, 4, 5, 8]))

In [36]:
# Q_values has shape (batch=2, history=3, 2)
Q_values = torch.tensor([
    # batch
    [
        # history
        [1, 2],
        [3, 4],
        [5, 6],
    ],[
        [ 7,  8],
        [ 9, 10],
        [11, 12]
    ]
])
# actions has shape (batch=2,history=3,1)
actions = torch.tensor([
    # batch
    [
        # history
        [0],
        [1],
        [0]
    ],[
        [0],
        [0],
        [1]
    ]
])
_Q_values = Q_values.gather(2, actions).squeeze(2)
_Q_values.shape

torch.Size([2, 3])

In [69]:
# Advanced usage of torch.gather()
# Documentation for torch.gather()
# https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms
A = torch.tensor([
        [1,2],
        [3,4],
        [5,6],
        [7,8]])
ind1 = torch.tensor([[0,1],[3,2]])
ind2 = torch.tensor([[0,1],[1,0],[0,1]])
torch.gather(A, 0, ind1), torch.gather(A, 1, ind2)

(tensor([[1, 4],
         [7, 6]]),
 tensor([[1, 2],
         [4, 3],
         [5, 6]]))

In [75]:
# Using masks to select subtensors
A = torch.arange(6).reshape(-1, 2)
mask = torch.tensor([True,False,True])
A, A[mask]

(tensor([[0, 1],
         [2, 3],
         [4, 5]]),
 tensor([[0, 1],
         [4, 5]]))

In [77]:
# reshaping tensors
A = torch.Tensor([[[1,2],[3,4]],[[5,6],[7,8]]])
A, A.reshape((2,-1))

(tensor([[[1., 2.],
          [3., 4.]],
 
         [[5., 6.],
          [7., 8.]]]),
 tensor([[1., 2., 3., 4.],
         [5., 6., 7., 8.]]))

In [9]:
# Selecting entries from a tensor as a vector
"""
tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])
"""
A = torch.arange(9).reshape(3,3)
B = torch.zeros(A.shape, dtype=torch.bool)
B[0,1] = True
B[0,2] = True
B[1,1] = True
B[2,0] = True
B, A[B]

(tensor([[False,  True,  True],
         [False,  True, False],
         [ True, False, False]]),
 tensor([1, 2, 4, 6]))

In [4]:
# sorting each sample in a batch of values
A = torch.rand(3, 5)
values, indices = torch.sort(A, dim=1)
A, values, indices

(tensor([[0.9266, 0.6806, 0.9140, 0.8071, 0.2495],
         [0.8059, 0.1397, 0.9016, 0.3367, 0.5274],
         [0.5550, 0.8124, 0.9493, 0.2014, 0.6831]]),
 tensor([[0.2495, 0.6806, 0.8071, 0.9140, 0.9266],
         [0.1397, 0.3367, 0.5274, 0.8059, 0.9016],
         [0.2014, 0.5550, 0.6831, 0.8124, 0.9493]]),
 tensor([[4, 1, 3, 2, 0],
         [1, 3, 4, 0, 2],
         [3, 0, 4, 1, 2]]))

In [77]:
# Sort batched tensors of shape (batch size, length, n dimensions) along the first column 
A = torch.tensor([
    [
        [2, 100],
        [1, -1],
        [4, 1],
        [3, 0]
    ],[
        [1, 1],
        [4, 0],
        [2, 100],
        [3, -1]
    ],[
        [3, 0],
        [2, 1],
        [1, 100],
        [4, -1]
    ]
]) # has shape (3, 4, 2)
# sort A by first column by selecting that column
# and get indices to apply to A
# indices has shape (3, 4)
_, indices = torch.sort(A[...,0], dim=1)
# repeat indices since we want to apply indices to both columns
# indices has shape (3, 4, 2)
indices = indices.unsqueeze(-1).repeat(1,1,2)
# use gather to sort A itself
torch.gather(A, 1, indices)

tensor([[[  1,  -1],
         [  2, 100],
         [  3,   0],
         [  4,   1]],

        [[  1,   1],
         [  2, 100],
         [  3,  -1],
         [  4,   0]],

        [[  1, 100],
         [  2,   1],
         [  3,   0],
         [  4,  -1]]])

## Tensor Manipulations

In [15]:
"""Padding tensor at the end of an array
tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]]])
"""
A = torch.arange(12).reshape(2,3,2)
# second argument specifies padding at
# (front of n-th axis, back of n-th axis, front of axis n-1, back of axis n-1, ...)
F.pad(A, (0,3,0,2,0,1), mode='constant', value=0)

tensor([[[ 0,  1,  0,  0,  0],
         [ 2,  3,  0,  0,  0],
         [ 4,  5,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]],

        [[ 6,  7,  0,  0,  0],
         [ 8,  9,  0,  0,  0],
         [10, 11,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]],

        [[ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]]])

In [18]:
"""Padding tensor at some parts of an array"""
A = torch.arange(12).reshape(2,3,2)
# second argument specifies padding at
# (front of n-th axis, back of n-th axis, front of axis n-1, back of axis n-1, ...)
F.pad(A, (0,3,1,0), mode='constant', value=0)

tensor([[[ 0,  0,  0,  0,  0],
         [ 0,  1,  0,  0,  0],
         [ 2,  3,  0,  0,  0],
         [ 4,  5,  0,  0,  0]],

        [[ 0,  0,  0,  0,  0],
         [ 6,  7,  0,  0,  0],
         [ 8,  9,  0,  0,  0],
         [10, 11,  0,  0,  0]]])

In [19]:
"""Padding tensor at the front of an array"""
A = torch.arange(12).reshape(2,3,2)
# second argument specifies padding at
# (front of n-th axis, back of n-th axis, front of axis n-1, back of axis n-1, ...)
F.pad(A, (3,0,2,0,1,0), mode='constant', value=0)

tensor([[[ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]],

        [[ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  1],
         [ 0,  0,  0,  2,  3],
         [ 0,  0,  0,  4,  5]],

        [[ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  6,  7],
         [ 0,  0,  0,  8,  9],
         [ 0,  0,  0, 10, 11]]])

In [20]:
A = torch.arange(12).reshape(2,3,2)
F.pad(A, (3,0), mode='constant', value=0)

tensor([[[ 0,  0,  0,  0,  1],
         [ 0,  0,  0,  2,  3],
         [ 0,  0,  0,  4,  5]],

        [[ 0,  0,  0,  6,  7],
         [ 0,  0,  0,  8,  9],
         [ 0,  0,  0, 10, 11]]])

In [33]:
# splitting the array using np.where()
A = torch.arange(12).reshape(3, 4) - 6
A, torch.where(A < 0, -1, 0) + torch.where(A > 0, 1, 0)

(tensor([[-6, -5, -4, -3],
         [-2, -1,  0,  1],
         [ 2,  3,  4,  5]]),
 tensor([[-1, -1, -1, -1],
         [-1, -1,  0,  1],
         [ 1,  1,  1,  1]]),
 (tensor([0, 0, 0, 0, 1, 1, 1, 1]), tensor([0, 1, 2, 3, 0, 1, 2, 3])))

In [40]:
# assigning arrays using masks
A = torch.arange(12, dtype=torch.int).reshape(3, 4) - 6
B = torch.arange(12, dtype=torch.int).reshape(3, 4)
C = torch.zeros(A.shape, dtype=torch.int)
C[A < 0] = B[A < 0]
A, B, C

(tensor([[-6, -5, -4, -3],
         [-2, -1,  0,  1],
         [ 2,  3,  4,  5]], dtype=torch.int32),
 tensor([[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]], dtype=torch.int32),
 tensor([[0, 1, 2, 3],
         [4, 5, 0, 0],
         [0, 0, 0, 0]], dtype=torch.int32))

## Sampling

There is no `torch` equivalent of `np.random.choice()`. See:
<https://discuss.pytorch.org/t/torch-equivalent-of-numpy-random-choice/16146/4>

In [58]:
# sample indices without replacement
N = 10 # we will sample indices from [0, 1, 2, ..., N - 1]
k = 5 # we will sample k elements
generator = torch.Generator()
torch.multinomial(torch.ones(N, dtype=torch.float), k, replacement=False, generator=generator)

tensor([0, 2, 6, 3, 5])

In [83]:
# using datasets, index sampling and data loader
class MyDataset(torchdata.Dataset):
    def __init__(self, data, labels):
        self.__data = data
        self.__labels = labels
        
    def __getitem__(self, idx):
        return (
            torch.tensor(self.__data[idx], dtype=torch.float),
            torch.tensor(self.__labels[idx], dtype=torch.long)
        )
    
    def __len__(self):
        return len(self.__labels)

data   = np.arange(24)
labels = [0]*12 + [1]*12
dataset = MyDataset(data, labels)
generator = torch.Generator()
split = [18, 6]
trainset, testset = torchdata.random_split(dataset, split, generator=generator)

trainsampler = torchdata.SubsetRandomSampler(range(split[0]), generator=generator)
trainloader = torchdata.DataLoader(trainset, sampler=trainsampler, batch_size=3, num_workers=1)

testsampler = torchdata.SubsetRandomSampler(range(split[1]), generator=generator)
testloader = torchdata.DataLoader(testset, sampler=testsampler, batch_size=3, num_workers=1)

print("Retrieve samples and labels of the training set in the training loop.")
for samples, labels in trainloader:
    print(samples, labels)
print("Retrieve samples and labels in the testing set in the evaluation loop.")
for samples, labels in testloader:
    print(samples, labels)
print("Sampling indices from SubsetRandomSampler")
for i in testsampler:
    print(i, end=' ')
print()

Retrieve samples and labels of the training set in the training loop.
tensor([8., 9., 4.]) tensor([0, 0, 0])
tensor([ 3., 19., 22.]) tensor([0, 1, 1])
tensor([23., 17., 13.]) tensor([1, 1, 1])
tensor([11.,  6., 20.]) tensor([0, 0, 1])
tensor([ 5., 18.,  0.]) tensor([0, 1, 0])
tensor([16.,  1., 14.]) tensor([1, 0, 1])
Retrieve samples and labels in the testing set in the evaluation loop.
tensor([ 7., 12.,  2.]) tensor([0, 1, 0])
tensor([21., 10., 15.]) tensor([1, 0, 1])
Sampling indices from SubsetRandomSampler
0 2 5 3 4 1 


## Torch Modules

In [54]:
# L1 loss function
criterion = nn.SmoothL1Loss()
a = torch.tensor([0,1,1,2], dtype=torch.float).reshape(2,2)
b = torch.tensor([0,1,2,0], dtype=torch.float).reshape(2,2)
loss = criterion(a, b)
loss

tensor(0.5000)

In [13]:
# Linear takes in input with multiple dimensions, and linear only applies the last dimension.
mlp = nn.Sequential()
mlp.add_module("linear", nn.Linear(2, 8))
mlp.add_module("relu", nn.ReLU(inplace=True))

def do_mlp(x):
    x = torch.tensor(x)
    h = mlp(x)
    return h.detach().numpy()

h1 = do_mlp([0.8, 0.2])
h2 = do_mlp([[0.8, 0.2],[0.4, 0.6]])
h3 = do_mlp([
    [
        [0.8, 0.2],
        [0.4, 0.6],
        [0.1, 0.9]
    ],[
        [0.1, 0.2],
        [0.3, 0.9],
        [0.3, 0.5]
    ]
])
util.doc.results(
    "output of mlp given [0.8, 0.2]", np.round(h1, 2),
    "shape of output", h1.shape,
    "output of mlp given [[0.8, 0.2],[0.4, 0.6]]", np.round(h2, 2),
    "shape of output", h2.shape,
    "output of mlp given input of shape (2,3,2)", np.round(h3, 2),
    "shape of output", h3.shape
)

output of mlp given [0.8, 0.2]
[0.   0.   0.   0.85 0.   0.   0.08 0.75]

shape of output
(8,)

output of mlp given [[0.8, 0.2],[0.4, 0.6]]
[[0.   0.   0.   0.85 0.   0.   0.08 0.75]
 [0.09 0.02 0.   0.54 0.04 0.   0.04 0.8 ]]

shape of output
(2, 8)

output of mlp given input of shape (2,3,2)
[[[0.   0.   0.   0.85 0.   0.   0.08 0.75]
  [0.09 0.02 0.   0.54 0.04 0.   0.04 0.8 ]
  [0.41 0.06 0.05 0.3  0.33 0.   0.01 0.84]]

 [[0.05 0.45 0.   0.57 0.   0.   0.11 0.73]
  [0.3  0.   0.03 0.38 0.26 0.   0.   0.84]
  [0.09 0.15 0.   0.54 0.01 0.   0.06 0.78]]]

shape of output
(2, 3, 8)


In [3]:
dist1 = torch.distributions.normal.Normal(loc= 1.0, scale=0.4)
dist2 = torch.distributions.normal.Normal(loc=-1.0, scale=0.2)

def get_x():
    x1 = dist1.sample((10,))
    x2 = dist2.sample((10,))
    return torch.hstack((x1[:,None], x2[:,None]))

# updates running mean and std using exponentially moving average.
# Setting momentum to 0 makes batch norm fail.
bn = nn.BatchNorm1d(2, momentum=0.1)

bn.train()
for _ in range(100):
    x = get_x()
    h = bn(x)
bn.eval()

def do_bn(x):
    x = torch.tensor(x, dtype=torch.float)
    h = bn(x)
    return h.detach().numpy()

print(bn.running_mean, bn.running_var)

h1 = do_bn([[1,-1]])
h2 = do_bn([
    [1.4, -0.8],
    [0.6, -1.2]
])

util.doc.results(
    "batch norm running mean", bn.running_mean,
    "batch norm running variance", bn.running_var,
    "output of bn given [1, -1]", np.round(h1, 2),
    "shape of output", h1.shape,
    "output of bn given data 1 std away", np.round(h2, 2),
    "shape of output", h2.shape,
)

tensor([ 1.0147, -1.0108]) tensor([0.1709, 0.0407])
batch norm running mean
tensor([ 1.0147, -1.0108])

batch norm running variance
tensor([0.1709, 0.0407])

output of bn given [1, -1]
[[-0.04  0.05]]

shape of output
(1, 2)

output of bn given data 1 std away
[[ 0.93  1.04]
 [-1.   -0.94]]

shape of output
(2, 2)


## Back Propagation

In [2]:
# Manual Neural Network, MSE loss, using PyTorch
W1 = torch.tensor([[0.5, 0.55], [0.6, 0.65]], requires_grad=True)
W2 = torch.tensor([[0.7, 0.75], [0.8, 0.85]], requires_grad=True)
b1 = torch.tensor([0.35, 0.35])
b2 = torch.tensor([0.6, 0.6])
x = torch.tensor([0.3, 0.5])
y = torch.tensor([0.01, 0.99])

optimizer = torch.optim.SGD([W1, W2], lr=0.5)
optimizer.zero_grad()

yhat = torch.sigmoid(W2 @ torch.sigmoid(W1 @ x + b1) + b2)
E_total = 0.5*torch.sum((y - yhat)**2)
torch.set_printoptions(precision=6)
print(E_total)
E_total.backward(create_graph=True)
print("Torch gradients")
print(W1.grad)
print(W2.grad)

optimizer.step()
print("Weights after update")
print(W1)
print(W2)

tensor(0.348113, grad_fn=<MulBackward0>)
Torch gradients
tensor([[0.004284, 0.007141],
        [0.004459, 0.007431]], grad_fn=<CopyBackwards>)
tensor([[ 0.078441,  0.080391],
        [-0.012035, -0.012334]], grad_fn=<CopyBackwards>)
Weights after update
tensor([[0.497858, 0.546430],
        [0.597771, 0.646284]], requires_grad=True)
tensor([[0.660779, 0.709805],
        [0.806018, 0.856167]], requires_grad=True)
