Based on
https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

Warm-up: numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N,D_in,H,D_out=64,1000,100,10

# Create random input and output data
x=np.random.randn(N,D_in)
y=np.random.randn(N,D_out)

# Randomly initialize weights
w1=np.random.randn(D_in,H)
w2=np.random.randn(H,D_out)

learning_rate=1e-6
for t in range(100):
    # Forward pass : compute predicted y
    h=x.dot(w1)
    h_relu=np.maximum(h,0)
    y_pred=h_relu.dot(w2)
    
    # Compute and print loss
    loss=np.square(y_pred-y).sum()
    print(t,loss)
    
    # Backprop to compute gradients of w1 and w2 with repect to loss
    grad_y_pred=2.0*(y_pred-y)
    grad_w2=h_relu.T.dot(grad_y_pred)
    grad_h_relu=grad_y_pred.dot(w2.T)
    grad_h=grad_h_relu.copy()
    grad_h[h<0]=0
    grad_w1=x.T.dot(grad_h)
    
    # Update weights
    w1-=learning_rate*grad_w1
    w2-=learning_rate*grad_w2
    
    

0 41420513.00630514
1 44881772.73801346
2 52403661.881827965
3 50511008.215685636
4 34325568.738956325
5 15788278.032777727
6 6064683.557893049
7 2718077.5916551803
8 1647155.546591505
9 1215233.815173144
10 974456.4146416697
11 807190.0892431247
12 679301.9612825266
13 577443.3578716603
14 494876.0719067117
15 427195.385702536
16 371022.4900639296
17 324065.00383904495
18 284450.7022410064
19 250764.47124104324
20 222015.40426886937
21 197317.18786830935
22 175975.79694333405
23 157456.02148279763
24 141286.32481612163
25 127105.49414794247
26 114642.11242151276
27 103641.96174659909
28 93871.05368797196
29 85188.10003730448
30 77441.65395536847
31 70518.87695551768
32 64309.792985994674
33 58733.816315638476
34 53715.698523465944
35 49187.69966046176
36 45107.36350855911
37 41428.134318276825
38 38089.90674232443
39 35055.914144801296
40 32295.653871495346
41 29779.730391080087
42 27483.580249599163
43 25384.87304521271
44 23465.156123787958
45 21708.89607628458
46 20097.892671173737

PyTorch: Tensors

In [6]:
import torch

dtype=torch.float
device=torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension
N,D_in, H,D_out=64,1000,100,10

# Create random input and output data
x=torch.randn(N,D_in,device=device,dtype=dtype)
y=torch.randn(N,D_out,device=device,dtype=dtype)

# Randomly initialize weights
w1=torch.randn(D_in, H, device=device, dtype=dtype)
w2=torch.randn(H,D_out,device=device,dtype=dtype)

learning_rate=1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h=x.mm(w1)
    h_relu=h.clamp(min=0)
    y_pred=h_relu.mm(w2)
    
    # Compute and print loss
    loss=(y_pred-y).pow(2).sum().item()
    if t%100==99:
        print(t,loss)
        
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred=2.0*(y_pred-y)
    grad_w2=h_relu.t().mm(grad_y_pred)
    grad_h_relu=grad_y_pred.mm(w2.t())
    grad_h=grad_h_relu.clone()
    grad_h[h<0]=0
    grad_w1=x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1-=learning_rate*grad_w1
    w2-=learning_rate*grad_w2

99 133.73150634765625
199 0.2830929756164551
299 0.0011858056532219052
399 6.19356069364585e-05
499 1.8011107385973446e-05


Autograd

In [7]:
import torch
dtype=torch.float
device=torch.device("cpu")

# device=torch.device("cuda:0") # Uncomment this to run on GPU
# torch.backends.cuda.matmul.allow_tf32 = False # Uncomment this to run on GPU

"""
 The above line disables TensorFloat32. This a feature that allows
 networks to run at a much faster speed while sacrificing precision.
 Although TensorFloat32 works well on most real models, for our toy model
 in this tutorial, the sacrificed precision causes convergence issue.
 For more information, see:
 https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
"""

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension

N,D_in,H,D_out=64,1000,100,10

# Create random Tensors to hold input and outputs
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass
x=torch.randn(N,D_in,device=device,dtype=dtype)
y=torch.randn(N,D_out,device=device,dtype=dtype)

# Create random Tensors for weights
# Setting requires_grad=True indicates that we want to compute gradients
# with respect to these Tensors during the backward pass

w1=torch.randn(D_in,H,device=device,dtype=dtype,requires_grad=True)
w2=torch.randn(H,D_out,device=device,dtype=dtype,requires_grad=True)

learning_rate=1e-6
for t in range(500):
    y_pred=x.mm(w1).clamp(min=0).mm(w2)
    loss=(y_pred-y).pow(2).sum()
    if t%100==99:
        print(t,loss.item())
        
    loss.backward()
    
    with torch.no_grad():
        w1-=learning_rate*w1.grad
        w2-=learning_rate*w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

99 774.7705688476562
199 4.301462173461914
299 0.031760912388563156
399 0.0004948644782416523
499 6.213696906343102e-05
