In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

# Using numpy MLP

In [None]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [None]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [None]:
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [None]:
learning_rate = 1e-6

In [None]:
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(0, h)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred-y).sum()
    if t % 25 == 0:
        print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = np.where(grad_h_relu > 0, grad_h_relu, 0)
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
h = x.dot(w1)
h_relu = np.maximum(0, h)
y_pred = h_relu.dot(w2)
np.mean(y_pred - y < 1e-2)

# Torch version

In [25]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [26]:
gpu_device = torch.device('cuda')

In [27]:
x = torch.randn(N, D_in, device=gpu_device, dtype=torch.float)
y = torch.randn(N, D_out, device=gpu_device, dtype=torch.float)

In [28]:
w1 = torch.randn(D_in, H, device=gpu_device, dtype=torch.float, requires_grad=True)
w2 = torch.randn(H, D_out, device=gpu_device, dtype=torch.float, requires_grad=True)

In [15]:
learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = torch.clamp(h, min=0.0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 25 == 0:
        print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = torch.where(grad_h_relu > 0, grad_h_relu, torch.tensor(0.0, device=gpu_device))
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 2.504310131072998
25 1.6382091045379639
50 1.079041600227356
75 0.7139986753463745
100 0.47440844774246216
125 0.3161752223968506
150 0.211361363530159
175 0.14173568785190582
200 0.09526404738426208
225 0.06417029350996017
250 0.04329650476574898
275 0.029283910989761353
300 0.019856596365571022
325 0.013500294648110867
350 0.009207150898873806
375 0.0063055590726435184
400 0.00435779569670558
425 0.0030372655019164085
450 0.002139214426279068
475 0.0015211035497486591


In [30]:
h = x.mm(w1)
h_relu = torch.clamp(h, min=0.0)
y_pred = h_relu.mm(w2)
torch.mean(y_pred - y < 1e-2, dtype=torch.float)

tensor(1., device='cuda:0')

In [29]:
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0.0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 25 == 0:
        print(t, loss)
    
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(35170824., device='cuda:0', grad_fn=<SumBackward0>)
25 tensor(154213.0156, device='cuda:0', grad_fn=<SumBackward0>)
50 tensor(18509.9883, device='cuda:0', grad_fn=<SumBackward0>)
75 tensor(3558.3235, device='cuda:0', grad_fn=<SumBackward0>)
100 tensor(849.7177, device='cuda:0', grad_fn=<SumBackward0>)
125 tensor(229.7461, device='cuda:0', grad_fn=<SumBackward0>)
150 tensor(67.4826, device='cuda:0', grad_fn=<SumBackward0>)
175 tensor(21.1058, device='cuda:0', grad_fn=<SumBackward0>)
200 tensor(6.9363, device='cuda:0', grad_fn=<SumBackward0>)
225 tensor(2.3706, device='cuda:0', grad_fn=<SumBackward0>)
250 tensor(0.8367, device='cuda:0', grad_fn=<SumBackward0>)
275 tensor(0.3029, device='cuda:0', grad_fn=<SumBackward0>)
300 tensor(0.1118, device='cuda:0', grad_fn=<SumBackward0>)
325 tensor(0.0420, device='cuda:0', grad_fn=<SumBackward0>)
350 tensor(0.0160, device='cuda:0', grad_fn=<SumBackward0>)
375 tensor(0.0063, device='cuda:0', grad_fn=<SumBackward0>)
400 tensor(0.0026, devic