# 1. NeuralNetwork from scratch - numpy

In [1]:
import numpy as np

In [2]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
# Create dummy input and output data
X = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

print(X.shape)
print(y.shape)

(64, 1000)
(64, 10)


#### Model
- Forward

$$\begin{split}
& h = X \cdot w_1 \\
& \text{h_relu} = relu(h) \\
& y\_ = relu(\text{h_relu} \cdot w_2)
\end{split}$$

- Loss function:

$$loss = \sum\limits_{i=1}^n(y_i - \hat{y}_i)^2 = (y - y\_)^2.sum()$$

- Backward

$$\begin{split}
& \text{grad_y_} = \frac{\delta\ loss}{\delta y\_} = -2(y - y\_) = 2(y\_ - y) \\
& \text{grad_w}_2 = \frac{\delta\ loss}{\delta w_2} = \frac{\delta\ y\_}{\delta w_2}\ \frac{\delta\ loss}{\delta y\_} = \text{h_relu}^T \cdot \text{grad_y_} \\
& \text{grad_h_relu} = \frac{\delta\ loss}{\delta \text{h_relu}} = \frac{\delta\ loss}{\delta y\_}\ \frac{\delta\ y\_}{\delta \text{h_relu}} = \text{grad_y_} \cdot w_2^T \\
& \cdots \\
& \text{grad_w}_1 = X^T \cdot \text{grad_h}
\end{split}$$

In [4]:
def relu(x): return np.maximum(0, x)

In [5]:
%%time
# Randomly initialize params
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6

for epoch in range(1, 501):
    # Forward pass: compute y_
    h = np.dot(X, w1)
    h_relu = relu(h)
    y_ = np.dot(h_relu, w2)

    # Compute and print loss
    loss = np.square(y_ - y).sum()
    if epoch % 100 == 0: print(epoch, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_ = 2.0 * (y_ - y)
    grad_w2 = np.dot(h_relu.T, grad_y_)

    grad_h_relu = np.dot(grad_y_, w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = np.dot(X.T, grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

100 609.6494190370581
200 3.423726164027128
300 0.03603351480458575
400 0.0004912619095656394
500 7.551542582423114e-06
CPU times: user 2.25 s, sys: 25.7 ms, total: 2.27 s
Wall time: 287 ms


# 2. NeuralNetwork from scratch - torch tensor

In [6]:
import torch

device = torch.device('cpu')
dtype=torch.float

In [7]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [8]:
# Create dummy input and output data
X = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

print(X.size())
print(y.size())

torch.Size([64, 1000])
torch.Size([64, 10])


In [9]:
%%time
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for epoch in range(1, 501):
    # Forward
    h = X.mm(w1)
    h_relu = h.clamp(min=0)
    y_ = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_ - y).pow(2).sum().item()
    if epoch % 100 == 0: print(epoch, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_ = 2.0 * (y_ - y)
    grad_w2 = h_relu.t().mm(grad_y_)

    grad_h_relu = grad_y_.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = X.t().mm(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

100 971.07470703125
200 9.878978729248047
300 0.16335122287273407
400 0.0035316296853125095
500 0.00021612788259517401
CPU times: user 1.92 s, sys: 13.9 ms, total: 1.94 s
Wall time: 243 ms


# 3. NeuralNetwork from scratch - torch Autograd

In [10]:
import torch

device = torch.device('cuda')
dtype=torch.float

In [11]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [12]:
# Create dummy input and output data
X = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

print(X.size())
print(y.size())

torch.Size([64, 1000])
torch.Size([64, 10])


In [13]:
%%time
# Randomly initialize weights
# requires_grad=True indicates that we want to compute gradients
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for epoch in range(1, 501):
    # Forward
    h = X.mm(w1)
    h_relu = h.clamp(min=0)
    y_ = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_ - y).pow(2).sum()
    if epoch % 100 == 0: print(epoch, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    loss.backward()

    # Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

100 742.392578125
200 5.4352569580078125
300 0.060981035232543945
400 0.001105283503420651
500 0.00011072327470174059
CPU times: user 235 ms, sys: 36.1 ms, total: 271 ms
Wall time: 271 ms


# 4. NeuralNetwork with Pytorch nn

In [14]:
import torch

device = torch.device('cpu')
dtype=torch.float

In [15]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [16]:
# Create dummy input and output data
X = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

print(X.size())
print(y.size())

torch.Size([64, 1000])
torch.Size([64, 10])


In [17]:
%%time
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-6

for epoch in range(1, 501):
    # Forward
    y_ = model(X)
    
    # Compute and print loss
    loss = loss_fn(y_, y)
    if epoch % 100 == 0: print(epoch, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    model.zero_grad()
    loss.backward()

    # Update weights
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

100 643.0660400390625
200 597.5169677734375
300 557.6885986328125
400 522.0635986328125
500 490.06805419921875
CPU times: user 2.42 s, sys: 14 ms, total: 2.43 s
Wall time: 305 ms


#### With PyTorch: optim

In [18]:
%%time
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(1, 501):
    # Forward
    y_ = model(X)
    
    # Compute and print loss
    loss = loss_fn(y_, y)
    if epoch % 100 == 0: print(epoch, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    optimizer.zero_grad()
    loss.backward()

    # Update weights
    optimizer.step()

100 664.26708984375
200 647.3447265625
300 630.9163818359375
400 615.0795288085938
500 599.7923583984375
CPU times: user 2.82 s, sys: 30.1 ms, total: 2.85 s
Wall time: 356 ms


# 5. NeuralNetwork with Custom nn

In [19]:
import torch

device = torch.device('cpu')
dtype=torch.float

In [20]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [21]:
# Create dummy input and output data
X = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

print(X.size())
print(y.size())

torch.Size([64, 1000])
torch.Size([64, 10])


In [22]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, X):
        h_relu = self.linear1(X).clamp(min=0)
        y_ = self.linear2(h_relu)
        return y_

In [23]:
%%time
model = TwoLayerNet(D_in, H, D_out)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(1, 501):
    # Forward
    y_ = model(X)
    
    # Compute and print loss
    loss = loss_fn(y_, y)
    if epoch % 100 == 0: print(epoch, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    optimizer.zero_grad()
    loss.backward()

    # Update weights
    optimizer.step()

100 586.2449340820312
200 570.69189453125
300 555.6212768554688
400 541.0654907226562
500 527.0009765625
CPU times: user 2.6 s, sys: 31.5 ms, total: 2.64 s
Wall time: 330 ms
