In [1]:
%matplotlib inline


Warm-up: numpy
--------------

A fully-connected **ReLU network** with one hidden layer and no biases, 

trained to predict y from x using Euclidean error.

This implementation uses numpy to manually compute the forward pass, loss, and
backward pass.

A numpy array is a generic n-dimensional array; it does not know anything about
deep learning or gradients or computational graphs, and is just a way to perform
generic numeric computations.



In [2]:
import numpy as np

# N is batch size; 
# D_in is input dimension;
# H is hidden dimension; 
# D_out is output dimension.

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6



In [6]:
x.shape

(64, 1000)

In [7]:
y.shape

(64, 10)

In [8]:
w1.shape

(1000, 100)

In [9]:
w2.shape

(100, 10)

In [10]:
for t in range(1):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    print("The Hidden Layer:", h.shape)
    # relu function
    h_relu = np.maximum(h, 0) 
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    # design a loss function
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    # How.
    # Back Forward? or BP
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

The Hidden Layer: (64, 100)
0 39321473.34467995


## For the Left Looping

In [11]:
for t in range(299):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    print("The Hidden Layer:", h.shape)
    # relu function
    h_relu = np.maximum(h, 0) 
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    # design a loss function
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    # How.
    # Back Forward? or BP
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

The Hidden Layer: (64, 100)
0 35759938.262073115
The Hidden Layer: (64, 100)
1 30689800.274656665
The Hidden Layer: (64, 100)
2 21935913.535637565
The Hidden Layer: (64, 100)
3 13231310.601608794
The Hidden Layer: (64, 100)
4 7188286.323579557
The Hidden Layer: (64, 100)
5 4004546.442737935
The Hidden Layer: (64, 100)
6 2462180.21517938
The Hidden Layer: (64, 100)
7 1700892.0148032166
The Hidden Layer: (64, 100)
8 1281888.3143455866
The Hidden Layer: (64, 100)
9 1019357.5784821492
The Hidden Layer: (64, 100)
10 835656.3273336892
The Hidden Layer: (64, 100)
11 697437.2716813622
The Hidden Layer: (64, 100)
12 588618.080839402
The Hidden Layer: (64, 100)
13 500891.72488987504
The Hidden Layer: (64, 100)
14 428894.2702448588
The Hidden Layer: (64, 100)
15 369249.22848945926
The Hidden Layer: (64, 100)
16 319305.77039887063
The Hidden Layer: (64, 100)
17 277223.17950520385
The Hidden Layer: (64, 100)
18 241621.51136753068
The Hidden Layer: (64, 100)
19 211305.80296907708
The Hidden Layer: (

The Hidden Layer: (64, 100)
205 0.1460060570999475
The Hidden Layer: (64, 100)
206 0.13708540297803262
The Hidden Layer: (64, 100)
207 0.12871590751568052
The Hidden Layer: (64, 100)
208 0.120868460543514
The Hidden Layer: (64, 100)
209 0.11350564444471448
The Hidden Layer: (64, 100)
210 0.10659632922343161
The Hidden Layer: (64, 100)
211 0.10011371535957038
The Hidden Layer: (64, 100)
212 0.09403016631178793
The Hidden Layer: (64, 100)
213 0.0883236676102162
The Hidden Layer: (64, 100)
214 0.08296664004550552
The Hidden Layer: (64, 100)
215 0.0779391125148069
The Hidden Layer: (64, 100)
216 0.07321938742913253
The Hidden Layer: (64, 100)
217 0.06879101491240927
The Hidden Layer: (64, 100)
218 0.0646327813450837
The Hidden Layer: (64, 100)
219 0.060729510246697524
The Hidden Layer: (64, 100)
220 0.057067382835476244
The Hidden Layer: (64, 100)
221 0.053628804484018626
The Hidden Layer: (64, 100)
222 0.05039896077074102
The Hidden Layer: (64, 100)
223 0.047365861853703256
The Hidden Lay