https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

## via numpy

In [1]:
import numpy as np

In [2]:
# N is batch size; D_in is input dimension (number columns in data set);
# H is hidden dimension (number neurons in hidden layer); D_out is output dimension (1 for regression)
N, D_in, H, D_out = 64, 20, 32, 1

# Create random input and output data
train_X = np.random.randn(N, D_in)
train_y = np.random.randn(N, D_out)

In [3]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H) 
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

In [4]:
#for t in range(500):
# Forward pass: compute predicted y
h = train_X.dot(w1)

In [5]:
h_relu = np.maximum(h, 0) # apply activation function

In [6]:
y_pred = h_relu.dot(w2)

In [7]:
# Compute and print loss
loss = np.square(y_pred - train_y).sum()
print(loss) # loss is sum of squared errors

19485.285725282018


In [8]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y) # derivative of x^2 = 2*x

https://www.youtube.com/watch?v=tIeHLnjs5U8

In [9]:
grad_w2 = h_relu.T.dot(grad_y_pred) 
# per link, dCost/dw2 = grad_y_pred * 1 {the derivative of relu} * result of previous layer
# dot product sums over all observations in data set

In [10]:
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0 # recall from forward pass which results were zeroed out
grad_w1 = train_X.T.dot(grad_h)

In [11]:
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

In [12]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h = train_X.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    if t % 20 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = train_X.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 19255.19118808303
20 472.1624803091429
40 239.59100564649518
60 148.7971666789395
80 103.352680014046
100 77.06826467807062
120 60.16201383849319
140 48.73859486213098
160 40.57118170721073
180 34.41852545608408
200 29.635680546064506
220 25.846811205353816
240 22.701022269610945
260 19.95323527014228
280 17.667779751774923
300 15.74141370075517
320 14.146656939351837
340 12.77885546757338
360 11.587707732131584
380 10.55560708871268
400 9.67335896431487
420 8.901098924726499
440 8.209228124446325
460 7.583181674795384
480 7.013508421024827


## Numpy 
## add hidden layer

In [13]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [14]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1) 
w2 = np.random.randn(H1, H2)
w3 = np.random.randn(H2, D_out)

In [15]:
learning_rate = 1e-4

In [16]:
# Forward pass: compute predicted y
h1 = train_X.dot(w1)
h1_relu = np.maximum(h1, 0) # apply activation function

h2 = h1_relu.dot(w2)
h2_relu = np.maximum(h2, 0)

y_pred = h2_relu.dot(w3)

In [17]:
# Compute and print loss
loss = np.square(y_pred - train_y).sum()
print(loss)

47437.70612987318


In [18]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y)

# gradients on w3
grad_w3 = h2_relu.T.dot(grad_y_pred)

# gradients on w2
grad_h2_relu = grad_y_pred.dot(w3.T)
grad_h2 = grad_h2_relu.copy()
grad_h2[h2 < 0] = 0
grad_w2 = h1_relu.T.dot(grad_h2)

# gradients on w1
grad_h1_relu = grad_h2.dot(w2.T)
grad_h1 = grad_h1_relu.copy()
grad_h1[h1 < 0] = 0
grad_w1 = train_X.T.dot(grad_h1)

In [19]:
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
w3 -= learning_rate * grad_w3

In [20]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1)
    h1_relu = np.maximum(h1, 0) # apply activation function

    h2 = h1_relu.dot(w2)
    h2_relu = np.maximum(h2, 0)

    y_pred = h2_relu.dot(w3)

    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    if t % 20 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_relu.T.dot(grad_y_pred)

    # gradients on w2
    grad_h2_relu = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_relu.copy()
    grad_h2[h2 < 0] = 0
    grad_w2 = h1_relu.T.dot(grad_h2)

    # gradients on w1
    grad_h1_relu = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_relu.copy()
    grad_h1[h1 < 0] = 0
    grad_w1 = train_X.T.dot(grad_h1)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3

0 5734.375059058963
20 151.65888191004706
40 90.82856778087958
60 72.84758759896233
80 63.626916673785026
100 58.27132392860417
120 54.663650899448676
140 52.01462678429429
160 49.82332526724022
180 47.85373371973661
200 46.12789307722356
220 44.7055881761286
240 43.68622642673291
260 42.865341145861166
280 42.20435505875182
300 41.64374119992173
320 41.16982347765008
340 40.78530822934714
360 40.444327356761136
380 40.10547650172527
400 39.79937576889613
420 39.5147586950423
440 39.25162263915564
460 39.00164972635194
480 38.75741056580283


## Numpy
## bias, (3 layers)

In [21]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [22]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1)
b1 = np.random.randn(H1)
w2 = np.random.randn(H1, H2)
b2 = np.random.randn(H2)
w3 = np.random.randn(H2, D_out)
b3 = np.random.randn(D_out)

In [23]:
# Forward pass: compute predicted y
h1 = train_X.dot(w1) + b1
h1_relu = np.maximum(h1, 0)

h2 = h1_relu.dot(w2) + b2
h2_relu = np.maximum(h2, 0)

y_pred = h2_relu.dot(w3) + b3

In [24]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y)

# gradients on w3
grad_w3 = h2_relu.T.dot(grad_y_pred)
grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

# gradients on w2
grad_h2_relu = grad_y_pred.dot(w3.T)
grad_h2 = grad_h2_relu.copy()
grad_h2[h2 < 0] = 0
grad_w2 = h1_relu.T.dot(grad_h2)
grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

# gradients on w1
grad_h1_relu = grad_h2.dot(w2.T)
grad_h1 = grad_h1_relu.copy()
grad_h1[h1 < 0] = 0
grad_w1 = train_X.T.dot(grad_h1)
grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
w3 -= learning_rate * grad_w3
b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

In [25]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1) + b1
    h1_relu = np.maximum(h1, 0)

    h2 = h1_relu.dot(w2) + b2
    h2_relu = np.maximum(h2, 0)

    y_pred = h2_relu.dot(w3) + b3
    
    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    if t % 20 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_relu.T.dot(grad_y_pred)
    grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

    # gradients on w2
    grad_h2_relu = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_relu.copy()
    grad_h2[h2 < 0] = 0
    grad_w2 = h1_relu.T.dot(grad_h2)
    grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

    # gradients on w1
    grad_h1_relu = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_relu.copy()
    grad_h1[h1 < 0] = 0
    grad_w1 = train_X.T.dot(grad_h1)
    grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3
    b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
    b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
    b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

0 882.3426905838278
20 57.367082511837964
40 53.047094969196976
60 51.49456781339602
80 50.462584685652054
100 49.75402166291342
120 49.219096746397796
140 48.8039078585646
160 48.4794870525936
180 48.220755635755
200 48.00732757023129
220 47.82895409507565
240 47.67699169312294
260 47.54551895863878
280 47.43034892702598
300 47.3282324445873
320 47.23677985073668
340 47.15556042700672
360 47.08243308076378
380 47.0154114212327
400 46.953761147429745
420 46.89799871193284
440 46.84612585075281
460 46.79793265280642
480 46.75249319906141


## Numpy
## sigmoid activation
https://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x

In [26]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [27]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1)
b1 = np.random.randn(H1)
w2 = np.random.randn(H1, H2)
b2 = np.random.randn(H2)
w3 = np.random.randn(H2, D_out)
b3 = np.random.randn(D_out)

In [28]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1) + b1
    h1_sigm = 1 / (1 + np.exp(-1 * h1))
    

    h2 = h1_sigm.dot(w2) + b2
    h2_sigm = 1 / (1 + np.exp(-1 * h1))

    y_pred = h2_sigm.dot(w3) + b3
    
    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    if t % 20 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_sigm.T.dot(grad_y_pred)
    grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

    # gradients on w2
    grad_h2_sigm = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_sigm * h2_sigm * (1 - h2_sigm)
    grad_w2 = h1_sigm.T.dot(grad_h2)
    grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

    # gradients on w1
    grad_h1_sigm = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_sigm * h1_sigm * (1 - h1_sigm)
    grad_w1 = train_X.T.dot(grad_h1)
    grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3
    b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
    b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
    b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

0 649.0950071597233
20 256.8272289932953
40 159.67962001846217
60 129.89481623398328
80 116.40652001153944
100 107.7124135727588
120 101.07759970797775
140 95.72474966861347
160 91.3354236559858
180 87.71753756112318
200 84.72839281011784
220 82.25412890689996
240 80.20207818737434
260 78.49645699200121
280 77.07523281852238
300 75.88762768167479
320 74.8920790532314
340 74.05456650929935
360 73.34723874900314
380 72.74728839137535
400 72.2360309083358
420 71.79815136749068
440 71.42108884281276
460 71.0945335796994
480 70.81001639201845
