https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

## via numpy

In [1]:
import numpy as np

In [2]:
# N is batch size; D_in is input dimension (number columns in data set);
# H is hidden dimension (number neurons in hidden layer); D_out is output dimension (1 for regression)
N, D_in, H, D_out = 64, 20, 32, 1

# Create random input and output data
train_X = np.random.randn(N, D_in)
train_y = np.random.randn(N, D_out)

In [3]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H) 
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

In [4]:
#for t in range(500):
# Forward pass: compute predicted y
h = train_X.dot(w1)

In [5]:
h_relu = np.maximum(h, 0) # apply activation function

In [6]:
y_pred = h_relu.dot(w2)

In [7]:
# Compute and print loss
loss = np.square(y_pred - train_y).sum()
print(loss) # loss is sum of squared errors

27267.166583942602


In [8]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y) # derivative of x^2 = 2*x

https://www.youtube.com/watch?v=tIeHLnjs5U8

In [9]:
grad_w2 = h_relu.T.dot(grad_y_pred) 
# per link, dCost/dw2 = grad_y_pred * 1 {the derivative of relu} * result of previous layer
# dot product sums over all observations in data set

In [10]:
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0 # recall from forward pass which results were zeroed out
grad_w1 = train_X.T.dot(grad_h)

In [11]:
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

In [12]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h = train_X.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = train_X.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 26879.553559643693
1 9686.489363346154
2 7134.277094201403
3 6300.597973708073
4 5830.1294623991425
5 5459.616358428724
6 5091.407536860791
7 4696.509220665534
8 4278.625119776314
9 3844.881532540663
10 3415.537400880732
11 2995.76317633459
12 2606.009590475641
13 2250.7760250556394
14 1934.428537683766
15 1659.5577058518318
16 1422.9602212060354
17 1222.4825143429875
18 1053.5210657741588
19 912.5355459736029
20 795.0788871213431
21 697.8985948174943
22 617.3170390639535
23 550.6602194644934
24 494.9225772709981
25 448.49428868559687
26 409.4830691165106
27 376.539369252628
28 348.4304243429148
29 324.3564258064422
30 303.6386580527181
31 285.6400501417663
32 269.87935377389414
33 255.9450667507933
34 243.44344713914745
35 232.2065218119222
36 222.01883751439138
37 212.7342021874054
38 204.2234435382552
39 196.3763843852099
40 189.10303796175015
41 182.34415886965286
42 176.02113826316045
43 170.09036118374598
44 164.51113875815207
45 159.2421432613081
46 154.24997058870719
47 149.5

## Numpy 
## add hidden layer

In [13]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [14]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1) 
w2 = np.random.randn(H1, H2)
w3 = np.random.randn(H2, D_out)

In [15]:
learning_rate = 1e-4

In [16]:
# Forward pass: compute predicted y
h1 = train_X.dot(w1)
h1_relu = np.maximum(h1, 0) # apply activation function

h2 = h1_relu.dot(w2)
h2_relu = np.maximum(h2, 0)

y_pred = h2_relu.dot(w3)

In [17]:
# Compute and print loss
loss = np.square(y_pred - train_y).sum()
print(loss)

415.4068683305306


In [18]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y)

# gradients on w3
grad_w3 = h2_relu.T.dot(grad_y_pred)

# gradients on w2
grad_h2_relu = grad_y_pred.dot(w3.T)
grad_h2 = grad_h2_relu.copy()
grad_h2[h2 < 0] = 0
grad_w2 = h1_relu.T.dot(grad_h2)

# gradients on w1
grad_h1_relu = grad_h2.dot(w2.T)
grad_h1 = grad_h1_relu.copy()
grad_h1[h1 < 0] = 0
grad_w1 = train_X.T.dot(grad_h1)

In [19]:
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
w3 -= learning_rate * grad_w3

In [20]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1)
    h1_relu = np.maximum(h1, 0) # apply activation function

    h2 = h1_relu.dot(w2)
    h2_relu = np.maximum(h2, 0)

    y_pred = h2_relu.dot(w3)

    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_relu.T.dot(grad_y_pred)

    # gradients on w2
    grad_h2_relu = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_relu.copy()
    grad_h2[h2 < 0] = 0
    grad_w2 = h1_relu.T.dot(grad_h2)

    # gradients on w1
    grad_h1_relu = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_relu.copy()
    grad_h1[h1 < 0] = 0
    grad_w1 = train_X.T.dot(grad_h1)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3

0 223.76432649903296
1 183.36191929631218
2 164.30202272092214
3 153.45595947465824
4 146.22280849834675
5 140.66524030437944
6 136.1219532470695
7 132.2766107560008
8 128.86518306792797
9 125.72325222066736
10 122.78490185120677
11 120.0930848950215
12 117.5752540440181
13 115.22691164273972
14 113.03273704808656
15 110.97845941283052
16 109.06775924487431
17 107.25250520758874
18 105.53824065863492
19 103.9185705525553
20 102.38533355545923
21 100.93152605013121
22 99.55039668423179
23 98.236270862764
24 96.98940809457329
25 95.79811164676573
26 94.65166500272868
27 93.5552338916348
28 92.50540759322787
29 91.49911171908121
30 90.53354261216067
31 89.60612454459411
32 88.71451276275073
33 87.85655687283932
34 87.03028016170862
35 86.23386165254101
36 85.46562044956852
37 84.72400201952408
38 84.00756612386252
39 83.31497616577981
40 82.6449897552826
41 81.99645032708949
42 81.36827967188779
43 80.75947126268211
44 80.16908427557334
45 79.59623821898035
46 79.0401080976039
47 78.49992

394 40.57720644634869
395 40.54103958544016
396 40.50868345976825
397 40.47680269664943
398 40.44079501966175
399 40.410235412761
400 40.3760868039299
401 40.34124453366613
402 40.30966543193034
403 40.277788528797664
404 40.24349329318318
405 40.211443289357334
406 40.17850992999544
407 40.145095612389014
408 40.11367444406264
409 40.08038709163411
410 40.04747185687293
411 40.01551689927407
412 39.983654695145
413 39.95132835124266
414 39.91943756727694
415 39.88662768550079
416 39.85394150499065
417 39.82338222848681
418 39.79099298053745
419 39.75768587784494
420 39.72798302176388
421 39.69632691710025
422 39.66331039067331
423 39.63125195750487
424 39.60126262207139
425 39.568775467346924
426 39.536531471143206
427 39.506188150482046
428 39.474262621751265
429 39.44163603464442
430 39.41249906427962
431 39.38176884046789
432 39.349264819400766
433 39.31896869015481
434 39.289257708627886
435 39.25754802247131
436 39.2271016408327
437 39.19626680791259
438 39.16582646790616
439 39.

## Numpy
## bias, (3 layers)

In [21]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [22]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1)
b1 = np.random.randn(H1)
w2 = np.random.randn(H1, H2)
b2 = np.random.randn(H2)
w3 = np.random.randn(H2, D_out)
b3 = np.random.randn(D_out)

In [23]:
# Forward pass: compute predicted y
h1 = train_X.dot(w1) + b1
h1_relu = np.maximum(h1, 0)

h2 = h1_relu.dot(w2) + b2
h2_relu = np.maximum(h2, 0)

y_pred = h2_relu.dot(w3) + b3

In [24]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - train_y)

# gradients on w3
grad_w3 = h2_relu.T.dot(grad_y_pred)
grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

# gradients on w2
grad_h2_relu = grad_y_pred.dot(w3.T)
grad_h2 = grad_h2_relu.copy()
grad_h2[h2 < 0] = 0
grad_w2 = h1_relu.T.dot(grad_h2)
grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

# gradients on w1
grad_h1_relu = grad_h2.dot(w2.T)
grad_h1 = grad_h1_relu.copy()
grad_h1[h1 < 0] = 0
grad_w1 = train_X.T.dot(grad_h1)
grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
w3 -= learning_rate * grad_w3
b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

In [25]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1) + b1
    h1_relu = np.maximum(h1, 0)

    h2 = h1_relu.dot(w2) + b2
    h2_relu = np.maximum(h2, 0)

    y_pred = h2_relu.dot(w3) + b3
    
    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_relu.T.dot(grad_y_pred)
    grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

    # gradients on w2
    grad_h2_relu = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_relu.copy()
    grad_h2[h2 < 0] = 0
    grad_w2 = h1_relu.T.dot(grad_h2)
    grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

    # gradients on w1
    grad_h1_relu = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_relu.copy()
    grad_h1[h1 < 0] = 0
    grad_w1 = train_X.T.dot(grad_h1)
    grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3
    b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
    b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
    b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

0 755.4653338851945
1 434.10708543851166
2 292.3235921023497
3 219.31954143696905
4 179.54524123934252
5 155.83985415029758
6 140.72271272225282
7 130.63490566069822
8 123.52052826900731
9 118.16867541810971
10 113.94865529457192
11 110.47795024397345
12 107.53463948043017
13 104.98091827279276
14 102.73986432505792
15 100.73703091007384
16 98.87349637696789
17 97.17191921117328
18 95.61159351228088
19 94.17595296190716
20 92.84556067153224
21 91.60215391437657
22 90.45005649856114
23 89.43155591109581
24 88.5271622945487
25 87.68247730604043
26 86.89237684062797
27 86.15226264234624
28 85.46354591450239
29 84.82212735678199
30 84.21914471505093
31 83.65154958142642
32 83.11657858938499
33 82.61710581178264
34 82.17816871483242
35 81.76341116038735
36 81.37103546696376
37 80.99557515139082
38 80.63122983905171
39 80.28545689044924
40 79.95695881801069
41 79.64454552102998
42 79.34712407326765
43 79.06368961968809
44 78.79331724339066
45 78.53515468431603
46 78.28841580814493
47 78.0523

## Numpy
## sigmoid activation
https://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x

In [26]:
# H1 is first hidden layer size
# H2 is second hidden layer size (number neurons in hidden layer)
# D_out is output dimension (1 for regression)
num_rows, num_columns, H1, H2, D_out = 64, 6, 8, 8, 1

# Create random input and output data
train_X = np.random.randn(num_rows, num_columns)
train_y = np.random.randn(num_rows, D_out)

In [27]:
# Randomly initialize weights
w1 = np.random.randn(num_columns, H1)
b1 = np.random.randn(H1)
w2 = np.random.randn(H1, H2)
b2 = np.random.randn(H2)
w3 = np.random.randn(H2, D_out)
b3 = np.random.randn(D_out)

In [28]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h1 = train_X.dot(w1) + b1
    h1_sigm = 1 / (1 + np.exp(-1 * h1))
    

    h2 = h1_sigm.dot(w2) + b2
    h2_sigm = 1 / (1 + np.exp(-1 * h1))

    y_pred = h2_sigm.dot(w3) + b3
    
    # Compute and print loss
    loss = np.square(y_pred - train_y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - train_y)

    # gradients on w3
    grad_w3 = h2_sigm.T.dot(grad_y_pred)
    grad_b3 = grad_y_pred.T.dot(np.ones((grad_y_pred.shape[0], D_out)))

    # gradients on w2
    grad_h2_sigm = grad_y_pred.dot(w3.T)
    grad_h2 = grad_h2_sigm * h2_sigm * (1 - h2_sigm)
    grad_w2 = h1_sigm.T.dot(grad_h2)
    grad_b2 = grad_h2.T.dot(np.ones((grad_h2.shape[0], 1)))

    # gradients on w1
    grad_h1_sigm = grad_h2.dot(w2.T)
    grad_h1 = grad_h1_sigm * h1_sigm * (1 - h1_sigm)
    grad_w1 = train_X.T.dot(grad_h1)
    grad_b1 = grad_h1.T.dot(np.ones((grad_h1.shape[0], 1)))

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    w3 -= learning_rate * grad_w3
    b1 -= learning_rate * grad_b1.reshape(grad_b1.shape[0],)
    b2 -= learning_rate * grad_b2.reshape(grad_b2.shape[0],)
    b3 -= learning_rate * grad_b3.reshape(grad_b3.shape[0],)

0 860.0267744323311
1 791.9521853093061
2 729.9261790140525
3 673.4067373422655
4 621.9013217207298
5 574.9621579059683
6 532.182012205438
7 493.19039938448867
8 457.65017133064816
9 425.25444285924954
10 395.72381708552945
11 368.8038778115043
12 344.2629205797451
13 321.8898975868469
14 301.492554650328
15 282.8957409817857
16 265.93987471433644
17 250.47954902608248
18 236.3822653437412
19 223.5272815421586
20 211.80456430913998
21 201.11383594771945
22 191.36370686172637
23 182.47088583349324
24 174.35946096991182
25 166.96024487740553
26 160.21017823835987
27 154.05178651010218
28 148.43268496026514
29 143.30512769582265
30 138.62559674284483
31 134.35442759492008
32 130.45546797443686
33 126.89576684616479
34 123.64529099004369
35 120.6766666825968
36 117.96494425643343
37 115.48738350710775
38 113.2232580981201
39 111.15367727984372
40 109.26142338820114
41 107.53080372540715
42 105.94751554930468
43 104.49852301088198
44 103.17194498250059
45 101.95695281310844
46 100.843677132