In [3]:
# NN with Numpy

import numpy as np

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate *grad_w2
    

0 36295949.2105
1 35949409.1133
2 37821193.7009
3 34800825.3175
4 25708421.7846
5 14825214.6916
6 7483144.02059
7 3811947.37234
8 2228436.87343
9 1517821.38404
10 1153424.03744
11 931359.065799
12 775978.076897
13 657533.605674
14 562862.871976
15 485568.085454
16 421365.190243
17 367472.979051
18 321894.574938
19 283083.199634
20 249857.386181
21 221283.646122
22 196588.019951
23 175166.373118
24 156521.288343
25 140221.856657
26 125922.801482
27 113360.213999
28 102277.024628
29 92463.6676353
30 83753.7179485
31 76007.4412911
32 69102.5924735
33 62934.3746826
34 57406.3100303
35 52449.0871653
36 47988.9102351
37 43969.7332458
38 40341.8714374
39 37063.2328119
40 34091.4144703
41 31394.0758713
42 28942.1920559
43 26708.6835671
44 24672.7812725
45 22813.8802747
46 21114.7480361
47 19559.7053645
48 18135.6821715
49 16828.5758107
50 15628.0591858
51 14524.7979613
52 13511.3404322
53 12577.3627297
54 11716.0309222
55 10921.454953
56 10187.6661952
57 9509.02362537
58 8880.7732518
59 8298.9

In [12]:
import torch

dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute loss function
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

    

0 25552808.77518153
1 19622766.089334488
2 18596027.872022808
3 19414749.791037023
4 20206698.19304973
5 19447612.53035757
6 16656473.62121182
7 12479663.36962932
8 8378497.592381572
9 5199069.559185211
10 3141437.846303877
11 1921600.7211936247
12 1232556.3225783622
13 841440.0371630229
14 613536.8661676862
15 473710.47849618783
16 382366.9731452693
17 318536.660516527
18 271227.93058264995
19 234355.31701010233
20 204565.00715913926
21 179868.5258516057
22 158999.53506275802
23 141144.29636382067
24 125714.08207448717
25 112308.3259396049
26 100594.78592208619
27 90302.36980494531
28 81253.8971186295
29 73250.92753199651
30 66162.5491358346
31 59865.694881136835
32 54251.87683805646
33 49242.039245102846
34 44756.11052798468
35 40736.42727230082
36 37125.09567382629
37 33876.142202667805
38 30947.200362021435
39 28303.218849896202
40 25918.588514929754
41 23762.247210425456
42 21812.852989831623
43 20041.755990463862
44 18432.332796087132
45 16967.27387241189
46 15632.117422828356
47

In [17]:
import torch
from torch.autograd import Variable

d_type = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=True)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=True)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradiens after updating the weights
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    
    

0 34715840.0
1 34271996.0
2 37943816.0
3 38277512.0
4 31180912.0
5 19365578.0
6 9763338.0
7 4648878.0
8 2458257.75
9 1548441.875
10 1128362.875
11 895739.125
12 742843.9375
13 629674.875
14 540234.3125
15 467230.65625
16 406553.125
17 355566.71875
18 312387.75
19 275554.125
20 243960.9375
21 216757.234375
22 193188.921875
23 172701.234375
24 154882.078125
25 139243.15625
26 125467.2734375
27 113298.203125
28 102519.8203125
29 92956.90625
30 84448.6015625
31 76855.6875
32 70058.5546875
33 63960.1015625
34 58477.328125
35 53536.33203125
36 49076.46875
37 45043.87890625
38 41389.89453125
39 38076.13671875
40 35066.2109375
41 32326.482421875
42 29829.216796875
43 27552.9296875
44 25476.44140625
45 23576.888671875
46 21836.919921875
47 20240.91015625
48 18775.556640625
49 17428.61328125
50 16190.1181640625
51 15049.7314453125
52 13998.2001953125
53 13028.333984375
54 12133.0234375
55 11305.572265625
56 10540.1328125
57 9831.587890625
58 9175.50390625
59 8567.1689453125
60 8003.98095703125
6

In [22]:
import torch
from torch.autograd import Variable

class MyReLu(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=True)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=True)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)


learning_rate = 1e-6
for t in range(500):
    relu = MyReLu.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    

0 34305396.0
1 31699938.0
2 31692994.0
3 29218394.0
4 22978078.0
5 15014931.0
6 8618489.0
7 4731575.5
8 2716808.75
9 1714676.75
10 1199318.125
11 909284.3125
12 727271.4375
13 601028.75
14 506762.34375
15 432779.09375
16 372764.8125
17 323141.25
18 281631.1875
19 246569.3125
20 216787.21875
21 191325.84375
22 169392.296875
23 150450.796875
24 134012.8125
25 119701.8828125
26 107178.4609375
27 96187.0078125
28 86509.4375
29 77968.0546875
30 70417.140625
31 63710.1640625
32 57738.3515625
33 52411.85546875
34 47648.8515625
35 43380.41796875
36 39547.1796875
37 36100.30859375
38 32992.74609375
39 30188.220703125
40 27652.30078125
41 25356.37109375
42 23283.265625
43 21403.767578125
44 19695.119140625
45 18138.505859375
46 16720.6015625
47 15430.162109375
48 14250.6171875
49 13171.77734375
50 12183.7431640625
51 11278.2998046875
52 10446.3935546875
53 9682.46875
54 8980.5927734375
55 8335.2939453125
56 7741.02734375
57 7193.6103515625
58 6689.546875
59 6224.72705078125
60 5795.59130859375
6

In [1]:
import tensorflow as tf
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y-y_pred) ** 2)
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], 
                                   feed_dict={x:x_value, y:y_value})
        print(loss_value)



3.08207e+07
2.8813e+07
3.02769e+07
3.02747e+07
2.6011e+07
1.82087e+07
1.07211e+07
5.73048e+06
3.11943e+06
1.86565e+06
1.25865e+06
937382.0
744406.0
613432.0
516047.0
439450.0
377179.0
325585.0
282358.0
245888.0
214879.0
188407.0
165707.0
146161.0
129264.0
114625.0
101883.0
90777.1
81049.7
72508.0
64989.7
58360.6
52501.1
47303.8
42690.0
38586.8
34930.6
31665.9
28746.6
26129.6
23782.2
21672.3
19774.6
18067.7
16528.1
15138.3
13880.3
12741.1
11707.1
10768.4
9914.46
9137.01
8429.36
7783.61
7193.59
6653.97
6160.17
5707.79
5292.83
4911.97
4561.78
4240.08
3943.86
3670.89
3419.15
3186.76
2972.01
2773.47
2589.84
2419.77
2262.19
2116.08
1980.54
1854.63
1737.63
1628.88
1527.71
1433.57
1345.85
1264.08
1187.84
1116.72
1050.4
988.499
930.663
876.568
825.963
778.59
734.211
692.63
653.665
617.122
582.836
550.7
520.517
492.162
465.523
440.461
416.873
394.683
373.793
354.113
335.582
318.113
301.643
286.11
271.458
257.63
244.568
232.23
220.573
209.555
199.141
189.292
179.976
171.156
162.814
154.913
147.42

In [4]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)



In [6]:
?nn


Object `nn` not found.
