In [3]:
# NN with Numpy

import numpy as np

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate *grad_w2
    

0 36295949.2105
1 35949409.1133
2 37821193.7009
3 34800825.3175
4 25708421.7846
5 14825214.6916
6 7483144.02059
7 3811947.37234
8 2228436.87343
9 1517821.38404
10 1153424.03744
11 931359.065799
12 775978.076897
13 657533.605674
14 562862.871976
15 485568.085454
16 421365.190243
17 367472.979051
18 321894.574938
19 283083.199634
20 249857.386181
21 221283.646122
22 196588.019951
23 175166.373118
24 156521.288343
25 140221.856657
26 125922.801482
27 113360.213999
28 102277.024628
29 92463.6676353
30 83753.7179485
31 76007.4412911
32 69102.5924735
33 62934.3746826
34 57406.3100303
35 52449.0871653
36 47988.9102351
37 43969.7332458
38 40341.8714374
39 37063.2328119
40 34091.4144703
41 31394.0758713
42 28942.1920559
43 26708.6835671
44 24672.7812725
45 22813.8802747
46 21114.7480361
47 19559.7053645
48 18135.6821715
49 16828.5758107
50 15628.0591858
51 14524.7979613
52 13511.3404322
53 12577.3627297
54 11716.0309222
55 10921.454953
56 10187.6661952
57 9509.02362537
58 8880.7732518
59 8298.9

In [12]:
import torch

dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute loss function
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

    

0 25552808.77518153
1 19622766.089334488
2 18596027.872022808
3 19414749.791037023
4 20206698.19304973
5 19447612.53035757
6 16656473.62121182
7 12479663.36962932
8 8378497.592381572
9 5199069.559185211
10 3141437.846303877
11 1921600.7211936247
12 1232556.3225783622
13 841440.0371630229
14 613536.8661676862
15 473710.47849618783
16 382366.9731452693
17 318536.660516527
18 271227.93058264995
19 234355.31701010233
20 204565.00715913926
21 179868.5258516057
22 158999.53506275802
23 141144.29636382067
24 125714.08207448717
25 112308.3259396049
26 100594.78592208619
27 90302.36980494531
28 81253.8971186295
29 73250.92753199651
30 66162.5491358346
31 59865.694881136835
32 54251.87683805646
33 49242.039245102846
34 44756.11052798468
35 40736.42727230082
36 37125.09567382629
37 33876.142202667805
38 30947.200362021435
39 28303.218849896202
40 25918.588514929754
41 23762.247210425456
42 21812.852989831623
43 20041.755990463862
44 18432.332796087132
45 16967.27387241189
46 15632.117422828356
47

In [77]:
import torch
from torch.autograd import Variable

d_type = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=True)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=True)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradiens after updating the weights
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    
    

0 26858338.0
1 20468032.0
2 17081984.0
3 14420581.0
4 11781621.0
5 9154579.0
6 6791733.0
7 4881607.5
8 3466695.0
9 2476137.5
10 1803168.75
11 1350091.75
12 1042109.375
13 828524.75
14 676122.75
15 563822.6875
16 478240.875
17 411044.8125
18 356978.40625
19 312573.25
20 275460.46875
21 244057.828125
22 217115.84375
23 193806.46875
24 173519.828125
25 155786.09375
26 140224.015625
27 126529.8203125
28 114394.4921875
29 103617.5703125
30 94014.5703125
31 85438.1484375
32 77775.8203125
33 70899.859375
34 64718.58984375
35 59153.0546875
36 54132.03125
37 49593.6640625
38 45485.66796875
39 41761.421875
40 38380.6796875
41 35312.0078125
42 32518.16796875
43 29973.189453125
44 27651.896484375
45 25531.416015625
46 23592.626953125
47 21818.220703125
48 20191.978515625
49 18700.0546875
50 17330.18359375
51 16071.560546875
52 14913.8740234375
53 13848.4033203125
54 12867.1328125
55 11963.4755859375
56 11129.6865234375
57 10359.7744140625
58 9648.4013671875
59 8990.958984375
60 8382.7255859375
61 

In [2]:
import torch
from torch.autograd import Variable

class MyReLu(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=True)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=True)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)


learning_rate = 1e-6
for t in range(500):
    relu = MyReLu.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    

0 40113516.0
1 37232036.0
2 33322738.0
3 24987808.0
4 15478014.0
5 8350853.0
6 4494623.5
7 2644875.5
8 1766043.375
9 1303142.75
10 1024972.625
11 835984.8125
12 696437.8125
13 587873.5625
14 500672.28125
15 429364.34375
16 370426.40625
17 321341.5
18 280142.96875
19 245335.53125
20 215689.46875
21 190310.59375
22 168470.703125
23 149605.796875
24 133262.140625
25 119013.1640625
26 106558.9140625
27 95680.9375
28 86103.8828125
29 77646.40625
30 70154.140625
31 63505.25
32 57595.859375
33 52323.40625
34 47608.51171875
35 43388.34375
36 39598.6328125
37 36189.4296875
38 33117.48046875
39 30343.13671875
40 27834.751953125
41 25563.544921875
42 23506.0859375
43 21637.197265625
44 19937.255859375
45 18388.306640625
46 16977.41796875
47 15688.6435546875
48 14510.5556640625
49 13435.7919921875
50 12450.8779296875
51 11547.2861328125
52 10717.0732421875
53 9954.267578125
54 9252.8349609375
55 8606.8994140625
56 8011.6728515625
57 7460.8740234375
58 6952.18212890625
59 6482.080078125
60 6047.415

In [78]:
import tensorflow as tf
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y-y_pred) ** 2)
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], 
                                   feed_dict={x:x_value, y:y_value})
        print(loss_value)



3.0921e+07
2.57268e+07
2.38796e+07
2.18049e+07
1.83364e+07
1.37219e+07
9.3134e+06
5.93368e+06
3.73895e+06
2.4234e+06
1.65941e+06
1.20824e+06
930450.0
748549.0
621390.0
527114.0
453732.0
394585.0
345706.0
304596.0
269606.0
239531.0
213523.0
190914.0
171137.0
153757.0
138445.0
124918.0
112919.0
102243.0
92722.4
84254.2
76668.9
69857.0
63731.5
58208.7
53222.3
48719.6
44641.3
40942.6
37582.9
34529.1
31747.1
29211.4
26898.2
24782.5
22847.7
21075.9
19454.0
17970.7
16610.5
15361.3
14213.5
13157.9
12186.2
11291.4
10466.3
9706.14
9005.61
8358.73
7761.4
7209.2
6698.38
6226.2
5789.46
5384.83
5010.06
4662.59
4340.62
4042.09
3765.01
3507.8
3268.98
3047.17
2841.09
2649.52
2471.48
2305.84
2151.74
2008.39
1874.96
1750.73
1635.08
1527.51
1427.34
1333.93
1246.88
1165.71
1090.01
1019.39
953.529
892.088
834.72
781.158
731.143
684.431
640.788
600.033
561.926
526.323
493.066
461.952
432.846
405.632
380.187
356.376
334.103
313.259
293.753
275.499
258.399
242.388
227.399
213.357
200.209
187.888
176.349
165.53

In [4]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access and gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 738.8372192382812
1 682.7576904296875
2 634.4929809570312
3 592.3621215820312
4 555.3797607421875
5 522.37158203125
6 492.2303771972656
7 464.51177978515625
8 438.9720764160156
9 415.2369384765625
10 393.1602478027344
11 372.447509765625
12 352.86004638671875
13 334.3628845214844
14 316.8018493652344
15 300.10882568359375
16 284.2737731933594
17 269.2071533203125
18 254.7886962890625
19 240.99827575683594
20 227.8546142578125
21 215.30416870117188
22 203.29664611816406
23 191.79144287109375
24 180.84054565429688
25 170.41439819335938
26 160.4705047607422
27 150.9790802001953
28 141.95260620117188
29 133.40464782714844
30 125.29436492919922
31 117.61279296875
32 110.34683227539062
33 103.48133087158203
34 97.01724243164062
35 90.9166259765625
36 85.16425323486328
37 79.75335693359375
38 74.66755676269531
39 69.88975524902344
40 65.40353393554688
41 61.19770431518555
42 57.25973892211914
43 53.575286865234375
44 50.12944412231445
45 46.90837478637695
46 43.89864730834961
47 41.08258819

380 0.0003001687291543931
381 0.000292800716124475
382 0.0002856171631719917
383 0.0002786156546790153
384 0.00027178580057807267
385 0.0002651275717653334
386 0.0002586418704595417
387 0.0002523124567233026
388 0.0002461511467117816
389 0.00024012825451791286
390 0.0002342667430639267
391 0.00022854891722090542
392 0.0002229672682005912
393 0.00021752959582954645
394 0.00021222600480541587
395 0.00020706460054498166
396 0.00020201501320116222
397 0.00019709777552634478
398 0.00019230657198932022
399 0.0001876246533356607
400 0.0001830705878091976
401 0.0001786275242920965
402 0.00017428633873350918
403 0.00017005619884002954
404 0.0001659346598898992
405 0.0001619099930394441
406 0.00015798448293935508
407 0.0001541609817650169
408 0.0001504257961641997
409 0.00014678554725833237
410 0.00014323455980047584
411 0.00013977332855574787
412 0.00013639275857713073
413 0.00013310412759892642
414 0.00012988390517421067
415 0.00012674939353019
416 0.0001236919197253883
417 0.00012070787488482

In [5]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 701.6347045898438
1 684.1663208007812
2 667.2534790039062
3 650.814697265625
4 634.86669921875
5 619.3699340820312
6 604.3070068359375
7 589.697998046875
8 575.5385131835938
9 561.7133178710938
10 548.2904663085938
11 535.257080078125
12 522.6295166015625
13 510.41876220703125
14 498.50067138671875
15 486.9172668457031
16 475.6256103515625
17 464.617431640625
18 453.8919677734375
19 443.476318359375
20 433.4750671386719
21 423.7264709472656
22 414.21209716796875
23 404.9141540527344
24 395.8508605957031
25 387.00592041015625
26 378.3743896484375
27 369.9579162597656
28 361.7677001953125
29 353.763671875
30 345.9626159667969
31 338.3416442871094
32 330.9053955078125
33 323.64630126953125
34 316.5596618652344
35 309.6396179199219
36 302.8757019042969
37 296.2629699707031
38 289.8002014160156
39 283.4808654785156
40 277.27642822265625
41 271.1831359863281
42 265.19964599609375
43 259.34478759765625
44 253.59886169433594
45 247.9722137451172
46 242.46043395996094
47 237.05763244628906
48

368 5.80901323701255e-05
369 5.507028254214674e-05
370 5.220809907768853e-05
371 4.949582216795534e-05
372 4.6925502829253674e-05
373 4.4491996959550306e-05
374 4.218668618705124e-05
375 3.999900945927948e-05
376 3.792683492065407e-05
377 3.596592796384357e-05
378 3.41018385370262e-05
379 3.2338659366359934e-05
380 3.066861245315522e-05
381 2.9082966648275033e-05
382 2.7581503672990948e-05
383 2.61539298662683e-05
384 2.4805651264614426e-05
385 2.3525564756710082e-05
386 2.2312051441986114e-05
387 2.1160931282793172e-05
388 2.0070627215318382e-05
389 1.9036187950405292e-05
390 1.8055428881780244e-05
391 1.712457014946267e-05
392 1.624231481400784e-05
393 1.5404597434098832e-05
394 1.4611317055823747e-05
395 1.385916766594164e-05
396 1.3145329830877017e-05
397 1.2469095054257195e-05
398 1.1828022252302617e-05
399 1.1218389772693627e-05
400 1.0640491382218897e-05
401 1.0091734111483674e-05
402 9.572197996021714e-06
403 9.08029505808372e-06
404 8.612532838014886e-06
405 8.168678505171556e

In [9]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return(y_pred)
    
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    


0 651.2144775390625
1 600.8587646484375
2 557.6687622070312
3 519.6417846679688
4 486.0440368652344
5 455.8727722167969
6 428.50311279296875
7 403.5574645996094
8 380.53253173828125
9 359.29290771484375
10 339.3811340332031
11 320.5867919921875
12 302.8896179199219
13 286.2348327636719
14 270.5379638671875
15 255.73362731933594
16 241.68865966796875
17 228.30087280273438
18 215.58163452148438
19 203.49069213867188
20 191.98294067382812
21 181.05589294433594
22 170.7010040283203
23 160.87939453125
24 151.57725524902344
25 142.74510192871094
26 134.4022674560547
27 126.52899932861328
28 119.10369873046875
29 112.10894012451172
30 105.53253173828125
31 99.31809997558594
32 93.46755981445312
33 87.9507827758789
34 82.75624084472656
35 77.84317779541016
36 73.2240982055664
37 68.88752746582031
38 64.80941772460938
39 60.97124099731445
40 57.35755157470703
41 53.965476989746094
42 50.76846694946289
43 47.76904296875
44 44.95333480834961
45 42.306522369384766
46 39.82438278198242
47 37.490699

351 0.00013522087829187512
352 0.000131371125462465
353 0.00012763359700329602
354 0.00012400263221934438
355 0.0001204809159389697
356 0.0001170619361801073
357 0.00011374968744348735
358 0.00011052589252358302
359 0.00010740460129454732
360 0.00010437592572998255
361 0.00010143655526917428
362 9.857915574684739e-05
363 9.5810042694211e-05
364 9.311526810051873e-05
365 9.05072083696723e-05
366 8.796990005066618e-05
367 8.550664642825723e-05
368 8.311602869071066e-05
369 8.079559484031051e-05
370 7.854338036850095e-05
371 7.635572546860203e-05
372 7.423091301461682e-05
373 7.216884841909632e-05
374 7.016178278718144e-05
375 6.821808347012848e-05
376 6.632873555645347e-05
377 6.449143984355032e-05
378 6.270722951740026e-05
379 6.0974420193815604e-05
380 5.9293484810041264e-05
381 5.76580387132708e-05
382 5.607245839200914e-05
383 5.452967525343411e-05
384 5.303017678670585e-05
385 5.157406485523097e-05
386 5.0161717808805406e-05
387 4.878570689470507e-05
388 4.745407204609364e-05
389 4.

In [56]:
x = torch.autograd.Variable(torch.FloatTensor([1]), requires_grad=True)

In [57]:
model = torch.nn.Sequential(
    torch.nn.Linear(1,1)
)
y = model(x)

In [70]:
y.backward(retain_graph=True)

In [71]:
x.grad

Variable containing:
-3.8339
[torch.FloatTensor of size 1]

In [72]:
list(model.modules())[0][0].gradInput

AttributeError: 'Linear' object has no attribute 'gradInput'

In [73]:
?nn


Object `nn` not found.
