# PyTorch Neural Network Tutorial

A walkthrough of PyTorch's Neural Network Tutorial (https://pytorch.org/tutorials/beginner/pytorch_with_examples.html)

## Using numpy

In [1]:
import numpy as np

In [2]:
# number of samples, input dimension, number of nodes in the hidden layer, output dimension
N, D1, H, D2 = 64, 1000, 100, 10 

In [3]:
X = np.random.randn(N, D1) # input data
Y = np.random.randn(N, D2) # output data

In [4]:
W1 = np.random.randn(D1, H) # weights matrix for the first layer
W2 = np.random.randn(H, D2) # weights matrix for the second layer

In [5]:
gamma = 1e-6 # learning rate 

In [6]:
# backpropagation algorithm
for i in range(500):
    # forward pass
    Z = X.dot(W1)
    phi = np.maximum(0, Z)
    pred = phi.dot(W2)
    # compute/print loss
    loss = np.sum((Y - pred) ** 2)
    print('Iteration: {}, loss {}'.format(i, loss))
    # backwards pass
    delta_l2 = 2 * (pred - Y)
    grad_l2 = phi.T.dot(delta_l2)
    delta_l1 = delta_l2.dot(W2.T)
    delta_l1[phi == 0] = 0
    grad_l1 = X.T.dot(delta_l1)
    # updates
    W1 -= gamma * grad_l1
    W2 -= gamma * grad_l2

Iteration: 0, loss 28485072.36897279
Iteration: 1, loss 21134344.020322382
Iteration: 2, loss 16190726.429168245
Iteration: 3, loss 12115100.345132908
Iteration: 4, loss 8710814.104630012
Iteration: 5, loss 6099054.674983146
Iteration: 6, loss 4232124.953973087
Iteration: 7, loss 2970356.2743779104
Iteration: 8, loss 2135890.968554906
Iteration: 9, loss 1585644.5430753976
Iteration: 10, loss 1216481.250552211
Iteration: 11, loss 961714.8800290413
Iteration: 12, loss 779751.486102947
Iteration: 13, loss 645503.0704887168
Iteration: 14, loss 542971.6157200529
Iteration: 15, loss 462413.8171251416
Iteration: 16, loss 397706.5507901727
Iteration: 17, loss 344738.5471359048
Iteration: 18, loss 300708.45241367666
Iteration: 19, loss 263659.83502893243
Iteration: 20, loss 232196.59065438036
Iteration: 21, loss 205262.15254597244
Iteration: 22, loss 182070.06859280923
Iteration: 23, loss 161993.1431249
Iteration: 24, loss 144562.64248700213
Iteration: 25, loss 129344.67006491194
Iteration: 26,

Iteration: 286, loss 0.2642157367308056
Iteration: 287, loss 0.2547469291341164
Iteration: 288, loss 0.24562419195857496
Iteration: 289, loss 0.236834198269199
Iteration: 290, loss 0.22836383556076675
Iteration: 291, loss 0.2202017671259056
Iteration: 292, loss 0.21233746097186568
Iteration: 293, loss 0.2047591494424767
Iteration: 294, loss 0.19745597078186777
Iteration: 295, loss 0.19041693787950487
Iteration: 296, loss 0.1836334098742534
Iteration: 297, loss 0.17709551072258806
Iteration: 298, loss 0.17079405886365465
Iteration: 299, loss 0.16472030013182784
Iteration: 300, loss 0.15886665654571963
Iteration: 301, loss 0.15322402310696437
Iteration: 302, loss 0.14778550908693677
Iteration: 303, loss 0.14254268046375915
Iteration: 304, loss 0.13748928053649423
Iteration: 305, loss 0.1326170693647056
Iteration: 306, loss 0.12792018272854697
Iteration: 307, loss 0.12339220414535056
Iteration: 308, loss 0.11902727772244849
Iteration: 309, loss 0.11481871739108632
Iteration: 310, loss 0.1

## Using PyTorch tensors

In [7]:
import torch

In [8]:
dtype = torch.float
device = torch.device('cpu')

In [9]:
X = torch.randn(N, D1)
Y = torch.randn(N, D2)

In [10]:
W1 = torch.randn(D1, H, device=device, dtype=dtype)
W2 = torch.randn(H, D2, device=device, dtype=dtype)

In [11]:
for i in range(500):
    # forward pass
    Z = X.mm(W1)
    phi = Z.clamp(min=0)
    pred = phi.mm(W2)
    # compute/print loss
    loss = (Y - pred).pow(2).sum()
    print('Iteration: {}, loss {}'.format(i, loss))
    # backward pass
    delta_l2 = 2 * (pred - Y)
    grad_l2 = phi.t().mm(delta_l2)
    delta_l1 = delta_l2.mm(W2.t())
    delta_l1[phi == 0] = 0
    grad_l1 = X.t().mm(delta_l1)
    # updates 
    W1 -= gamma * grad_l1
    W2 -= gamma * grad_l2

Iteration: 0, loss 30572468.0
Iteration: 1, loss 26421654.0
Iteration: 2, loss 24728238.0
Iteration: 3, loss 22120718.0
Iteration: 4, loss 17691872.0
Iteration: 5, loss 12465142.0
Iteration: 6, loss 7932702.5
Iteration: 7, loss 4810428.5
Iteration: 8, loss 2937997.0
Iteration: 9, loss 1884814.25
Iteration: 10, loss 1295121.25
Iteration: 11, loss 953224.8125
Iteration: 12, loss 741899.5625
Iteration: 13, loss 601415.75
Iteration: 14, loss 501177.84375
Iteration: 15, loss 425482.21875
Iteration: 16, loss 365699.25
Iteration: 17, loss 317017.84375
Iteration: 18, loss 276613.40625
Iteration: 19, loss 242635.203125
Iteration: 20, loss 213730.578125
Iteration: 21, loss 188977.75
Iteration: 22, loss 167647.765625
Iteration: 23, loss 149178.546875
Iteration: 24, loss 133107.0
Iteration: 25, loss 119055.578125
Iteration: 26, loss 106724.609375
Iteration: 27, loss 95873.2109375
Iteration: 28, loss 86295.5546875
Iteration: 29, loss 77812.5234375
Iteration: 30, loss 70290.9765625
Iteration: 31, lo

Iteration: 267, loss 0.04189303144812584
Iteration: 268, loss 0.03978254646062851
Iteration: 269, loss 0.03777296096086502
Iteration: 270, loss 0.03588613495230675
Iteration: 271, loss 0.034075140953063965
Iteration: 272, loss 0.0323699526488781
Iteration: 273, loss 0.030730124562978745
Iteration: 274, loss 0.029187865555286407
Iteration: 275, loss 0.027723176404833794
Iteration: 276, loss 0.026325102895498276
Iteration: 277, loss 0.02500830590724945
Iteration: 278, loss 0.02375432848930359
Iteration: 279, loss 0.02256469987332821
Iteration: 280, loss 0.021428614854812622
Iteration: 281, loss 0.020362328737974167
Iteration: 282, loss 0.01932922936975956
Iteration: 283, loss 0.018361927941441536
Iteration: 284, loss 0.01745038665831089
Iteration: 285, loss 0.016581248492002487
Iteration: 286, loss 0.015749681740999222
Iteration: 287, loss 0.014980156905949116
Iteration: 288, loss 0.014232998713850975
Iteration: 289, loss 0.013532746583223343
Iteration: 290, loss 0.012848522514104843
Ite

## Using PyTorch Autograd package 

In [12]:
W1 = torch.randn(D1, H, device=device, dtype=dtype, requires_grad=True)
W2 = torch.randn(H, D2, device=device, dtype=dtype, requires_grad=True)

In [13]:
for i in range(500):
    # forward pass
    pred = X.mm(W1).clamp(min=0).mm(W2)
    # compute/print loss
    loss = (Y - pred).pow(2).sum()
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    # backward pass
    loss.backward()
    # updates 
    with torch.no_grad(): #temporarily set requires_grad flags to False
        W1 -= gamma * W1.grad
        W2 -= gamma * W2.grad
        W1.grad.zero_()
        W2.grad.zero_()

Iteration: 0, loss 31276690.0
Iteration: 1, loss 29065940.0
Iteration: 2, loss 29749312.0
Iteration: 3, loss 28901852.0
Iteration: 4, loss 24143488.0
Iteration: 5, loss 16741321.0
Iteration: 6, loss 9938664.0
Iteration: 7, loss 5472923.0
Iteration: 8, loss 3072420.75
Iteration: 9, loss 1878972.375
Iteration: 10, loss 1277651.25
Iteration: 11, loss 950733.125
Iteration: 12, loss 752579.0
Iteration: 13, loss 619021.0
Iteration: 14, loss 520922.6875
Iteration: 15, loss 444675.53125
Iteration: 16, loss 383197.59375
Iteration: 17, loss 332514.40625
Iteration: 18, loss 290149.5
Iteration: 19, loss 254345.78125
Iteration: 20, loss 223846.953125
Iteration: 21, loss 197738.203125
Iteration: 22, loss 175247.03125
Iteration: 23, loss 155801.375
Iteration: 24, loss 138910.53125
Iteration: 25, loss 124193.390625
Iteration: 26, loss 111320.515625
Iteration: 27, loss 100015.9140625
Iteration: 28, loss 90070.2734375
Iteration: 29, loss 81291.2265625
Iteration: 30, loss 73563.140625
Iteration: 31, loss

Iteration: 315, loss 0.016767950728535652
Iteration: 316, loss 0.01603885553777218
Iteration: 317, loss 0.015351833775639534
Iteration: 318, loss 0.014693260192871094
Iteration: 319, loss 0.014064528048038483
Iteration: 320, loss 0.013465954922139645
Iteration: 321, loss 0.012892507016658783
Iteration: 322, loss 0.012342375703155994
Iteration: 323, loss 0.011808909475803375
Iteration: 324, loss 0.011302454397082329
Iteration: 325, loss 0.010830039158463478
Iteration: 326, loss 0.010377568192780018
Iteration: 327, loss 0.00992952473461628
Iteration: 328, loss 0.009507437236607075
Iteration: 329, loss 0.009107199497520924
Iteration: 330, loss 0.00872884877026081
Iteration: 331, loss 0.008364317007362843
Iteration: 332, loss 0.008008880540728569
Iteration: 333, loss 0.00767037458717823
Iteration: 334, loss 0.0073513793759047985
Iteration: 335, loss 0.00704905204474926
Iteration: 336, loss 0.006754837464541197
Iteration: 337, loss 0.006480515003204346
Iteration: 338, loss 0.006214209366589

## Defining and using a custom Function 

In [14]:
class MyReLu(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [15]:
W1 = torch.randn(D1, H, device=device, dtype=dtype, requires_grad=True)
W2 = torch.randn(H, D2, device=device, dtype=dtype, requires_grad=True)

In [16]:
for i in range(500):
    relu = MyReLu.apply
    # forward pass
    pred = relu(X.mm(W1)).mm(W2)
    # compute/print loss
    loss = (Y - pred).pow(2).sum()
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    # backward pass
    loss.backward()
    # updates 
    with torch.no_grad(): #temporarily set requires_grad flags to False
        W1 -= gamma * W1.grad
        W2 -= gamma * W2.grad
        W1.grad.zero_()
        W2.grad.zero_()

Iteration: 0, loss 31672936.0
Iteration: 1, loss 26050776.0
Iteration: 2, loss 23715592.0
Iteration: 3, loss 21074248.0
Iteration: 4, loss 17157604.0
Iteration: 5, loss 12431002.0
Iteration: 6, loss 8225142.5
Iteration: 7, loss 5148345.0
Iteration: 8, loss 3213393.25
Iteration: 9, loss 2074633.0
Iteration: 10, loss 1418945.75
Iteration: 11, loss 1031025.9375
Iteration: 12, loss 790009.5625
Iteration: 13, loss 630517.875
Iteration: 14, loss 518066.1875
Iteration: 15, loss 434290.75
Iteration: 16, loss 369253.75
Iteration: 17, loss 317299.78125
Iteration: 18, loss 274748.75
Iteration: 19, loss 239268.375
Iteration: 20, loss 209326.5625
Iteration: 21, loss 183902.8125
Iteration: 22, loss 162099.984375
Iteration: 23, loss 143333.1875
Iteration: 24, loss 127103.703125
Iteration: 25, loss 113002.03125
Iteration: 26, loss 100703.4140625
Iteration: 27, loss 89949.546875
Iteration: 28, loss 80511.046875
Iteration: 29, loss 72205.9453125
Iteration: 30, loss 64878.6328125
Iteration: 31, loss 5839

Iteration: 250, loss 0.02635190263390541
Iteration: 251, loss 0.02489776723086834
Iteration: 252, loss 0.023511137813329697
Iteration: 253, loss 0.02219097688794136
Iteration: 254, loss 0.020958226174116135
Iteration: 255, loss 0.01980147510766983
Iteration: 256, loss 0.018708545714616776
Iteration: 257, loss 0.0176742784678936
Iteration: 258, loss 0.01669210195541382
Iteration: 259, loss 0.015768706798553467
Iteration: 260, loss 0.014898039400577545
Iteration: 261, loss 0.01406997349113226
Iteration: 262, loss 0.013298267498612404
Iteration: 263, loss 0.012568983249366283
Iteration: 264, loss 0.01188431866466999
Iteration: 265, loss 0.01122964732348919
Iteration: 266, loss 0.010617049410939217
Iteration: 267, loss 0.010043379850685596
Iteration: 268, loss 0.009492401033639908
Iteration: 269, loss 0.00897474866360426
Iteration: 270, loss 0.00848957896232605
Iteration: 271, loss 0.008027318865060806
Iteration: 272, loss 0.007595446892082691
Iteration: 273, loss 0.00718553364276886
Itera

Iteration: 480, loss 2.272463643748779e-05
Iteration: 481, loss 2.251465048175305e-05
Iteration: 482, loss 2.2362833988154307e-05
Iteration: 483, loss 2.203566145908553e-05
Iteration: 484, loss 2.184337063226849e-05
Iteration: 485, loss 2.168760693166405e-05
Iteration: 486, loss 2.145038524759002e-05
Iteration: 487, loss 2.1298281353665516e-05
Iteration: 488, loss 2.1144713173271157e-05
Iteration: 489, loss 2.0923022020724602e-05
Iteration: 490, loss 2.0737546947202645e-05
Iteration: 491, loss 2.0456540369195864e-05
Iteration: 492, loss 2.0280522221582942e-05
Iteration: 493, loss 2.0098324966966175e-05
Iteration: 494, loss 1.9868366507580504e-05
Iteration: 495, loss 1.9653172785183415e-05
Iteration: 496, loss 1.9660314137581736e-05
Iteration: 497, loss 1.944763062056154e-05
Iteration: 498, loss 1.9109635104541667e-05
Iteration: 499, loss 1.8999880921910517e-05


## Using static computational graphs via TensorFlow

In [17]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [18]:
X = tf.placeholder(tf.float32, shape=(N, D1))
Y = tf.placeholder(tf.float32, shape=(N, D2))

In [19]:
W1 = tf.Variable(tf.random_normal((D1, H)))
W2 = tf.Variable(tf.random_normal((H, D2)))

In [20]:
Z = tf.matmul(X, W1)
phi = tf.maximum(Z, tf.zeros(1))
pred = tf.matmul(phi, W2)

In [21]:
loss = tf.reduce_sum((Y - pred) ** 2.0) 

In [22]:
# compute the gradients
grad_l1, grad_l2 = tf.gradients(loss, [W1, W2])

In [23]:
# define update rules
W1_new = W1.assign(W1 - gamma * grad_l1)
W2_new = W2.assign(W2 - gamma * grad_l2)

In [24]:
# run the computational graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    X_np = np.random.randn(N, D1)
    Y_np = np.random.randn(N, D2)
    for i in range(500):
        loss_value, _, _ = sess.run([loss, W1_new, W2_new],
                             feed_dict={X: X_np, Y: Y_np})
        print('Iteration: {}, loss {}'.format(i, loss_value))

Iteration: 0, loss 31025410.0
Iteration: 1, loss 29603118.0
Iteration: 2, loss 33416816.0
Iteration: 3, loss 38626540.0
Iteration: 4, loss 38374956.0
Iteration: 5, loss 29441512.0
Iteration: 6, loss 17120376.0
Iteration: 7, loss 8508116.0
Iteration: 8, loss 4687544.0
Iteration: 9, loss 2653951.75
Iteration: 10, loss 1641374.75
Iteration: 11, loss 1127887.75
Iteration: 12, loss 849153.0
Iteration: 13, loss 681059.0
Iteration: 14, loss 568065.6875
Iteration: 15, loss 485065.3125
Iteration: 16, loss 420107.625
Iteration: 17, loss 367178.375
Iteration: 18, loss 322931.625
Iteration: 19, loss 285438.25
Iteration: 20, loss 253331.125
Iteration: 21, loss 225630.359375
Iteration: 22, loss 201605.75
Iteration: 23, loss 180672.59375
Iteration: 24, loss 162334.53125
Iteration: 25, loss 146223.03125
Iteration: 26, loss 132008.421875
Iteration: 27, loss 119431.484375
Iteration: 28, loss 108281.265625
Iteration: 29, loss 98368.640625
Iteration: 30, loss 89542.5625
Iteration: 31, loss 81649.625
Itera

Iteration: 290, loss 0.1321861296892166
Iteration: 291, loss 0.1266450583934784
Iteration: 292, loss 0.12134191393852234
Iteration: 293, loss 0.11625947058200836
Iteration: 294, loss 0.11140196025371552
Iteration: 295, loss 0.10674446821212769
Iteration: 296, loss 0.10227891057729721
Iteration: 297, loss 0.09799011051654816
Iteration: 298, loss 0.09389135241508484
Iteration: 299, loss 0.08999259024858475
Iteration: 300, loss 0.0862569734454155
Iteration: 301, loss 0.08264756202697754
Iteration: 302, loss 0.07923401892185211
Iteration: 303, loss 0.07592567801475525
Iteration: 304, loss 0.07278864085674286
Iteration: 305, loss 0.06975817680358887
Iteration: 306, loss 0.06686851382255554
Iteration: 307, loss 0.06408379226922989
Iteration: 308, loss 0.06143498793244362
Iteration: 309, loss 0.05890418961644173
Iteration: 310, loss 0.05645819753408432
Iteration: 311, loss 0.05414268374443054
Iteration: 312, loss 0.05191560089588165
Iteration: 313, loss 0.04977763071656227
Iteration: 314, los

## Using nn package from PyTorch

In [25]:
X = torch.randn(N, D1)
Y = torch.randn(N, D2)

In [26]:
# define the neural network
model = torch.nn.Sequential(torch.nn.Linear(D1, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D2))
loss_function = torch.nn.MSELoss(reduction='sum')
gamma = 1e-4

In [27]:
for i in range(500):
    pred = model(X)
    loss = loss_function(pred, Y)
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    model.zero_grad() # zero the gradients before the computation
    loss.backward() # compute gradients for all learnable parameters in the model
    with torch.no_grad():
        for param in model.parameters():
            param -= gamma * param.grad

Iteration: 0, loss 714.2217407226562
Iteration: 1, loss 661.5736083984375
Iteration: 2, loss 615.9337158203125
Iteration: 3, loss 576.0433959960938
Iteration: 4, loss 540.3504638671875
Iteration: 5, loss 508.2291259765625
Iteration: 6, loss 478.7377624511719
Iteration: 7, loss 451.75323486328125
Iteration: 8, loss 427.0273742675781
Iteration: 9, loss 404.0857849121094
Iteration: 10, loss 382.3077697753906
Iteration: 11, loss 361.88568115234375
Iteration: 12, loss 342.5139465332031
Iteration: 13, loss 324.1628723144531
Iteration: 14, loss 306.7515563964844
Iteration: 15, loss 290.18792724609375
Iteration: 16, loss 274.5047302246094
Iteration: 17, loss 259.52587890625
Iteration: 18, loss 245.2332763671875
Iteration: 19, loss 231.57936096191406
Iteration: 20, loss 218.574462890625
Iteration: 21, loss 206.22418212890625
Iteration: 22, loss 194.47549438476562
Iteration: 23, loss 183.2827606201172
Iteration: 24, loss 172.63693237304688
Iteration: 25, loss 162.53482055664062
Iteration: 26, lo

Iteration: 298, loss 0.001186998444609344
Iteration: 299, loss 0.001146293361671269
Iteration: 300, loss 0.001107013551518321
Iteration: 301, loss 0.0010691044153645635
Iteration: 302, loss 0.0010325420880690217
Iteration: 303, loss 0.0009972580010071397
Iteration: 304, loss 0.000963167636655271
Iteration: 305, loss 0.0009302593534812331
Iteration: 306, loss 0.0008985032327473164
Iteration: 307, loss 0.0008678358281031251
Iteration: 308, loss 0.0008382269297726452
Iteration: 309, loss 0.0008096633828245103
Iteration: 310, loss 0.0007820841856300831
Iteration: 311, loss 0.0007554820040240884
Iteration: 312, loss 0.0007297936244867742
Iteration: 313, loss 0.0007049776031635702
Iteration: 314, loss 0.000681019970215857
Iteration: 315, loss 0.0006579100154340267
Iteration: 316, loss 0.0006355892401188612
Iteration: 317, loss 0.0006140602054074407
Iteration: 318, loss 0.0005932322819717228
Iteration: 319, loss 0.000573124154470861
Iteration: 320, loss 0.0005537171382457018
Iteration: 321, l

## Using the optim package from PyTorch

In [28]:
# re-initialize nn
model = torch.nn.Sequential(torch.nn.Linear(D1, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D2))

In [29]:
optimizer = torch.optim.Adam(model.parameters(), lr=gamma) # define the optimizer

In [30]:
for i in range(500):
    pred = model(X)
    loss = loss_function(pred, Y)
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    optimizer.zero_grad() # zero the gradients before the computation
    loss.backward() # compute gradients for all learnable parameters in the model
    optimizer.step() 

Iteration: 0, loss 730.1437377929688
Iteration: 1, loss 711.9232177734375
Iteration: 2, loss 694.1687622070312
Iteration: 3, loss 676.9307861328125
Iteration: 4, loss 660.1820068359375
Iteration: 5, loss 643.9859619140625
Iteration: 6, loss 628.30517578125
Iteration: 7, loss 613.11083984375
Iteration: 8, loss 598.380126953125
Iteration: 9, loss 584.1588134765625
Iteration: 10, loss 570.2985229492188
Iteration: 11, loss 556.867431640625
Iteration: 12, loss 543.822998046875
Iteration: 13, loss 531.1405029296875
Iteration: 14, loss 518.7693481445312
Iteration: 15, loss 506.6921081542969
Iteration: 16, loss 494.9648132324219
Iteration: 17, loss 483.5303955078125
Iteration: 18, loss 472.3996276855469
Iteration: 19, loss 461.5086364746094
Iteration: 20, loss 450.9468688964844
Iteration: 21, loss 440.619873046875
Iteration: 22, loss 430.5333557128906
Iteration: 23, loss 420.7386474609375
Iteration: 24, loss 411.2206115722656
Iteration: 25, loss 401.9309387207031
Iteration: 26, loss 392.898101

Iteration: 214, loss 0.43494558334350586
Iteration: 215, loss 0.41430142521858215
Iteration: 216, loss 0.39466017484664917
Iteration: 217, loss 0.3759506344795227
Iteration: 218, loss 0.35813891887664795
Iteration: 219, loss 0.3411835730075836
Iteration: 220, loss 0.325062096118927
Iteration: 221, loss 0.30968374013900757
Iteration: 222, loss 0.2950601279735565
Iteration: 223, loss 0.2811462879180908
Iteration: 224, loss 0.2678896486759186
Iteration: 225, loss 0.2552884817123413
Iteration: 226, loss 0.24328161776065826
Iteration: 227, loss 0.2318524569272995
Iteration: 228, loss 0.2209801822900772
Iteration: 229, loss 0.21062764525413513
Iteration: 230, loss 0.2007783055305481
Iteration: 231, loss 0.1913951188325882
Iteration: 232, loss 0.18246501684188843
Iteration: 233, loss 0.17398026585578918
Iteration: 234, loss 0.16590918600559235
Iteration: 235, loss 0.15822042524814606
Iteration: 236, loss 0.15090550482273102
Iteration: 237, loss 0.14393408596515656
Iteration: 238, loss 0.13729

Iteration: 412, loss 5.3507927077589557e-05
Iteration: 413, loss 5.091095590614714e-05
Iteration: 414, loss 4.843486749450676e-05
Iteration: 415, loss 4.607634400599636e-05
Iteration: 416, loss 4.382687984616496e-05
Iteration: 417, loss 4.1683175368234515e-05
Iteration: 418, loss 3.964348434237763e-05
Iteration: 419, loss 3.769612158066593e-05
Iteration: 420, loss 3.584220394259319e-05
Iteration: 421, loss 3.407773328945041e-05
Iteration: 422, loss 3.239541911170818e-05
Iteration: 423, loss 3.0793846235610545e-05
Iteration: 424, loss 2.9267521313158795e-05
Iteration: 425, loss 2.781357579806354e-05
Iteration: 426, loss 2.6431531296111643e-05
Iteration: 427, loss 2.511354432499502e-05
Iteration: 428, loss 2.3858783606556244e-05
Iteration: 429, loss 2.2665399228571914e-05
Iteration: 430, loss 2.15300933632534e-05
Iteration: 431, loss 2.0449449948500842e-05
Iteration: 432, loss 1.9420604076003656e-05
Iteration: 433, loss 1.8440916392137296e-05
Iteration: 434, loss 1.7509544704807922e-05
I

## Defining and using custom nn modules

In [31]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D1, H, D2):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D1, H)
        self.linear2 = torch.nn.Linear(H, D2)
        
    def forward(self, X):
        phi = self.linear1(X).clamp(min=0)
        pred = self.linear2(phi)
        return pred

In [32]:
model = TwoLayerNet(D1, H, D2)

In [33]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [34]:
for i in range(500):
    pred = model(X)
    loss = loss_function(pred, Y)
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    optimizer.zero_grad() # zero the gradients before the computation
    loss.backward() # compute gradients for all learnable parameters in the model
    optimizer.step() 

Iteration: 0, loss 724.108154296875
Iteration: 1, loss 673.79248046875
Iteration: 2, loss 630.09619140625
Iteration: 3, loss 591.4645385742188
Iteration: 4, loss 556.9235229492188
Iteration: 5, loss 525.6498413085938
Iteration: 6, loss 497.08795166015625
Iteration: 7, loss 470.97906494140625
Iteration: 8, loss 446.9306945800781
Iteration: 9, loss 424.6529541015625
Iteration: 10, loss 403.61328125
Iteration: 11, loss 383.6639709472656
Iteration: 12, loss 364.6640319824219
Iteration: 13, loss 346.5650634765625
Iteration: 14, loss 329.2579345703125
Iteration: 15, loss 312.80084228515625
Iteration: 16, loss 297.0255432128906
Iteration: 17, loss 281.9476623535156
Iteration: 18, loss 267.5547790527344
Iteration: 19, loss 253.77894592285156
Iteration: 20, loss 240.56871032714844
Iteration: 21, loss 227.87045288085938
Iteration: 22, loss 215.7568817138672
Iteration: 23, loss 204.234375
Iteration: 24, loss 193.14144897460938
Iteration: 25, loss 182.53933715820312
Iteration: 26, loss 172.3983306

Iteration: 243, loss 0.008868599310517311
Iteration: 244, loss 0.00857487041503191
Iteration: 245, loss 0.008291061967611313
Iteration: 246, loss 0.008017173036932945
Iteration: 247, loss 0.0077527230605483055
Iteration: 248, loss 0.00749761750921607
Iteration: 249, loss 0.007251030765473843
Iteration: 250, loss 0.007012960501015186
Iteration: 251, loss 0.006782988086342812
Iteration: 252, loss 0.00656092306599021
Iteration: 253, loss 0.006346302106976509
Iteration: 254, loss 0.006139026954770088
Iteration: 255, loss 0.005938739515841007
Iteration: 256, loss 0.0057452660985291
Iteration: 257, loss 0.005558326840400696
Iteration: 258, loss 0.005377717781811953
Iteration: 259, loss 0.005203227512538433
Iteration: 260, loss 0.005034560803323984
Iteration: 261, loss 0.004871495999395847
Iteration: 262, loss 0.00471392460167408
Iteration: 263, loss 0.004561764188110828
Iteration: 264, loss 0.0044146329164505005
Iteration: 265, loss 0.00427234498783946
Iteration: 266, loss 0.0041347299702465

Iteration: 438, loss 2.0404375391080976e-05
Iteration: 439, loss 1.980578963411972e-05
Iteration: 440, loss 1.922477713378612e-05
Iteration: 441, loss 1.8663065930013545e-05
Iteration: 442, loss 1.811381480365526e-05
Iteration: 443, loss 1.758452708600089e-05
Iteration: 444, loss 1.7068123270291835e-05
Iteration: 445, loss 1.6567179045523517e-05
Iteration: 446, loss 1.6081217836472206e-05
Iteration: 447, loss 1.5611496564815752e-05
Iteration: 448, loss 1.5154952052398585e-05
Iteration: 449, loss 1.4710301911691204e-05
Iteration: 450, loss 1.4278889466368128e-05
Iteration: 451, loss 1.3862682862963993e-05
Iteration: 452, loss 1.3457021850626916e-05
Iteration: 453, loss 1.3064175618637819e-05
Iteration: 454, loss 1.268195228476543e-05
Iteration: 455, loss 1.230992711498402e-05
Iteration: 456, loss 1.1950630323553924e-05
Iteration: 457, loss 1.1602055565163027e-05
Iteration: 458, loss 1.1264321074122563e-05
Iteration: 459, loss 1.0935286809399258e-05
Iteration: 460, loss 1.061502734955865

## An example using Control Flow + Weight Sharing via PyTorch

In [35]:
class DynamicNet(torch.nn.Module): # 3 layer NN
    def __init__(self, D1, H, D2):
        super(DynamicNet, self).__init__()
        self.linear1 = torch.nn.Linear(D1, H)
        self.linear2 = torch.nn.Linear(H, H)
        self.linear3 = torch.nn.Linear(H, D2)
        
    def forward(self, X):
        phi = self.linear1(X).clamp(min=0)
        for _ in range(np.random.randint(0, 3)): # randomly change the layer structure
            phi = self.linear2(phi).clamp(min=0)
        pred = self.linear3(phi)
        return pred

In [36]:
model = DynamicNet(D1, H, D2)

In [37]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

In [38]:
for i in range(500):
    pred = model(X)
    loss = loss_function(pred, Y)
    print('Iteration: {}, loss {}'.format(i, loss.item()))
    optimizer.zero_grad() # zero the gradients before the computation
    loss.backward() # compute gradients for all learnable parameters in the model
    optimizer.step() 

Iteration: 0, loss 677.4867553710938
Iteration: 1, loss 698.7186279296875
Iteration: 2, loss 674.6714477539062
Iteration: 3, loss 672.2929077148438
Iteration: 4, loss 677.2782592773438
Iteration: 5, loss 668.3834838867188
Iteration: 6, loss 663.9821166992188
Iteration: 7, loss 641.7876586914062
Iteration: 8, loss 658.2225341796875
Iteration: 9, loss 481.8358154296875
Iteration: 10, loss 601.582275390625
Iteration: 11, loss 649.9966430664062
Iteration: 12, loss 398.4241638183594
Iteration: 13, loss 561.011474609375
Iteration: 14, loss 639.14111328125
Iteration: 15, loss 298.14892578125
Iteration: 16, loss 627.719482421875
Iteration: 17, loss 226.00283813476562
Iteration: 18, loss 611.1524658203125
Iteration: 19, loss 156.58444213867188
Iteration: 20, loss 446.5323181152344
Iteration: 21, loss 573.2296142578125
Iteration: 22, loss 552.8634643554688
Iteration: 23, loss 365.3282470703125
Iteration: 24, loss 494.82330322265625
Iteration: 25, loss 455.7518005371094
Iteration: 26, loss 130.20

Iteration: 223, loss 0.20190034806728363
Iteration: 224, loss 0.19052660465240479
Iteration: 225, loss 0.07675222307443619
Iteration: 226, loss 0.9073489308357239
Iteration: 227, loss 0.8573487997055054
Iteration: 228, loss 0.1881355345249176
Iteration: 229, loss 0.6658128499984741
Iteration: 230, loss 0.23005148768424988
Iteration: 231, loss 0.11765623092651367
Iteration: 232, loss 0.12717142701148987
Iteration: 233, loss 0.11684507876634598
Iteration: 234, loss 0.09383505582809448
Iteration: 235, loss 0.41961756348609924
Iteration: 236, loss 0.06217259168624878
Iteration: 237, loss 0.30967479944229126
Iteration: 238, loss 0.3754677474498749
Iteration: 239, loss 0.30436986684799194
Iteration: 240, loss 0.07144042104482651
Iteration: 241, loss 0.28662416338920593
Iteration: 242, loss 0.38532277941703796
Iteration: 243, loss 0.25485536456108093
Iteration: 244, loss 0.2425464242696762
Iteration: 245, loss 0.22224998474121094
Iteration: 246, loss 0.1965772807598114
Iteration: 247, loss 0.

Iteration: 424, loss 0.009740139357745647
Iteration: 425, loss 0.011780476197600365
Iteration: 426, loss 0.07630881667137146
Iteration: 427, loss 0.06987674534320831
Iteration: 428, loss 0.06270827353000641
Iteration: 429, loss 0.021902523934841156
Iteration: 430, loss 0.09731347858905792
Iteration: 431, loss 0.09522297233343124
Iteration: 432, loss 0.01769595593214035
Iteration: 433, loss 0.05272418633103371
Iteration: 434, loss 0.012199736200273037
Iteration: 435, loss 0.05659615620970726
Iteration: 436, loss 0.0559467189013958
Iteration: 437, loss 0.008888212963938713
Iteration: 438, loss 0.06985138356685638
Iteration: 439, loss 0.006937732454389334
Iteration: 440, loss 0.005917607806622982
Iteration: 441, loss 0.05449575558304787
Iteration: 442, loss 0.05216658115386963
Iteration: 443, loss 0.004408626351505518
Iteration: 444, loss 0.004428423009812832
Iteration: 445, loss 0.07483286410570145
Iteration: 446, loss 0.0713181346654892
Iteration: 447, loss 0.0629725307226181
Iteration: