# Learning Pytorch with Examples

地址: http://pytorch.org/tutorials/beginner/pytorch_with_examples.html

特点：
+ Tensor: n-dimesinoal Tensor, 与numpy类似，不过可以在GPU上run
+ 在建立和训练神经网络的过程中，自动地进行了微分计算

## 一、tensor
### warm-up： numpy
以下用numpy定义神经网络的训练过程，一个隐藏层，使用relu作为激活函数

In [2]:
import numpy as np

# N是batch size, D_in是输入的维度，H是隐藏维度，D_out是输出的维度
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# randomly initiate weights 初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6  # 学习率
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # compute and print loss
    loss = np.square(y_pred - y).sum()
    print('%d %.3f' % (t, loss))
    
    # backprop to compute gradients of w1 and w1 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27738170.233
1 23004569.956
2 22235502.265
3 22243672.385
4 21071532.563
5 17824287.247
6 13259423.525
7 8795917.039
8 5449931.573
9 3311109.942
10 2069897.110
11 1368955.506
12 968745.847
13 729353.057
14 576942.076
15 472763.792
16 396925.486
17 338929.907
18 292774.566
19 254955.220
20 223405.181
21 196693.464
22 173832.758
23 154168.776
24 137167.359
25 122402.469
26 109505.475
27 98190.995
28 88230.104
29 79435.503
30 71646.440
31 64734.532
32 58585.547
33 53104.700
34 48226.378
35 43859.357
36 39939.983
37 36414.240
38 33238.507
39 30373.657
40 27784.694
41 25441.635
42 23318.355
43 21392.603
44 19641.835
45 18049.525
46 16603.895
47 15286.194
48 14083.542
49 12985.329
50 11980.928
51 11061.358
52 10218.863
53 9446.284
54 8737.354
55 8086.410
56 7487.946
57 6937.467
58 6430.778
59 5964.108
60 5534.247
61 5137.754
62 4771.999
63 4434.194
64 4122.069
65 3833.560
66 3566.769
67 3319.768
68 3091.114
69 2879.293
70 2682.990
71 2500.943
72 2332.077
73 2175.362
74 2029.890
75 1894.789

### Pytorch：Tensors
PyTorch可以利用GPU进行数值计算；

In [8]:
import torch

dtype = torch.FloatTensor # 如果是GPU的话就是torch.cuda.FloatTensor

# N is batch size, D_in input dimension
# H is hidden dimension; D_out is output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x =torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# random initiate weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

(0, 31722876.45816654)
(1, 29271825.155434847)
(2, 31499788.745518453)
(3, 32987204.901493803)
(4, 29711372.11199583)
(5, 21459894.740109563)
(6, 12572167.933773607)
(7, 6486681.690743292)
(8, 3358820.051641375)
(9, 1930535.4947736037)
(10, 1277501.3703937773)
(11, 948615.8304938318)
(12, 757647.215241018)
(13, 630146.5795451179)
(14, 535987.0738979937)
(15, 462016.22295444726)
(16, 401763.88623003085)
(17, 351677.4613928657)
(18, 309437.2385102692)
(19, 273436.3193599299)
(20, 242574.57448416986)
(21, 215999.59478717155)
(22, 193005.67338942585)
(23, 172969.45412969854)
(24, 155449.9672924392)
(25, 140076.6709764854)
(26, 126548.06246980504)
(27, 114578.2116462418)
(28, 103954.09307577371)
(29, 94499.27540051984)
(30, 86071.90781065637)
(31, 78538.29487033724)
(32, 71783.97115309228)
(33, 65707.39761513378)
(34, 60229.46708867896)
(35, 55281.81839130964)
(36, 50805.26670372207)
(37, 46741.50786072278)
(38, 43056.340874272355)
(39, 39715.562143852)
(40, 36667.581972629705)
(41, 33886.0

## Autograd

上面是自己写迭代的过程，当然我们有框架

## pytorch: Variables and autograd
autograd其实也是建立了计算图，节点就是variables,边就是function

Variable是这样一个class:包含了Tensor；Variable就是计算图中的一个node，若x表示一个Variables,那么x.data就是一个Tensor，而x.grad也是一个Varibable，其中包含了关于某些标量的梯度值。

In [19]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机的tensor来存储输入与输出，同时将他们包装在Variable之中
# 令requires_grad = false表示在反向传播的过程中不需要对他们计算梯度
x =Variable(torch.randn(N, D_in).type(dtype), requires_grad = False)
y =Variable(torch.randn(N, D_out).type(dtype), requires_grad = False)

# 生成权重，同样将他们包装在Variable之中
# 令requires_grad=true
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6

In [20]:
x

Variable containing:
-3.2492e-01 -7.8384e-01 -9.9097e-02  ...  -1.2246e+00 -3.9574e-02  1.1883e+00
 1.2497e-01 -5.6167e-01  8.3817e-01  ...   5.7982e-01 -5.8394e-01  4.5226e-02
 1.2286e+00 -1.3853e-01  7.7313e-02  ...  -1.5991e+00  1.4460e+00  1.3473e+00
                ...                   ⋱                   ...                
 1.3601e-01 -1.5036e+00 -6.2605e-01  ...   6.1462e-01 -1.6344e+00  7.4872e-01
-3.7263e-01  7.3490e-01 -4.4305e-01  ...   2.9885e+00 -8.0849e-01  9.9337e-01
 5.2417e-01  2.2697e+00  1.4125e+00  ...  -6.8662e-01 -4.1499e-01 -1.8078e-02
[torch.FloatTensor of size 64x1000]

In [23]:
# 训练过程
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2) # input 与 w1加权乘积，relu激活函数， 再与w2加权乘积
    
    # compute and print loss using operations on variables
    # loss变为了一个variables,shape(1,),loss.data是一个tensor,shape(1,);
    # loss.data[0]就是一个存储了loss的标量值
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    # use autograte to compute the backward pass;这条代码，
    #会将loss关于所有前面设置为requires_grad=True的变量计算梯度
    # 经过这句代码之后，w1.grad和w2.grad也变为了Variable,存储的是关于w1和w2的梯度
    loss.backward()
    
    # update weights using gradient descent;
    # w1.data和w2.data是Tensor；w1.grad是Variable，w1.grad.data是Tensor
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # 需要手动将gradient清掉
    w1.grad.data.zero_()
    w2.grad.data.zero_()

(0, 42370964.0)
(1, 39746400.0)
(2, 37719820.0)
(3, 30059538.0)
(4, 18920020.0)
(5, 9890899.0)
(6, 5029859.0)
(7, 2850178.75)
(8, 1886663.375)
(9, 1403682.5)
(10, 1117067.625)
(11, 920324.625)
(12, 772466.3125)
(13, 655601.6875)
(14, 560829.5625)
(15, 482714.125)
(16, 417619.53125)
(17, 362914.375)
(18, 316758.40625)
(19, 277498.53125)
(20, 243959.21875)
(21, 215142.015625)
(22, 190282.78125)
(23, 168749.796875)
(24, 150022.9375)
(25, 133716.171875)
(26, 119463.5078125)
(27, 106953.0)
(28, 95936.59375)
(29, 86241.6171875)
(30, 77671.3125)
(31, 70073.15625)
(32, 63323.46875)
(33, 57308.55859375)
(34, 51937.25390625)
(35, 47130.75)
(36, 42824.25)
(37, 38955.6015625)
(38, 35478.12109375)
(39, 32346.517578125)
(40, 29521.396484375)
(41, 26967.3203125)
(42, 24658.083984375)
(43, 22566.345703125)
(44, 20667.87109375)
(45, 18944.37890625)
(46, 17380.78125)
(47, 15959.0283203125)
(48, 14664.0048828125)
(49, 13483.7490234375)
(50, 12407.03515625)
(51, 11422.671875)
(52, 10522.634765625)
(53, 96

## Pytorch: define new autograd functions


In [40]:
# 自定义函数
class MyReLU(torch.autograd.Function):
    def forward(self, input):
        self.save_for_backward(input)
        return input.clamp(min=0)
    def backward(self, grad_output):
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0]=0
        return grad_input

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

(0, 31479708.0)
(1, 27399938.0)
(2, 24686312.0)
(3, 20535026.0)
(4, 15248111.0)
(5, 10079246.0)
(6, 6266191.5)
(7, 3850696.5)
(8, 2468281.75)
(9, 1686805.0)
(10, 1235129.125)
(11, 957120.5625)
(12, 773299.0625)
(13, 642673.0)
(14, 544366.9375)
(15, 467057.71875)
(16, 404479.0)
(17, 352698.25)
(18, 309301.40625)
(19, 272569.78125)
(20, 241189.96875)
(21, 214247.8125)
(22, 190907.984375)
(23, 170630.046875)
(24, 153013.859375)
(25, 137552.625)
(26, 123963.7578125)
(27, 111964.4375)
(28, 101316.2265625)
(29, 91847.421875)
(30, 83407.3671875)
(31, 75869.1484375)
(32, 69125.34375)
(33, 63075.5234375)
(34, 57633.1875)
(35, 52728.28125)
(36, 48302.9140625)
(37, 44301.33203125)
(38, 40673.75390625)
(39, 37385.3828125)
(40, 34399.4921875)
(41, 31683.951171875)
(42, 29209.041015625)
(43, 26951.705078125)
(44, 24892.158203125)
(45, 23010.794921875)
(46, 21290.580078125)
(47, 19713.865234375)
(48, 18268.05859375)
(49, 16939.64453125)
(50, 15719.3759765625)
(51, 14597.3525390625)
(52, 13565.0527343

## Tensorflow: static graphs

相较于tf，pytorch使用的是动态计算图


In [1]:
import tensorflow as tf
import numpy as np

import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

3.15738e+07
3.05755e+07
3.13902e+07
2.92008e+07
2.30278e+07
1.49094e+07
8.45488e+06
4.54605e+06
2.58244e+06
1.6276e+06
1.1475e+06
879927.0
712279.0
595227.0
506476.0
436013.0
378400.0
330326.0
289791.0
255306.0
225822.0
200429.0
178471.0
159363.0
142651.0
127994.0
115089.0
103684.0
93582.8
84619.0
76643.6
69526.6
63161.1
57458.0
52336.8
47730.2
43580.8
39836.6
36453.2
33391.4
30617.1
28099.4
25812.0
23732.7
21840.1
20114.4
18540.5
17102.3
15788.2
14586.2
13484.4
12474.2
11547.1
10695.9
9913.67
9194.01
8531.88
7921.97
7359.65
6841.05
6362.56
5920.47
5511.74
5134.49
4785.38
4462.17
4162.5
3884.61
3626.84
3387.54
3165.3
2958.85
2766.85
2588.3
2422.12
2267.42
2123.34
1989.24
1864.2
1747.57
1638.73
1537.15
1442.3
1353.68
1270.88
1193.48
1121.08
1053.35
989.971
930.626
875.069
823.01
774.229
728.512
685.64
645.437
607.716
572.326
539.11
507.908
478.608
451.09
425.228
400.928
378.089
356.604
336.401
317.395
299.515
282.689
266.849
251.937
237.891
224.659
212.198
200.458
189.388
178.958
169.12

## nn module

### PyTorch:nn
 however for large neural networks raw autograd can be a bit too low-level.

In [2]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

In [3]:
model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),)

model

Sequential (
  (0): Linear (1000 -> 100)
  (1): ReLU ()
  (2): Linear (100 -> 10)
)

In [4]:
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    # zeros the gradients
    model.zero_grad()
    
    # backward pass
    loss.backward()
    
    # update the parameter
    for params in model.parameters():
        params.data -= learning_rate * params.grad.data
        

(0, 584.8945922851562)
(1, 540.41748046875)
(2, 501.9297180175781)
(3, 468.1427917480469)
(4, 438.3111267089844)
(5, 411.27655029296875)
(6, 386.68792724609375)
(7, 364.04931640625)
(8, 343.27142333984375)
(9, 323.9523010253906)
(10, 306.0048828125)
(11, 289.1953430175781)
(12, 273.3241271972656)
(13, 258.3806457519531)
(14, 244.35943603515625)
(15, 231.22491455078125)
(16, 218.80455017089844)
(17, 207.00038146972656)
(18, 195.7586669921875)
(19, 185.0979461669922)
(20, 174.98509216308594)
(21, 165.4166717529297)
(22, 156.34658813476562)
(23, 147.78314208984375)
(24, 139.70664978027344)
(25, 132.06488037109375)
(26, 124.81795501708984)
(27, 117.95894622802734)
(28, 111.46029663085938)
(29, 105.3214111328125)
(30, 99.52008819580078)
(31, 94.02864074707031)
(32, 88.84369659423828)
(33, 83.92688751220703)
(34, 79.28556823730469)
(35, 74.91802978515625)
(36, 70.78887176513672)
(37, 66.89456939697266)
(38, 63.21010971069336)
(39, 59.73413848876953)
(40, 56.44281005859375)
(41, 53.3472976684

**注**：总结上面这个过程，就是先建立一个计算图model，里面已经包含了激活函数等信息，之后通过model计算预测值，指定一个loss function作为目标，之后每次先清空梯度，之后再自动计算梯度，利用梯度下降进行参数的迭代

### Pytorch： optim
除了普通的梯度下降之外，还可以使用哪些optimizer呢？
optim包提供了一系列优化算法

In [5]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

In [7]:
# 使用optim包来定义优化器，这里我们使用Adam，第一个参数是需要迭代的variables

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # forward computing: predict y by passing x to the model
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    
    loss.backward()
    
    # calling the optimizer to 迭代
    optimizer.step()

(0, 659.6001586914062)
(1, 642.6619873046875)
(2, 626.1334838867188)
(3, 610.0513305664062)
(4, 594.4301147460938)
(5, 579.3054809570312)
(6, 564.59619140625)
(7, 550.328369140625)
(8, 536.456787109375)
(9, 522.900390625)
(10, 509.7463684082031)
(11, 496.99810791015625)
(12, 484.6109313964844)
(13, 472.6658935546875)
(14, 461.1160583496094)
(15, 449.943115234375)
(16, 439.1109924316406)
(17, 428.5198059082031)
(18, 418.2001953125)
(19, 408.1749572753906)
(20, 398.3612060546875)
(21, 388.7661437988281)
(22, 379.43609619140625)
(23, 370.35009765625)
(24, 361.4988098144531)
(25, 352.8426513671875)
(26, 344.3965759277344)
(27, 336.1748046875)
(28, 328.14666748046875)
(29, 320.299072265625)
(30, 312.6243896484375)
(31, 305.17071533203125)
(32, 297.9551696777344)
(33, 290.9435119628906)
(34, 284.0993957519531)
(35, 277.42510986328125)
(36, 270.91357421875)
(37, 264.5422058105469)
(38, 258.29412841796875)
(39, 252.1604766845703)
(40, 246.16867065429688)
(41, 240.29547119140625)
(42, 234.53248

## Pytorch:自定义 nn 模块

In [12]:
import torch
from torch.autograd import Variable

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad = False)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(400):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    
    loss.backward()
    optimizer.step()

(0, 636.857666015625)
(1, 585.065185546875)
(2, 541.4152221679688)
(3, 503.3388671875)
(4, 469.7135925292969)
(5, 439.6905212402344)
(6, 412.68634033203125)
(7, 388.1914978027344)
(8, 365.7237548828125)
(9, 344.98992919921875)
(10, 325.8662109375)
(11, 308.068359375)
(12, 291.5516662597656)
(13, 276.0881652832031)
(14, 261.58489990234375)
(15, 247.86024475097656)
(16, 234.88848876953125)
(17, 222.6004638671875)
(18, 210.9210968017578)
(19, 199.821533203125)
(20, 189.2509002685547)
(21, 179.1573028564453)
(22, 169.5316619873047)
(23, 160.3347625732422)
(24, 151.60289001464844)
(25, 143.29525756835938)
(26, 135.4075164794922)
(27, 127.9195327758789)
(28, 120.79448699951172)
(29, 114.0289535522461)
(30, 107.60701751708984)
(31, 101.52124786376953)
(32, 95.75874328613281)
(33, 90.2998275756836)
(34, 85.12461853027344)
(35, 80.23416900634766)
(36, 75.59407043457031)
(37, 71.20939636230469)
(38, 67.0674819946289)
(39, 63.148399353027344)
(40, 59.45237731933594)
(41, 55.966270446777344)
(42, 

### Pytorch: control Flow + weight sharing


In [14]:
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 627.1441650390625)
(1, 634.2400512695312)
(2, 623.0227661132812)
(3, 671.3903198242188)
(4, 616.3870239257812)
(5, 612.2980346679688)
(6, 544.0466918945312)
(7, 603.7003173828125)
(8, 559.228271484375)
(9, 550.2503662109375)
(10, 611.5838623046875)
(11, 590.873779296875)
(12, 586.4982299804688)
(13, 506.43585205078125)
(14, 605.05810546875)
(15, 476.99591064453125)
(16, 456.77288818359375)
(17, 311.3760986328125)
(18, 537.850341796875)
(19, 259.3171691894531)
(20, 509.2892761230469)
(21, 566.8246459960938)
(22, 322.1181335449219)
(23, 447.46514892578125)
(24, 161.2103729248047)
(25, 146.1382598876953)
(26, 123.00212860107422)
(27, 360.5242614746094)
(28, 233.76382446289062)
(29, 69.33521270751953)
(30, 313.17877197265625)
(31, 190.68319702148438)
(32, 270.4389343261719)
(33, 241.81105041503906)
(34, 325.2544250488281)
(35, 126.492431640625)
(36, 164.0634307861328)
(37, 141.58729553222656)
(38, 192.88784790039062)
(39, 161.54933166503906)
(40, 124.7602310180664)
(41, 147.91360473632