## PyTorch Tutorial

IFT6135 – Representation Learning

A Deep Learning Course, January 2020

By Chin-Wei Huang 

(Adapted from Sandeep Subramanian's MILA tutorial)

## Creating Your Own Modules

### `torch.nn.module`

In [0]:
import numpy as np

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter

import math
import numpy as np

``nn.Module`` is base class for all neural network modules.

You should also write your modules as sub-class of ``nn.Module``, so that it can inherit the following attributes:

* *Recursive structure*: you can wrap an instantiation of a Module class with another one, which stores the inner one as its parent

* *Cudafiability*: you can easily cudafy the whole sequence of modules using `model.cuda()`

* *Serializable*: you can save your trained model (checkpoint, early stopping ...) using ``torch.save``, ``torch.load``

* *Parameters*: you can call model.parameters() to access all parameters at the same time. 

etc. 


In [0]:
# modified from https://pytorch.org/docs/master/_modules/torch/nn/modules/linear.html#Linear
class Linear(nn.Module):
    r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to False, the layer will not learn an additive bias. Default: True

    Shape:
        - Input: :math:`(N, in\_features)`
        - Output: :math:`(N, out\_features)`

    ...

    """

    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        bound = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-bound, bound)
        if self.bias is not None:
            self.bias.data.zero_()

    def forward(self, input):
        return F.linear(input, self.weight, self.bias)

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )

In [0]:
class MyLinear(nn.Module):
    
    def __init__(self, in_features, out_features, bias=True):
        super(MyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

    def forward(self, input):
        if self.bias is None:
            return torch.mm(input, self.weight) 
        else:
            return torch.mm(input, self.weight) + self.bias

        

In [11]:
x = torch.from_numpy(np.random.randn(2, 3)).float()


linear1 = nn.Linear(3,4)
linear2 = MyLinear(3,4)

# set the weight and bias of linear2 to be the same as linear1's
linear2.weight.data = linear1.weight.data.transpose(1,0)
linear2.bias.data = linear1.bias.data

print(torch.eq(linear1(x), linear2(x)))



tensor([[True, True, True, True],
        [True, True, True, True]])


### Resnet example

* Resnet blocks let the gradient flow through the hidden unit more directly and at the same time increase expressiveness

Res(x) = F(x, {W}) + x

Res(x) = F(x, {W1}) + W2 x



In [0]:
# Q: write the resnet block which projects the data properly if in_feat != out_fest
class ResLinear(nn.Module):

    def __init__(self, in_features, out_features, activation=nn.ReLU()):
        super(ResLinear, self).__init__()
        # write your code here
        pass
    def forward(self, x):
        # write your code here
        pass

In [16]:
x = torch.from_numpy(np.random.randn(2, 3)).float()


res1 = nn.Linear(3,3)
res2 = ResLinear(3,5)

print(res1(x).size())
print(res2(x).size())

torch.Size([2, 3])
torch.Size([2, 5])


### Putting things altogether, Sequential, Parameter updates

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, Linear=ResLinear):
        super(MyModel, self).__init__()
        
        self.predict_ = nn.Sequential(
            Linear(784, 328),
            nn.ReLU(),
            Linear(328, 328),
            nn.ReLU(),
            Linear(328, 10),
        )
        
        
        self.criterion = nn.CrossEntropyLoss()
    
    def predict_proba(self, x):
        return F.softmax(x)
    
    def predict(self, x):
        return torch.max(self.predict_proba(x))[1]
        # ``max'' returns (max_value, argmax)
    
    def loss(self, x, target):
        proba = self.predict_(x)
        return self.criterion(proba, target)

# CrossEntropyLoss -> output no nonlinearity
# NLLloss -> logsoftmax
        







caveate:    ``CrossEntropyLoss``    versus    ``NLLLoss``


* ``CrossEntropyLoss`` takes in *pre-softmax* as input

* ``NLLLoss`` takes in *log-softmax* as input


In [18]:
y = torch.Tensor(1,10).normal_() # logit
t = torch.from_numpy(np.random.choice(10, size=1))

loss1 = nn.CrossEntropyLoss()
loss2 = nn.NLLLoss()

print(loss1(y, t))
print(loss2(nn.LogSoftmax(dim=1)(y), t))


tensor(2.1038)
tensor(2.1038)


In [19]:
x = torch.from_numpy(np.random.randn(64, 784)).float()
t = torch.from_numpy(np.random.choice(10, size=64))

model = MyModel()
print(model.loss(x, t))

tensor(2.4775, grad_fn=<NllLossBackward>)


### Updating Parameters (Manually)

In [36]:
x = torch.from_numpy(np.random.randn(64, 784)).float()
t = torch.from_numpy(np.random.choice(10, size=64))
model = MyModel()

lr = 0.1

for i in range(10):
    loss = model.loss(x, t)
    loss.backward()
    
    for param in model.parameters():
        # param.data = param.data - lr*param.grad.data
        param.data.sub_(param.grad.data*lr)
        param.grad.data.zero_()
        
    print(loss.item())
    
    # print(param.grad)

2.4042789936065674
1.5861577987670898
1.0716038942337036
0.7304816246032715
0.47881975769996643
0.3232457637786865
0.23248334228992462
0.17822784185409546
0.14187084138393402
0.11654175817966461


### Updating Parameters (``torch.optim``)

In [38]:
x = torch.from_numpy(np.random.randn(64, 784)).float()
t = torch.from_numpy(np.random.choice(10, size=64))
model = MyModel()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.0)


for i in range(10):
    optimizer.zero_grad()
    
    loss = model.loss(x, t)
    loss.backward()
    
    optimizer.step()
        
    print(loss)

# what happens if you use momentum? 

tensor(2.4058, grad_fn=<NllLossBackward>)
tensor(1.5623, grad_fn=<NllLossBackward>)
tensor(1.0356, grad_fn=<NllLossBackward>)
tensor(0.7061, grad_fn=<NllLossBackward>)
tensor(0.4813, grad_fn=<NllLossBackward>)
tensor(0.3393, grad_fn=<NllLossBackward>)
tensor(0.2466, grad_fn=<NllLossBackward>)
tensor(0.1892, grad_fn=<NllLossBackward>)
tensor(0.1511, grad_fn=<NllLossBackward>)
tensor(0.1243, grad_fn=<NllLossBackward>)
