# Tutoriel pytorch - TP3 - IFT725

Tel que mentionné dans l'énoncé du travail, vous devez recopier les blocs de code du tutoriel suivant

https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

en donnant, pour chaque bloc, une description en format "markdown" de son contenu.

### Entrainement d'un réseau de neurones pleinement connecté
1. On initialise les valeurs du réseau (taille des 'batchs', dimension de la couche d'entrée, dimension de la couche cachée, dimension de la couche de sortie).
2. On créer des données et leur label de façon aléatoire.
3. On initialise les points de façon aléatoire.
4. On fait une 'forward pass' avec ReLu comme fonction d'activation.
5. On calcule la loss (nombre de données mal classées).
6. On fait une 'backward pass' (avec ReLu comme fonction d'activation toujours) afin d'obtenir les gradients de W1 et W2.
7. On met à jour les points de W1 et W2 en fonction du taux d'apprentissage et des gradients (et sans régularisation).

In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 44543076.11211847
1 48060038.22883322
2 52334266.97398868
3 45096419.3872671
4 27344384.99037444
5 12075277.247493777
6 5034078.902778637
7 2616866.4558130093
8 1762369.6566922711
9 1369691.406615529
10 1128535.3743750846
11 952524.8560839768
12 814201.841072473
13 701768.9703973496
14 608894.8301627148
15 531336.0492049719
16 466026.5979096919
17 410704.23179648275
18 363603.5040583771
19 323156.23040114134
20 288195.8669594285
21 257846.82033409853
22 231363.89356698972
23 208166.1580249479
24 187776.90990070396
25 169770.8198814658
26 153833.8464597719
27 139666.81245820975
28 127040.79305966565
29 115757.92319555196
30 105688.09342307775
31 96641.76039025458
32 88503.30399712356
33 81171.50274316638
34 74544.97376850905
35 68542.45695557096
36 63097.18034291592
37 58148.90498370045
38 53646.65165201956
39 49539.7831404281
40 45793.2850076242
41 42370.38294055263
42 39236.73784058924
43 36364.307568508986
44 33729.26055901945
45 31307.908437882656
46 29081.782588646416
47 27034.26

413 0.0022504563804405004
414 0.0021661979486458003
415 0.002085046932558557
416 0.002006910427895247
417 0.0019317214461915572
418 0.001859341948524716
419 0.0017896849250123798
420 0.0017226428653642476
421 0.0016581210449204396
422 0.001596024400100057
423 0.0015362592687780548
424 0.0014787427692579467
425 0.0014233917421365833
426 0.001370116960131576
427 0.0013188616609226592
428 0.001269525154447575
429 0.0012220592126084783
430 0.0011763696555039695
431 0.001132367509731854
432 0.0010900297577996813
433 0.0010492626099354633
434 0.0010100261123990922
435 0.0009722578280022097
436 0.0009359126361690312
437 0.0009009397066040977
438 0.0008672673669433623
439 0.0008348563019272218
440 0.0008036638672425367
441 0.0007736407198174308
442 0.0007447393014425804
443 0.0007169183720789684
444 0.0006901420723990367
445 0.0006644008316221699
446 0.0006395919352479946
447 0.0006157239959306783
448 0.000592736824650436
449 0.0005706077855467801
450 0.0005493092059694425
451 0.00052880546288

### Entrainement d'un réseau de neurones pleinement connecté
1. On fixe le 'type' de processeur à utiliser (CPU / GPU).
2. On initialise les valeurs du réseau (taille des 'batchs', dimension de la couche d'entrée, dimension de la couche cachée, dimension de la couche de sortie).
3. On créer des données et leur label de façon aléatoire.
4. On initialise les points de façon aléatoire.
5. On fait une 'forward pass' avec ReLu comme fonction d'activation.
6. On calcule la loss (nombre de données mal classées).
7. On fait une 'backward pass' (avec ReLu comme fonction d'activation toujours) afin d'obtenir les gradients de W1 et W2.
8. On met à jour les points de W1 et W2 en fonction du taux d'apprentissage et des gradients (et sans régularisation).

In [5]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 633.27734375
199 2.265718698501587
299 0.013112684711813927
399 0.0002729864208959043
499 4.573642218019813e-05


### Entrainement d'un réseau de neurones pleinement connecté
1. On fixe le 'type' de processeur à utiliser (CPU / GPU).
2. On initialise les valeurs du réseau (taille des 'batchs', dimension de la couche d'entrée, dimension de la couche cachée, dimension de la couche de sortie).
3. On créer des données et leur label de façon aléatoire (à l'aide de tenseurs).
4. On initialise les points de façon aléatoire (à l'aide de tenseurs qu'on cherche à garder pour la 'backward pass')).
5. On fait une 'forward pass' avec ReLu comme fonction d'activation.
6. On calcule la loss (nombre de données mal classées).
7. On fait une 'backward pass' (avec ReLu comme fonction d'activation toujours) afin d'obtenir les gradients de W1 et W2. On utilise la fonction 'backward()' d'autograd avec les tenseurs requis (ici, W1 et W2).
8. On met à jour les points de W1 et W2 en fonction du taux d'apprentissage et des gradients (et sans régularisation). Puis on remet à zéro les gradients.

In [6]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


99 349.55938720703125
199 1.19576096534729
299 0.0078320587053895
399 0.00020937970839440823
499 3.979222310590558e-05


### Entrainement d'un réseau de neurones pleinement connecté
1. On fixe le 'type' de processeur à utiliser (CPU / GPU).
2. On modifie les fonctions 'forward' et 'backward' sur la base d'autograd.
3. On initialise les valeurs du réseau (taille des 'batchs', dimension de la couche d'entrée, dimension de la couche cachée, dimension de la couche de sortie).
4. On créer des données et leur label de façon aléatoire (à l'aide de tenseurs).
5. On initialise les points de façon aléatoire (à l'aide de tenseurs qu'on cherche à garder pour la 'backward pass')).
6. On fait une 'forward pass' avec ReLu comme fonction d'activation.
7. On calcule la loss (nombre de données mal classées).
8. On fait une 'backward pass' (avec ReLu comme fonction d'activation toujours) afin d'obtenir les gradients de W1 et W2. On utilise la fonction 'backward()' d'autograd avec les tenseurs requis (ici, W1 et W2).
9. On met à jour les points de W1 et W2 en fonction du taux d'apprentissage et des gradients (et sans régularisation). Puis on remet à zéro les gradients.

In [10]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


99 503.5501708984375
199 2.6580700874328613
299 0.021978724747896194
399 0.00043362771975807846
499 6.140151526778936e-05


In [11]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

99 2.7904064655303955
199 0.04972470924258232
299 0.0017450560117140412
399 9.0595189249143e-05
499 5.853259608556982e-06
