# PyTorch Introduction
- From video: https://www.youtube.com/watch?v=OIenNRt2bjg&list=LL&index=2

## Torch Basic Syntax

In [None]:
import torch
import numpy as np

# empty 2x2x3 tensor
x = torch.empty(2, 2, 3)
print(x)

tensor([[[6.1633e-32, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[6.0988e-38, 6.2815e-38, 0.0000e+00],
         [0.0000e+00, 7.4056e-37, 4.1703e-42]]])


In [None]:
# 5x3 tensor with random values
x = torch.rand(5, 3)
print(x)

tensor([[0.7602, 0.0658, 0.1487],
        [0.1816, 0.8014, 0.9906],
        [0.3217, 0.0724, 0.4876],
        [0.5805, 0.5373, 0.5290],
        [0.4587, 0.6754, 0.7414]])


In [None]:
x = torch.zeros(5, 3)
print(x)
# can also do torch.ones

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [None]:
print(x.shape)
# can also do this
print(x.size())
# to get size of first dimension (number of rows)
print(x.size(0))
# to get size of second dimension (number of columns)
print(x.size(1))

torch.Size([5, 3])
torch.Size([5, 3])
5
3


In [None]:
print(x.dtype)

torch.float32


In [None]:
# construct torch tensor from an array
x = torch.tensor([2., 4., 3.])
print(x)

tensor([2, 4, 3])


In [None]:
# tensor argument requires_grad=True tells pytorch that it needs to calculate gradients which is needed for optimization
# by default it is false so need to explicitly set to true
# for requires_grad to be applicable data type needs to be float not int
x = torch.tensor([2.0, 4.0, 3.0], requires_grad=True)
print(x)

tensor([2., 4., 3.], requires_grad=True)


In [None]:
x = torch.ones(2, 2)
y = torch.rand(2, 2)
# elementwise addition
z = x + y
# can also do z = torch.add(x, y)
print(x)
print(y)
print(z)

# or in place addition
z.add_(x)
print(z)

# multiplication elementwise
z = x * y
print(z)

tensor([[1., 1.],
        [1., 1.]])
tensor([[0.5830, 0.0762],
        [0.6074, 0.1773]])
tensor([[1.5830, 1.0762],
        [1.6074, 1.1773]])
tensor([[2.5830, 2.0762],
        [2.6074, 2.1773]])
tensor([[0.5830, 0.0762],
        [0.6074, 0.1773]])


In [None]:
# slicing works like numpy
x = torch.rand(5, 3)
print(x)
print(x[:,0]) # print full column 0
print(x[1,:]) # print full row 1
print(x[1, 1]) # element access, but it returns a tensor with the single value
# to get the item itself append .item()
print(x[1, 1].item())

tensor([[0.3936, 0.4779, 0.2744],
        [0.9081, 0.6503, 0.1617],
        [0.0140, 0.8471, 0.8964],
        [0.7152, 0.4342, 0.3703],
        [0.7504, 0.1638, 0.4709]])
tensor([0.3936, 0.9081, 0.0140, 0.7152, 0.7504])
tensor([0.9081, 0.6503, 0.1617])
tensor(0.6503)
0.6503470540046692


In [None]:
# convert to numpy
a = torch.ones(5)
print(a)
b = a.numpy()
print(b)
print(type(b))

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
<class 'numpy.ndarray'>


In [None]:
# note that if the tensor is on the CPU, then both objects share same memory
a += 1
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [None]:
# convert from numpy to torch
a = np.ones(5)
b = torch.from_numpy(a) #from_numpy will share same memory as a
c = torch.tensor(a) #tensor will create a separate copy
print(a)
print(b)
print(c)
a += 1
print(a)
print(b)
print(c)

[1. 1. 1. 1. 1.]
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)


In [None]:
# GPU support
# tensors by default created on CPU, but can also move them to GPU or create them on GPU directly
device = torch.device('cude' if torch.cude.is_available() else 'cpu')

x = torch.rand(2,2).to(device) # first creates on CPU, and then moves to GPU if available
# or move to CPU or GPU
x = x.to('cpu')
x = x.to('cuda')

# or create directly on GPU
x = torch.rand(2, 2, device=device)

## Autograd
- Autograd package provides automatic differentiation for operations on tensors
- Compute partial derivatives while applying chain rule
- Must set requires_grad=True
- PyTorch uses a computation graph to track the sequence of operations applied to tensors during forward pass in order to compute gradients during .backward()
- Any manual operations performed on the tensors outside of the "learning logic" should not be recorded
- For instance, after computing the gradients, the weights are manually updated. This operation should not be tracked on the computation graph because a) increases memory usage, b) breaks the gradient logic, c) slows training
- So when updating weights, must set requires_grad=False before, and then set back to requires_grad=True after

In [None]:
import torch

x = torch.randn(3, requires_grad=True)
y = x + 2

print(x)
print(y)
print(y.grad_fn)
# y has an attribute grad_fn which is set to addbackward
# since y is an addition function it stores what the function is for the backpropagation
# also note that since y is a function of x, y also has requires_grad as true
print(y.requires_grad)


z = y ** 2
print(z)
print(z.grad_fn)
# compute the mean of the tensor
z = z.mean()
print(z)

tensor([0.7214, 0.7776, 0.3249], requires_grad=True)
tensor([2.7214, 2.7776, 2.3249], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7fce25b49720>
True
tensor([7.4060, 7.7150, 5.4051], grad_fn=<PowBackward0>)
<PowBackward0 object at 0x7fce25b49720>
tensor(6.8420, grad_fn=<MeanBackward0>)


In [None]:
# to compute gradients with backpropagation, call .backward() and the gradients compute automatically
# the gradients will be accumulated into the .grad attribute, which is the partial derivative of the function wrt the tensor
print(x.grad)

# backpropagate
# note this only works if z is a scalar, not a tensor
# for instance if I didn't do z.mean(), it would give an error
z.backward()
print(x.grad)
# IMPORTANT NOTE: .backward() ACCUMULATES into the .grad attribute
# so if in a loop the .grad isn't cleared, it will keep adding
# make sure to do optimizer.zero_grad() to reset gradients every loop
# or do:
x.grad.zero_()
print(x.grad)

None
tensor([1.8143, 1.8517, 1.5499])
tensor([0., 0., 0.])


- Sometimes we want to stop a tensor from tracking the gradient history, ex) after training during evaluation, or when manually updating the weights
- Use x.requires_grad_(False)
- Or use x.detach()
- Or wrap in "with torch.no_grad():"

In [None]:
# using .requires_grad_(False)
a = torch.randn(2, 2, requires_grad=True)
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

a.requires_grad_(False)
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

True
<SumBackward0 object at 0x7fce25b27160>
False
None


In [None]:
# detach creates a new copy tensor with requires_grad false
a = torch.randn(2, 2, requires_grad=True)
b = a.detach()
print(a.requires_grad)
print(b.requires_grad)

True
False


In [None]:
# wrap in with torch.no_grad():
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a ** 2
print(b.requires_grad)
with torch.no_grad():
  b = a ** 2
  print(b.requires_grad)

True
True
False


## Gradient Descent in Autograd
- Linear regression
- Weight times input plus bias
- $ f(x) = w * x + b$
- Want to approximate $f(x) = 2 * x$

In [None]:
import torch

# training samples, 1 input 1 output, 8 samples
X = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32)
# y values are 2x
Y = torch.tensor([2, 4, 6, 8, 10, 12, 14, 16], dtype=torch.float32)

# weight initialized to zero
# requires_grad because during backpropagation we need to compute gradient of loss wrt w
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model output
def forward(x):
  return w * x

def loss(y, y_pred):
  return ((y_pred - y)**2).mean()

X_test = 5.0
print(f"Prediction before training: f({X_test}) = {forward(X_test).item():.3f}")

# training
learning_rate = 0.01
n_epochs = 100
for epoch in range(n_epochs):
  # calculate predictions using forward pass
  y_pred = forward(X)
  # compute the loss
  l = loss(Y, y_pred)
  # calculate the gradients wrt weight
  l.backward()

  # during the update of the weights we don't want to track these operations
  # so wrap in with torch.no_grad() so the operations aren't added to computation graph
  with torch.no_grad():
    w -= learning_rate * w.grad

  # remember to reset the gradients in the weight .grad attribute
  w.grad.zero_()

  if (epoch+1) % 10 == 0:
    print(f'epoch {epoch+1}: w = {w.item():.3f}, loss= {l.item():.3f}')

# correctly predicts the output
print(f"Prediction after training: f({X_test}) = {forward(X_test).item():.3f}")

Prediction before training: f(5.0) = 0.000
epoch 10: w = 1.998, loss= 0.000
epoch 20: w = 2.000, loss= 0.000
epoch 30: w = 2.000, loss= 0.000
epoch 40: w = 2.000, loss= 0.000
epoch 50: w = 2.000, loss= 0.000
epoch 60: w = 2.000, loss= 0.000
epoch 70: w = 2.000, loss= 0.000
epoch 80: w = 2.000, loss= 0.000
epoch 90: w = 2.000, loss= 0.000
epoch 100: w = 2.000, loss= 0.000
Prediction after training: f(5.0) = 10.000


## Model, Loss and Optimizer
- Typical PyTorch pipeline:
1. Design model (input, output, forward pass with different layers)
2. Construct loss and optimizer
3. Training loop with forward and backward propagation

In [None]:
# Neural network for a linear regression model

import torch
import torch.nn as nn

# torch neural network object expects tensors in a certain shape
# this is like a one column 8 row matrix
X = torch.tensor([[1],[2],[3],[4],[5],[6],[7],[8]], dtype=torch.float32)
Y = torch.tensor([[2],[4],[6],[8],[10],[12],[14],[16]], dtype=torch.float32)

# number of samples is number of rows, number of features is columns
n_samples, n_features = X.shape

X_test = torch.tensor([10], dtype=torch.float32)

# neural network classes that you define must inherit from the nn.Module

class LinearRegression(nn.Module):
  def __init__(self, input_dim, output_dim):
    # call the init of the nn parent
    super().__init__()
    # define different layers
    # for linear regression we add only one Linear layer
    self.lin = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    # returns the predictions
    return self.lin(x)

# 1. create model
input_size, output_size = n_features, n_features
model = LinearRegression(input_size, output_size)
print(f"Prediction before training: f({X_test}) = {model(X_test).item():.3f}")

# 2. define the loss and optimizer
learning_rate = 0.01
n_epochs = 100

# mean square error loss
loss = nn.MSELoss()
# SGD optimizer
# optimizer always gets model.parameters() which are the weights and bias parameters
# learning rate optional hyperparameter
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# 3. define training loop
for epoch in range(n_epochs):
  # when model is called, PyTorch automatically triggers a __call__ method which calls the forward class
  # so no need to call model.forward()
  y_pred = model(X)

  # calculate loss, pass in the expected then predicted
  l = loss(Y, y_pred)
  # calculate gradients
  l.backward()

  # update weights
  optimizer.step()

  # zero gradients
  optimizer.zero_grad()

  if (epoch+1) % 10 == 0:
    w, b = model.parameters() # returns weights and biases
    print(f'epoch {epoch+1}, w = {w[0][0].item()}, loss = {l.item()}')

print(f"Prediction after training: f({X_test.item()}) = {model(X_test).item():.3f}")

Prediction before training: f(tensor([10.])) = -2.488
epoch 10, w = 2.086758852005005, loss = 0.0510685071349144
epoch 20, w = 2.084592819213867, loss = 0.046957120299339294
epoch 30, w = 2.0812761783599854, loss = 0.04334663599729538
epoch 40, w = 2.0780892372131348, loss = 0.04001369699835777
epoch 50, w = 2.0750269889831543, loss = 0.03693711757659912
epoch 60, w = 2.072084903717041, loss = 0.034096989780664444
epoch 70, w = 2.069258213043213, loss = 0.031475286930799484
epoch 80, w = 2.066542387008667, loss = 0.02905518375337124
epoch 90, w = 2.0639328956604004, loss = 0.026821179315447807
epoch 100, w = 2.0614259243011475, loss = 0.02475883439183235
Prediction after training: f(10.0) = 20.269


## Training a Neural Network
- Follows same pipeline as the single neuron


In [3]:
# training a NN to recognize digits 0-9

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# configure the device for GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define the hyperparameters
input_size = 784 # images of shape 28x28
hidden_size = 500 # number of neurons in hidden layer
num_classes = 10 # 10 because 10 digits from 0-9
num_epochs = 2
batch_size = 100
learning_rate = 0.001

# MNIST dataset built in from torch
# this returns set of 600000 images
train_dataset = torchvision.datasets.MNIST(root='./data',
                                            train=True,
                                            transform=transforms.ToTensor(),
                                            download=True)
test_dataset = torchvision.datasets.MNIST(root='./data',
                                            train=True,
                                            transform=transforms.ToTensor())

# data loader built in from torch
# provides optimized way to iterate over the dataset
# since batch_size was defined to be 100, the length of train_loader is 600 instead of 600000 (600 batches of 100)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

'''
# example of the images
examples = iter(test_loader)
example_data, example_targets = next(examples)

for i in range(6):
  plt.subplot(2,3,i+1)
  plt.imshow(example_data[i][0], cmap='gray')
plt.show()
'''

# create the network model
class NeuralNet(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super().__init__()
    self.l1 = nn.Linear(input_size, hidden_size) # this is the weighted sum plus biases, if bias=True
    self.relu = nn.ReLU() # the activation function for layer 1
    self.l2 = nn.Linear(hidden_size, num_classes)
  def forward(self, x):
    out = self.l1(x)
    out = self.relu(out)
    out = self.l2(out)
    # no activation applied at output. PyTroch crossentropy requires values with no activation applied
    return out

# to use the GPU, the model needs to be pushed to the device
model = NeuralNet(input_size, hidden_size, num_classes).to(device)

# define loss and optimizer
# crossentropy used for classification
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# define the training loop
# have 2 epochs (number of iterations over the entire training set)
# each epoch, define a loop to train on each batch in train_loader
# each batch contain 100 images. So model will perform forward/backward pass on a batch of 100, update params, move to next batch of 100
# and repeat 600 times to look at 600000 images
# then repeat for another epoch
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
  # unpack each batch of 100 images and labels (expected output)
  for i, (images, labels) in enumerate(train_loader):
    # the images need to be reshaped into the shape the NN expects
    # the original shape is a 4D matrix of size [100, 1, 28, 28]
    # the NN expects [100, 784] where there are 100 rows (number of images in batch) and 784 pixels per image
    # since the model was pushed to the GPU, the tensors must also be pushed to GPU
    images = images.reshape(-1, 28*28).to(device)
    labels = labels.to(device)

    # forward pass by calling model with the inputs
    outputs = model(images)
    # average loss in the batch of 100 images
    loss = criterion(outputs, labels)

    # calculate gradients
    loss.backward()
    # update parameters
    optimizer.step()
    # reset the gradients
    optimizer.zero_grad()

    if (i+1) % 100 == 0:
      print(f'Epoch {epoch+1}/{num_epochs}, Step [{i+1}/{n_total_steps}], Loss: {loss}')

# evaluate the performance of the model
# all model evaluation should be done in torch.no_grad to prevent gradient tracking
with torch.no_grad():
  n_correct = 0
  n_samples = len(test_loader.dataset)

  for images, labels in test_loader:
    images = images.reshape(-1, 28*28).to(device)
    labels = labels.to(device)

    outputs = model(images)

    # outputs is the raw output
    # need to find the max value per image, and its index which indicates the prediction
    # torch.max returns the maximum output value and its index
    _, predicted = torch.max(outputs, 1)
    n_correct += (predicted == labels).sum().item()

  acc = n_correct / n_samples
  print(f'Accuracy of the network on the {n_samples} test images: {acc*100} %')

PATH = './digit_id.path'
torch.save(model.state_dict(), PATH)

Epoch 1/2, Step [100/600], Loss: 0.2334097921848297
Epoch 1/2, Step [200/600], Loss: 0.398881733417511
Epoch 1/2, Step [300/600], Loss: 0.22981984913349152
Epoch 1/2, Step [400/600], Loss: 0.16696451604366302
Epoch 1/2, Step [500/600], Loss: 0.1656610369682312
Epoch 1/2, Step [600/600], Loss: 0.22521807253360748
Epoch 2/2, Step [100/600], Loss: 0.06800297647714615
Epoch 2/2, Step [200/600], Loss: 0.11497944593429565
Epoch 2/2, Step [300/600], Loss: 0.06505492329597473
Epoch 2/2, Step [400/600], Loss: 0.09721935540437698
Epoch 2/2, Step [500/600], Loss: 0.04298058897256851
Epoch 2/2, Step [600/600], Loss: 0.06278544664382935
Accuracy of the network on the 60000 test images: 97.82499999999999 %
