# In this notebook, an implementation of manual gradient descent is presented.

In [1]:
import torch

In [2]:
import numpy as np

In [3]:
X = np.array([1,2,3,4], dtype=np.float32)
Y = np.array([2,4,6,8], dtype=np.float32)

w = 0.0

In [4]:
# Model prediction
def forward(x):
    return w*x

In [5]:
# Loss = MSE, Mean Squared Error
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

In [6]:
# Gradients
# MSE: 1/N * (w*x - y)**2
# dJ/dw = 1/N 2x (w*x - y) - this is our computated derivative
def gradient(x,y,y_predicted):
    return np.dot(2*x, y_predicted-y).mean()

In [7]:
# The solution
print(f'Prediction before training: f(5) = {forward(5):.3f}')

# Training
learning_rate = 0.01
iter = 20

for epoch in range(iter):
    # prediction = forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients
    dw = gradient(X, Y, y_pred)
    
    # update weights
    w -= learning_rate * dw
    
    if epoch % 2 == 0:
        print(f'Epoch {epoch+1}: w = {w:.3f}, loss={l:.8f}')

print(f'Prediction after training: {forward(5):.3f}')

Prediction before training: f(5) = 0.000
Epoch 1: w = 1.200, loss=30.00000000
Epoch 3: w = 1.872, loss=0.76800019
Epoch 5: w = 1.980, loss=0.01966083
Epoch 7: w = 1.997, loss=0.00050332
Epoch 9: w = 1.999, loss=0.00001288
Epoch 11: w = 2.000, loss=0.00000033
Epoch 13: w = 2.000, loss=0.00000001
Epoch 15: w = 2.000, loss=0.00000000
Epoch 17: w = 2.000, loss=0.00000000
Epoch 19: w = 2.000, loss=0.00000000
Prediction after training: 10.000


# Now, we will do the same, but using PyTorch only.

In [8]:
# Model prediction
# Nothing changed according to the previous approach, so remains unmodified.

In [9]:
# Loss
# Nothing changed according to the previous approach, so remains unmodified.

In [10]:
# Gradients
# We use PyTorch to compute it.

In [11]:
X = torch.tensor([1,2,3,4], dtype=torch.float32)
Y = torch.tensor([2,4,6,8], dtype=torch.float32)
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

In [12]:
# The solution

print(f'Prediction before training: f(5) = {forward(5):.3f}')

# Training
learning_rate = 0.01
iter = 70

for epoch in range(iter+1):
    # prediction = forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients = backward pass
    l.backward() # gradient of our loss with respect to W -> dl/dw
    
    # update weights
    with torch.no_grad():  # can not be part of our computational loop, so we wrap it inside the proper 'with' statement
        w -= learning_rate * w.grad
    
    # zero gradients
    w.grad.zero_()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch}: w = {w:.3f}, loss={l:.8f}')

print(f'Prediction after training: {forward(5):.3f}')

Prediction before training: f(5) = 0.000
Epoch 0: w = 0.300, loss=30.00000000
Epoch 10: w = 1.665, loss=1.16278565
Epoch 20: w = 1.934, loss=0.04506890
Epoch 30: w = 1.987, loss=0.00174685
Epoch 40: w = 1.997, loss=0.00006770
Epoch 50: w = 1.999, loss=0.00000262
Epoch 60: w = 2.000, loss=0.00000010
Epoch 70: w = 2.000, loss=0.00000000
Prediction after training: 10.000


# Now, we will replace manual loss computation and parameter updates with PyTorch functions, as well as replace our manually constructed model with a PyTorch model class.

### Classic 3-step approach in a training pipeline:
1. Design the model (input/output size, forward pass)
2. Construct loss and optimizer
3. Training loop:
- forward pass: compute prediction
- backward pass: get gradients
- update weights 

In [1]:
import torch
import torch.nn as nn  # Neural Network module from PyTorch
import torch.optim as opt # Optimization module from PyTorch

In [2]:
X = torch.tensor([[1],
                  [2],
                  [3],
                  [4]], dtype=torch.float32)
Y = torch.tensor([[2],
                  [4],
                  [6],
                  [8]], dtype=torch.float32)
print(f'Input shape is ({X.shape[0]},{X.shape[1]}) which means our data has {X.shape[0]} sample(s) and {X.shape[1]} feature(s).')
# the representation is different - amount of rows is equal to amount of samples, and amount of columns is equal to the amount of features

Input shape is (4,1) which means our data has 4 sample(s) and 1 feature(s).


In [3]:
n_samples, n_features = X.shape  # define our number of samples and features according to our training data

input_size = n_features  # what we put into the model
output_size = n_features  # what gets out of the model

X_test = torch.tensor([5], dtype=torch.float32)  # a tensor to test our model's output

We do not define any loss function anymore - we simply get it from PyTorch. Same goes for the optimizer.

In [4]:
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)
learning_rate = 0.1
iter = 200
model = nn.Linear(input_size, output_size)
loss = nn.MSELoss()
optimizer = opt.SGD(model.parameters(), lr=learning_rate)

In [5]:
print(f'Prediction before training: {model(X_test).item():.3f}')
for epoch in range(iter+1):
    #prediction - forward pass
    y_pred = model(X)
    # compute loss
    l = loss(Y, y_pred)
    # compute gradients
    l.backward()
    # update weights
    optimizer.step()
    # zero gradients
    optimizer.zero_grad()
    # print results of ongoing training
    if epoch % 25 == 0:
        [w,b] = model.parameters()
        print(f'Epoch {epoch}: w = {w[0][0].item():.3f}, loss={l:.8f}')

print(f'Prediction after training: {model(X_test).item():.3f}')

Prediction before training: -4.672
Epoch 0: w = 3.830, loss=70.14845276
Epoch 25: w = 1.996, loss=0.00002533
Epoch 50: w = 1.998, loss=0.00000551
Epoch 75: w = 1.999, loss=0.00000121
Epoch 100: w = 2.000, loss=0.00000026
Epoch 125: w = 2.000, loss=0.00000006
Epoch 150: w = 2.000, loss=0.00000001
Epoch 175: w = 2.000, loss=0.00000000
Epoch 200: w = 2.000, loss=0.00000000
Prediction after training: 10.000


### We can also define our own model which derives from PyTorch model, and use it instead.

In [6]:
class LinearRegression(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        # define layers
        self.lin = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        return self.lin(x)

In [7]:
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)
learning_rate = 0.1
iter = 200
model = LinearRegression(input_size, output_size)
loss = nn.MSELoss()
optimizer = opt.SGD(model.parameters(), lr=learning_rate)

In [8]:
print(f'Prediction before training: {model(X_test).item():.3f}')
for epoch in range(iter+1):
    #prediction - forward pass
    y_pred = model(X)
    # compute loss
    l = loss(Y, y_pred)
    # compute gradients
    l.backward()
    # update weights
    optimizer.step()
    # zero gradients
    optimizer.zero_grad()
    # print results of ongoing training
    if epoch % 25 == 0:
        [w,b] = model.parameters()
        print(f'Epoch {epoch}: w = {w[0][0].item():.3f}, loss={l:.8f}')

print(f'Prediction after training: {model(X_test).item():.3f}')

Prediction before training: 3.493
Epoch 0: w = 2.576, loss=12.22595215
Epoch 25: w = 1.911, loss=0.01208516
Epoch 50: w = 1.958, loss=0.00264364
Epoch 75: w = 1.981, loss=0.00057831
Epoch 100: w = 1.991, loss=0.00012651
Epoch 125: w = 1.996, loss=0.00002767
Epoch 150: w = 1.998, loss=0.00000605
Epoch 175: w = 1.999, loss=0.00000132
Epoch 200: w = 2.000, loss=0.00000029
Prediction after training: 9.999
