All the resources are from:
https://www.kaggle.com/aakashns/pytorch-basics-linear-regression-from-scratch 

Start to get used to the keyboard shortcuts.
command mode. dd, c, a, b, v, etc.

# 1. Linear regression from scratch

In [122]:
import numpy as np
import torch

In [123]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70]], dtype='float32')
# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119]], dtype='float32')

# sw: specify the dtype as float32.

In [124]:
# Convert inputs and targets to tensors
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [125]:
# Weights and biases
# sw: initialize random matrix and allow computing gradients. 
w = torch.randn(2, 3, requires_grad=True)
b = torch.randn(2, requires_grad=True)
print(w)
print(b)

tensor([[-0.6331,  0.6867, -0.7845],
        [-1.0127,  0.7896,  0.7273]], requires_grad=True)
tensor([0.5178, 1.1966], requires_grad=True)


In [126]:
# Define the model
# sw: note the transpose and matrix multiplication
def model(x):
    return x @ w.t() + b 

In [127]:
# Generate predictions
preds = model(inputs)
print(preds)

tensor([[-33.4223,  11.4463],
        [-46.8718,  25.0722],
        [ -8.0430,  61.0824],
        [-63.5563, -41.2366],
        [-32.1570,  58.0322]], grad_fn=<AddBackward0>)


In [128]:
# Compare with targets
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [130]:
# MSE loss
# sw: "*" is element-wise multiplication. diff.numel() is the number of elements.
def mse(t1, t2):
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel()

In [100]:
# Compute loss
loss = mse(preds, targets)
print(loss)

tensor(25721.3359, grad_fn=<DivBackward0>)


In [101]:
# Compute gradients
# sw: retain_graph=True allows me to rerun this loss.backward() many times. Use it.
loss.backward(retain_graph=True)

In [102]:
# Gradients for weights and bias
print(w)
print(w.grad)
print(b)
print(b.grad)

tensor([[ 1.4049,  0.5676,  0.4691],
        [-1.2773,  0.3566, -0.4104]], requires_grad=True)
tensor([[ 10219.0469,   9534.1172,   6155.8140],
        [-16078.5820, -16947.4434, -10616.2383]])
tensor([0.9501, 1.1454], requires_grad=True)
tensor([ 117.4340, -190.4592])


In [103]:
# Adjust weights & reset gradients
# sw Q: Why do we use .grad.zero_() all the time in torch? 
# sw A: it relates to the dynamic graph in torch. Without setting gradients to zero, gradients of different batches accumulate.
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

print(w)
print(w.grad)
print(b)
print(b.grad)

tensor([[ 1.3027,  0.4723,  0.4075],
        [-1.1165,  0.5261, -0.3043]], requires_grad=True)
tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0.9489, 1.1473], requires_grad=True)
tensor([0., 0.])


In [107]:
# Train for 100 epochs
# sw Q: Why do we need this "with torch.no_grad()"?
# sw A: Torch does not allow the inplace operation of a leaf variable. inplace: w+=w.grad*1; leaf variable: the variable defined with gradient.
# sw A: As a result, we need to tempararily deactivate the gradients to allow the change of leaf variables. 
# sw A: You can try to remove the "with torch.no_grad()". You will see the error of "a leaf Variable that requires grad has been used in an in-place operation."
for i in range(100):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    # sw: start to use this with XX when it comes to update the parameters' values.
    # Q: Can you use optim.SGD for the defined model?
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()
        

In [110]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(91.4183, grad_fn=<DivBackward0>)


In [111]:
# Print predictions
print(preds)
print(targets)

tensor([[ 61.0419,  70.3206],
        [ 81.9842,  95.3163],
        [112.9701, 145.1200],
        [ 42.4160,  35.4168],
        [ 89.2679, 110.9849]], grad_fn=<AddBackward0>)
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


# 2. Use torch DataLoader  for linear regression

In [132]:
# Imports
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [133]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], [102, 43, 37], [69, 96, 70], [73, 67, 43], [91, 88, 64], [87, 134, 58], [102, 43, 37], [69, 96, 70], [73, 67, 43], [91, 88, 64], [87, 134, 58], [102, 43, 37], [69, 96, 70]], dtype = 'float32')
# Targets (apples, oranges)
targets = np.array([[56, 70], [81, 101], [119, 133], [22, 37], [103, 119], 
                    [56, 70], [81, 101], [119, 133], [22, 37], [103, 119], 
                    [56, 70], [81, 101], [119, 133], [22, 37], [103, 119]], dtype = 'float32')
# sw: note again that dtype 'float32' is specified! 

In [134]:
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [135]:
# Define dataset
# sw: put inputs and targets together like this.
train_ds = TensorDataset(inputs, targets)
train_ds[:]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.],
         [102.,  43.,  37.],
         [ 69.,  96.,  70.],
         [ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.],
         [102.,  43.,  37.],
         [ 69.,  96.,  70.],
         [ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.],
         [102.,  43.,  37.],
         [ 69.,  96.,  70.]]), tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.],
         [ 22.,  37.],
         [103., 119.],
         [ 56.,  70.],
         [ 81., 101.],
         [119., 133.],
         [ 22.,  37.],
         [103., 119.],
         [ 56.,  70.],
         [ 81., 101.],
         [119., 133.],
         [ 22.,  37.],
         [103., 119.]]))

In [136]:
# sw: Define data loader. This is how to use batches in this DataLoader.
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
next(iter(train_dl)) # sw: this is the way to call each batch! 

[tensor([[ 91.,  88.,  64.],
         [ 87., 134.,  58.],
         [ 87., 134.,  58.],
         [ 91.,  88.,  64.],
         [102.,  43.,  37.]]), tensor([[ 81., 101.],
         [119., 133.],
         [119., 133.],
         [ 81., 101.],
         [ 22.,  37.]])]

In [137]:
# Define model
model = nn.Linear(3, 2) # sw: 3 is the input dim; 2 is the output dim. 
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[ 0.1456, -0.5061,  0.2145],
        [-0.4573,  0.3152, -0.4416]], requires_grad=True)
Parameter containing:
tensor([ 0.4150, -0.3458], requires_grad=True)


In [138]:
# Define optimizer
opt = torch.optim.SGD(model.parameters(), lr=1e-5)

In [139]:
# Import nn.functional
import torch.nn.functional as F

In [140]:
# Define loss function
loss_fn = F.mse_loss
loss = loss_fn(model(inputs), targets)
print(loss)

tensor(14272.5908, grad_fn=<MseLossBackward>)


In [141]:
# Define a utility function to train the model
def fit(num_epochs, model, loss_fn, opt):
    for epoch in range(num_epochs):
        # sw: it looks like this iteration can automatically walk through the training data set.
        for xb,yb in train_dl:            
            # Generate predictions
            pred = model(xb)
            loss = loss_fn(pred, yb)
            # Perform gradient descent
            # sw: these three things seem standard. Keep them in your mind.
            loss.backward()
            opt.step()
            opt.zero_grad()
    print('Training loss: ', loss_fn(model(inputs), targets))

In [142]:
# Train the model for 100 epochs
fit(100, model, loss_fn, opt)

Training loss:  tensor(40.9138, grad_fn=<MseLossBackward>)


In [143]:
# Generate predictions
preds = model(inputs)
print(preds)
print(targets)

tensor([[ 58.8666,  71.1075],
        [ 83.1082,  95.5647],
        [114.2812, 142.5411],
        [ 30.2479,  42.0078],
        [ 98.1939, 107.3295],
        [ 58.8666,  71.1075],
        [ 83.1082,  95.5647],
        [114.2812, 142.5411],
        [ 30.2479,  42.0078],
        [ 98.1939, 107.3295],
        [ 58.8666,  71.1075],
        [ 83.1082,  95.5647],
        [114.2812, 142.5411],
        [ 30.2479,  42.0078],
        [ 98.1939, 107.3295]], grad_fn=<AddmmBackward>)
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


# 3. Simple nn

In [144]:
class SimpleNet(nn.Module):
    # Initialize the layers
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 3)
        self.act1 = nn.ReLU() # Activation function
        self.linear2 = nn.Linear(3, 2)
    
    # Perform the computation
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        return x

In [145]:
model = SimpleNet()
opt = torch.optim.SGD(model.parameters(), 1e-5)
loss_fn = F.mse_loss

In [146]:
fit(100, model, loss_fn, opt)

Training loss:  tensor(8225.7412, grad_fn=<MseLossBackward>)
