In [None]:
import numpy as np
import sklearn
import torch
import torch.optim as optim
import torch.nn as nn

### **Create data in NumPy**

Each point comes from the 'ideal' line $y = 2x +1$, but adding some random small number. The goal will be to find what line fits the data as well as possible. And that line should be very close to that ideal line. 

In [None]:
# Data Generation
np.random.seed(42)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

### **The last model we created**

In [None]:
class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Instead of our custom parameters, we use a Linear layer with single input and single output
        # this implicitly defines both a (the weight) and b (the intercept)
        self.linear = nn.Linear(1, 1)
                
    def forward(self, x):
        # Now it only takes a call to the layer to make predictions
        return self.linear(x)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# let's instantiate a model
model = LayerLinearRegression().to(device)

# define loss function
loss_fn = nn.MSELoss(reduction='mean')

# define optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [None]:
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        optimizer.zero_grad()
        # Returns the loss
        return loss.item()
    
    # Returns the function that will be called inside the train loop
    return train_step

# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
losses = []
n_epochs = 1000


### **The Dataset Class**

We will now see how to handle bigger data and do gradient descent and minibatch-based optimization in PyTorch. 

In [None]:
from torch.utils.data import Dataset, TensorDataset

# Here we subclass the class Dataset that allows manipulation of data
#
# __init__, __getitem__, __len__ are basic methods that need to be implemented

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])



(tensor([0.7713]), tensor([2.4745]))


In [None]:
from torch.utils.data import Dataset, TensorDataset

# All of the above allows us to do very generic datasets
# But the above is very common, so there is a pre-made way to do the same
#
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))


**Important** Unlike what we did before, here we do not send the training tensors to the GPU. That is because GPU memory cannot fit the data if it is too big.

### **The Dataloader**

The Dataloader takes as input a dataset that we created earlier, a batch size and also whether we should shuffle it.

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)

Here is how the train_loader can be used to make a training loop for minibatches. The inner loop grabs minibatches from the train loader,
passes to the device where the model is, and just does a training step. It also computes the loss and keeps it around, so we can observe it later. 

In [None]:
losses = []
train_step = make_train_step(model, loss_fn, optimizer)  # this is the training function we defined earlier

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9700]])), ('linear.bias', tensor([1.0246]))])


### **Finally: How to do validation/evaluation**

First, we need to split the dataset. 

In [None]:
from torch.utils.data.dataset import random_split

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

## random_split applies on an object of the Dataset class
train_dataset, val_dataset = random_split(dataset, [80, 20])

## then we build two different data loaders for training and validation
train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

This is the updated training loop, that now includes validation.

In [None]:
losses = []
val_losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        loss = train_step(x_batch, y_batch)
        losses.append(loss)

    # torch no_grad makes sure that the nested-below computations happen without gradients, 
    # since these are not needed for evaluation
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            model.eval()

            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9505]])), ('linear.bias', tensor([1.0272]))])


In [None]:
val_losses[0]

0.008780090138316154