# Dataset Transforms

Dataset and Dataloaders help efficiently manage, load, and preprocess the data, making it possible to handle large datasets with limited memory by splitting the data into batches. Transforms allow for on-the-fly preprocessing, ensuring that data is normalized and in the right format for training the model, which leads to faster convergence. Neural networks, even in their simplest form, showcase the key steps in defining a model, training it, and evaluating it. The optimizer updates the model's weights by minimizing loss, allowing the model to improve its predictions over time.

In [70]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [71]:
class TabularDataset(Dataset):
    def __init__(self, data, transform = None): #set the transform to nonde to start
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = self.data[index]
        if self.transform: #check if sample can transform
            sample = self.transform(sample)
        return sample

# Transform Functions

In [72]:
class ToTensor:
    def __call__(self, sample):
        features, label = sample[0], sample[1]
        return {'features': torch.tensor(features, dtype=torch.float32),
                'label': torch.tensor(label, dtype=torch.float32)}

In [73]:
class Normalise:
    def __call__(self, sample):
        features, label = sample[0], sample[1]
        normalised_features = (features - np.mean(features))/np.std(features)

        return (normalised_features, label)

# Create Dummy Data

In [74]:
tabular_data = [(np.random.rand(2), np.random.rand()) for _ in range(100)] #grab a random dataset

transform = transforms.Compose([Normalise(), ToTensor()]) #compose normalise and totensor together

In [75]:
dataset = TabularDataset(data = tabular_data, transform = transform)  

# Create Dataloader

In [76]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True) #chop up all the data in smaller chunks to use less memory

# Create Neural Network Class

In [77]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc = nn.Linear(input_size,1)

    def forward(self,x):
        x = self.fc(x)
        return x

In [78]:
model = SimpleNN(input_size=2)
criterion = nn.MSELoss()
optimiser = optim.SGD(model.parameters(), lr=0.1) #adjust weigghts and biases

# Training Loop

In [79]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        features, labels = batch['features'], batch['label']

        optimiser.zero_grad()
        outputs = model(features)

        loss = criterion(outputs, labels.view(-1,1))

        #run backwards pass
        loss.backward()
        optimiser.step()

        total_loss += loss.item()

    average_loss = total_loss/len(dataloader)

    print(f'epoch: [{epoch+1}/{num_epochs}], loss: {average_loss}')

epoch: [1/50], loss: 0.6319631093314716
epoch: [2/50], loss: 0.09454137778707913
epoch: [3/50], loss: 0.0973713504416602
epoch: [4/50], loss: 0.08549400525433677
epoch: [5/50], loss: 0.0965449448142733
epoch: [6/50], loss: 0.08456701732107572
epoch: [7/50], loss: 0.08976713355098452
epoch: [8/50], loss: 0.09015543439558574
epoch: [9/50], loss: 0.09109764812248093
epoch: [10/50], loss: 0.09897306774343763
epoch: [11/50], loss: 0.09502991821084704
epoch: [12/50], loss: 0.09395231200116021
epoch: [13/50], loss: 0.09477764368057251
epoch: [14/50], loss: 0.09328674525022507
epoch: [15/50], loss: 0.09275693659271513
epoch: [16/50], loss: 0.09107466201697077
epoch: [17/50], loss: 0.09230842334883553
epoch: [18/50], loss: 0.08357284058417593
epoch: [19/50], loss: 0.0810104174805539
epoch: [20/50], loss: 0.09151668207986015
epoch: [21/50], loss: 0.08867836743593216
epoch: [22/50], loss: 0.0856957526079246
epoch: [23/50], loss: 0.10875747778585979
epoch: [24/50], loss: 0.08751359635165759
epoch:

# Evaluate the model

In [80]:
model.eval()

with torch.no_grad():
    total_loss = 0.0

    for batch in dataloader:
        features, labels = batch['features'], batch['label']
        outputs = model(features)
        loss = criterion(outputs, labels.view(-1,1))
        total_loss+= loss.item()

    average_loss = total_loss/len(dataloader)

    print(average_loss)

0.10247238831860679
