In [1]:
# Batch size:
#    - [VI] https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network
#    - optimizing the model (gradients calculation) based on the whole dataset can be very time-consuming (and take too much memory).
#    - A better way for large datasets is to divide the samples into smaller bacthes. 

In [2]:
'''
epoch = 1 forward and backward pass of ALL training samples
batch_size = number of training samples in one forward & backward pass
num_iterations = number of passes, each pass using [batch_size] number of samples

e.g. 100 samples, batch_size=20. Then, 100/2 = 5 iterations for 1 epoch
'''

'\nepoch = 1 forward and backward pass of ALL training samples\nbatch_size = number of training samples in one forward & backward pass\nnum_iterations = number of passes, each pass using [batch_size] number of samples\n\ne.g. 100 samples, batch_size=20. Then, 100/2 = 5 iterations for 1 epoch\n'

In [3]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

In [4]:
class WineDataset(Dataset):
    # We have to implement the functions below
    def __init__(self):
        # Data loading
        xy = np.loadtxt('./wine.csv', delimiter=",", dtype=np.float32, skiprows=1)  # 'skiprows' will skip the first header row
        self.x = torch.from_numpy(xy[:, 1:]) # all the sample except the very first column [':' = all, '1:' = starting from second till the end]
        self.y = torch.from_numpy(xy[:, [0]]) # [n_samples, 1] (this will make easier for calculations later)
        self.n_samples = xy.shape[0] # first dimension (rows)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples

In [5]:
dataset = WineDataset()
first_data = dataset[0]
features, labels = first_data
print(features, labels)

tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03]) tensor([1.])


In [14]:
batch_size = 4

dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
dataiterator = iter(dataloader) # https://www.w3schools.com/python/ref_func_iter.asp
features, labels = next(dataiterator)
print(features, labels) # the first 4 items (batch_size)

tensor([[1.2080e+01, 1.1300e+00, 2.5100e+00, 2.4000e+01, 7.8000e+01, 2.0000e+00,
         1.5800e+00, 4.0000e-01, 1.4000e+00, 2.2000e+00, 1.3100e+00, 2.7200e+00,
         6.3000e+02],
        [1.2290e+01, 2.8300e+00, 2.2200e+00, 1.8000e+01, 8.8000e+01, 2.4500e+00,
         2.2500e+00, 2.5000e-01, 1.9900e+00, 2.1500e+00, 1.1500e+00, 3.3000e+00,
         2.9000e+02],
        [1.3070e+01, 1.5000e+00, 2.1000e+00, 1.5500e+01, 9.8000e+01, 2.4000e+00,
         2.6400e+00, 2.8000e-01, 1.3700e+00, 3.7000e+00, 1.1800e+00, 2.6900e+00,
         1.0200e+03],
        [1.3490e+01, 3.5900e+00, 2.1900e+00, 1.9500e+01, 8.8000e+01, 1.6200e+00,
         4.8000e-01, 5.8000e-01, 8.8000e-01, 5.7000e+00, 8.1000e-01, 1.8200e+00,
         5.8000e+02]]) tensor([[2.],
        [2.],
        [1.],
        [3.]])


In [15]:
# Training loop
num_epochs = 2
total_n_samples = len(dataset)
n_iterations = math.ceil(total_n_samples / batch_size)  # 178 / 4
print(total_n_samples, n_iterations)

178 45


In [16]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        # forward pass, backward pass, update
        if (i+1) % 5 == 0:
            print(f'epoch: {epoch+1}/{num_epochs}, step: {i+1}/{n_iterations}, inputs: {inputs.shape}')


epoch: 1/2, step: 5/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 10/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 15/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 20/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 25/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 30/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 35/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 40/45, inputs: torch.Size([4, 13])
epoch: 1/2, step: 45/45, inputs: torch.Size([2, 13])
epoch: 2/2, step: 5/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 10/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 15/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 20/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 25/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 30/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 35/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 40/45, inputs: torch.Size([4, 13])
epoch: 2/2, step: 45/45, inputs: torch.Size([2, 13])


In [None]:
# torchvision.datasets.MNIST() # We get the famous MNIST dataset (We will use in future tutorial)
# other famous ones include fashion MNIST, cifar, coco, etc.