<a href="https://colab.research.google.com/github/cluePrints/fastai-v3-notes/blob/master/fastai3_part2_03_minibatch_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from fastai import datasets
import torch
import gzip, pickle
MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

def get_data():
  path = datasets.download_data(MNIST_URL, ext='.gz')
  with gzip.open(path) as f:
    ((train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y)) = pickle.load(f, encoding='latin')

  train_X = torch.tensor(train_X)
  train_Y = torch.tensor(train_Y)
  valid_X = torch.tensor(valid_X)
  valid_Y = torch.tensor(valid_Y)
  return train_X, train_Y, valid_X, valid_Y

import operator

def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def test_eq(a,b): test(a,b,operator.eq,'==')
def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [0]:
train_X, train_Y, valid_X, valid_Y = get_data()

In [3]:
train_X.shape, train_Y.shape

(torch.Size([50000, 784]), torch.Size([50000]))

In [4]:
n_classes = len(train_Y.unique())
n_classes

10

In [5]:
n_inputs = train_X.shape[-1]
n_inputs

784

In [6]:
from torch import nn
import torch.nn.functional as F
class Model(nn.Module):
  def __init__(self, n_in, n_hidden, n_out):
    super(Model, self).__init__()
    self.lin1 = nn.Linear(n_in, n_hidden)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(n_hidden, n_out)
    self.layers = [self.lin1, self.relu, self.lin2]
    
  def __call__(self, x):
    result = x
    for layer in self.layers:
      result = layer(result)
    
    return result

n_hidden = 100
model = Model(n_inputs, n_hidden, n_classes)
preds = model(train_X)
preds.shape

torch.Size([50000, 10])

In [7]:
preds[:,train_Y].shape

torch.Size([50000, 50000])

In [8]:
preds[train_Y].shape

torch.Size([50000, 10])

In [9]:
preds[0,:], train_Y[0], preds[train_Y][0], preds[0,train_Y[0]]

(tensor([-0.0603, -0.0133,  0.1480, -0.0486,  0.1027,  0.0019, -0.0256,  0.0478,
         -0.0038,  0.0125], grad_fn=<SliceBackward>),
 tensor(5),
 tensor([-0.0559, -0.1007,  0.1279, -0.0858,  0.1533,  0.0603,  0.0099,  0.1177,
         -0.0433,  0.1035], grad_fn=<SelectBackward>),
 tensor(0.0019, grad_fn=<SelectBackward>))

In [0]:
def my_softmax(row):
  return row.exp()/row.exp().sum()

row = preds[train_Y][0]
test_near(my_softmax(row), row.softmax(dim=0))

In [0]:
def my_logsfotmax(row):
  # log(row.exp()/row.exp().sum()) = 
  # row.exp().log() - row.exp().sum().log()
  # row - row.exp().sum().log()
  # row - ((row-max).exp().sum() + max.exp()).log()       <-- Note to self: without this guy the below results differs like 10%
  return row - row.exp().sum(dim=-1, keepdim=True).log()

test_near(my_logsfotmax(row), row.log_softmax(dim=0))

In [0]:
def my_logsfotmax(matrix):
  # log(row.exp()/row.exp().sum()) = 
  # row.exp().log() - row.exp().sum().log()
  # row - row.exp().sum().log()
  # row - ((row-max).exp().sum() + max.exp()).log()
  rowsmax = matrix.max(dim=-1)
  rowsmax_vals = rowsmax[0]
  rowsmax_vals = rowsmax_vals[:,None]
  logsumexp = rowsmax_vals + (matrix-rowsmax_vals).exp().sum(dim=-1, keepdim=True).log()
  return matrix - logsumexp

test_near(my_logsfotmax(preds), preds.log_softmax(dim=-1))

In [0]:
def my_nll(lsmax, y):
  # Note to self: had to put range here, colon was a bad idea because it means broadcasting
  return -lsmax[range(len(y)), y].mean()

test_near(my_nll(my_logsfotmax(preds), train_Y),
          F.nll_loss(preds.log_softmax(dim=-1), train_Y))

In [0]:
test_near(my_nll(my_logsfotmax(preds), train_Y),
          F.cross_entropy(preds, train_Y))

In [15]:
model = Model(n_inputs, n_hidden, n_classes)

batch_size = 4096
# Note to self: be careful not to have last batch of size 0
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1
lr = 0.8
for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    print(f"Training batch {batch_idx+1} of {n_batches}")
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    print(f"  {loss.item()}")
    with torch.no_grad():
      model.lin1.weight -= model.lin1.weight.grad * lr
      model.lin1.bias   -= model.lin1.bias.grad * lr
      model.lin2.weight -= model.lin2.weight.grad * lr
      model.lin2.bias   -= model.lin2.bias.grad * lr
      model.lin1.weight.grad.zero_()
      model.lin2.weight.grad.zero_()
      model.lin1.bias.grad.zero_()
      model.lin2.bias.grad.zero_()


Training batch 1 of 13
  2.2996182441711426
Training batch 2 of 13
  2.17569637298584
Training batch 3 of 13
  2.011753559112549
Training batch 4 of 13
  1.8213253021240234
Training batch 5 of 13
  1.583311915397644
Training batch 6 of 13
  1.338706612586975
Training batch 7 of 13
  1.2961739301681519
Training batch 8 of 13
  1.875916600227356
Training batch 9 of 13
  1.8761314153671265
Training batch 10 of 13
  1.5657845735549927
Training batch 11 of 13
  1.4264986515045166
Training batch 12 of 13
  1.0599159002304077
Training batch 13 of 13
  1.070555329322815


In [16]:
model = Model(n_inputs, n_hidden, n_classes)

batch_size = 4096
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1
lr = 0.8
for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    print(f"Training batch {batch_idx+1} of {n_batches}")
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    print(f"  {loss.item()}")
    with torch.no_grad():
      for layer in model.layers:
        if not isinstance(layer, nn.Linear):
          continue
        
      layer.weight -= layer.weight.grad * lr
      layer.bias   -= layer.bias.grad * lr

      layer.bias.grad.zero_()
      layer.bias.grad.zero_()

Training batch 1 of 13
  2.3082618713378906
Training batch 2 of 13
  2.2934176921844482
Training batch 3 of 13
  2.266172409057617
Training batch 4 of 13
  2.22622013092041
Training batch 5 of 13
  2.1741018295288086
Training batch 6 of 13
  2.1078577041625977
Training batch 7 of 13
  2.036240816116333
Training batch 8 of 13
  1.9903026819229126
Training batch 9 of 13
  1.884812593460083
Training batch 10 of 13
  1.797560691833496
Training batch 11 of 13
  1.7426574230194092
Training batch 12 of 13
  1.6193019151687622
Training batch 13 of 13
  1.601128339767456


In [17]:
class Model(nn.Module):
  def __init__(self, n_in, n_hidden, n_out):
    super(Model, self).__init__()
    self.lin1 = nn.Linear(n_in, n_hidden)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(n_hidden, n_out)
    self.layers = [self.lin1, self.relu, self.lin2]
    
  def __call__(self, x):
    result = x
    for layer in self.layers:
      result = layer(result)
    
    return result
  
  def params(self):
    for layer in self.layers:
      for param in layer.parameters():
        yield param;

model = Model(n_inputs, n_hidden, n_classes)

batch_size = 4096
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1
lr = 0.8
for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    print(f"Training batch {batch_idx+1} of {n_batches}")
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    print(f"  {loss.item()}")
    with torch.no_grad():
      for param in model.params():
        param.sub_(param.grad * lr)
      model.zero_grad()

Training batch 1 of 13
  2.3152432441711426
Training batch 2 of 13
  2.1860506534576416
Training batch 3 of 13
  2.0262773036956787
Training batch 4 of 13
  1.852264404296875
Training batch 5 of 13
  1.6146730184555054
Training batch 6 of 13
  1.3679393529891968
Training batch 7 of 13
  1.3271557092666626
Training batch 8 of 13
  2.006197214126587
Training batch 9 of 13
  1.8821645975112915
Training batch 10 of 13
  1.7415269613265991
Training batch 11 of 13
  1.4774370193481445
Training batch 12 of 13
  1.19278085231781
Training batch 13 of 13
  1.0323290824890137


In [18]:
# registration
class Model():
  def __init__(self, n_in, n_hidden, n_out):
    self._layers = []
    self._layers_by_name = {}
    self.lin1 = nn.Linear(n_in, n_hidden)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(n_hidden, n_out)
    
  def __setattr__(self, name, val):
    super().__setattr__(name, val)
    if (name.startswith("_")):
      return

    self._layers.append(val)
  
  def __call__(self, x):
    result = x
    for layer in self._layers:
      result = layer(result)
    
    return result
  
  def params(self):
    for layer in self._layers:
      for param in layer.parameters():
        yield param;

model = Model(n_inputs, n_hidden, n_classes)

batch_size = 4096
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1
lr = 0.8
for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    print(f"Training batch {batch_idx+1} of {n_batches}")
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    print(f"  {loss.item()}")
    with torch.no_grad():
      for param in model.params():
        param.sub_(param.grad * lr)
        param.grad.zero_()

Training batch 1 of 13
  2.3169772624969482
Training batch 2 of 13
  2.194561243057251
Training batch 3 of 13
  2.040031909942627
Training batch 4 of 13
  1.862878680229187
Training batch 5 of 13
  1.6215133666992188
Training batch 6 of 13
  1.3944119215011597
Training batch 7 of 13
  1.3434137105941772
Training batch 8 of 13
  1.7931064367294312
Training batch 9 of 13
  1.5168453454971313
Training batch 10 of 13
  1.2937639951705933
Training batch 11 of 13
  1.1940127611160278
Training batch 12 of 13
  1.2923635244369507
Training batch 13 of 13
  1.4793000221252441


In [19]:
# torch registration
class Model(nn.Module):
  def __init__(self, n_in, n_hidden, n_out):
    super(Model, self).__init__()
    lin1 = nn.Linear(n_in, n_hidden)
    relu = nn.ReLU()
    lin2 = nn.Linear(n_hidden, n_out)
    layers = [lin1, relu, lin2]
    for idx, layer in enumerate(layers):
      self.add_module(f"{idx}", layer)
  
  def __call__(self, x):
    result = x
    for layer in self.children():
      result = layer(result)
    
    return result
  
  def params(self):
    for layer in self.children():
      for param in layer.parameters():
        yield param;

model = Model(n_inputs, n_hidden, n_classes)
model(train_X);
model

Model(
  (0): Linear(in_features=784, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)

In [20]:
# the above is very much like nn.Sequential
model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)
model

Sequential(
  (0): Linear(in_features=784, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)

In [21]:
model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

batch_size = 4096
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1
lr = 0.8
for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    print(f"Training batch {batch_idx+1} of {n_batches} --> {loss.item():.2f}")
    with torch.no_grad():
      for param in model.parameters():
        param.sub_(param.grad * lr)
        param.grad.zero_()

Training batch 1 of 13 --> 2.33
Training batch 2 of 13 --> 2.20
Training batch 3 of 13 --> 2.06
Training batch 4 of 13 --> 1.89
Training batch 5 of 13 --> 1.65
Training batch 6 of 13 --> 1.40
Training batch 7 of 13 --> 1.25
Training batch 8 of 13 --> 1.51
Training batch 9 of 13 --> 1.92
Training batch 10 of 13 --> 1.98
Training batch 11 of 13 --> 1.81
Training batch 12 of 13 --> 1.30
Training batch 13 of 13 --> 1.09


In [22]:
# using optim
from torch import optim

model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

lr = 0.8
opt = optim.SGD(params = model.parameters(), lr = lr)
batch_size = 4096
n_batches = (len(train_X) - 1) // batch_size + 1
n_epochs = 1

for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x = train_X[start_idx:end_idx]
    batch_y = train_Y[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    opt.step()
    opt.zero_grad()
    print(f"Training batch {batch_idx+1} of {n_batches} --> {loss.item():.2f}")

Training batch 1 of 13 --> 2.30
Training batch 2 of 13 --> 2.17
Training batch 3 of 13 --> 2.00
Training batch 4 of 13 --> 1.81
Training batch 5 of 13 --> 1.57
Training batch 6 of 13 --> 1.38
Training batch 7 of 13 --> 1.57
Training batch 8 of 13 --> 1.69
Training batch 9 of 13 --> 1.23
Training batch 10 of 13 --> 0.94
Training batch 11 of 13 --> 0.86
Training batch 12 of 13 --> 0.96
Training batch 13 of 13 --> 1.68


In [23]:
[param.shape for param in model.parameters()]

[torch.Size([100, 784]),
 torch.Size([100]),
 torch.Size([10, 100]),
 torch.Size([10])]

In [24]:
class Dataset():
  def __init__(self, x, target):
    self.x = x
    self.target = target
    
  def __getitem__(self, idx):
    return self.x[idx], self.target[idx]
  
  def __len__(self):
    return len(self.target)

dataset = Dataset(train_X, train_Y)
x,y = dataset[1:2]
x.shape, y.shape

(torch.Size([1, 784]), torch.Size([1]))

In [25]:
dataset = Dataset(train_X, train_Y)

model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

lr = 0.8
opt = optim.SGD(params = model.parameters(), lr = lr)
batch_size = 4096
n_batches = (len(dataset) - 1) // batch_size + 1
n_epochs = 1

for epoch in range(n_epochs):
  for batch_idx in range(n_batches):
    start_idx = batch_size * batch_idx
    end_idx = batch_size * (batch_idx + 1)
    batch_x, batch_y = dataset[start_idx:end_idx]
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    opt.step()
    opt.zero_grad()
    print(f"Training batch {batch_idx+1} of {n_batches} --> {loss.item():.2f}")

Training batch 1 of 13 --> 2.30
Training batch 2 of 13 --> 2.20
Training batch 3 of 13 --> 2.05
Training batch 4 of 13 --> 1.88
Training batch 5 of 13 --> 1.64
Training batch 6 of 13 --> 1.39
Training batch 7 of 13 --> 1.30
Training batch 8 of 13 --> 1.82
Training batch 9 of 13 --> 1.71
Training batch 10 of 13 --> 1.34
Training batch 11 of 13 --> 1.10
Training batch 12 of 13 --> 1.11
Training batch 13 of 13 --> 1.29


In [26]:
class Dataloader():
  def __init__(self, dataset, batch_size=100):
    self.dataset = dataset
    self.batch_size = batch_size
    
  def __iter__(self):
    for batch_idx in range(n_batches):
      start_idx = batch_size * batch_idx
      end_idx = batch_size * (batch_idx + 1)
      batch_x, batch_y = dataset[start_idx:end_idx]
      yield (batch_x, batch_y)
      
dataloader = Dataloader(dataset)
batch_x, batch_y = next(iter(dataloader))
batch_x.shape, batch_y.shape

(torch.Size([4096, 784]), torch.Size([4096]))

In [27]:
from torch import tensor
tensor([[0, 1, 1],[1,0,0]]).argmax(dim=1)

tensor([2, 0])

In [28]:
tensor([[0, 1, 1.5],[13,0,0]])[range(2),tensor([2, 0])]

tensor([ 1.5000, 13.0000])

In [0]:
def accuracy(actual, expected):
  idx = actual.argmax(dim=1)
  return (idx == expected).float().mean()

from torch import tensor
test_eq(accuracy(tensor([[0, 1, 0.1]]), tensor([1])), 1)
test_eq(accuracy(tensor([[0, 0.3, 0.7]]), tensor([2])), 1)
test_eq(accuracy(tensor([[0, 0.3, 0]]), tensor([2])), 0)

In [30]:
tensor([[0, 1, 1],[1,0,0]]).argmax(dim=1)

tensor([2, 0])

In [31]:
dataset = Dataset(train_X, train_Y)
dataloader = Dataloader(dataset)

model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

lr = 0.5
opt = optim.SGD(params = model.parameters(), lr = lr)
batch_size = 4096
n_batches = (len(dataset) - 1) // batch_size + 1
n_epochs = 1

for epoch in range(n_epochs):
  for (batch_x, batch_y) in dataloader:
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    opt.step()
    opt.zero_grad()
    print(f"Training batch {batch_idx+1} of {n_batches} --> {loss.item():.3f}")
    
  t_acc = accuracy(model(train_X), train_Y)
  v_acc = accuracy(model(valid_X), valid_Y)
  print(f"Accuracy train: {t_acc:.3f}, validation: {v_acc:.3f}")

Training batch 13 of 13 --> 2.314
Training batch 13 of 13 --> 2.229
Training batch 13 of 13 --> 2.134
Training batch 13 of 13 --> 2.038
Training batch 13 of 13 --> 1.910
Training batch 13 of 13 --> 1.754
Training batch 13 of 13 --> 1.593
Training batch 13 of 13 --> 1.494
Training batch 13 of 13 --> 1.301
Training batch 13 of 13 --> 1.177
Training batch 13 of 13 --> 1.104
Training batch 13 of 13 --> 0.985
Training batch 13 of 13 --> 1.012
Accuracy train: 0.655, validation: 0.670


In [32]:
class Sampler():
  def __init__(self, dataset, batch_size = 64, shuffle=False):
    self.dataset = dataset
    self.len = len(dataset)
    self.shuffle = shuffle
    self.batch_size = batch_size
    
  def __iter__(self):
    self.idxs = torch.randperm(self.len) if self.shuffle else torch.arange(self.len)
    for i in range(0, self.len, self.batch_size): yield self.idxs[i:i+self.batch_size]

s = Sampler(Dataset(train_X[:5], train_Y[:5]), 3, False)
[o for o in s]

[tensor([0, 1, 2]), tensor([3, 4])]

In [33]:
s = Sampler(Dataset(train_X[:5], train_Y[:5]), 3, False)
[o for o in s]

[tensor([0, 1, 2]), tensor([3, 4])]

In [34]:
s = Sampler(Dataset(train_X[:5], train_Y[:5]), 3, True)
[o for o in s]

[tensor([3, 0, 2]), tensor([1, 4])]

In [35]:
s = Sampler(Dataset(train_X[:5], train_Y[:5]), 3, False)
[o for o in s]

[tensor([0, 1, 2]), tensor([3, 4])]

In [36]:
class Dataloader():
  def __init__(self, dataset, sampler, batch_size=100):
    self.dataset = dataset
    self.batch_size = batch_size
    self.sampler = sampler
    
  def __iter__(self):
    for batch_idx in self.sampler:
      batch_x, batch_y = self.dataset[batch_idx]
      yield (batch_x, batch_y)

dataset = Dataset(train_X, train_Y)
sampler = Sampler(dataset, shuffle = True)
dataloader = Dataloader(dataset, sampler)

x,y = next(iter(dataloader))
x.shape, y.shape

(torch.Size([64, 784]), torch.Size([64]))

In [37]:
dataset = Dataset(train_X, train_Y)
sampler = Sampler(dataset, shuffle = True, batch_size=batch_size)
dataloader = Dataloader(dataset, sampler, batch_size=batch_size)

model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

lr = 0.5
opt = optim.SGD(params = model.parameters(), lr = lr)
n_batches = (len(dataset) - 1) // batch_size + 1
n_epochs = 1

for epoch in range(n_epochs):
  for idx, (batch_x, batch_y) in enumerate(dataloader):
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    opt.step()
    opt.zero_grad()
    print(f"Training batch {idx+1}. Length {len(batch_x)} --> {loss.item():.3f}")
    
  t_acc = accuracy(model(train_X), train_Y)
  v_acc = accuracy(model(valid_X), valid_Y)
  print(f"Accuracy train: {t_acc:.3f}, validation: {v_acc:.3f}")

Training batch 1. Length 4096 --> 2.315
Training batch 2. Length 4096 --> 2.242
Training batch 3. Length 4096 --> 2.163
Training batch 4. Length 4096 --> 2.074
Training batch 5. Length 4096 --> 1.962
Training batch 6. Length 4096 --> 1.826
Training batch 7. Length 4096 --> 1.667
Training batch 8. Length 4096 --> 1.530
Training batch 9. Length 4096 --> 1.375
Training batch 10. Length 4096 --> 1.242
Training batch 11. Length 4096 --> 1.123
Training batch 12. Length 4096 --> 1.067
Training batch 13. Length 848 --> 1.076
Accuracy train: 0.615, validation: 0.628


In [38]:
batch_size = 64
dataset = Dataset(train_X, train_Y)
sampler = Sampler(dataset, shuffle = True, batch_size=batch_size)
dataloader = Dataloader(dataset, sampler, batch_size=batch_size)

v_dataset = Dataset(valid_X, valid_Y)
v_sampler = Sampler(v_dataset, shuffle = False, batch_size=batch_size)
v_dataloader = Dataloader(v_dataset, v_sampler, batch_size=batch_size)

n_hidden = 50
model = nn.Sequential(
   nn.Linear(n_inputs, n_hidden),
   nn.ReLU(),
   nn.Linear(n_hidden, n_classes)
)

lr = 0.5
opt = optim.SGD(params = model.parameters(), lr = lr)
n_batches = (len(dataset) - 1) // batch_size + 1
n_epochs = 5

for epoch in range(n_epochs):
  model.train()
  for (batch_x, batch_y) in dataloader:
    preds = model(batch_x)
    loss = F.cross_entropy(preds, batch_y)
    loss.backward()

    opt.step()
    opt.zero_grad()
  
  model.eval()
  accuracy_exp_avg = 0
  loss_exp_avg = 0
  for idx, (v_batch_x, v_batch_y) in enumerate(v_dataloader):
    with torch.no_grad():
      v_preds = model(v_batch_x)
      loss = F.cross_entropy(v_preds, v_batch_y)
      avg_coeff = 0.5
      accuracy_exp_avg = accuracy_exp_avg * avg_coeff + accuracy(v_preds, v_batch_y) * (1 - avg_coeff)
      loss_exp_avg     = loss_exp_avg * avg_coeff + loss * (1 - avg_coeff)
 
  print(f"Epoch {epoch}/{n_epochs}. Validation metrics. Loss: {loss_exp_avg:.3f}, accuracy: {accuracy_exp_avg:.3f}")
  
model.eval()

t_preds = model(train_X)
v_preds = model(valid_X)
t_acc = accuracy(t_preds, train_Y)
v_acc = accuracy(v_preds, valid_Y)
t_loss = F.cross_entropy(t_preds, train_Y)
v_loss = F.cross_entropy(v_preds, valid_Y)
print(f"Accuracy train: {t_acc:.3f} (loss: {t_loss:.3f}), validation: {v_acc:.3f} (loss: {v_loss:.3f})")

Epoch 0/5. Validation metrics. Loss: 0.166, accuracy: 0.964
Epoch 1/5. Validation metrics. Loss: 0.302, accuracy: 0.928
Epoch 2/5. Validation metrics. Loss: 0.092, accuracy: 0.986
Epoch 3/5. Validation metrics. Loss: 0.109, accuracy: 0.986
Epoch 4/5. Validation metrics. Loss: 0.094, accuracy: 0.986
Accuracy train: 0.960 (loss: 0.131), validation: 0.955 (loss: 0.168)


In [39]:
len(v_dataset)

10000

In [0]:
# get data
# lin-relu-lin
# log_softmax 
# nll
# logsumexp in log_softmax
# basica training loop, epochs, batches, manual changes to weights
# iterate over params
# registering the modules
# nn.Sequential
# optimizer
# grad accumulation cleanup
# dataset & dataloader
# random sampling
# validation

In [0]:
torch.utils.data.TensorDataset??