### 32 Training Loop Run Builder

In [1]:
from collections import OrderedDict, namedtuple
from itertools import product

In [3]:
class RunBuilder:
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [4]:
params = OrderedDict(
    lr = [0.01, 0.001],
    batch_size = [32, 64]
)

In [None]:
params.keys()

odict_keys(['lr', 'batch_size'])

In [5]:
runs = RunBuilder.get_runs(params)
runs

[Run(lr=0.01, batch_size=32),
 Run(lr=0.01, batch_size=64),
 Run(lr=0.001, batch_size=32),
 Run(lr=0.001, batch_size=64)]

In [7]:
runs[0]

Run(lr=0.01, batch_size=32)

In [8]:
print(runs[0].lr, runs[0].batch_size)

0.01 32


In [9]:
for run in runs:
    print(run.lr, run.batch_size)

0.01 32
0.01 64
0.001 32
0.001 64


#### Two parameters

In [10]:
params = OrderedDict(
    lr = [0.01, 0.001],
    batch_size = [32, 64],

)
runs = RunBuilder.get_runs(params)
runs

[Run(lr=0.01, batch_size=32),
 Run(lr=0.01, batch_size=64),
 Run(lr=0.001, batch_size=32),
 Run(lr=0.001, batch_size=64)]

In [13]:
params = OrderedDict(
    lr = [0.01, 0.001],
    batch_size = [32, 64],
    device = ['cpu', 'cuda']
)

runs = RunBuilder.get_runs(params)
runs

[Run(lr=0.01, batch_size=32, device='cpu'),
 Run(lr=0.01, batch_size=32, device='cuda'),
 Run(lr=0.01, batch_size=64, device='cpu'),
 Run(lr=0.01, batch_size=64, device='cuda'),
 Run(lr=0.001, batch_size=32, device='cpu'),
 Run(lr=0.001, batch_size=32, device='cuda'),
 Run(lr=0.001, batch_size=64, device='cpu'),
 Run(lr=0.001, batch_size=64, device='cuda')]

In [14]:
params.keys()

odict_keys(['lr', 'batch_size', 'device'])

In [15]:
params.values()

odict_values([[0.01, 0.001], [32, 64], ['cpu', 'cuda']])

In [16]:
Run = namedtuple('Run', params.keys())
Run

__main__.Run

In [17]:
runs = []
for v in product(*params.values()):
    runs.append(Run(*v))
runs

[Run(lr=0.01, batch_size=32, device='cpu'),
 Run(lr=0.01, batch_size=32, device='cuda'),
 Run(lr=0.01, batch_size=64, device='cpu'),
 Run(lr=0.01, batch_size=64, device='cuda'),
 Run(lr=0.001, batch_size=32, device='cpu'),
 Run(lr=0.001, batch_size=32, device='cuda'),
 Run(lr=0.001, batch_size=64, device='cpu'),
 Run(lr=0.001, batch_size=64, device='cuda')]

In [18]:
for fun in RunBuilder.get_runs(params):
    comment = f'-{run}'

    # Training process given the set of parameters
    print(comment)

-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)
-Run(lr=0.001, batch_size=64)


### 33 CNN Training Loop Refactoring - Simultaneous Hyperparameter Testing

In [26]:
import time
import torchvision
import pandas as pd
import torch
import json
from torch.utils.tensorboard import SummaryWriter

from IPython.display import display, clear_output

class RunManager:
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.model = None
        self.loader = None
        self.tb = None

    def begin_run(self, hyper_params, model, loader):
        self.run_start_time = time.time()
        self.run_params = hyper_params
        self.run_count += 1

        self.model = model
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{hyper_params}')

        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image('images', grid)
        self.tb.add_graph(self.model, images)

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self, epoch_no):
        self.epoch_start_time = time.time()

        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

        # print(f"Epoch {epoch_no} started ...", end=" ")

    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        for name, param in self.model.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)

        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch_duration"] = epoch_duration
        results["run duration"] = run_duration

        for k, v in self.run_params._asdict().items():
            results[k] = v

        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')

        clear_output(wait=True)
        display(df)
        # print("Ended")

    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)

    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self, fileName):
        pd.DataFrame.from_dict(self.run_data, orient='columns', ).to_csv(f'{fileName}.csv')
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transform

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

from torch.utils.tensorboard import SummaryWriter # <-- new

print(torch.__version__)
print(torchvision.__version__)

class Network(nn.Module):
    def __init__(self, channels=1): # default grayscale
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=channels, out_channels=6, kernel_size=5) 
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120) # ((28-5+1)/2 -5 +1)/2 = 4
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, (2, 2), stride=2)

        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, (2, 2), stride=2)

        t = t.reshape(-1, 12*4*4)
        t = F.relu(self.fc1(t))

        t = F.relu(self.fc2(t))

        t = self.out(t)
        
        return t


2.5.1+cu124
0.20.1+cu124


In [24]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST',
    download=True,
    transform=transform.Compose([
        transform.ToTensor()
    ]))

# train_loader = torch.utils.data.DataLoader(train_set, batch_size=100, shuffle=True) 

In [25]:
from torch.utils.data import DataLoader

params = OrderedDict(
    lr = [.01],
    batch_size = [1000, 2000],
)

m = RunManager()

for run in RunBuilder.get_runs(params):

    network = Network()
    loader = DataLoader(train_set, batch_size=run.batch_size) 
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch(epoch)

        for batch in loader:
            images, labels = batch

            preds = network(images)
            loss = F.cross_entropy(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')


Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run duration,lr,batch_size
0,1,1,0.942568,0.64145,8.328573,8.866978,0.01,1000
1,1,2,0.479108,0.822633,8.365075,17.380928,0.01,1000
2,1,3,0.394044,0.856033,8.501794,26.004044,0.01,1000
3,1,4,0.357565,0.869117,8.453928,34.573614,0.01,1000
4,1,5,0.336215,0.87705,8.282996,42.974124,0.01,1000
5,2,1,1.201559,0.5446,8.301674,9.384692,0.01,2000
6,2,2,0.60349,0.764267,8.401149,17.909029,0.01,2000
7,2,3,0.508192,0.80295,8.356733,26.389409,0.01,2000
8,2,4,0.4593,0.828233,8.414301,34.92125,0.01,2000
9,2,5,0.421466,0.844983,8.332051,43.375174,0.01,2000


Ended


In [27]:
params = OrderedDict(
    lr = [.01],
    batch_size = [1000, 2000],
    shuffle = [True, False]
)

m = RunManager()

for run in RunBuilder.get_runs(params):

    network = Network()
    loader = DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle) 
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch(epoch)

        for batch in loader:
            images, labels = batch

            preds = network(images)
            loss = F.cross_entropy(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run duration,lr,batch_size,shuffle
0,1,1,1.017841,0.61295,7.645893,8.197881,0.01,1000,True
1,1,2,0.521381,0.797317,7.493823,15.809482,0.01,1000,True
2,1,3,0.420959,0.8469,7.524096,23.443556,0.01,1000,True
3,1,4,0.376354,0.861817,7.458082,31.018337,0.01,1000,True
4,1,5,0.343932,0.872367,7.551965,38.690419,0.01,1000,True
5,2,1,0.901488,0.661833,8.522249,9.061235,0.01,1000,False
6,2,2,0.465672,0.826367,8.366937,17.577125,0.01,1000,False
7,2,3,0.386732,0.85775,8.584581,26.274714,0.01,1000,False
8,2,4,0.351575,0.869317,8.387501,34.794836,0.01,1000,False
9,2,5,0.328205,0.8788,8.31312,43.234395,0.01,1000,False


### 34 DataLoader num_workers - Speed Limit Increase

In [28]:
params = OrderedDict(
    lr = [.01],
    batch_size = [1000, 2000],
    num_workers = [0, 1, 2, 4, 8]
    # shuffle = [True, False]
)

m = RunManager()

for run in RunBuilder.get_runs(params):

    network = Network()
    loader = DataLoader(train_set, batch_size=run.batch_size, num_workers=run.num_workers) 
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(1):
        m.begin_epoch(epoch)

        for batch in loader:
            images, labels = batch

            preds = network(images)
            loss = F.cross_entropy(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run duration,lr,batch_size,num_workers
0,1,1,1.019271,0.6058,7.442017,7.997392,0.01,1000,0
1,2,1,0.944378,0.642233,6.439095,7.192991,0.01,1000,1
2,3,1,1.011829,0.607883,3.713509,4.448341,0.01,1000,2
3,4,1,1.043159,0.589583,4.108103,4.917751,0.01,1000,4
4,5,1,1.057763,0.593317,4.104699,5.086607,0.01,1000,8
5,6,1,1.318678,0.507533,8.413423,9.549574,0.01,2000,0
6,7,1,1.398434,0.463,6.7057,8.063636,0.01,2000,1
7,8,1,1.380551,0.478533,4.442457,5.815802,0.01,2000,2
8,9,1,1.30549,0.494817,4.645245,6.101749,0.01,2000,4
9,10,1,1.269757,0.52385,4.509929,6.315699,0.01,2000,8
