In [1]:
import numpy as np 
from tqdm import tqdm as notebook_tqdm

import torch
import torchvision
from torch import nn, optim
from torch.nn import init
from torchvision import datasets, transforms
from accelerate import Accelerator

import os
from accelerate.utils import write_basic_config
from accelerate import notebook_launcher

# write_basic_config()  # Write a config file
# os._exit(00)  # Restart the notebook

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7fb867fdf4d0>

In [3]:
def get_loaders(batch_size):
    transformer = transforms.Compose([torchvision.transforms.ToTensor()])
    train_loader = torch.utils.data.DataLoader(datasets.MNIST('./files/', train=True, download=True, transform=transformer),
                                               batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('./files/', train=False, download=True, transform=transformer),
                                                  batch_size=batch_size, shuffle=True)
    
    return train_loader, test_loader

def save_model(mstate, optimizer, epoch, name):
    torch.save({'model_state_dict': mstate,
                'optimizer': optimizer.state_dict(),
                'epoch': epoch},
                f'model_{name}.pt')

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=5, stride=2),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(10, 20, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Dropout2d(),
            nn.Flatten()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(320, 50),
            nn.ReLU(),
            nn.Dropout()
        )
        self.fc2 = nn.Linear(50, 10)
        
        self.apply(self._init_weights)
        
    def forward(self, x, f):
        if f: print(f'\nIn Model: input size {x.size()}')
            
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.fc1(x)
        x = self.fc2(x)

        return x
    
    def _init_weights(self, m):
        if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear)):
            init.kaiming_normal_(m.weight.data, a=0, mode='fan_in', nonlinearity='relu')
            if hasattr(m, 'bias') and m.bias is not None:
                init.constant_(m.bias.data, 0.0)

In [5]:
def training_HFA(mixed_precision="fp16", batch_size: int = 64, n_epochs=1):
    accelerator = Accelerator(mixed_precision=mixed_precision, split_batches=True)
    
    model = Net()
    optimizer = optim.Adam(model.parameters(), lr=1e-2)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    train_loader, test_loader = get_loaders(batch_size)
    
    model, optimizer, train_loader, test_loader = accelerator.prepare(model, optimizer, train_loader, test_loader)
    
    highest_acc = 0
    for epoch in range(1, n_epochs + 1):
        train_losses = []
        model.train()
        train_bar = notebook_tqdm(train_loader)
        for batch_idx, (data, target) in enumerate(train_bar):
            optimizer.zero_grad()

            f = True if not train_losses and epoch==1 else False    
            output = model(data.to(accelerator.device), f)
            loss = loss_fn(output, target.to(accelerator.device))
            accelerator.backward(loss)
            optimizer.step()
            
            train_losses.append(loss.item())
            train_bar.set_description(f'Train Epoch: {epoch} Loss: {np.mean(train_losses):.6f}')
        
        model.eval()
        acc = []
        test_bar = notebook_tqdm(test_loader)
        for data, target in test_bar:
            with torch.no_grad():
                output = model(data.to(accelerator.device), False)
            
            pred = torch.argmax(output.softmax(1), dim=1)
            acc.extend((accelerator.gather(pred) == accelerator.gather(target)).cpu()) #.data.cpu().numpy()
            test_bar.set_description(f'Test set: Accuracy: {100. * np.mean(acc):.0f}%')
        
        if np.mean(acc) > highest_acc:
            highest_acc = np.mean(acc)
            save_model(model.state_dict(), optimizer, epoch, 'HFA')

            
notebook_launcher(training_HFA, ("fp16", 250, 5), num_processes=2)


Launching training on 2 GPUs.


  0%|                                                   | 0/240 [00:00<?, ?it/s]


In Model: input size torch.Size([125, 1, 28, 28])
In Model: input size torch.Size([125, 1, 28, 28])



Train Epoch: 1 Loss: 0.733179: 100%|██████████| 240/240 [00:08<00:00, 27.66it/s]

Test set: Accuracy: 96%: 100%|██████████████████| 40/40 [00:01<00:00, 26.78it/s]

Train Epoch: 2 Loss: 0.361338: 100%|██████████| 240/240 [00:08<00:00, 29.98it/s]

Test set: Accuracy: 97%: 100%|██████████████████| 40/40 [00:01<00:00, 30.22it/s]

Train Epoch: 3 Loss: 0.301721:  99%|█████████▉| 238/240 [00:07<00:00, 31.69it/s]
Train Epoch: 3 Loss: 0.301721: 100%|██████████| 240/240 [00:07<00:00, 30.54it/s]
Test set: Accuracy: 97%: 100%|██████████████████| 40/40 [00:01<00:00, 27.88it/s]
Test set: Accuracy: 97%: 100%|██████████████████| 40/40 [00:01<00:00, 27.52it/s]
Train Epoch: 4 Loss: 0.272888: 100%|██████████| 240/240 [00:07<00:00, 29.04it/s]
Train Epoch: 4 Loss: 0.272888: 100%|██████████| 240/240 [00:07<00:00, 30.00it/s]
Test set: Accuracy: 98%: 100%|██████████████████| 40/40 [00:01<00:00, 27.11it/s]

Train Epoch: 5 Loss: 0.269545: 100%|██████████| 240/240 [00:08<00:00, 29.92it/s]
Train Epoch: 5 Loss: 0.