# Tutorial

1. Data versioning
1. Experiment tracking
1. Hyperparameter tuning

In [1]:
import os
import wandb

In [2]:
PROJECT_NAME = "soict-2022"

## 1. Data versioning

### 1.1. Log a dataset

In [3]:
import random 

import torch
import torchvision
from torch.utils.data import TensorDataset
from tqdm.notebook import tqdm

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Data parameters
num_classes = 10
input_shape = (1, 28, 28)
n_train_valid = 1000
n_test = 200

# drop slow mirror from list of MNIST mirrors
torchvision.datasets.MNIST.mirrors = [mirror for mirror in torchvision.datasets.MNIST.mirrors
                                        if not mirror.startswith("http://yann.lecun.com")]

def load():
    # split between train and test sets
    train = torchvision.datasets.MNIST("./", train=True, download=True)
    test = torchvision.datasets.MNIST("./", train=False, download=True)
    (x_train, y_train), (x_test, y_test) = (train.data, train.targets), (test.data, test.targets)
    x_train = x_train[:n_train_valid]
    y_train = y_train[:n_train_valid]
    x_test = x_test[:n_test]
    y_test = y_test[:n_test]

    # split off a validation set for hyperparameter tuning
    train_size = int(n_train_valid * 0.75)
    x_train, x_val = x_train[:train_size], x_train[train_size:]
    y_train, y_val = y_train[:train_size], y_train[train_size:]

    training_set = TensorDataset(x_train, y_train)
    validation_set = TensorDataset(x_val, y_val)
    test_set = TensorDataset(x_test, y_test)
    datasets = [training_set, validation_set, test_set]
    return datasets

def load_and_log():
    # start a run, with a type to label it and a project name
    with wandb.init(project=PROJECT_NAME, job_type="load-data") as run:
        datasets = load()  # separate code for loading the datasets
        names = ["training", "validation", "test"]

        # create our Artifact
        raw_data = wandb.Artifact(
            "mnist-raw", type="dataset",
            description="Raw MNIST dataset, split into train/val/test",
            metadata={"source": "torchvision.datasets.MNIST",
                        "sizes": [len(dataset) for dataset in datasets]})

        for name, data in zip(names, datasets):
            # Store a new file in the artifact, and write data
            with raw_data.new_file(name + ".pt", mode="wb") as file:
                x, y = data.tensors
                torch.save((x, y), file)

        # Save the artifact to W&B.
        run.log_artifact(raw_data)

load_and_log()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdemo[0m. Use [1m`wandb login --relogin`[0m to force relogin


### 1.2. Preprocess a logged dataset artifact

In [4]:
def preprocess(dataset, normalize=True, expand_dims=True):
    x, y = dataset.tensors
    if normalize:
        # Scale images to the [0, 1] range
        x = x.type(torch.float32) / 255
    if expand_dims:
        # Make sure images have shape (1, 28, 28)
        x = torch.unsqueeze(x, 1)
    return TensorDataset(x, y)

def preprocess_and_log(steps):
    with wandb.init(project=PROJECT_NAME, job_type="preprocess-data") as run:
        processed_data = wandb.Artifact(
            "mnist-preprocess", type="dataset",
            description="Preprocessed MNIST dataset",
            metadata=steps)

        # declare which artifact we'll be using
        raw_data_artifact = run.use_artifact('mnist-raw:latest')

        # if need be, download the artifact
        raw_dataset = raw_data_artifact.download()
        
        for split in ["training", "validation", "test"]:
            raw_split = read(raw_dataset, split)
            processed_dataset = preprocess(raw_split, **steps)

            with processed_data.new_file(split + ".pt", mode="wb") as file:
                x, y = processed_dataset.tensors
                torch.save((x, y), file)

        run.log_artifact(processed_data)


def read(data_dir, split):
    filename = split + ".pt"
    x, y = torch.load(os.path.join(data_dir, filename))
    return TensorDataset(x, y)

steps = {"normalize": True, "expand_dims": True}

preprocess_and_log(steps)

[34m[1mwandb[0m: Downloading large artifact mnist-raw:latest, 98.19MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.1


VBox(children=(Label(value='4.721 MB of 4.721 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## 2. Experiment tracking

### 2.1. Initialize a model

In [5]:
from math import floor

import torch.nn as nn

class ConvNet(nn.Module):
      def __init__(self, hidden_layer_sizes=[32, 64],
            kernel_sizes=[3],
            activation="ReLU",
            pool_sizes=[2],
            dropout=0.5,
            num_classes=num_classes,
            input_shape=input_shape):
            super(ConvNet, self).__init__()

            self.layer1 = nn.Sequential(
                  nn.Conv2d(in_channels=input_shape[0], out_channels=hidden_layer_sizes[0], kernel_size=kernel_sizes[0]),
                  getattr(nn, activation)(),
                  nn.MaxPool2d(kernel_size=pool_sizes[0])
            )
            self.layer2 = nn.Sequential(
                  nn.Conv2d(in_channels=hidden_layer_sizes[0], out_channels=hidden_layer_sizes[-1], kernel_size=kernel_sizes[-1]),
                  getattr(nn, activation)(),
                  nn.MaxPool2d(kernel_size=pool_sizes[-1])
            )
            self.layer3 = nn.Sequential(
                  nn.Flatten(),
                  nn.Dropout(dropout)
            )

            fc_input_dims = floor((input_shape[1] - kernel_sizes[0] + 1) / pool_sizes[0]) # layer 1 output size
            fc_input_dims = floor((fc_input_dims - kernel_sizes[-1] + 1) / pool_sizes[-1]) # layer 2 output size
            fc_input_dims = fc_input_dims*fc_input_dims*hidden_layer_sizes[-1] # layer 3 output size

            self.fc = nn.Linear(fc_input_dims, num_classes)

      def forward(self, x):
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.fc(x)
            return x

def build_model_and_log(config):
      with wandb.init(project=PROJECT_NAME, job_type="initialize", config=config) as run:
            config = wandb.config
            model = ConvNet(**config)
            model_artifact = wandb.Artifact(
                  "convnet", type="model",
                  description="Simple AlexNet style CNN",
                  metadata=dict(config))

            with model_artifact.new_file("initialized_model.pth", mode="wb") as file:
                  torch.save(model.state_dict(), file)

            run.log_artifact(model_artifact)

model_config = {"hidden_layer_sizes": [32, 64],
                  "kernel_sizes": [3],
                  "activation": "ReLU",
                  "pool_sizes": [2],
                  "dropout": 0.5,
                  "num_classes": 10}
build_model_and_log(model_config)

### 2.2. Log an experiment

In [6]:
import torch.nn.functional as F

def train(model, train_loader, valid_loader, config):
    optimizer = getattr(torch.optim, config.optimizer)(model.parameters())
    model.train()
    for epoch in range(config.epochs):
        train_epoch(model, train_loader, valid_loader, config.batch_log_interval, optimizer, epoch)

def train_epoch(model, train_loader, valid_loader, batch_log_interval, optimizer, epoch):
    example_ct = epoch * len(train_loader)
    cumu_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        cumu_loss += float(loss)
        loss.backward()
        optimizer.step()

        example_ct += len(data)
        if batch_idx % batch_log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0%})]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                batch_idx / len(train_loader), loss.item()))
            train_log(loss, example_ct, epoch)

    if not valid_loader is None:
        # evaluate the model on the validation set at each epoch
        loss, accuracy = test(model, valid_loader)
        test_log(loss, accuracy, example_ct, epoch)

    return cumu_loss / len(train_loader)

def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum')  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

def train_log(loss, example_ct, epoch):
    loss = float(loss)
    # where the magic happens
    wandb.log({"epoch": epoch, "train/loss": loss}, step=example_ct)
    print(f"Loss after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}")
    
def test_log(loss, accuracy, example_ct, epoch):
    loss = float(loss)
    accuracy = float(accuracy)
    # where the magic happens
    wandb.log({"epoch": epoch, "validation/loss": loss, "validation/accuracy": accuracy}, step=example_ct)
    print(f"Loss/accuracy after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}/{accuracy:.3f}")

def evaluate(model, test_loader):
    loss, accuracy = test(model, test_loader)
    highest_losses, hardest_examples, true_labels, predictions = get_hardest_k_examples(model, test_loader.dataset)
    return loss, accuracy, highest_losses, hardest_examples, true_labels, predictions

def get_hardest_k_examples(model, testing_set, k=32):
    model.eval()
    loader = DataLoader(testing_set, 1, shuffle=False)
    # get the losses and predictions for each item in the dataset
    losses = None
    predictions = None
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = F.cross_entropy(output, target)
            pred = output.argmax(dim=1, keepdim=True)
            
            if losses is None:
                losses = loss.view((1, 1))
                predictions = pred
            else:
                losses = torch.cat((losses, loss.view((1, 1))), 0)
                predictions = torch.cat((predictions, pred), 0)

    argsort_loss = torch.argsort(losses, dim=0)
    highest_k_losses = losses[argsort_loss[-k:]]
    hardest_k_examples = testing_set[argsort_loss[-k:]][0]
    true_labels = testing_set[argsort_loss[-k:]][1]
    predicted_labels = predictions[argsort_loss[-k:]]
    return highest_k_losses, hardest_k_examples, true_labels, predicted_labels

from torch.utils.data import DataLoader

def train_and_log(config):
    with wandb.init(project=PROJECT_NAME, job_type="train", config=config) as run:
        config = wandb.config
        data = run.use_artifact('mnist-preprocess:latest')
        data_dir = data.download()

        training_dataset = read(data_dir, "training")
        validation_dataset = read(data_dir, "validation")
        train_loader = DataLoader(training_dataset, batch_size=config.batch_size)
        validation_loader = DataLoader(validation_dataset, batch_size=config.batch_size)
        
        model_artifact = run.use_artifact("convnet:latest")
        model_dir = model_artifact.download()
        model_path = os.path.join(model_dir, "initialized_model.pth")
        model_config = model_artifact.metadata
        config.update(model_config)

        model = ConvNet(**model_config)
        model.load_state_dict(torch.load(model_path))
        model = model.to(device)

        train(model, train_loader, validation_loader, config)
        model_artifact = wandb.Artifact(
            "trained-model", type="model",
            description="Trained NN model",
            metadata=dict(model_config))

        with model_artifact.new_file("trained_model.pth", mode="wb") as file:
            torch.save(model.state_dict(), file)

        run.log_artifact(model_artifact)

    return model

    
def evaluate_and_log(config=None):
    with wandb.init(project=PROJECT_NAME, job_type="report", config=config) as run:
        data = run.use_artifact('mnist-preprocess:latest')
        data_dir = data.download()
        testing_set = read(data_dir, "test")
        test_loader = torch.utils.data.DataLoader(testing_set, batch_size=128, shuffle=False)

        model_artifact = run.use_artifact("trained-model:latest")
        model_dir = model_artifact.download()
        model_path = os.path.join(model_dir, "trained_model.pth")
        model_config = model_artifact.metadata

        model = ConvNet(**model_config)
        model.load_state_dict(torch.load(model_path))
        model.to(device)

        loss, accuracy, highest_losses, hardest_examples, true_labels, preds = evaluate(model, test_loader)
        run.summary.update({"loss": loss, "accuracy": accuracy})

        wandb.log({"high-loss-examples":
            [wandb.Image(hard_example, caption=str(int(pred)) + "," +  str(int(label)))
                for hard_example, pred, label in zip(hardest_examples, preds, true_labels)]})

train_config = {"batch_size": 128,
                "epochs": 3,
                "batch_log_interval": 25,
                "optimizer": "Adam"}

model = train_and_log(train_config)
evaluate_and_log()

[34m[1mwandb[0m:   3 of 3 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


Loss after 00128 examples: 2.331
Loss/accuracy after 00750 examples: 2.149/59.600
Loss after 00134 examples: 2.078
Loss/accuracy after 00756 examples: 1.804/65.600
Loss after 00140 examples: 1.677
Loss/accuracy after 00762 examples: 1.285/74.800


VBox(children=(Label(value='0.272 MB of 0.272 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▅█
train/loss,▁
validation/accuracy,▁▄█
validation/loss,█▅▁

0,1
epoch,2.0
train/loss,2.33091
validation/accuracy,74.8
validation/loss,1.28515


[34m[1mwandb[0m:   3 of 3 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


0,1
accuracy,77.0
loss,1.3132


## 3. Hyperparameter tuning

### 3.1. Define Sweep config

In [7]:
sweep_config = {
    'method': 'random',
    # For bayesian Sweeps only: need to know what to minimize
    'metric': {
        'name': 'loss',
        'goal': 'minimize',
    },
    'parameters': {
        # epochs var doesnt vary, but we still want it here
        'epochs': {
            'value': 1,
        },
        'optimizer': {
            'values': ['adam', 'sgd'],
        },
        'hidden_layer_1_size': {
            'values': [16, 32],
        },
        'hidden_layer_2_size': {
            'values': [32, 64],
        },
        'dropout': {
            'values': [0.4, 0.5],
        },
        'learning_rate': {
            # a flat distribution between 0 and 0.1
            'distribution': 'uniform',
            'min': 0,
            'max': 0.1
        },
        'batch_size': {
            # integers between 32 and 256
            # with evenly-distributed logarithms 
            'distribution': 'q_log_uniform_values',
            'q': 8,
            'min': 32,
            'max': 256,
        }
    },
}
sweep_config

{'method': 'random',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'epochs': {'value': 1},
  'optimizer': {'values': ['adam', 'sgd']},
  'hidden_layer_1_size': {'values': [16, 32]},
  'hidden_layer_2_size': {'values': [32, 64]},
  'dropout': {'values': [0.4, 0.5]},
  'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.1},
  'batch_size': {'distribution': 'q_log_uniform_values',
   'q': 8,
   'min': 32,
   'max': 256}}}

### 3.2. Run the Sweep

In [8]:
sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)

Create sweep with ID: 09s1x4pa
Sweep URL: http://localhost:8080/demo/soict-2022/sweeps/09s1x4pa


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_log_interval = 25

def train_sweep(config=None):
    with wandb.init(config=config) as run:
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        loader = build_dataset(run, config)
        model = build_model(run, config)
        optimizer = build_optimizer(model, config)

        for epoch in range(config.epochs):
            avg_loss = train_epoch(model, loader, None, batch_log_interval, optimizer, epoch)
            wandb.log({"loss": avg_loss, "epoch": epoch})

def build_dataset(run, config):
    batch_size = config.batch_size
    data = run.use_artifact('mnist-preprocess:latest')
    data_dir = data.download()
    training_dataset = read(data_dir, "training")
    sub_dataset = torch.utils.data.Subset(
        training_dataset, indices=range(0, len(training_dataset), 5))
    train_loader = DataLoader(sub_dataset, batch_size=batch_size)
    return train_loader

def build_model(run, config):
    model_config = {
        'hidden_layer_sizes': [
            config.hidden_layer_1_size,
            config.hidden_layer_2_size,
        ],
        'dropout': config.dropout,
    }
    model = ConvNet(**model_config)
    model = model.to(device)
    return model
        
def build_optimizer(model, config):
    optimizer = config.optimizer
    learning_rate = config.learning_rate
    if optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate, momentum=0.9)
    elif optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(),
                                lr=learning_rate)
    return optimizer

In [10]:
wandb.agent(sweep_id, train_sweep, count=5)

[34m[1mwandb[0m: Agent Starting Run: 5r5l7ys2 with config:
[34m[1mwandb[0m: 	batch_size: 144
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_layer_1_size: 32
[34m[1mwandb[0m: 	hidden_layer_2_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.015102956640100463
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m:   3 of 3 files downloaded.  


Loss after 00144 examples: 2.307


VBox(children=(Label(value='0.137 MB of 0.137 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
loss,▁
train/loss,▁

0,1
epoch,0.0
loss,2.92674
train/loss,2.30697


[34m[1mwandb[0m: Agent Starting Run: 8s7gjy7q with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_layer_1_size: 16
[34m[1mwandb[0m: 	hidden_layer_2_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.007609440830379311
[34m[1mwandb[0m: 	optimizer: sgd
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m:   3 of 3 files downloaded.  


Loss after 00064 examples: 2.296


VBox(children=(Label(value='0.137 MB of 0.137 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
loss,▁
train/loss,▁

0,1
epoch,0.0
loss,2.30869
train/loss,2.29599


[34m[1mwandb[0m: Agent Starting Run: ted3giw8 with config:
[34m[1mwandb[0m: 	batch_size: 48
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_layer_1_size: 16
[34m[1mwandb[0m: 	hidden_layer_2_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.051657374679097805
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m:   3 of 3 files downloaded.  


Loss after 00048 examples: 2.298


VBox(children=(Label(value='0.137 MB of 0.137 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
loss,▁
train/loss,▁

0,1
epoch,0.0
loss,4.37686
train/loss,2.29794


[34m[1mwandb[0m: Agent Starting Run: aumhgvcv with config:
[34m[1mwandb[0m: 	batch_size: 40
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_layer_1_size: 16
[34m[1mwandb[0m: 	hidden_layer_2_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.06045335483128189
[34m[1mwandb[0m: 	optimizer: sgd
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m:   3 of 3 files downloaded.  


Loss after 00040 examples: 2.300


VBox(children=(Label(value='0.123 MB of 0.123 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
loss,▁
train/loss,▁

0,1
epoch,0.0
loss,2.28561
train/loss,2.29983


[34m[1mwandb[0m: Agent Starting Run: i7qbqsmd with config:
[34m[1mwandb[0m: 	batch_size: 152
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_layer_1_size: 32
[34m[1mwandb[0m: 	hidden_layer_2_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0011304934838524017
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m:   3 of 3 files downloaded.  


Loss after 00150 examples: 2.306


VBox(children=(Label(value='0.123 MB of 0.123 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
loss,▁
train/loss,▁

0,1
epoch,0.0
loss,2.30643
train/loss,2.30643


In [11]:
# For self-hosted Wandb server
!wandb sweep --stop "demo/$PROJECT_NAME/$sweep_id"

# For Wandb cloud server
# Stop sweep at https://wandb.ai/<wandb-user>/soict-2022/sweeps/<sweep-id>/controls

[34m[1mwandb[0m: Stopping sweep demo/soict-2022/09s1x4pa.
[34m[1mwandb[0m: Done.
