# Using wandb to track experiments.

Demo task: multi-class image classification using CIFAR10 dataset.

In [1]:
from sklearn.metrics import average_precision_score
from torch.utils.data import DataLoader
from torchvision import datasets, models
from torchvision import transforms as T
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# The next cell includes-
- Collecting the CIFAR10 dataset and defining data loaders.
- Methods to load model, criterion, optimizer and schedulers.
- Definition of AverageMeter

In [2]:
# Downloading CIFAR10 dataset
inp_transforms = T.Compose([T.ToTensor(),
                            T.Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])])
tgt_transforms = T.Lambda(lambda y: torch.zeros(10, dtype=torch.long).scatter_(0, torch.tensor(y), value=1))
cifar10 = datasets.CIFAR10(root = "/.",
                           transform = inp_transforms,
                           target_transform = tgt_transforms,
                           download = True)

# Defining dataset split (80-20)
train_dataset, val_dataset = torch.utils.data.random_split(cifar10,
                                                           [int(len(cifar10)*0.80), int(len(cifar10)*0.20)])

# Defining the dataloaders
train_dataloader = DataLoader(train_dataset,
                              batch_size=200,
                              shuffle=True)
val_dataloader = DataLoader(val_dataset,
                            batch_size=200,
                            shuffle=False)


# Method to get model based on config param model_type
def get_model(model_type):
    model = None
    if model_type == "pretrained": # Loading pretrained ResNet18 and with updated to final fc layer. 
        model = models.resnet18(pretrained=True)
        model.fc = nn.Linear(512, 10)
        model = model.to(device)
    elif model_type == "scratch": # Loading a blank ResNet18 which generated 10 outputs.
        model = models.resnet18(num_classes=10)
        model = model.to(device)
    else:
        raise NotImplemented
    return model


# Method to get criterion, optimizer and scheduler based on config params.
def get_criterion_optimizer_scheduler(config, model):
    optim_dct = {
        "adam": optim.Adam,
        "SGD": optim.SGD,
        "RMSprop": optim.RMSprop
    }
    optimizer = optim_dct[config["optimizer"]](model.parameters(), lr=config["lr"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=config["scheduler_patience"],
                                                           threshold=config["scheduler_thresh"])
    criterion = nn.CrossEntropyLoss()
    return criterion, optimizer, scheduler



# Remainder of this cell includes definition of AverageMeter (can be ignored)
"""
Code taken from Pytorch ImageNet examples
https://github.com/pytorch/examples/blob/main/imagenet/main.py#L375
"""
class Summary():
    NONE = 0
    AVERAGE = 1
    SUM = 2
    COUNT = 3

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
        self.name = name
        self.fmt = fmt
        self.summary_type = summary_type
        self.val_history = list()
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.val_history = list()

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.val_history.append(val)

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)
    
    def summary(self):
        fmtstr = ''
        if self.summary_type is Summary.NONE:
            fmtstr = ''
        elif self.summary_type is Summary.AVERAGE:
            fmtstr = '{name} {avg:.3f}'
        elif self.summary_type is Summary.SUM:
            fmtstr = '{name} {sum:.3f}'
        elif self.summary_type is Summary.COUNT:
            fmtstr = '{name} {count:.3f}'
        else:
            raise ValueError('invalid summary type %r' % self.summary_type)        
        return fmtstr.format(**self.__dict__)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /./cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 86761595.88it/s]


Extracting /./cifar-10-python.tar.gz to /.


# Following cell includes-
- Defining the train and eval loops.
- Method to trigger training loops based on config parameters.

In [3]:
# The train function without wandb logging

def train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device):
    for epoch in range(epochs):
        model.train()
        loss_meter = AverageMeter("train_loss", ":.5f")
        epoch_outs, epoch_tgt = list(), list()
        for data, tgt_vec in tqdm(train_dataloader):
            data, tgt_vec = data.to(device), tgt_vec.to(device)
            targets = torch.argmax(tgt_vec, axis=1)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, targets)
            loss_meter.update(loss.item(), data.shape[0])
            loss.backward()
            optimizer.step()
            epoch_outs.append(out)
            epoch_tgt.append(tgt_vec)
        predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
        targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
        ap_score = average_precision_score(targets, predictions)
        eval_loss_meter, eval_ap_score = evaluate(model, criterion, val_dataloader, device)
        data_to_log = {
            "epoch": epoch+1,
            "train_loss": loss_meter.avg,
            "eval_loss": eval_loss_meter.avg,
            "train_ap_score": ap_score,
            "eval_ap_score": eval_ap_score,
            "lr": optimizer.state_dict()["param_groups"][0]["lr"],
        }
        scheduler.step(eval_loss_meter.avg)
        print(data_to_log)
        


@torch.no_grad()
def evaluate(model, criterion, val_dataloader, device):
    model.eval()
    loss_meter = AverageMeter("eval_loss", ":.5f")
    epoch_outs, epoch_tgt = list(), list()
    for data, tgt_vec in val_dataloader:
        data, tgt_vec = data.to(device), tgt_vec.to(device)
        targets = torch.argmax(tgt_vec, axis=1)
        out = model(data)
        loss = criterion(out, targets)
        loss_meter.update(loss.item(), data.shape[0])
        epoch_outs.append(out)
        epoch_tgt.append(tgt_vec)
    predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
    targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
    ap_score = average_precision_score(targets, predictions)
    return loss_meter, ap_score


def trigger_training(config):
    model = get_model(config["model_type"])
    criterion, optimizer, scheduler = get_criterion_optimizer_scheduler(config, model)
    epochs = config["num_epochs"]

    train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device)


# Complete the config file, edit the cells in this notebook to log data to wandb and trigger training loops!

In [11]:
# Fill the Config file below and log the experiment at wandb
config = {
    "lr": 0.0, 
    "model_type": "scratch", # pretrained/scratch
    "optimizer": "SGD", # adam/SGD/RMSprop
    "criterion": "ce",
    "scheduler_patience": 3,
    "scheduler_thresh": 0.001,
    "num_epochs": 5, # CHANGE
    "gpu_id": 0,
    "wandb_run_name": "DeafSpy" ### FILL YOUR NAME HERE
}


In [12]:
logs = trigger_training(config)


100%|██████████| 200/200 [00:16<00:00, 12.30it/s]


{'epoch': 1, 'train_loss': 2.6121673250198363, 'eval_loss': 2.6223618030548095, 'train_ap_score': 0.09586990352175055, 'eval_ap_score': 0.09630837712451573, 'lr': 0.0}


100%|██████████| 200/200 [00:15<00:00, 12.99it/s]


{'epoch': 2, 'train_loss': 2.6113621950149537, 'eval_loss': 2.6178168869018554, 'train_ap_score': 0.09597457049806499, 'eval_ap_score': 0.09648182590204175, 'lr': 0.0}


100%|██████████| 200/200 [00:19<00:00, 10.42it/s]


{'epoch': 3, 'train_loss': 2.612925395965576, 'eval_loss': 2.618176474571228, 'train_ap_score': 0.09590416435765645, 'eval_ap_score': 0.09676841501870921, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.35it/s]


{'epoch': 4, 'train_loss': 2.611471564769745, 'eval_loss': 2.617351980209351, 'train_ap_score': 0.09598106560597251, 'eval_ap_score': 0.0964839701379921, 'lr': 0.0}


100%|██████████| 200/200 [00:15<00:00, 12.61it/s]


{'epoch': 5, 'train_loss': 2.6119293749332426, 'eval_loss': 2.615071907043457, 'train_ap_score': 0.09584674984179686, 'eval_ap_score': 0.0967355246086297, 'lr': 0.0}


In [20]:
logs

# WandB Steps

In [13]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.24.0-py2.py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [14]:
### Step 1: Import WandB in your code

import wandb

### Step 1 ends

In [15]:
### Step 2:
# Initiate wandb in your script. The moment we trigger wandb.init(), an active
# socket connection is established between your machine and wandb server.
# We specify the entity (wandb username) and project (which wandb project to use for logging)

wandb.init(entity = "dhruv_sri",   # wandb username. (NOT REQUIRED ARG. ANYMORE, it fetches from initial login)
           project = "wandb_demo", # wandb project name. New project will be created if given project is missing.
           config = config         # Config dict
          )
wandb.run.name = config["wandb_run_name"]

### Step 2 ends.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [22]:

# with wandb
def train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device):
    for epoch in range(epochs):
        model.train()
        loss_meter = AverageMeter("train_loss", ":.5f")
        epoch_outs, epoch_tgt = list(), list()
        for data, tgt_vec in tqdm(train_dataloader):
            data, tgt_vec = data.to(device), tgt_vec.to(device)
            targets = torch.argmax(tgt_vec, axis=1)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, targets)
            loss_meter.update(loss.item(), data.shape[0])
            loss.backward()
            optimizer.step()
            epoch_outs.append(out)
            epoch_tgt.append(tgt_vec)
        predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
        targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
        ap_score = average_precision_score(targets, predictions)
        eval_loss_meter, eval_ap_score = evaluate(model, criterion, val_dataloader, device)
        data_to_log = {
            "epoch": epoch+1,
            "train_loss": loss_meter.avg,
            "eval_loss": eval_loss_meter.avg,
            "train_ap_score": ap_score,
            "eval_ap_score": eval_ap_score,
            "lr": optimizer.state_dict()["param_groups"][0]["lr"],
        }
        scheduler.step(eval_loss_meter.avg)
        print(data_to_log)
        wandb.log(data_to_log)


        


In [23]:
### Step 3: Trigger wandb log
# This step is responsible for sending the logs to wandb

# wandb.log(data_to_log)
# wandb.log(logs)
trigger_training(config)

### Step 3 ends.


100%|██████████| 200/200 [00:16<00:00, 12.17it/s]


{'epoch': 1, 'train_loss': 2.5488260221481323, 'eval_loss': 2.5454543161392214, 'train_ap_score': 0.0964605902520718, 'eval_ap_score': 0.09681210689878099, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.19it/s]


{'epoch': 2, 'train_loss': 2.548868427276611, 'eval_loss': 2.5438941383361815, 'train_ap_score': 0.09660769741713146, 'eval_ap_score': 0.09692571856864027, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.29it/s]


{'epoch': 3, 'train_loss': 2.5486118924617767, 'eval_loss': 2.546280069351196, 'train_ap_score': 0.09670727821120961, 'eval_ap_score': 0.09677012791898944, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.81it/s]


{'epoch': 4, 'train_loss': 2.548195925951004, 'eval_loss': 2.546732659339905, 'train_ap_score': 0.09677725060826162, 'eval_ap_score': 0.09669797484665624, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.58it/s]


{'epoch': 5, 'train_loss': 2.5483759248256685, 'eval_loss': 2.5432587623596192, 'train_ap_score': 0.09679393498379227, 'eval_ap_score': 0.09694530755298585, 'lr': 0.0}


In [24]:
### Step 4 (Optional)
# This closes the active socket connection to wandb server. Optional since wandb destructor does the same.

wandb.finish()

### Step 4 ends.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▃▅▆█
eval_ap_score,▄▇▃▁█
eval_loss,▅▂▇█▁
lr,▁▁▁▁▁
train_ap_score,▁▄▆██
train_loss,██▅▁▃

0,1
epoch,5.0
eval_ap_score,0.09695
eval_loss,2.54326
lr,0.0
train_ap_score,0.09679
train_loss,2.54838


# WandB sweeps related steps

In [36]:
# ### Step 1:
# # Create a WandB sweep config file.
# # This config file will be used at the WandB website to initialize a sweep server
sweep_configuration = {
  "program": "demo.py",
  "method": "grid",
  "metric":
    {
        "name": "eval_ap_score",
        "goal": "maximize"
    },
  "parameters": {
      "criterion": 
        { "value": "ce" },
      "gpu_id":
        { "value": 0 },
      "lr":
        { "values": [0.1, 0.001, 0.0001] },
      "model_type":
        { "values": ["scratch", "pretrained"]},
      "num_epochs":
        { "value": 25 },
      "optimizer":
        { "values": ["adam", "SGD", "RMSprop"] },
      "scheduler_patience":
        {"value": 3},
      "scheduler_thresh":
        { "value": 0.01 }
  }
}

# sweep_configuration = {
#   "program": "demo.py",
#   "method": "grid",
#   "metric":
#     {
#         "name": "eval_ap_score",
#         "goal": "maximize"
#     },
#   "parameters": {
#       "criterion": 
#         { "value": "ce" },
#       "gpu_id":
#         { "value": 0 },
#       "lr":
#         { "values": [0.1, 0.001, 0.0001] },
#       "model_type":
#         { "values": "scratch" },
#       "num_epochs":
#         { "value": 6 },
#       "optimizer":
#         { "values": "SGD" },
#       "scheduler_patience":
#         {"value": 3},
#       "scheduler_thresh":
#         { "value": 0.01 }
#   }
# }
        
        
### A sample sweep config file if bayes method is used-
# program: wandb_demo.py
# method: bayes
# metric:
#   name: "eval_ap_score"
#   goal: maximize
# parameters:
#   lr:
#     distribution: uniform
#     min: 0.00001
#     max: 0.1
#   criterion:
#     distribution: categorical
#     value:
#       - ce
#   optimizer:
#     distribution: categorical
#     values:
#       - adam
#       - SGD
#       - RMSprop
#   model_type:
#     distribution: categorical
#     values:
#       - pretrained
#       - scratch
#   num_epochs:
#     value:
#       - 30
#   scheduler_thresh:
#     distribution: uniform
#     min: 0.001
#     max: 0.01
#   scheduler_patience:
#     distribution: int_uniform
#     min: 2
#     max: 10


In [26]:
### Step 3
# Notice in above command we mentioned an argument named "function"
# Wandb agents must trigger a function where they can initiate a socket to wandb and get a config.
# So, we will use the following sweep_agent_manager function here-

def sweep_agent_manager():
    wandb.init()
    config = dict(wandb.config)
    run_name = f"{config['model_type']}_{config['optimizer']}_{config['lr']}"
    wandb.run.name = run_name
    trigger_training(config)


In [37]:
### Step 2
# After using the above config on wandb website, you will get a sweep id in return.
# E.g. sweep id- dhruv_sri/wandb_demo/hbyp0tl8
#
# Add the following agent line in your code-
# Use the generated sweep id in the below code
sweep_id = wandb.sweep(sweep_configuration)
print(sweep_id)

wandb.agent(sweep_id=sweep_id, function=sweep_agent_manager, count=100)


 10%|█         | 20/200 [00:01<00:14, 12.69it/s]

Create sweep with ID: p8uogj8h
Sweep URL: https://wandb.ai/dhruv_sri/wandb_demo/sweeps/p8uogj8h
p8uogj8h


 30%|███       | 60/200 [00:04<00:10, 13.03it/s]Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 278, in check_stop_status
Exception in thread NetStatThr    self._loop_check_status(
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 216, in _loop_check_status
:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    local_handle = request()
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface.py", line 787, in deliver_stop_status
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
        self._target(*self._args, **self._kwargs)return self._deliver_stop_status(status)
 

 60%|██████    | 120/200 [00:10<00:11,  6.87it/s]
 60%|██████    | 121/200 [00:10<00:13,  5.83it/s]
 61%|██████    | 122/200 [00:10<00:13,  5.60it/s]
 62%|██████▏   | 123/200 [00:11<00:14,  5.26it/s]
 62%|██████▏   | 124/200 [00:11<00:14,  5.29it/s]
 62%|██████▎   | 125/200 [00:11<00:14,  5.06it/s]
 63%|██████▎   | 126/200 [00:11<00:15,  4.81it/s]
 64%|██████▎   | 127/200 [00:12<00:15,  4.72it/s]
 64%|██████▍   | 128/200 [00:12<00:16,  4.46it/s]
 64%|██████▍   | 129/200 [00:12<00:16,  4.30it/s]
 65%|██████▌   | 130/200 [00:12<00:20,  3.39it/s]
 66%|██████▌   | 131/200 [00:13<00:18,  3.67it/s]
 66%|██████▌   | 132/200 [00:13<00:17,  3.87it/s]
 66%|██████▋   | 133/200 [00:13<00:16,  4.10it/s]
 67%|██████▋   | 134/200 [00:13<00:15,  4.28it/s]
 68%|██████▊   | 136/200 [00:14<00:14,  4.49it/s]
 68%|██████▊   | 137/200 [00:14<00:14,  4.48it/s]
 69%|██████▉   | 138/200 [00:14<00:14,  4.39it/s]
 70%|██████▉   | 139/200 [00:14<00:14,  4.27it/s]
 70%|███████   | 140/200 [00:15<00:13,  4.31it/s]


{'epoch': 10, 'train_loss': 1.2358965146541596, 'eval_loss': 1.3058311581611632, 'train_ap_score': 0.5801390558450106, 'eval_ap_score': 0.5800861633742634, 'lr': 0.1}


  0%|          | 0/200 [00:00<?, ?it/s]
  0%|          | 1/200 [00:00<00:50,  3.94it/s]
  1%|          | 2/200 [00:00<00:51,  3.87it/s]
  2%|▏         | 3/200 [00:00<00:48,  4.04it/s]
  2%|▏         | 4/200 [00:00<00:45,  4.30it/s]
  2%|▎         | 5/200 [00:01<00:43,  4.51it/s]
  3%|▎         | 6/200 [00:01<00:43,  4.49it/s]
  4%|▎         | 7/200 [00:01<00:42,  4.55it/s]
  4%|▍         | 8/200 [00:01<00:42,  4.48it/s]
  4%|▍         | 9/200 [00:02<00:42,  4.47it/s]
  6%|▌         | 11/200 [00:02<00:42,  4.46it/s]
  6%|▌         | 12/200 [00:02<00:43,  4.32it/s]
  6%|▋         | 13/200 [00:03<00:44,  4.22it/s]
  7%|▋         | 14/200 [00:03<00:44,  4.17it/s]
  8%|▊         | 15/200 [00:03<00:44,  4.12it/s]
  8%|▊         | 16/200 [00:03<00:43,  4.25it/s]
  8%|▊         | 17/200 [00:03<00:41,  4.38it/s]
  9%|▉         | 18/200 [00:04<00:41,  4.44it/s]
 10%|▉         | 19/200 [00:04<00:40,  4.47it/s]
 10%|█         | 20/200 [00:04<00:39,  4.56it/s]
 10%|█         | 21/200 [00:04<00:39, 

{'epoch': 1, 'train_loss': 2.7956596767902373, 'eval_loss': 2.307637987136841, 'train_ap_score': 0.10002845240328802, 'eval_ap_score': 0.10011031804939374, 'lr': 0.1}



 64%|██████▎   | 127/200 [00:28<00:16,  4.50it/s]
 64%|██████▍   | 128/200 [00:28<00:16,  4.36it/s]
 64%|██████▍   | 129/200 [00:28<00:16,  4.23it/s]
 65%|██████▌   | 130/200 [00:28<00:16,  4.37it/s]
 66%|██████▌   | 131/200 [00:29<00:15,  4.53it/s]
 66%|██████▌   | 132/200 [00:29<00:14,  4.63it/s]
 66%|██████▋   | 133/200 [00:29<00:14,  4.62it/s]
 67%|██████▋   | 134/200 [00:29<00:13,  4.74it/s]
 68%|██████▊   | 135/200 [00:29<00:13,  4.74it/s]
 68%|██████▊   | 136/200 [00:30<00:13,  4.74it/s]
 68%|██████▊   | 137/200 [00:30<00:13,  4.78it/s]
 70%|██████▉   | 139/200 [00:30<00:12,  4.78it/s]
 70%|███████   | 140/200 [00:31<00:12,  4.72it/s]
 70%|███████   | 141/200 [00:31<00:12,  4.66it/s]
 71%|███████   | 142/200 [00:31<00:12,  4.77it/s]
 72%|███████▏  | 143/200 [00:31<00:11,  4.80it/s]
 72%|███████▏  | 144/200 [00:31<00:12,  4.64it/s]
 72%|███████▎  | 145/200 [00:32<00:11,  4.79it/s]
 73%|███████▎  | 146/200 [00:32<00:11,  4.88it/s]
 74%|███████▎  | 147/200 [00:32<00:11,  4.81it/s]

{'epoch': 11, 'train_loss': 1.2820205175876618, 'eval_loss': 1.3758660769462585, 'train_ap_score': 0.5650849779650308, 'eval_ap_score': 0.5632464936517302, 'lr': 0.1}


  0%|          | 0/200 [00:00<?, ?it/s]
  0%|          | 1/200 [00:00<00:52,  3.79it/s]
  1%|          | 2/200 [00:00<00:54,  3.64it/s]
  2%|▏         | 3/200 [00:00<00:51,  3.84it/s]
  2%|▏         | 4/200 [00:01<00:48,  4.05it/s]
  2%|▏         | 4/200 [00:02<02:05,  1.56it/s]


 62%|██████▏   | 123/200 [00:29<00:18,  4.17it/s]
[34m[1mwandb[0m: [32m[41mERROR[0m Run sis1v8c4 errored: RuntimeError('CUDA error: misaligned address\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n')
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Error in callback <function _WandbInit._pause_backend at 0x7f8803fc1e10> (for post_run_cell):


BrokenPipeError: ignored

In [None]:
### Done.
# Now execute your training script on multiple machines.
# Each run will request the config file from wandb and related experiments will be logged.
# 
# NOTE!! wandb.log(data_to_log) must be present inside the code!! Else there is no meaning to sweep.


# ------------------------------ Ends ------------------------------