## Introduction to Weights & Biases

- Integrate quickly, track & version automatically.
- Visualize your data and uncover critical insights.
- Improve performance so you can evaluate and deploy with confidence.

In [1]:
!pip install wandb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.6/215.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [2]:
# Get Data
!wget https://github.com/eulerianKnight/deeplearningai_short_courses/raw/main/evaluating_debugging_genAI/sprites-data.zip

--2023-08-02 21:21:03--  https://github.com/eulerianKnight/deeplearningai_short_courses/raw/main/evaluating_debugging_genAI/sprites-data.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/eulerianKnight/deeplearningai_short_courses/main/evaluating_debugging_genAI/sprites-data.zip [following]
--2023-08-02 21:21:03--  https://raw.githubusercontent.com/eulerianKnight/deeplearningai_short_courses/main/evaluating_debugging_genAI/sprites-data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20629644 (20M) [application/zip]
Saving to: ‘sprites-data.zip’


2023-08-02 21:21:04 (155 MB/s) - ‘sprites-d

In [3]:
!unzip '/content/sprites-data.zip'

Archive:  /content/sprites-data.zip
  inflating: sprites-data/sprite_labels_nc_1788_16x16.npy  
  inflating: sprites-data/sprites_1788_16x16.npy  


In [4]:
# Get utility file
!wget https://github.com/eulerianKnight/deeplearningai_short_courses/raw/main/evaluating_debugging_genAI/utilities.py

--2023-08-02 21:21:05--  https://github.com/eulerianKnight/deeplearningai_short_courses/raw/main/evaluating_debugging_genAI/utilities.py
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/eulerianKnight/deeplearningai_short_courses/main/evaluating_debugging_genAI/utilities.py [following]
--2023-08-02 21:21:05--  https://raw.githubusercontent.com/eulerianKnight/deeplearningai_short_courses/main/evaluating_debugging_genAI/utilities.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19709 (19K) [text/plain]
Saving to: ‘utilities.py’


2023-08-02 21:21:05 (25.0 MB/s) - ‘utilities.py’ saved [19709/19709

In [5]:
import math
from pathlib import Path
from types import SimpleNamespace
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from utilities import get_dataloaders

import wandb

In [6]:
# GLOABL PARAMETERS
INPUT_SIZE = 3 * 16 * 16
OUTPUT_SIZE = 5
HIDDEN_SIZE = 256
NUM_WORKERS = 2
CLASSES = ['hero', 'non-hero', 'food', 'spell', 'side-facing']
DATA_DIR = Path('./sprites-data/')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Define a Simple MLP Model
def get_model(dropout):
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(INPUT_SIZE, HIDDEN_SIZE),
        nn.BatchNorm1d(HIDDEN_SIZE),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)
    ).to(DEVICE)

In [8]:
# Define a configuration object to store hyperparameter
config = SimpleNamespace(
    epochs=5,
    batch_size=128,
    lr=1e-5,
    dropout=0.5,
    slice_size=10_000,
    valid_pct=0.2
)

In [9]:
# Train the model
def train_model(config):
    # Initialize Wandb
    wandb.init(
        project='dlai_intro',
        config=config
    )
    # Get the data
    train_dl, valid_dl = get_dataloaders(DATA_DIR,
                                         config.batch_size,
                                         config.slice_size,
                                         config.valid_pct)
    n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

    # A Simple MLP Model
    model = get_model(config.dropout)

    # Create Loss and Optimizer
    loss_func = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=config.lr)

    example_ct = 0

    for epoch in tqdm(range(config.epochs), total=config.epochs):
        model.train()

        for step, (images, labels) in enumerate(train_dl):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            outputs = model(images)
            train_loss = loss_func(outputs, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            example_ct += len(images)
            metrics = {
                'train/train_loss': train_loss,
                'train/epoch': epoch + 1,
                'train/example_ct': example_ct
            }
            wandb.log(metrics)

        # Compute Validation metrics, log images on last epochs
        val_loss, accuracy = validate_model(model, valid_dl, loss_func)
        # Compute train and validation metrics
        val_metrics = {
            "val/val_loss": val_loss,
            "val/val_accuracy": accuracy
        }
        wandb.log(val_metrics)

    wandb.finish()

In [10]:
def validate_model(model, valid_dl, loss_func):
    model.eval()
    val_loss = 0.0
    correct = 0

    with torch.inference_mode():
        for i, (images, labels) in enumerate(valid_dl):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            # Forward pass
            outputs = model(images)
            val_loss += loss_func(outputs, labels) * labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

In [11]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
# Train the model
train_model(config)

[34m[1mwandb[0m: Currently logged in as: [33meulerianknight[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/5 [00:00<?, ?it/s]

0,1
train/epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
train/example_ct,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,▇█▇▇▇▇▇▆▇▅▅▅▅▅▅▅▄▅▄▄▄▃▃▄▃▃▃▃▃▃▂▂▁▃▃▂▂▁▁▂
val/val_accuracy,▁▄▆▇█
val/val_loss,█▆▄▂▁

0,1
train/epoch,5.0
train/example_ct,40000.0
train/train_loss,0.85351
val/val_accuracy,0.846
val/val_loss,0.83672


In [13]:
# Try increasing learning_rate
config.lr = 1e-4
train_model(config)

  0%|          | 0/5 [00:00<?, ?it/s]

0,1
train/epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
train/example_ct,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,█▇▆▅▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁
val/val_accuracy,▁▅▇▇█
val/val_loss,█▄▃▂▁

0,1
train/epoch,5.0
train/example_ct,40000.0
train/train_loss,0.14694
val/val_accuracy,0.9875
val/val_loss,0.12862


In [14]:
# Try changing other hyperparameters
config.dropout = 0.1
config.epochs = 10
train_model(config)

  0%|          | 0/10 [00:00<?, ?it/s]

0,1
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train/example_ct,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,█▇▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁▅▆▇▇█████
val/val_loss,█▅▃▃▂▂▁▁▁▁

0,1
train/epoch,10.0
train/example_ct,80000.0
train/train_loss,0.03019
val/val_accuracy,1.0
val/val_loss,0.03572
