In [1]:
# !pip install wandb
# !wandb login

## basics

```
import wandb
wandb.init()
```

In [1]:
import argparse
import random # to set the python random seed
import numpy # to set the numpy random seed
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision import models
from datetime import datetime
# Ignore excessive warnings
import logging
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

# WandB – Import the wandb library
import wandb

## summary

```
!pip install wandb
wandb login
# api_key
~/.netrc
```

- WandB: weights & biases

```
wandb.init(project="wandb-demo-0423")
# 字典（dict）
config = wandb.config
config[k] = v

# 实例化模型
model = Net().to(device)
train_dataset
test_dataset
train_dataloader
test_dataloader

# 监控模型，histogram weights and biases
wandb.watch(model, log="all")


for epoch in range(n_epochs):
    train_loss, train_acc = train(model, train_dataloader)
    # 字典的形式
    wandb.log({"train_loss": train_loss, "train_acc": train_acc})
    # 评估，不进行参数的更新
    test_loss, test_acc = test(model, test_dataloader)
    wandb.log({"test_loss": test_loss, "test_acc": train_acc})
```

## model, train & test

In [2]:
def train(train_dataloader, model, criterion, optimizer, device):
    total_loss = 0
    total_correct = 0
    total_batch = len(train_dataloader)
    for batch_idx, (images, labels) in enumerate(train_dataloader):
        images = images.to(device)
        labels = labels.to(device)

        # forward
        out = model(images)
        loss = criterion(out, labels)

        # 标准的处理，用 validate data；这个过程是监督训练过程，用于 early stop
        n_corrects = (out.argmax(axis=1) == labels).sum().item()
        acc = n_corrects/labels.size(0)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   # 更细 模型参数
        
        total_loss += loss.item()
        total_correct += n_corrects
        
        if (batch_idx+1) % 200 == 0:
            print(f'{datetime.now()}, {batch_idx+1}/{total_batch}: {loss.item():.4f}, acc: {acc}')
    total_errors = len(train_dataloader.dataset) - total_correct
    return total_loss, total_correct/len(train_dataloader.dataset), total_errors

In [4]:
def test(test_dataloader, model, criterion, device, classes):
    total_loss = 0
    total_correct = 0
    example_images = []
    model.eval()
    for images, labels in test_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        out = model(images)
        loss = criterion(out, labels)
        total_loss += loss.item()
        preds = torch.argmax(out, dim=1)
        total_correct += (preds == labels).sum().item()
        
        mis_preds_indice = torch.flatten((preds != labels).nonzero())
        mis_preds = preds[mis_preds_indice]
        mis_labels = labels[mis_preds_indice]
        mis_images = images[mis_preds_indice]
        
        # 13*8 + 4 == 108
        for index in range(len(mis_preds)):
            example_images.append(wandb.Image(mis_images[index], 
                                              caption="Pred: {} Truth: {}".format(classes[mis_preds[index].item()],
                                                                                  classes[mis_labels[index]])))
    total_errors = len(test_loader.dataset) - total_correct
    return example_images, total_loss, total_correct / len(test_loader.dataset), total_errors


## wandb config & dataset

In [5]:
import os
# os.environ["WANDB_API_KEY"] = ''
os.environ["WANDB_MODE"] = "online"

# WandB – Initialize a new run
# 一个 project 可以 run 多次
wandb.init(project="wandb-demo-0423")
wandb.watch_called = False # Re-run the model without restarting the runtime, unnecessary after our next release

[34m[1mwandb[0m: Currently logged in as: [33mlanchunhui[0m ([33mloveresearch[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# empty dict
wandb.config

{}

In [7]:
# WandB – Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config          # Initialize config
config.batch_size = 64          # input batch size for training (default: 64)
config.test_batch_size = 32    # input batch size for testing (default: 1000)
config.epochs = 30             # number of epochs to train (default: 10)
config.lr = 1e-3              # learning rate (default: 0.01)
config.momentum = 0.9         # SGD momentum (default: 0.5) 
config.weight_decay = 5e-4
config.no_cuda = False         # disables CUDA training
config.seed = 42               # random seed (default: 42)
config.log_interval = 10     # how many batches to wait before logging training status

In [14]:
use_cuda = not config.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
print(kwargs)

cuda
{'num_workers': 1, 'pin_memory': True}


In [9]:
model = models.resnet18(pretrained=False)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, 10)
model = model.to(device)



In [10]:
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))
])
    
# Now we load our training and test datasets and apply the transformations defined above
train_dataset = datasets.CIFAR10(root='./data', train=True,
                                 download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False,
                                download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=config.batch_size,
                                           shuffle=True, 
                                           **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=config.test_batch_size,
                                          shuffle=False, 
                                          **kwargs)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [15]:
print(len(train_dataset))
print(len(train_dataset)//config.batch_size)
print(len(train_loader))

50000
781
782


In [16]:
print(len(test_dataset))
print(len(test_dataset)//config.test_batch_size)
print(len(test_loader))

10000
312
313


## training pipeline

In [13]:

# Set random seeds and deterministic pytorch for reproducibility
# random.seed(config.seed)       # python random seed
torch.manual_seed(config.seed) # pytorch random seed
# numpy.random.seed(config.seed) # numpy random seed
torch.backends.cudnn.deterministic = True

# Load the dataset: We're training our CNN on CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html)
# First we define the tranformations to apply to our images


# Initialize our model, recursively go over all modules and convert their parameters and buffers to CUDA tensors (if device is set to cuda)
optimizer = torch.optim.SGD(model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay)
criterion = nn.CrossEntropyLoss()

# WandB – wandb.watch() automatically fetches all layer dimensions, gradients, model parameters and logs them automatically to your dashboard.
# Using log="all" log histograms of parameter values in addition to gradients
wandb.watch(model, log="all")

for epoch in range(1, config.epochs + 1):
    train_loss, train_acc, train_errors = train(train_loader, model, criterion, optimizer, device)
    wandb.log({"train_loss": train_loss, "train_acc": train_acc, "train_errors": train_errors})
    # test_dataloader, model, criterion, device, classes
    example_images, test_loss, test_acc, test_errors = test(test_loader, model, criterion, device, classes)
    wandb.log({'example_images': example_images, 'test_loss': test_loss, 'test_acc': test_acc, 'test_errors': test_errors})
    print()
    print(f'{datetime.now()}, epoch: {epoch}, train_loss: {train_loss:.4f}, train_acc: {train_acc:.2f}, test_loss: {test_loss:.4f}, test_acc: {test_acc:.2f}')
    print()


2023-04-23 22:02:47.546538, 200/782: 1.7578, acc: 0.3125
2023-04-23 22:03:01.684265, 400/782: 1.5881, acc: 0.40625
2023-04-23 22:03:15.786362, 600/782: 1.7024, acc: 0.359375

2023-04-23 22:04:48.248923, epoch: 1, train_loss: 1312.3292, train_acc: 0.38, test_loss: 451.0153, test_acc: 0.47

2023-04-23 22:05:03.120458, 200/782: 1.3696, acc: 0.578125
2023-04-23 22:05:17.786350, 400/782: 1.7198, acc: 0.390625
2023-04-23 22:05:31.756241, 600/782: 1.3919, acc: 0.46875

2023-04-23 22:06:59.056604, epoch: 2, train_loss: 1111.5958, train_acc: 0.48, test_loss: 430.7115, test_acc: 0.50

2023-04-23 22:07:14.636778, 200/782: 1.3315, acc: 0.453125
2023-04-23 22:07:29.614013, 400/782: 1.1030, acc: 0.515625
2023-04-23 22:07:43.630946, 600/782: 0.9127, acc: 0.5625

2023-04-23 22:09:02.521576, epoch: 3, train_loss: 951.4182, train_acc: 0.56, test_loss: 373.4743, test_acc: 0.56

2023-04-23 22:09:18.509211, 200/782: 1.1813, acc: 0.5625
2023-04-23 22:09:32.542236, 400/782: 0.8315, acc: 0.734375
2023-04-23 2

2023-04-23 22:50:33.631395, 600/782: 0.0246, acc: 0.984375

2023-04-23 22:51:20.359464, epoch: 29, train_loss: 22.9649, train_acc: 0.99, test_loss: 302.0734, test_acc: 0.79

2023-04-23 22:51:35.176612, 200/782: 0.0075, acc: 1.0
2023-04-23 22:51:49.426564, 400/782: 0.0012, acc: 1.0
2023-04-23 22:52:03.474307, 600/782: 0.0045, acc: 1.0

2023-04-23 22:52:50.013599, epoch: 30, train_loss: 6.8951, train_acc: 1.00, test_loss: 355.9989, test_acc: 0.80



In [None]:
# WandB – Save the model checkpoint. This automatically saves a file to the cloud and associates it with the current run.
torch.save(model.state_dict(), "model.ckpt")
wandb.save('model.ckpt')