# Train sample notebook

## Setting
set up all hyperparameters and configurations here

In [5]:
DATA_ROOT = "data"
MODEL_PATH = "model.pth"

class Hyperparameters:
    def __init__(self):
        # Data
        self.val_ratio = 0.2

        # Training hyperparameters
        self.batch_size = 128
        self.epochs = 600

        # Optimizer & LR scheme
        self.opt_name = 'sgd'
        self.momentum = 0.9
        self.lr = 0.5
        self.lr_scheduler_name = 'cosine'
        self.lr_warmup_epochs = 5
        self.lr_warmup_method = 'linear'
        self.lr_warmup_decay = 0.01

        # Regularization and Augmentation
        self.weight_decay = 2e-05
        self.norm_weight_decay = 0.0
        self.label_smoothing = 0.1

        # Resizing
        self.val_crop_size = 224
        self.train_crop_size = 176

    def to_dict(self):
        return self.__dict__


## Import & Setup

In [19]:
import importlib
import utils; importlib.reload(utils)
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms as T
from model import ResNet50Wrapper
import numpy as np
from tqdm import tqdm
import wandb
from typing import Tuple

DEVICE = utils.torch.detect_device(verbose=True)
torch.device(DEVICE)
# start a new experiment

CUDA is available device: NVIDIA GeForce RTX 3070


device(type='cuda')

In [8]:

def setup_training(config: Hyperparameters) -> Tuple[DataLoader, DataLoader, nn.Module, nn.Module, torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]:
    print("splitting dataset...")
    train_dir, val_dir = utils.data.split_dataset(DATA_ROOT, val_ratio=config.val_ratio)

    print("Loading training data...")
    train_preprocess = T.Compose([
        T.Resize([232, ]),
        T.CenterCrop(config.train_crop_size),
        T.PILToTensor(),
        T.ConvertImageDtype(torch.float),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    train_dataset = ImageFolder(root=train_dir, transform=train_preprocess)

    print("Loading validation data...")
    val_preprocess = T.Compose([
        T.Resize([232, ]),
        T.CenterCrop(config.train_crop_size),
        T.PILToTensor(),
        T.ConvertImageDtype(torch.float),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    val_dataset = ImageFolder(root=val_dir, transform=val_preprocess)

    print("Creating data loaders...")
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=16, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=16, pin_memory=True)

    print("Creating model...")
    model = ResNet50Wrapper(num_classes=len(train_dataset.classes))

    print("Creating criterion...")
    criterion = nn.CrossEntropyLoss(label_smoothing=config.label_smoothing)

    print("Creating optimizer...")
    if config.opt_name != 'sgd': raise NotImplementedError("Only SGD is supported")
    parameters = utils.train.set_weight_decay(
        model,
        config.weight_decay,
        norm_weight_decay=config.norm_weight_decay
    )
    optimizer = torch.optim.SGD(
        parameters,
        lr=config.lr,
        momentum=config.momentum,
        weight_decay=config.weight_decay
    )

    print("Creating learning rate scheduler...")
    if config.lr_scheduler_name != 'cosine': raise NotImplementedError("Only cosine is supported")
    main_lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        config.epochs - config.lr_warmup_epochs,
        eta_min=0
    )

    if config.lr_warmup_epochs > 0:
        if config.lr_warmup_method != 'linear': raise NotImplementedError("Only linear is supported")
        warmup_lr_scheduler = optim.lr_scheduler.LinearLR(
            optimizer,
            start_factor=config.lr_warmup_decay,
            total_iters=config.lr_warmup_epochs
        )
        lr_scheduler = optim.lr_scheduler.SequentialLR(
                optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[config.lr_warmup_epochs]
            )
    else:
        lr_scheduler = main_lr_scheduler

    return train_loader, val_loader, model, criterion, optimizer, lr_scheduler

# Training

In [9]:

def train_one_epoch(model, criterion, optimizer, data_loader, epoch):
    model.train()
    for i, (images, labels) in enumerate(data_loader):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # wandb log every 10 batches
        if i % 10 == 0:
            example_ct = i * len(images) + epoch * len(data_loader.dataset)
            acc1, acc3 = utils.train.accuracy(outputs, labels, topk=(1, 3))
            loss_value = loss.item()
            lr = optimizer.param_groups[0]['lr']
            wandb.log({
                "train/loss": loss_value,
                "train/acc1": acc1,
                "train/acc3": acc3,
                "train/lr": lr,
                "train/epoch": epoch,
            }, step=example_ct)



def evaluate(model, criterion, data_loader, example_ct):
    model.eval()
    acc1_avg, acc3_avg, loss_avg = utils.train.AverageMeter(), utils.train.AverageMeter(), utils.train.AverageMeter()
    with torch.inference_mode():
        for i, (images, labels) in enumerate(data_loader):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)

            # calculate metrix
            acc1, acc3 = utils.train.accuracy(outputs, labels, topk=(1, 3))
            loss = loss.item()
            acc1_avg.update(acc1, images.size(0))
            acc3_avg.update(acc3, images.size(0))
            loss_avg.update(loss, images.size(0))

        # log to wandb
        wandb.log({
            "val/loss": loss_avg.avg,
            "val/acc1": acc1_avg.avg,
            "val/acc3": acc3_avg.avg,
        }, step=example_ct)

        # random sample 10 images for data_loader
        sample_indices = np.random.choice(len(data_loader), 10, replace=False)
        for i in sample_indices:
            images, labels = data_loader.dataset[i]
            images = images.unsqueeze(0).to(DEVICE)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            preds = preds.item()
            label = data_loader.dataset.classes[labels]
            pred = data_loader.dataset.classes[preds]
            # get pred confidence
            pred_confidence = torch.softmax(outputs, dim=1)[0, preds].item()
            wandb.log({
                "val/sample": [wandb.Image(images[0], caption=f"pred: {pred}({pred_confidence:.2f}), label: {label}")]
            }, step=example_ct)

    return acc1_avg.avg, acc3_avg.avg, loss_avg.avg


def pipline(hyperparameters: Hyperparameters):
    with wandb.init(project="new-sota-model", config=hyperparameters.to_dict()):
        config: Hyperparameters = wandb.config
        train_loader, val_loader, model, criterion, optimizer, lr_scheduler = setup_training(config)

        print("Start training...")
        wandb.watch(model, criterion, log="all", log_freq=10)
        for epoch in tqdm(range(config.epochs)):
             train_one_epoch(model, criterion, optimizer, train_loader, epoch)
             lr_scheduler.step()
             example_ct = (epoch + 1) * len(train_loader.dataset)
             acc1, acc3, loss = evaluate(model, criterion, val_loader, example_ct)

In [11]:
def main():
    hyperparameters = Hyperparameters()
    pipline(hyperparameters)

In [14]:
train_loader, val_loader, model, criterion, optimizer, lr_scheduler = setup_training(hyperparameters)

splitting dataset...
Aborting.
Loading training data...
Loading validation data...
Creating data loaders...
Creating model...
Creating criterion...
Creating optimizer...
Creating learning rate scheduler...
