# Scene Grounding Model

## 1. Importing and Instantiations

In [None]:
!pip install torch torchvision transformers Pillow tqdm

In [None]:
# Run this cell if on colab
from google.colab import drive
drive.mount('/content/drive')

Imports

In [None]:
import os
import torch
import torchvision
import torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader
from pathlib import Path

from transformers import AutoTokenizer, RobertaModel

from scripts.utils import RefCOCODataset, FinalModel, box_loss, contrastive_loss, generalized_iou

Version Check

In [None]:
print("torch :", torch.__version__)
print("torchvision :", torchvision.__version__)
print("torchvision file :", Path(torchvision.__file__).resolve())

Instantiate Text Encoder and Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base",add_prefix_space=True)
text_encoder = RobertaModel.from_pretrained("roberta-base")
for param in text_encoder.parameters():
    param.requires_grad = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 2. Processing Begins

Variables

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE  = 16
EPOCHS      = 20
LR          = 1e-4
ALPHA_CTR   = 0.1          # weight for contrastive term
MARGIN_CTR  = 0.2
PATIENCE    = 3            # epochs to wait before LR drop
FACTOR      = 0.5          # LR multiplier on plateau
CLIP_NORM   = 1.0

DATA_DIR = Path("")         ## Change to your dataset directory, if y 
CHECKPOINT_PATH : Path # You are free to edit this.
os.makedirs(CHECKPOINT_PATH,exist_ok=True)

Instantiating Datasets and Loaders

In [None]:
train_set = RefCOCODataset(DATA_DIR,"train",tokenizer)
val_set = RefCOCODataset(DATA_DIR,"val",tokenizer)
test_set = RefCOCODataset(DATA_DIR,"test",tokenizer)

train_load = DataLoader(train_set,BATCH_SIZE,shuffle=True,pin_memory=True)
val_load = DataLoader(val_set,BATCH_SIZE,shuffle=True,pin_memory=True)
test_load = DataLoader(test_set, BATCH_SIZE, shuffle=False, pin_memory=True)

Instantiating Final Model

In [None]:
model = FinalModel(textbackbone=text_encoder).to(DEVICE)

# lower LR for backbone, normal LR for the rest
param_groups = [
    {"params": model.backbone.parameters(), "lr": 0.25*LR},
    {"params": [p for n,p in model.named_parameters() if "backbone" not in n],
     "lr": LR}
]
optim     = torch.optim.AdamW(param_groups, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optim, mode="min", factor=FACTOR, patience=PATIENCE, verbose=True)

scaler = torch.amp.GradScaler(device=DEVICE)

  scaler = torch.cuda.amp.GradScaler()


## 3. Training Begins

In [None]:
def train_one_epoch():
    model.train()
    running = 0.
    for batch in train_load:
        imgs   = batch["image"].to(DEVICE)
        gt_box = batch["bbox"].to(DEVICE)                 # (B,4)
        toks   = {k:v.to(DEVICE) for k,v in batch["tokens"].items()}

        optim.zero_grad(set_to_none=True)
        with torch.autocast(device_type=DEVICE):
            pred_box, dec_out, img_tokens = model(imgs, toks)
            loss_bbox = box_loss(pred_box, gt_box)
            loss_ctr  = contrastive_loss(dec_out, img_tokens,
                                               margin=MARGIN_CTR)
            loss = loss_bbox + ALPHA_CTR*loss_ctr

        scaler.scale(loss).backward()
        scaler.unscale_(optim)                                  # for clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
        scaler.step(optim)
        scaler.update()

        running += loss.item() * imgs.size(0)
    return running / len(train_set)

In [None]:
@torch.no_grad()
def evaluate(loader):
    model.eval()
    giou_best, l1_best = [], []
    for batch in loader:
        imgs  = batch["image"].to(DEVICE)
        gt    = batch["bbox"].to(DEVICE).unsqueeze(1)      # (B,1,4)
        toks  = {k:v.to(DEVICE) for k,v in batch["tokens"].items()}

        pred, _, _ = model(imgs, toks)

        giou = 1. - generalized_iou(pred, gt).squeeze(-1)   # (B,Q)
        l1   = F.l1_loss(pred, gt.expand_as(pred), reduction="none").sum(-1)

        giou_best.append(giou.max(1).values)
        l1_best.append(l1.min(1).values)

    giou_best = torch.cat(giou_best).mean().item()
    l1_best   = torch.cat(l1_best).mean().item()
    return giou_best, l1_best

In [None]:
best_val = float("inf")

for ep in range(1, EPOCHS+1):
    train_loss = train_one_epoch()
    val_giou, val_l1 = evaluate(val_load)
    val_metric = val_giou + val_l1            # scheduler uses this

    scheduler.step(val_metric)                # ReduceLRonPlateau

    lr_now = optim.param_groups[1]["lr"]
    print(f"Epoch {ep:02d} │ train L {train_loss:.4f} │ "
          f"val mIoU {val_giou:.3f} │ val L1 {val_l1:.3f} │ "
          f"LR {lr_now:.2e}")
    
    checkpoint = {
        "epoch":ep,
        "model_state_dict":model.state_dict()
    }

    torch.save(checkpoint, CHECKPOINT_PATH / "final_weights.pth")

    if val_metric < best_val:
        best_val = val_metric
        torch.save(checkpoint,CHECKPOINT_PATH/"best_weights.pth")
        print(f"New best validation metric: {best_val:.4f} - saved to best_weights.pth")

Epoch 01 │ train L 1.4163 │ val mIoU 0.792 │ val L1 0.528 │ LR 1.00e-04
Epoch 02 │ train L 1.2094 │ val mIoU 0.783 │ val L1 0.511 │ LR 1.00e-04
Epoch 03 │ train L 1.1308 │ val mIoU 0.749 │ val L1 0.520 │ LR 1.00e-04
Epoch 04 │ train L 1.0702 │ val mIoU 0.753 │ val L1 0.487 │ LR 1.00e-04
Epoch 05 │ train L 1.0234 │ val mIoU 0.744 │ val L1 0.483 │ LR 1.00e-04
Epoch 06 │ train L 0.9841 │ val mIoU 0.745 │ val L1 0.495 │ LR 1.00e-04
Epoch 07 │ train L 0.9449 │ val mIoU 0.727 │ val L1 0.486 │ LR 1.00e-04
Epoch 08 │ train L 0.9114 │ val mIoU 0.749 │ val L1 0.487 │ LR 1.00e-04
Epoch 09 │ train L 0.8815 │ val mIoU 0.713 │ val L1 0.464 │ LR 1.00e-04
Epoch 10 │ train L 0.8641 │ val mIoU 0.725 │ val L1 0.477 │ LR 1.00e-04
Epoch 11 │ train L 0.8345 │ val mIoU 0.726 │ val L1 0.466 │ LR 1.00e-04
Epoch 12 │ train L 0.8155 │ val mIoU 0.718 │ val L1 0.457 │ LR 1.00e-04
Epoch 13 │ train L 0.7967 │ val mIoU 0.732 │ val L1 0.477 │ LR 1.00e-04
Epoch 14 │ train L 0.7779 │ val mIoU 0.715 │ val L1 0.476 │ LR 1

In [None]:
test_giou, test_l1 = evaluate(test_load)
print("\n──────── TEST REPORT ────────")
print(f"mIoU  (best query) : {test_giou:.3f}")
print(f"Mean L1 (cxcywh)   : {test_l1:.3f}")
print("─────────────────────────────")


──────── TEST REPORT ────────
mIoU  (best query) : 0.700
Mean L1 (cxcywh)   : 0.445
─────────────────────────────
