In [None]:
import random
import numpy as np

import torch
from torch.utils.data import SequentialSampler, BatchSampler, DataLoader

import datasets.transforms as T
from datasets.coco import CocoDetection, collate_fn
from models.backbone import ResNetBackbone
from models.transformer import TransformerBitLinear, Transformer
from models.detr import DETR, SetCriterion
from models.matcher import HungarianMatcher
from util.misc import rescale_bboxes, plot_results

In [None]:
# keep random seed fixed to keep my sanity
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

class Args:
  coco_path = "/workspace/coco"
  dataset_file = "coco"
  masks = False
args = Args()

In [None]:

scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
transform_train = T.Compose([
  # augumentation
  T.RandomHorizontalFlip(),
  T.RandomSelect(
    T.RandomResize(scales, max_size=1333),
    T.Compose([
      T.RandomResize([400, 500, 600]),
      T.RandomSizeCrop(384, 600),
      T.RandomResize(scales, max_size=1333),
    ])
  ),
  # normalize
  T.Compose([
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
  ])
])     

In [None]:
dataset = CocoDetection("/workspace/coco/train2017", "/workspace/coco/annotations/instances_train2017.json", transform_train, return_masks=False)
sampler_train = SequentialSampler(dataset)
batch_sampler_train = BatchSampler(sampler_train, batch_size=8, drop_last=True)
data_loader_train = DataLoader(dataset, batch_sampler=batch_sampler_train, collate_fn=collate_fn, num_workers=1)
batch_fetcher = iter(data_loader_train)

In [None]:
x, x_mask, y = next(batch_fetcher)

## DETR Model

In [None]:
device = "cuda"

backbone = ResNetBackbone()
transformer = TransformerBitLinear(256, 8, 6, 6, 2048, 0.1)
model = DETR(backbone, transformer, num_classes=91, num_queries=100).to(device)

matcher = HungarianMatcher()
criterion = SetCriterion(91, matcher).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler()

In [None]:
n_parameters = 0
for n, p in model.named_parameters():
  if "transformer" in n:
    print(n)
    n_parameters += p.numel()
print('number of params:', n_parameters)

In [None]:
model = model.train()
x, x_mask = x.to(device), x_mask.to(device) 
y = [{k: v.to(device) for k, v in t.items()} for t in y]

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir="run/transformer-bitlinear-no_quant_post_norm")

for epochs in range(500):
  optimizer.zero_grad()
  with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
    outputs_logits, outputs_boxes = model(x, x_mask)
    loss = criterion(outputs_logits, outputs_boxes, y)

  writer.add_scalar("Loss/train", loss, epochs)
 
  scaler.scale(loss).backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
  scaler.step(optimizer)
  scaler.update()

## Inspect predictions

In [None]:
i = 1
with torch.no_grad():
  h, w = y[i]["size"]
  im = x[i].cpu().numpy().transpose((1,2,0))
  prob = outputs_logits[i,:].softmax(-1)[:, :-1]
  keep = prob.max(-1).values > 0.25
  plot_results(im, prob[keep], rescale_bboxes(outputs_boxes[i][keep,:], (w,h)))

## Inspect ground truth

In [None]:
i = 1
h, w = y[i]["size"]
im = x[i].cpu().numpy().transpose((1,2,0))
prob = torch.zeros((len(y[i]["labels"]), 91))
prob[torch.arange(len(y[i]["labels"])), y[i]["labels"].cpu()] = 1
plot_results(im, prob, rescale_bboxes(y[i]["boxes"], (w,h)))

In [None]:
import pathlib

import torch
import torch.utils.data

from torchvision import models, datasets, tv_tensors
from torchvision.transforms import v2
from helpers import plot

In [None]:
	# create dataset
	scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
	transform_train = T.Compose([
	  # augumentation
	  T.RandomHorizontalFlip(),
	  T.RandomSelect(
	    T.RandomResize(scales, max_size=1333),
	    T.Compose([
	      T.RandomResize([400, 500, 600]),
	      T.RandomSizeCrop(384, 600),
	      T.RandomResize(scales, max_size=1333),
	    ])
	  ),
	  # normalize
	  T.Compose([
	    T.ToTensor(),
	    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
	  ])
	])  

In [None]:
class NormalizeBoundingBox(object):
  def __call__(self, images, targets):
    print(images)
    print(targets)
    return images, targets


In [None]:
torch.manual_seed(0)
transforms = v2.Compose(
  [
    v2.ToImage(),
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    v2.ConvertBoundingBoxFormat("CXCYWH"),
    v2.SanitizeBoundingBoxes(),
    NormalizeBoundingBox(),
    v2.ToPureTensor(),
  ]
)

dataset = datasets.CocoDetection("/workspace/coco/train2017", "/workspace/coco/annotations/instances_train2017.json", transforms=transforms)
dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels"))

In [None]:
img, target = dataset[0]
print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
print(f"{type(target['boxes']) = }\n{type(target['labels']) = }")

In [None]:
target

In [None]:
plot([dataset[0], dataset[1]])