# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

0

In [3]:
import torch
from torch.utils.data import Subset, DataLoader

from torchvision import tv_tensors
from torchvision.io import decode_image

from torchvision.transforms import v2 as T
from torchvision.transforms.v2.functional import to_dtype, get_size, convert_bounding_box_format

from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

import pandas as pd
import numpy as np

import os
import json

import math
import sys

import utils
from engine import train_one_epoch, evaluate

# Data

In [4]:
class AerialImageryDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir_path, annotation_path, background_weight, train=False):
        self.image_dir_path = image_dir_path

        self.transforms = []
        if train:
            self.transforms.append(T.RandomHorizontalFlip(p=0.5))
            self.transforms.append(T.RandomVerticalFlip(p=0.5))

        self.transforms.append(T.ToDtype(torch.float, scale=True))
        self.transforms.append(T.ToPureTensor())
        self.transforms = T.Compose(self.transforms)

        self.label_map = {"daylighting": 1,
                          "safety_islands": 2,
                          "curb_extensions": 3}

        with open(annotation_path) as json_file:
            self.annotations = json.load(json_file)

        self.weights = []

        for item in tqdm(self.annotations):
            curr_weight = 1

            if "label" in item:
                for label in item["label"]:
                    if label["rectanglelabels"][0] == "curb_extensions":
                        curr_weight = 3
                        break
            else:
                curr_weight = background_weight

            self.weights.append(curr_weight)

    def __getitem__(self, idx):
        img_path = os.path.join(
            self.image_dir_path,
            self.annotations[idx]["image"].split("%5C")[-1])

        img = decode_image(img_path, mode="RGB")

        if img.dtype != torch.float:
            img = to_dtype(img, dtype=torch.float, scale=True)

        img = tv_tensors.Image(img)

        if "label" in self.annotations[idx]:
            boxes = [[
                i["x"],
                i["y"],
                i["width"],
                i["height"]] for i in self.annotations[idx]["label"]]

            boxes = convert_bounding_box_format(tv_tensors.BoundingBoxes(boxes, format="XYWH", canvas_size=get_size(img)), new_format="XYXY")
            boxes = tv_tensors.BoundingBoxes(boxes * 416 / 100, format="XYXY", canvas_size=get_size(img))
            labels = torch.tensor([self.label_map[i["rectanglelabels"][0]] for i in self.annotations[idx]["label"]])

        else:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = int("".join(self.annotations[idx]["image"].split("%5C")[-1].split(".")[0].split("_")))
        target["area"] = torch.tensor([(box[2] - box[0]) * (box[3] - box[1]) for box in boxes])
        target["iscrowd"] = torch.tensor([0] * len(boxes))

        img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.annotations)

In [5]:
from tqdm import tqdm

image_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "test")
label_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "test.json")

batch_size = 16

dataset_train = AerialImageryDataset(
    image_dir_path=image_path,
    annotation_path=label_path,
    background_weight=0.001,
    train=True)
dataset_valtest = AerialImageryDataset(
    image_dir_path=image_path,
    annotation_path=label_path,
    background_weight=0.001,
    train=False)


g = torch.Generator()
g.manual_seed(2025)
indices = torch.randperm(n=len(dataset_train), generator=g).tolist()

subset_train = Subset(dataset_train, indices[:int(len(dataset_train) * 0.8)])
subset_val = Subset(dataset_valtest, indices[int(len(dataset_valtest) * 0.8):int(len(dataset_valtest) * 0.9)])
subset_test = Subset(dataset_valtest, indices[int(len(dataset_valtest) * 0.9):])

train_weights = [dataset_train.weights[idx] for idx in indices[:int(len(dataset_train) * 0.8)]]

data_loader_train = DataLoader(
    subset_train,
    batch_size=batch_size,
    sampler=torch.utils.data.WeightedRandomSampler(train_weights, len(train_weights)),
    collate_fn=utils.collate_fn
)
data_loader_val = DataLoader(
    subset_val,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

data_loader_test = DataLoader(
    subset_test,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

annotated_indices = []
unannotated_indices = []

for i in subset_val.indices:
    if "label" in subset_val.dataset.annotations[i]:
        annotated_indices.append(i)
    else:
        unannotated_indices.append(i)

subset_val_annotated = Subset(dataset_valtest, annotated_indices)
subset_val_unannotated = Subset(dataset_valtest, unannotated_indices)

data_loader_val_annotated = DataLoader(
    subset_val_annotated,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

data_loader_val_unannotated = DataLoader(
    subset_val_unannotated,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

100%|██████████| 5000/5000 [00:00<00:00, 3161217.97it/s]
100%|██████████| 5000/5000 [00:00<00:00, 3085408.27it/s]


In [6]:
train_label_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "test.json")

with open(train_label_path) as json_file:
    test = json.load(json_file)


test = test[int(len(test) * 0.8):int(len(test) * 0.9)]

daylighting_count = 0
safety_islands_count = 0
curb_extensions_count = 0
empty_count = 0

for item in test:
    if "label" in item:
        counts = [0, 0, 0]
        for annot in item["label"]:
            if annot["rectanglelabels"][0] == "daylighting":
                # daylighting_count += 1
                counts[0] += 1
            elif annot["rectanglelabels"][0] == "safety_islands":
                # safety_islands_count += 1
                counts[1] += 1
            elif annot["rectanglelabels"][0] == "curb_extensions":
                # curb_extensions_count += 1
                counts[2] += 1

        if counts[2] != 0:
            counts = [1 * i for i in counts]
        # elif counts[0] == 0 and counts[2] != 0:
        #     counts = [4 * i for i in counts]

        daylighting_count += counts[0]
        safety_islands_count += counts[1]
        curb_extensions_count += counts[2]

    else:
        empty_count += 1

print(f"Daylighting: {daylighting_count}")
print(f"Safety Islands: {safety_islands_count}")
print(f"Curb Extensions: {curb_extensions_count}")
print(f"Empty: {empty_count}")

Daylighting: 127
Safety Islands: 53
Curb Extensions: 37
Empty: 402


# Training

In [7]:
def train_one_epoch_no_schedule(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    header = f"Epoch: [{epoch}]"

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]

        with torch.amp.autocast('cuda', enabled=False):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping training")
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger

In [8]:
def training(model, optimizer, num_epochs, lr, save_loc, starting_epoch=0,):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.to(device)

    for epoch in range(starting_epoch, starting_epoch + num_epochs):
        print("EPOCH " + str(epoch))
        train_one_epoch_no_schedule(model, optimizer, data_loader_train, device, epoch, print_freq=50)
        # evaluate(model_tune, data_loader_train, device=device)
        print("VALIDATION EVALUATION:")
        evaluate(model_tune, data_loader_val, device=device)
        print("VALIDATION ANNOTATED EVALUATION:")
        evaluate(model_tune, data_loader_val_annotated, device=device)

        save_path = save_loc + "_" + str(lr) + "_" + str(epoch) + ".pt"
        save_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", save_path)

        checkpoint = {
            'epoch': epoch,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),}
        torch.save(checkpoint, save_path)

    print("Training complete!")

# Experiments

## Experiment 1: Fine-tuning

In [9]:
lr = 0.00001
model_tune = fasterrcnn_resnet50_fpn_v2(
    weights="DEFAULT",
    # image_mean=[0.4715, 0.4723, 0.4587],
    # image_std=[0.2071, 0.1955, 0.1825]
    )

# replace the pre-trained head with a new one
num_classes = 4
in_features = model_tune.roi_heads.box_predictor.cls_score.in_features
model_tune.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

params = [p for p in model_tune.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(
    params,
    lr=lr,
)

training(model=model_tune, optimizer=optimizer, num_epochs=30, lr=lr, save_loc="pretrain_transform_0.001", starting_epoch=0)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth


100%|██████████| 167M/167M [00:00<00:00, 224MB/s]


EPOCH 0
Epoch: [0]  [  0/250]  eta: 2:43:44  lr: 0.000010  loss: 2.0085 (2.0085)  loss_classifier: 1.6165 (1.6165)  loss_box_reg: 0.0519 (0.0519)  loss_objectness: 0.3217 (0.3217)  loss_rpn_box_reg: 0.0184 (0.0184)  time: 39.2964  data: 35.0275  max mem: 18863
Epoch: [0]  [ 50/250]  eta: 0:24:09  lr: 0.000010  loss: 0.4976 (0.7429)  loss_classifier: 0.2076 (0.4293)  loss_box_reg: 0.1350 (0.1044)  loss_objectness: 0.1345 (0.1867)  loss_rpn_box_reg: 0.0189 (0.0226)  time: 5.1834  data: 3.1988  max mem: 19193
Epoch: [0]  [100/250]  eta: 0:13:48  lr: 0.000010  loss: 0.4641 (0.6179)  loss_classifier: 0.1982 (0.3224)  loss_box_reg: 0.1531 (0.1316)  loss_objectness: 0.0846 (0.1427)  loss_rpn_box_reg: 0.0152 (0.0212)  time: 3.4323  data: 1.4269  max mem: 19193
Epoch: [0]  [150/250]  eta: 0:07:46  lr: 0.000010  loss: 0.4866 (0.5754)  loss_classifier: 0.2216 (0.2880)  loss_box_reg: 0.1856 (0.1473)  loss_objectness: 0.0693 (0.1200)  loss_rpn_box_reg: 0.0150 (0.0201)  time: 2.7275  data: 0.7216  m

# Evaluation

In [None]:
model_tune = fasterrcnn_resnet50_fpn_v2(
    weights="DEFAULT")
num_classes = 4
in_features = model_tune.roi_heads.box_predictor.cls_score.in_features
model_tune.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

checkpoint_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "pretrain_transform_0.001_0.001_21.pt")
checkpoint = torch.load(checkpoint_path)

model_tune.load_state_dict(checkpoint['model'])

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_tune.to(device)

print(f"Model loaded successfully from {checkpoint_path}")

Model loaded successfully from /content/gdrive/My Drive/Grad School/Dissertation/Data/pretrain_transform_0.001_0.001_21.pt


In [None]:
class AerialImageryDatasetOneClass(torch.utils.data.Dataset):
    def __init__(self, image_dir_path, annotation_path, background_weight, label, train=False):
        self.image_dir_path = image_dir_path

        self.transforms = []
        if train:
            self.transforms.append(T.RandomHorizontalFlip(p=0.5))
            self.transforms.append(T.RandomVerticalFlip(p=0.5))

        self.transforms.append(T.ToDtype(torch.float, scale=True))
        self.transforms.append(T.ToPureTensor())
        self.transforms = T.Compose(self.transforms)

        self.label_map = {"daylighting": 1,
                          "safety_islands": 2,
                          "curb_extensions": 3}

        with open(annotation_path) as json_file:
            self.annotations = json.load(json_file)

        self.weights = []

        for item in tqdm(self.annotations):
            curr_weight = 1
            self.weights.append(curr_weight)

        self.label = label

    def __getitem__(self, idx):
        img_path = os.path.join(
            self.image_dir_path,
            self.annotations[idx]["image"].split("%5C")[-1])

        img = decode_image(img_path, mode="RGB")

        if img.dtype != torch.float:
            img = to_dtype(img, dtype=torch.float, scale=True)

        img = tv_tensors.Image(img)

        if "label" in self.annotations[idx]:
            boxes = [[
                i["x"],
                i["y"],
                i["width"],
                i["height"]] for i in self.annotations[idx]["label"] if i["rectanglelabels"][0] == self.label]

            if len(boxes) == 0:
                boxes = torch.zeros((0, 4), dtype=torch.float32)
                labels = torch.zeros((0,), dtype=torch.int64)
            else:
                boxes = convert_bounding_box_format(tv_tensors.BoundingBoxes(boxes, format="XYWH", canvas_size=get_size(img)), new_format="XYXY")
                boxes = tv_tensors.BoundingBoxes(boxes * 416 / 100, format="XYXY", canvas_size=get_size(img))
                labels = torch.tensor([self.label_map[i["rectanglelabels"][0]] for i in self.annotations[idx]["label"] if i["rectanglelabels"][0] == self.label])

        else:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = int("".join(self.annotations[idx]["image"].split("%5C")[-1].split(".")[0].split("_")))
        target["area"] = torch.tensor([(box[2] - box[0]) * (box[3] - box[1]) for box in boxes])
        target["iscrowd"] = torch.tensor([0] * len(boxes))

        img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.annotations)

In [None]:
from tqdm import tqdm

image_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "test")
label_path = os.path.join("/content/gdrive/My Drive/Grad School/Dissertation/Data", "test.json")

batch_size = 16

dataset_valtest = AerialImageryDatasetOneClass(
    image_dir_path=image_path,
    annotation_path=label_path,
    background_weight=0.001,
    label="curb_extensions",
    train=False)


g = torch.Generator()
g.manual_seed(2025)
indices = torch.randperm(n=len(dataset_valtest), generator=g).tolist()

subset_val = Subset(dataset_valtest, indices[int(len(dataset_valtest) * 0.8):int(len(dataset_valtest) * 0.9)])

data_loader_val_curb = DataLoader(
    subset_val,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

dataset_valtest = AerialImageryDatasetOneClass(
    image_dir_path=image_path,
    annotation_path=label_path,
    background_weight=0.001,
    label="daylighting",
    train=False)

g = torch.Generator()
g.manual_seed(2025)
indices = torch.randperm(n=len(dataset_valtest), generator=g).tolist()

subset_val = Subset(dataset_valtest, indices[int(len(dataset_valtest) * 0.8):int(len(dataset_valtest) * 0.9)])

data_loader_val_day = DataLoader(
    subset_val,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

dataset_valtest = AerialImageryDatasetOneClass(
    image_dir_path=image_path,
    annotation_path=label_path,
    background_weight=0.001,
    label="safety_islands",
    train=False)

g = torch.Generator()
g.manual_seed(2025)
indices = torch.randperm(n=len(dataset_valtest), generator=g).tolist()

subset_val = Subset(dataset_valtest, indices[int(len(dataset_valtest) * 0.8):int(len(dataset_valtest) * 0.9)])

data_loader_val_island = DataLoader(
    subset_val,
    batch_size=batch_size,
    collate_fn=utils.collate_fn
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

100%|██████████| 5000/5000 [00:00<00:00, 4317792.88it/s]
100%|██████████| 5000/5000 [00:00<00:00, 4184261.77it/s]
100%|██████████| 5000/5000 [00:00<00:00, 4104818.95it/s]


In [None]:
evaluate(model_tune, data_loader_val_curb, device=device)

creating index...
index created!
Test:  [ 0/32]  eta: 0:01:14  model_time: 2.2217 (2.2217)  evaluator_time: 0.0066 (0.0066)  time: 2.3230  data: 0.0857  max mem: 5477
Test:  [31/32]  eta: 0:00:00  model_time: 0.8319 (0.8561)  evaluator_time: 0.0027 (0.0029)  time: 0.9164  data: 0.1039  max mem: 5486
Test: Total time: 0:00:31 (0.9690 s / it)
Averaged stats: model_time: 0.8319 (0.8561)  evaluator_time: 0.0027 (0.0029)
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.008
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=

<coco_eval.CocoEvaluator at 0x782c7c890170>

In [None]:
evaluate(model_tune, data_loader_val_day, device=device)

creating index...
index created!
Test:  [ 0/32]  eta: 0:00:29  model_time: 0.8179 (0.8179)  evaluator_time: 0.0028 (0.0028)  time: 0.9130  data: 0.0834  max mem: 5486
Test:  [31/32]  eta: 0:00:00  model_time: 0.8261 (0.8045)  evaluator_time: 0.0031 (0.0032)  time: 0.8958  data: 0.0889  max mem: 5486
Test: Total time: 0:00:29 (0.9069 s / it)
Averaged stats: model_time: 0.8261 (0.8045)  evaluator_time: 0.0031 (0.0032)
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.036
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.069
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.032
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.002
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.058
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=

<coco_eval.CocoEvaluator at 0x782c7c8924b0>

In [None]:
evaluate(model_tune, data_loader_val_island, device=device)

creating index...
index created!
Test:  [ 0/32]  eta: 0:00:29  model_time: 0.8221 (0.8221)  evaluator_time: 0.0027 (0.0027)  time: 0.9205  data: 0.0866  max mem: 5486
Test:  [31/32]  eta: 0:00:00  model_time: 0.8328 (0.8099)  evaluator_time: 0.0033 (0.0033)  time: 0.9024  data: 0.0895  max mem: 5486
Test: Total time: 0:00:29 (0.9132 s / it)
Averaged stats: model_time: 0.8328 (0.8099)  evaluator_time: 0.0033 (0.0033)
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.653
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.316
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.258
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.419
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=

<coco_eval.CocoEvaluator at 0x782c7c8aeba0>