# Imports

In [1]:
import sys
from copy import deepcopy
import cv2
import wandb
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchmetrics import JaccardIndex
from peft import LoraConfig, get_peft_model, TaskType
from tqdm import tqdm
import albumentations as A
from hydra import initialize, compose

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Add the project root directory to the Python path
cur_dir     = Path.cwd()
project_dir = cur_dir.parent
sys.path.append(str(project_dir))

from models.tools import CombinedLoss
from models.DinoFPNbn import DinoFPN
from src.validate import evaluate_model
from src.train import train_and_validate
from data.mastr1325.dataset import MaritimeDataset
from utils.others import save_checkpoint, load_checkpoint, get_memory_footprint
from data.kitti360.labels_kitti360 import trainId2label, NUM_CLASSES
from utils.visualization import plot_image_and_masks
from utils.others import save_checkpoint, load_checkpoint

Using device: cuda
Using device: cuda


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
with initialize(version_base=None, config_path=f"../configs", job_name="train_and_log"):
    cfg = compose(config_name="lora_config")

# Dataloaders

In [5]:
crop_size = (cfg.augmentation.crop_height, cfg.augmentation.crop_width)
train_transform = A.Compose([
    # -- Geometric --
    A.RandomCrop(height=crop_size[0], width=crop_size[1], p=1.0), # preserve scale/context
    A.Affine(
        # translate_percent={"x": (-0.05, 0.05), "y": (-0.05, 0.05)}, # ±5% shift
        scale=(0.8, 1.0),                                           # zoom between 0.8×–1.0×
        rotate=(-3, 3),                                             # ±3° roll
        interpolation=cv2.INTER_LINEAR,
        mask_interpolation=cv2.INTER_NEAREST,
        border_mode=cv2.BORDER_CONSTANT,
        fill=255,
        fill_mask=255,
        p=0.7
    ),
    A.Perspective(scale=(0.01, 0.03), p=0.5),  # tiny camera viewpoint warp

    # -- Photometric --
    A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.5),
    A.RandomGamma(gamma_limit=(90, 110), p=0.5),
    A.OneOf([
        A.RandomFog(fog_coef_range=(0.05, 0.2), p=1.0),
        A.RandomShadow(shadow_roi=(0, 0.5, 1, 1), num_shadows_limit=(1, 2), p=1.0),
        A.RandomSunFlare(src_radius=50, p=1.0)
    ], p=0.5),

    # -- Occlusions --
    A.CoarseDropout(num_holes_range=(1, 4), 
                    hole_height_range=(5, 30), 
                    hole_width_range=(5, 30), 
                    p=0.5),                                # random occlusion

    # — Blur & noise: motion, sensor, compression —
    A.OneOf([
        A.MotionBlur(blur_limit=5, p=0.4),
        A.GaussianBlur(blur_limit=(3,5), p=0.3),
        A.MedianBlur(blur_limit=3, p=0.2),
    ], p=0.5),
    A.GaussNoise(
        std_range=(10.0/255.0, 50.0/255.0),
        mean_range=(0.0, 0.0),
        p=0.5
    )
])

# Define deterministic transforms for validation
val_transform = A.Compose([
    A.CenterCrop(height=crop_size[0], width=crop_size[1])
])

In [6]:
# Dataset and DataLoader
train_dataset = MaritimeDataset(cfg.dataset.root, train=True, transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=cfg.train.batch_size, 
                            shuffle=True, num_workers=cfg.dataset.num_workers, pin_memory=True)
val_dataset = MaritimeDataset(cfg.dataset.root, train=False, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=cfg.train.batch_size,
                        shuffle=False, num_workers=cfg.dataset.num_workers, pin_memory=True)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Train dataset size: 1060
Validation dataset size: 265


# Base model

In [7]:
model = DinoFPN(
    cfg.dataset.num_classes, 
    cfg.model
)
get_memory_footprint(model, detailed=True)

=== Model Memory Footprint ===
Backbone: 86,580,480 params, 330.28 MB
Head:     3,740,164 params, 14.27 MB
Total:    90,320,644 params, 344.55 MB


(90320644, 361282576)

In [8]:
# model

# Lora

In [9]:
target_modules=['head.projs.0', 'head.projs.1', 'head.classifier']
# Auto-detect Linear and Conv2d layers
target_modules = []
for name, module in model.named_modules():
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        if any(target in name for target in ['head', 'attention', 'query', 'key', 'value']):
            target_modules.append(name)

lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  # For vision tasks
    r=cfg.lora.rank,           
    lora_alpha=cfg.lora.alpha, 
    lora_dropout=cfg.lora.dropout, 
    target_modules=target_modules,
    bias="none",               # Don't adapt bias
    use_rslora=False,          # Use standard LoRA
)

# Apply LoRA to model
lora_model = get_peft_model(model, lora_config)

# Print trainable parameters
lora_model.print_trainable_parameters()

trainable params: 720,928 || all params: 91,041,572 || trainable%: 0.7919


In [10]:
get_memory_footprint(lora_model, detailed=True)

=== Model Memory Footprint ===
Backbone: 87,170,304 params, 332.53 MB
Head:     3,871,268 params, 14.77 MB
Total:    91,041,572 params, 347.30 MB


(91041572, 364166288)

In [11]:
lora_model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): DinoFPN(
      (backbone): Dinov2Model(
        (embeddings): Dinov2Embeddings(
          (patch_embeddings): Dinov2PatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): Dinov2Encoder(
          (layer): ModuleList(
            (0-11): 12 x Dinov2Layer(
              (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
              (attention): Dinov2Attention(
                (attention): Dinov2SelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_featur