In [None]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms

In [2]:
import sys
from pathlib import Path

# Add the project root directory to the Python path
notebook_dir = Path().resolve()
project_root  = notebook_dir.parent
sys.path.append(str(project_root))

In [3]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
# Hyperparameters
BATCH_SIZE = 4
LEARNING_RATE = 1e-4
NUM_EPOCHS = 20

In [5]:
from data.dataset import KittiSemSegDataset
# Dataset and DataLoader
image_size = (375, 1242)
dataset_root = '/home/panos/Documents/data/kitti/data_semantics/training'
train_dataset = KittiSemSegDataset(dataset_root, train=True, target_size=image_size)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, 
                          shuffle=True, num_workers=4, 
                          pin_memory=True)
val_dataset = KittiSemSegDataset(dataset_root, train=True, target_size=image_size)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, 
                        shuffle=False, num_workers=4, pin_memory=True)

Using device: cuda


In [6]:
from models.DinoSeg import DinoSeg
# Initialize model, loss function, and optimizer
NUM_CLASSES = 35
model = DinoSeg(num_labels=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
from utils.others import print_model_and_gpu_stats
print_model_and_gpu_stats(model, device)

Model parameters: 84.19 MB
CUDA total memory   : 3716.69 MB
CUDA allocated      :   84.19 MB
CUDA reserved       :  114.00 MB
CUDA free (est.)    : 3602.69 MB


In [8]:
# Draw one batch of images from the train dataloader
imgs, _ = next(iter(train_loader))

# Move the images to the appropriate device
imgs = imgs.to(device)
print(imgs.shape)

# Pass the images through the model's process() function
processed_input = model.process(imgs[0])
print(processed_input.shape)

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


torch.Size([4, 3, 375, 1242])
torch.Size([1, 3, 224, 224])


In [None]:
C, W, H = imgs[0].shape
with torch.no_grad():
    # Pass the images through the model
    output = model(imgs)
    print(output.shape)

    # Pass the images through the model and return to the original size
    original_size=(W, H)
    logits = model(imgs)
    logits = F.interpolate(logits, size=original_size, mode="bilinear", align_corners=False)
    print(logits.shape)

    # Get the predictions
    logits = model(imgs)
    logits = F.interpolate(logits, size=original_size, mode="bilinear", align_corners=False)
    preds = logits.argmax(dim=1)
    print(preds[0].shape)
    print(preds[1].shape)

torch.Size([4, 35, 26, 88])
torch.Size([4, 35, 375, 1242])
torch.Size([4, 35, 375, 1242])
torch.Size([4, 375, 1242])


In [10]:
from torchinfo import summary

# set batch_size=1 (otherwise multiply everything by your real batch size)
summary(
  model,
  input_size=(1, C, W, H),
  device=device.type,
  col_names=("output_size", "num_params", "mult_adds"),
  depth=4
)

Layer (type:depth-idx)                                       Output Shape              Param #                   Mult-Adds
DinoSeg                                                      [1, 35, 26, 88]           --                        --
├─Dinov2Model: 1-1                                           [1, 384]                  --                        --
│    └─Dinov2Embeddings: 2-1                                 [1, 2289, 384]            526,848                   --
│    │    └─Dinov2PatchEmbeddings: 3-1                       [1, 2288, 384]            --                        --
│    │    │    └─Conv2d: 4-1                                 [1, 384, 26, 88]          226,176                   517,490,688
│    │    └─Dropout: 3-2                                     [1, 2289, 384]            --                        --
│    └─Dinov2Encoder: 2-2                                    [1, 2289, 384]            --                        --
│    │    └─ModuleList: 3-3                             

In [11]:
import torch
import torch.nn.functional as F
from torchmetrics import JaccardIndex
from tqdm import tqdm

In [13]:
# Metric: mean IoU over all classes (ignore_index for void if needed)
miou_metric = JaccardIndex(
    task='multiclass',
    num_classes=NUM_CLASSES, 
    average='macro',       # mean over classes
    ignore_index=None      # or your void label
).to(device)

best_val_miou = 0.0

for epoch in range(1, NUM_EPOCHS+1):
    ####### TRAINING #######
    model.train()
    running_loss = 0.0

    train_bar = tqdm(train_loader, desc=f"[Epoch {epoch}/{NUM_EPOCHS}] Train")
    for imgs, masks in train_bar:
        imgs, masks = imgs.to(device), masks.to(device)

        # forward + loss
        outputs = model(imgs)                  # e.g. outputs.logits: [B, C, H, W]
        loss    = criterion(outputs, masks.long())

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        train_bar.set_postfix(loss=running_loss / train_bar.n)

    avg_train_loss = running_loss / len(train_loader)


    ####### VALIDATION #######
    model.eval()
    running_val_loss = 0.0
    miou_metric.reset()

    with torch.no_grad():
        val_bar = tqdm(val_loader, desc=f"[Epoch {epoch}/{NUM_EPOCHS}]  Val")
        for imgs, masks in val_bar:
            imgs, masks = imgs.to(device), masks.to(device)

            outputs = model(imgs)
            loss    = criterion(outputs.logits, masks.long())
            running_val_loss += loss.item()

            # compute IoU on this batch
            preds = torch.argmax(outputs.logits, dim=1)  # [B, H, W]
            miou_metric.update(preds, masks)

            val_bar.set_postfix(val_loss=running_val_loss / val_bar.n)

    avg_val_loss = running_val_loss / len(val_loader)
    avg_val_miou = miou_metric.compute().item()


    ####### LOG & CHECKPOINT #######
    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val   Loss: {avg_val_loss:.4f} | "
        f"Val  mIoU: {avg_val_miou:.4f}"
    )

    # save best
    if avg_val_miou > best_val_miou:
        best_val_miou = avg_val_miou
        torch.save(model.state_dict(), "best_model.pth")
        print(f" → New best! Model saved (mIoU={best_val_miou:.4f})")


[Epoch 1/20] Train:   0%|          | 0/40 [00:00<?, ?it/s]

[Epoch 1/20] Train:   0%|          | 0/40 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 54.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 8.00 MiB is free. Including non-PyTorch memory, this process has 3.61 GiB memory in use. Of the allocated memory 3.52 GiB is allocated by PyTorch, and 22.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)