In [1]:
import torch, math
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import v2

# Hugging Face
from transformers import AutoImageProcessor, AutoModel
from datasets import load_dataset, Image
from transformers.image_utils import load_image
from huggingface_hub import notebook_login

#notebook_login()

In [2]:
import numpy as np

# load npz
train = np.load("data/glacier_train.npz")
sh = train["X_train"].shape
print(sh, train)

(3119, 6, 32, 32) NpzFile 'data/glacier_train.npz' with keys: X_train, y_train, means, stds


In [3]:
dataset = load_dataset("blanchon/INRIA-Aerial-Image-Labeling", split="train")

Resolving data files:   0%|          | 0/360 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/180 [00:00<?, ?it/s]

In [4]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TempDataset(Dataset):
    def __init__(self, dataset, processor, tile=224, mask_offset=180):
        self.dataset = dataset
        self.processor = processor
        self.mask_offset = mask_offset
        self.tile = tile
        w, h = dataset["image"][0].size
        
        
        self.cols = int(w // int(tile))
        self.rows = int(h // int(tile))
        self.tiles_per_image = self.cols * self.rows
        self.num_images = len(dataset["image"]) - mask_offset  # paired count

    def __len__(self):
        return self.num_images * self.tiles_per_image

    def __getitem__(self, idx):
        
        img_idx = idx // self.tiles_per_image + self.mask_offset  # image block
        patch_idx = idx % self.tiles_per_image
        row = patch_idx // self.cols
        col = patch_idx % self.cols
        box = (
            col * self.tile,
            row * self.tile,
            (col + 1) * self.tile,
            (row + 1) * self.tile,
        )

        image = self.dataset["image"][img_idx].crop(box)
        mask = self.dataset["image"][img_idx - self.mask_offset].crop(box) 
        
        image = self.processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)
        mask = np.array(mask)
        if mask.ndim == 3:
            # If it's a 3D array, take the first channel to make it 2D
            mask = mask[:, :, 0]
            
        mask = torch.tensor(mask // 255).long()
        return {"image": image, "mask": mask}
        

In [5]:
# autoreload notebook
%load_ext autoreload
%autoreload 2
import torch
# clear torch cache
torch.cuda.empty_cache()

In [None]:
from tqdm import tqdm
from models.DinoV3.SemanDino import GlacierSegmenter

pretrained_model_name = "facebook/dinov3-vitl16-pretrain-sat493m"
processor = AutoImageProcessor.from_pretrained(pretrained_model_name)

model = GlacierSegmenter(2)
model = model.to("cuda")

dataset2 = TempDataset(dataset, processor)

# Optimize for faster loading
data_loader = DataLoader(dataset2,
                         batch_size=8, 
                         shuffle=True,
                         num_workers=2,
                         pin_memory=True,
                         drop_last=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# training loop
pb = tqdm(data_loader, total=len(data_loader))
for batch in pb:
    images = batch["image"]
    masks = batch["mask"]
        
    model.train()
    outputs = model(images.to("cuda"))
    
    loss = F.cross_entropy(outputs, masks.to("cuda"))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    pb.set_description(f"Loss: {loss.item():.4f}")    
    



  0%|          | 0/10890 [00:00<?, ?it/s]