##google mount

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# requirement

In [None]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers pillow
!pip install -q --upgrade transformers accelerate
!pip install -q tqdm
!pip install -q peft accelerate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m124.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# huggingface token login

In [None]:
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## `preprocessing.py`

In [None]:

import json
import numpy as np

def load_json(json_path:str)->dict:
    with open(json_path, "r" , encoding="utf-8") as file:
        data = json.load(file)
    return data

def extract_bbox(json:dict)->tuple[list,list]:
    """
    returns:
        (bboxes_normalized, heights)
        bboxes_normalized: [[xmin,ymin,xmax,ymax], ...] in [0,1]  현재는 [xmin,ymin,xmax,ymax,x1,y1,x2,y2]
    """
    data = list(json.values())[0]
    regions = data.get('regions', [])
    bboxes = []
    heights = []

    img_h = float(data["file_attributes"]["img_height"])
    img_w = float(data["file_attributes"]["img_width"])

    for r in regions:
        ra = r['region_attributes']
        sa = r['shape_attributes']

        if 'all_points_x' in sa and 'all_points_y' in sa:
            xs = sa['all_points_x']
            ys = sa['all_points_y']
            # 폴리곤→AABB
            x1 = xs[0]; x2 = xs[1]
            y1 = ys[0]; y2 = ys[1]
        elif all(k in sa for k in ('x','y','width','height')):
            # VIA rectangle 포맷 대응
            x1 = sa['x']; y1 = sa['y']
            x2 = sa['x'] + sa['width']
            y2 = sa['y'] + sa['height']
        else:
            # 알 수 없는 포맷은 스킵
            continue

        # 정규화
        nx1, ny1 = x1 / img_w, y1 / img_h
        nx2, ny2 = x2 / img_w, y2 / img_h

        # 최소 외접 사각형으로 확장(패딩 w는 get_enclosing_rect 내부 기본값)
        bbox = get_enclosing_rect([nx1, ny1, nx2, ny2])
        bbox.extend([nx1,ny1,nx2,ny2])
        bboxes.append(bbox)

        # 높이
        h = float(ra['chi_height_m'])
        heights.append(h)

    return bboxes, heights






def get_enclosing_rect(pair, w=60 / 512):

    enclosing_rects = []
    half_w = w / 2


    x1, y1, x2, y2 = pair

    xmin = min(x1, x2) - half_w
    ymin = min(y1, y2) - half_w
    xmax = max(x1, x2) + half_w
    ymax = max(y1, y2) + half_w
    xmin = max(xmin,0)
    xmax = min(xmax,1)
    ymin = max(ymin,0)
    ymax = min(ymax,1)

    return [xmin,ymin,xmax,ymax]
import numpy as np

def crop(image: np.ndarray, bbox: list) -> np.ndarray:


    masked_image = np.zeros_like(image)


    h, w = image.shape[:2]
    xmin = int(bbox[0] * w)
    ymin = int(bbox[1] * h)
    xmax = int(bbox[2] * w)
    ymax = int(bbox[3] * h)


    xmin = max(0, xmin)
    ymin = max(0, ymin)
    xmax = min(w, xmax)
    ymax = min(h, ymax)


    if xmin < xmax and ymin < ymax:
        masked_image[ymin:ymax, xmin:xmax] = image[ymin:ymax, xmin:xmax]

    return masked_image

    def crop_and_adjust_bbox(
        self,
        imgs: torch.Tensor,
        bbox:torch.Tensor,
        padding_factor: float = 0.1,
        target_size: tuple = (224, 224)
    ) -> (torch.Tensor, torch.Tensor):

        crops = []
        new_bboxes_list = []
        img_h, img_w = imgs.shape[2:]

        for i in range(imgs.size(0)):

            nx1, ny1, nx2, ny2 = bbox[i].tolist()

            orig_x1 = int(nx1 * img_w)
            orig_y1 = int(ny1 * img_h)
            orig_x2 = int(nx2 * img_w)
            orig_y2 = int(ny2 * img_h)


            bbox_w = orig_x2 - orig_x1
            bbox_h = orig_y2 - orig_y1
            center_x = orig_x1 + bbox_w / 2
            center_y = orig_y1 + bbox_h / 2

            side_len = max(bbox_w, bbox_h)
            padded_side_len = int(side_len * (1 + padding_factor))


            crop_x1 = int(center_x - padded_side_len / 2)
            crop_y1 = int(center_y - padded_side_len / 2)
            crop_x2 = crop_x1 + padded_side_len
            crop_y2 = crop_y1 + padded_side_len


            shift_x = 0
            if crop_x1 < 0: shift_x = -crop_x1
            elif crop_x2 > img_w: shift_x = img_w - crop_x2

            shift_y = 0
            if crop_y1 < 0: shift_y = -crop_y1
            elif crop_y2 > img_h: shift_y = img_h - crop_y2

            crop_x1 += shift_x
            crop_x2 += shift_x
            crop_y1 += shift_y
            crop_y2 += shift_y


            cropped_img = imgs[i:i+1, :, crop_y1:crop_y2, crop_x1:crop_x2]


            new_bbox_x1 = orig_x1 - crop_x1
            new_bbox_y1 = orig_y1 - crop_y1
            new_bbox_x2 = orig_x2 - crop_x1
            new_bbox_y2 = orig_y2 - crop_y1


            crop_w = crop_x2 - crop_x1
            crop_h = crop_y2 - crop_y1

            final_nx1 = new_bbox_x1 / crop_w
            final_ny1 = new_bbox_y1 / crop_h
            final_nx2 = new_bbox_x2 / crop_w
            final_ny2 = new_bbox_y2 / crop_h

            new_bboxes_list.append(torch.tensor([final_nx1, final_ny1, final_nx2, final_ny2],device = imgs.device))


            resized_crop = nn.functional.interpolate(
                cropped_img, size=target_size, mode="bilinear", align_corners=False
            )
            crops.append(resized_crop)


        final_crops = torch.cat(crops, dim=0)
        final_bboxes = torch.stack(new_bboxes_list, dim=0)

        return final_crops, final_bboxes

##augmentation

In [None]:

import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
from numpy.random import rand

class Transformer():
    def __init__(self):
        pass
    def __call__(self,image:np.ndarray,bbox:list ):
        image,bbox = self.hflip(image,bbox,p=0.5)
        image,bbox = self.vflip(image,bbox,p=0.5)
        image,bbox = self.gaussian_noise(image,bbox,p=0.4)

        return image,bbox
    def vflip(self,image,bbox,p):
        if(rand() < p):
            image = A.VerticalFlip(p=1)(image=image)['image']

            bbox = [bbox[0],1 - bbox[3] , bbox[2] , 1 - bbox[1],bbox[4],1-bbox[7],bbox[6],1-bbox[5]]
        return image,bbox
    def hflip(self,image,bbox,p):
        if(rand() < p):
            image = A.HorizontalFlip(p=1)(image=image)['image']

            bbox = [1-bbox[2],bbox[1] , 1-bbox[0] , bbox[3],1-bbox[6],bbox[5],1-bbox[4],bbox[7]]
        return image,bbox
    def gaussian_noise(self,image,bbox,p):
        if(rand()<p):
            image= A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.2, p=0.5)(image=image)['image']
        # if(rand() < p):
        #     image = A.GaussNoise(p=1)(image=image)['image']
        return image,bbox


# dataset

In [None]:

import torch
from torch.utils.data import Dataset
from PIL import Image
from pathlib import Path
import cv2
class ChimneyDataset(Dataset):
    def __init__(self, img_dir, label_dir, processor, transform=None):
        self.img_dir = Path(img_dir)
        self.label_dir = Path(label_dir)
        self.processor = processor
        self.transform = transform

        self.samples = []
        json_files = list(self.label_dir.glob("*.json"))
        for i, json_path in enumerate(json_files):
            img_path = self.img_dir / json_path.with_suffix(".jpg").name
            if not img_path.exists():
                continue
            print(f"\rprocessing {img_path.name}, {i + 1}/{len(json_files)}", end="")
            json_data = load_json(json_path)
            bboxs,hs = extract_bbox(json_data)
            for bbox,h in zip(bboxs,hs):
              self.samples.append(
                  {
                      "img_path": img_path,
                      "bbox":bbox,
                      "height":h
                  }
              )
        print('dataset:',len(self.samples))

        print(f"\\nLoaded {len(self.samples)} chimney samples from {img_dir}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]


        height = sample['height']
        bbox = sample['bbox']
        img = cv2.imread(sample['img_path'])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:

            img,bbox = self.transform(img,bbox)
        inputs = self.processor(images=img, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)
        bbox = torch.tensor(
           bbox,
            dtype=torch.float32,
        )
        target = torch.tensor(height, dtype=torch.float32)
        return pixel_values, bbox, target




## model.py

In [None]:
import torch
import torch.nn as nn
from transformers import AutoImageProcessor, AutoModel
from numpy.random import rand
from peft import LoraConfig, get_peft_model
class ChimneyRegressor(nn.Module):
    def __init__(
        self,
        dinov3_name="facebook/dinov3-vitl16-pretrain-sat493m",
        metadata_dim=8, # bbox + all points x,y
        fusion_type="baseline_crop",
        crop_p = 0.25,
        is_train = True
    ):
        super().__init__()
        print(fusion_type)
        self.gate_eps = 0.2
        self.gate_temp = 1.0
        self.crop_p = crop_p
        self.is_train = is_train
        if is_train:
            print('train mode')
        else:
          print('val mode')
        config = LoraConfig(
            r = 32,
            lora_alpha = 64,
            target_modules=["q_proj","k_proj","v_proj","o_proj"],
            lora_dropout = 0.05,
            bias = "none",
            task_type="FEATURE_EXTRACTION"
        )
        self.processor = AutoImageProcessor.from_pretrained(dinov3_name)
        self.backbone = AutoModel.from_pretrained(dinov3_name)


        self.backbone = get_peft_model(self.backbone, config)

        bbox_input_dim = 8
        bbox_embedded_dim = 512
        self.bbox_encoder = nn.Sequential(
            nn.Linear(bbox_input_dim,32),
            nn.ReLU(),
            nn.Linear(32,bbox_embedded_dim)
        )

        hidden_dim = self.backbone.config.hidden_size
        self.fusion_type = fusion_type

        if fusion_type == "baseline":
            self.head = nn.Sequential(
                nn.Linear((hidden_dim + bbox_embedded_dim), 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 1),
            )

        if fusion_type == "baseline_crop":
            self.head = nn.Sequential(
                nn.Linear(hidden_dim  + bbox_embedded_dim, 512),
                nn.BatchNorm1d(512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Linear(256, 64),
                nn.BatchNorm1d(64),
                nn.ReLU(),
                nn.Linear(64, 1),
            )

        elif fusion_type == "bbox_gate":
            self.img_proj  = nn.Sequential(nn.LayerNorm(hidden_dim),        nn.Linear(hidden_dim, 512))
            self.bbox_proj = nn.Sequential(nn.LayerNorm(bbox_embedded_dim), nn.Linear(bbox_embedded_dim, 512))
            self.gate = nn.Linear(bbox_embedded_dim, 512)
            self.head = nn.Sequential(
                nn.Linear(512,256),
                nn.ReLU(),
                nn.Linear(256,128),
                nn.ReLU(),
                nn.Linear(128, 1),
            )
        elif fusion_type == "film":
            self.gamma = nn.Linear(metadata_dim, hidden_dim)
            self.beta = nn.Linear(metadata_dim, hidden_dim)
            self.head = nn.Sequential(
                nn.Linear(hidden_dim, 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 1),
            )
    def set_is_train(self,is_train):
        self.is_train = is_train
    def forward(self, pixel_values, bbox):
        ft = self.fusion_type


        if ft == "baseline_crop": ##지금은 실행하면 에러남

            if self.is_train and (rand() < self.crop_p):
                pixel_values, bbox = self.crop_and_adjust_bbox(pixel_values, bbox)

            outputs = self.backbone(pixel_values=pixel_values)
            cls_token = outputs.pooler_output  # (B, hidden_dim)

            bbox_feature = self.bbox_encoder(bbox)  # (B, bbox_embedded_dim)
            combined = torch.cat([cls_token, bbox_feature], dim=1)
            preds = self.head(combined)
            return preds.squeeze(-1)
        elif ft == "bbox_gate":
            outputs = self.backbone(pixel_values=pixel_values)
            cls_token = outputs.pooler_output
            bbox_feature = self.bbox_encoder(bbox)
            h_img = self.img_proj(cls_token)
            h_bbox = self.bbox_proj(bbox_feature)
            g = torch.sigmoid(self.gate(bbox_feature) / self.gate_temp)
            g = self.gate_eps + (1.0 - self.gate_eps) * g
            h = h_img + g * h_bbox
            preds = self.head(h)
            return preds.squeeze(-1)

        elif ft == "baseline":
            outputs = self.backbone(pixel_values=pixel_values)
            cls_token = outputs.pooler_output
            bbox_feature = self.bbox_encoder(bbox)
            combined = torch.cat([cls_token, bbox_feature], dim=1)
            return self.head(combined).squeeze(-1)



    # def _crop_img(self, imgs, metadata):
    #     crops = []
    #     for i in range(imgs.size(0)):
    #         x1 = int(metadata[i, 2].item())
    #         x2 = int(metadata[i, 3].item())
    #         y1 = int(metadata[i, 4].item())
    #         y2 = int(metadata[i, 5].item())
    #         crop_x1, crop_y1, crop_x2, crop_y2 = self._get_enclosing_rect(
    #             (x1, y1, x2, y2)
    #         )
    #         crop = imgs[i:, :, crop_y1:crop_y2, crop_x1:crop_x2]
    #         crop = nn.functional.interpolate(
    #             crop, size=(224, 224), mode="bilinear", align_corners=False
    #         )
    #         crops.append(crop)
    #     return torch.cat(crops, dim=0)

    # def _get_enclosing_rect(self, point_pairs, w=60, imgsz=511):
    #     """
    #     get minimal enclosing square for given point pairs
    #     Args:
    #         point_pairs: (x1, y1, x2, y2)
    #         w: padding width
    #         imgsz: image size
    #     Returns:
    #         (xmin, ymin, xmax, ymax)
    #     """
    #     x1, y1, x2, y2 = pair
    #     half_w = w/2
    #     xmin = max(0.0, min(x1, x2) - half_w)
    #     ymin = max(0.0, min(y1, y2) - half_w)
    #     xmax = min(1.0, max(x1, x2) + half_w)
    #     ymax = min(1.0, max(y1, y2) + half_w)
    #     return (xmin, ymin, xmax, ymax)
    def crop_and_adjust_bbox(
        self,
        imgs: torch.Tensor,
        bbox:torch.Tensor,
        padding_factor: float = 0.25,
        target_size: tuple = (224, 224)
    ) -> (torch.Tensor, torch.Tensor):

        crops = []
        new_bboxes_list = []
        img_h, img_w = imgs.shape[2:]

        for i in range(imgs.size(0)):

            nx1, ny1, nx2, ny2 = bbox[i].tolist()

            orig_x1 = int(nx1 * img_w)
            orig_y1 = int(ny1 * img_h)
            orig_x2 = int(nx2 * img_w)
            orig_y2 = int(ny2 * img_h)

            bbox_w = orig_x2 - orig_x1
            bbox_h = orig_y2 - orig_y1
            center_x = orig_x1 + bbox_w / 2
            center_y = orig_y1 + bbox_h / 2

            side_len = max(bbox_w, bbox_h)
            padded_side_len = int(side_len * (1 + padding_factor))


            crop_x1 = int(center_x - padded_side_len / 2)
            crop_y1 = int(center_y - padded_side_len / 2)
            crop_x2 = crop_x1 + padded_side_len
            crop_y2 = crop_y1 + padded_side_len

            shift_x = 0
            if crop_x1 < 0: shift_x = -crop_x1
            elif crop_x2 > img_w: shift_x = img_w - crop_x2

            shift_y = 0
            if crop_y1 < 0: shift_y = -crop_y1
            elif crop_y2 > img_h: shift_y = img_h - crop_y2

            crop_x1 += shift_x
            crop_x2 += shift_x
            crop_y1 += shift_y
            crop_y2 += shift_y


            cropped_img = imgs[i:i+1, :, crop_y1:crop_y2, crop_x1:crop_x2]


            new_bbox_x1 = orig_x1 - crop_x1
            new_bbox_y1 = orig_y1 - crop_y1
            new_bbox_x2 = orig_x2 - crop_x1
            new_bbox_y2 = orig_y2 - crop_y1


            crop_w = crop_x2 - crop_x1
            crop_h = crop_y2 - crop_y1

            final_nx1 = new_bbox_x1 / crop_w
            final_ny1 = new_bbox_y1 / crop_h
            final_nx2 = new_bbox_x2 / crop_w
            final_ny2 = new_bbox_y2 / crop_h

            new_bboxes_list.append(torch.tensor([final_nx1, final_ny1, final_nx2, final_ny2],device = imgs.device))


            resized_crop = nn.functional.interpolate(
                cropped_img, size=target_size, mode="bilinear", align_corners=False
            )
            crops.append(resized_crop)


        final_crops = torch.cat(crops, dim=0)
        final_bboxes = torch.stack(new_bboxes_list, dim=0)

        return final_crops, final_bboxes

class ModelWrapper:
    def __init__(
        self,
        dinov3_name="facebook/dinov3-vitl16-pretrain-sat493m",
        device=None,
        fusion_type="baseline",
    ):
        self.device = device or torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )

        self.model = ChimneyRegressor(dinov3_name, fusion_type=fusion_type).to(
            self.device
        )
        self.processor = self.model.processor

    def train_mode(self):
        self.model.set_is_train(True)
        self.model.train()


    def eval_mode(self):
        self.model.set_is_train(False)
        self.model.eval()

    def predict(self, pixel_values, metadata):
        self.model.set_is_train(False)
        self.eval_mode()
        with torch.no_grad():
            return self.model(pixel_values.to(self.device), metadata.to(self.device))

    def save(self, path):
        torch.save(self.model.state_dict(), path)

    def load(self, path):
        self.model.load_state_dict(
            torch.load(path, map_location=self.device), strict=False
        )


## train.py

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from tqdm.notebook import tqdm
import math
from transformers import get_cosine_schedule_with_warmup



def train_epoch(model_wrapper, dataloader, optimizer, criterion, scheduler):
    model_wrapper.train_mode()
    total_loss = 0.0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for pixel_values, metadata, targets in progress_bar:
        pixel_values = pixel_values.to(model_wrapper.device)
        metadata = metadata.to(model_wrapper.device)
        targets = targets.to(model_wrapper.device)

        optimizer.zero_grad()
        predictions = model_wrapper.model(pixel_values, metadata)
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()


        total_loss += loss.item() * pixel_values.size(0)

    return total_loss / len(dataloader.dataset)

def eval_epoch(model_wrapper, dataloader, criterion):
    model_wrapper.eval_mode()
    total_loss = 0.0
    progress_bar = tqdm(dataloader, desc="Validation", leave=False)
    with torch.no_grad():
        for pixel_values, metadata, targets in progress_bar:
            pixel_values = pixel_values.to(model_wrapper.device)
            metadata = metadata.to(model_wrapper.device)
            targets = targets.to(model_wrapper.device)

            predictions = model_wrapper.model(pixel_values, metadata)
            loss = criterion(predictions, targets)
            total_loss += loss.item() * pixel_values.size(0)

    return total_loss / len(dataloader.dataset)

def train_model(
    train_img_dir,
    train_label_dir,
    val_img_dir,
    val_label_dir,
    epochs=60,
    batch_size=8,
    lr=1e-3,
    fusion_type="baseline",
    warmup_epochs=2,
    lr_decay_factor=0.1
):
    model_wrapper = ModelWrapper(fusion_type=fusion_type)
    transformer = Transformer()
    print("Loading training dataset...")
    train_dataset = ChimneyDataset(train_img_dir, train_label_dir, model_wrapper.processor,transform=transformer)

    print("Loading validation dataset...")
    val_dataset = ChimneyDataset(val_img_dir, val_label_dir, model_wrapper.processor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    head_params = []
    lora_params = []
    for n, p in model_wrapper.model.named_parameters():
        if not p.requires_grad:
            continue
        if "lora_" in n.lower():           # PEFT 주입된 가중치
            lora_params.append(p)
        elif "head" in n or "bbox_encoder" in n:
            head_params.append(p)

    optimizer = torch.optim.AdamW(
        [
            {"params": head_params, "lr": 1e-3},
            {"params": lora_params, "lr": 1e-4},
        ],
        weight_decay=0.01,
    )
    criterion = nn.MSELoss()

    # Added LR scheduler
    num_training_steps = len(train_loader) * epochs
    num_warmup_steps = len(train_loader) * warmup_epochs
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=0.5,
        last_epoch=-1
    )


    best_val_loss = float("inf")

    for epoch in range(epochs):
        train_loss = math.sqrt(train_epoch(model_wrapper, train_loader, optimizer, criterion, scheduler))
        val_loss = math.sqrt(eval_epoch(model_wrapper, val_loader, criterion))

        print(
            f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}"
        )

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model_wrapper.save("/content/drive/MyDrive/bbox_gate_best_model.pth")

    return model_wrapper

## data dir load


In [None]:
!mkdir -p "/content/dataset/images/train"
!mkdir -p "/content/dataset/images/val"
!mkdir -p "/content/dataset/labels/train_p2"
!mkdir -p "/content/dataset/labels/val_p2"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/dataset/labels/TL_KS_LINE.zip" -d "/content/dataset/labels/train_p2"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/dataset/labels/VL_KS_LINE.zip" -d "/content/dataset/labels/val_p2"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/dataset/images/TS_KS.zip" -d "/content/dataset/images/train"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/dataset/images/VS_KS.zip" -d "/content/dataset/images/val"


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


## load train_model

In [None]:
def demo(
    fusion_type="baseline",
    train_img_dir="/content/dataset/images/train",
    train_label_dir="/content/dataset/labels/train_p2",
    val_img_dir="/content/dataset/images/val",
    val_label_dir="/content/dataset/labels/val_p2",
):
    print(f"=== Training {fusion_type} model ===")
    model_wrapper = train_model(
        train_img_dir=train_img_dir,
        train_label_dir=train_label_dir,
        val_img_dir=val_img_dir,
        val_label_dir=val_label_dir,
        epochs=60,
        batch_size=96,
        lr=1e-3, #실제 쓰이는 lr은 trainer에서 정의됨
        fusion_type=fusion_type,
    )



## clean ram

In [None]:
import gc, torch

# 1) 큰 객체 먼저 CPU로 옮기고 참조 해제
try:
    model.to("cpu")
except Exception:
    pass

# 필요시 중간 텐서/배치도 삭제
for name in [
    "model","optimizer","scheduler","outputs","loss",
    "pixel_values","mask_labels","class_labels",
    "train_loader","val_loader","train_data","val_data",
]:
    if name in globals():
        del globals()[name]

# 2) 가비지 컬렉션
gc.collect()

# 3) CUDA 캐시 정리
torch.cuda.empty_cache()      # PyTorch 캐시 해제
torch.cuda.ipc_collect()      # 프로세스 간 공유 메모리 회수
torch.cuda.reset_peak_memory_stats()  # 피크 메모리 통계 초기화(선택)

# 5) 확인(선택)
if torch.cuda.is_available():
    dev = torch.cuda.current_device()
    print("allocated:", torch.cuda.memory_allocated(dev)/1024**2, "MB")
    print("reserved: ", torch.cuda.memory_reserved(dev)/1024**2, "MB")


allocated: 2427.638671875 MB
reserved:  8026.0 MB


## bbox gate ver

In [None]:
demo("bbox_gate")

=== Training bbox_gate model ===
bbox_gate
train mode
Loading training dataset...
processing K3_CHN_20190827045936_51.jpg, 8052/8052dataset: 10590
\nLoaded 10590 chimney samples from /content/dataset/images/train
Loading validation dataset...
processing K3_CHN_20221002052256_31.jpg, 1006/1006dataset: 1323
\nLoaded 1323 chimney samples from /content/dataset/images/val


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 1/60 - Train Loss: 95.2159, Val Loss: 48.1529


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2/60 - Train Loss: 46.0710, Val Loss: 41.2196


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3/60 - Train Loss: 33.0987, Val Loss: 27.4068


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4/60 - Train Loss: 23.9604, Val Loss: 26.1833


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5/60 - Train Loss: 20.4228, Val Loss: 20.0749


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 6/60 - Train Loss: 18.3768, Val Loss: 18.7394


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 7/60 - Train Loss: 17.2955, Val Loss: 20.0842


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 8/60 - Train Loss: 15.7785, Val Loss: 16.6583


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 9/60 - Train Loss: 15.1643, Val Loss: 18.4290


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 10/60 - Train Loss: 14.2597, Val Loss: 16.3693


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 11/60 - Train Loss: 13.9862, Val Loss: 15.1451


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 12/60 - Train Loss: 13.4945, Val Loss: 14.9477


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 13/60 - Train Loss: 12.9649, Val Loss: 14.6304


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 14/60 - Train Loss: 12.7006, Val Loss: 14.7950


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 15/60 - Train Loss: 11.9391, Val Loss: 14.0135


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 16/60 - Train Loss: 12.3762, Val Loss: 13.8877


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 17/60 - Train Loss: 11.6904, Val Loss: 13.3958


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 18/60 - Train Loss: 10.8626, Val Loss: 13.0096


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 19/60 - Train Loss: 10.5543, Val Loss: 13.5783


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 20/60 - Train Loss: 10.2959, Val Loss: 14.0736


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 21/60 - Train Loss: 10.4477, Val Loss: 13.5352


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 22/60 - Train Loss: 9.9658, Val Loss: 13.6741


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 23/60 - Train Loss: 9.8539, Val Loss: 12.1043


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 24/60 - Train Loss: 9.4322, Val Loss: 12.0679


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 25/60 - Train Loss: 9.1616, Val Loss: 12.6779


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 26/60 - Train Loss: 9.0707, Val Loss: 12.0024


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 27/60 - Train Loss: 8.9226, Val Loss: 11.6613


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 28/60 - Train Loss: 8.6404, Val Loss: 12.1123


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 29/60 - Train Loss: 8.7139, Val Loss: 11.8562


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 30/60 - Train Loss: 8.5305, Val Loss: 11.7927


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 31/60 - Train Loss: 8.0440, Val Loss: 11.2682


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 32/60 - Train Loss: 8.1098, Val Loss: 11.2847


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 33/60 - Train Loss: 7.8783, Val Loss: 11.3022


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 34/60 - Train Loss: 8.2125, Val Loss: 11.2415


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 35/60 - Train Loss: 7.7328, Val Loss: 11.5697


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 36/60 - Train Loss: 7.8757, Val Loss: 11.1881


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 37/60 - Train Loss: 7.5120, Val Loss: 11.0099


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 38/60 - Train Loss: 7.5603, Val Loss: 11.0066


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 39/60 - Train Loss: 7.2984, Val Loss: 10.9233


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 40/60 - Train Loss: 7.1699, Val Loss: 10.8734


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 41/60 - Train Loss: 7.2688, Val Loss: 10.9985


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 42/60 - Train Loss: 7.1821, Val Loss: 10.9033


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 43/60 - Train Loss: 7.0502, Val Loss: 10.7965


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 44/60 - Train Loss: 6.9025, Val Loss: 10.8593


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 45/60 - Train Loss: 7.0421, Val Loss: 10.7928


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 46/60 - Train Loss: 6.7642, Val Loss: 10.7282


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 47/60 - Train Loss: 6.8506, Val Loss: 10.8528


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 48/60 - Train Loss: 6.6390, Val Loss: 10.6867


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 49/60 - Train Loss: 6.9339, Val Loss: 10.8220


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 50/60 - Train Loss: 6.6192, Val Loss: 10.7454


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 51/60 - Train Loss: 6.7768, Val Loss: 10.6621


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 52/60 - Train Loss: 6.7500, Val Loss: 10.7363


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 53/60 - Train Loss: 6.4957, Val Loss: 10.7404


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 54/60 - Train Loss: 6.5797, Val Loss: 10.7282


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 55/60 - Train Loss: 6.4191, Val Loss: 10.6907


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 56/60 - Train Loss: 6.5199, Val Loss: 10.6818


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 57/60 - Train Loss: 6.5317, Val Loss: 10.6233


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 58/60 - Train Loss: 6.3626, Val Loss: 10.6132


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 59/60 - Train Loss: 6.5037, Val Loss: 10.6461


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 60/60 - Train Loss: 6.5842, Val Loss: 10.6390


##simple bbox embedding ver

In [None]:
# lr (1e-4, 1e-3, r = 32. lora_alpha=64, batch = 96, bbox embeding 64로 layer 1층)
demo("baseline")
# demo("cross_attn")
# demo("film")


=== Training baseline model ===
baseline
train mode
Loading training dataset...
processing K3_CHN_20190827045936_51.jpg, 8052/8052dataset: 10590
\nLoaded 10590 chimney samples from /content/dataset/images/train
Loading validation dataset...
processing K3_CHN_20221002052256_31.jpg, 1006/1006dataset: 1323
\nLoaded 1323 chimney samples from /content/dataset/images/val


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 1/30 - Train Loss: 89.7262, Val Loss: 47.8362


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2/30 - Train Loss: 44.9831, Val Loss: 39.3803


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3/30 - Train Loss: 33.5049, Val Loss: 29.4855


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4/30 - Train Loss: 24.8206, Val Loss: 22.4133


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5/30 - Train Loss: 20.1685, Val Loss: 22.2034


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 6/30 - Train Loss: 18.9616, Val Loss: 18.4077


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 7/30 - Train Loss: 17.2411, Val Loss: 17.8722


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 8/30 - Train Loss: 15.8110, Val Loss: 17.5692


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 9/30 - Train Loss: 15.7767, Val Loss: 17.8634


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 10/30 - Train Loss: 14.5049, Val Loss: 16.5112


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 11/30 - Train Loss: 14.0019, Val Loss: 15.7619


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 12/30 - Train Loss: 13.3324, Val Loss: 15.6009


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 13/30 - Train Loss: 12.5958, Val Loss: 15.1673


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 14/30 - Train Loss: 12.8567, Val Loss: 14.7528


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 15/30 - Train Loss: 12.2549, Val Loss: 14.5245


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 16/30 - Train Loss: 11.7481, Val Loss: 14.2955


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 17/30 - Train Loss: 11.7003, Val Loss: 14.1205


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 18/30 - Train Loss: 11.1894, Val Loss: 14.0126


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 19/30 - Train Loss: 10.9022, Val Loss: 13.8378


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 20/30 - Train Loss: 10.9109, Val Loss: 13.9328


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 21/30 - Train Loss: 10.5316, Val Loss: 13.4716


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 22/30 - Train Loss: 10.3842, Val Loss: 13.3041


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 23/30 - Train Loss: 10.3046, Val Loss: 13.8264


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 24/30 - Train Loss: 10.1609, Val Loss: 13.1224


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 25/30 - Train Loss: 10.1041, Val Loss: 13.0761


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 26/30 - Train Loss: 9.8264, Val Loss: 13.0467


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 27/30 - Train Loss: 9.8596, Val Loss: 13.1224


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 28/30 - Train Loss: 9.7652, Val Loss: 13.0117


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 29/30 - Train Loss: 9.7778, Val Loss: 13.0377


Training:   0%|          | 0/111 [00:00<?, ?it/s]

Validation:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 30/30 - Train Loss: 9.7539, Val Loss: 13.0374
