# Submission Notebook - DINOv2 Forgery Detection

This notebook loads pre-trained weights and generates predictions for the test set.

**Usage:**
- Set `LOCAL_MODE = True` for local testing with validation data
- Set `LOCAL_MODE = False` for Kaggle submission

In [1]:
import os, cv2, json, math
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ==================== CONFIG ====================

# ⚡ SWITCH THIS FLAG FOR LOCAL VS KAGGLE MODE ⚡
LOCAL_MODE = True  # Set to False when running on Kaggle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if LOCAL_MODE:
    # Local paths (for testing with validation data)
    BASE_DIR = "./data"
    TEST_DIR = f"{BASE_DIR}/test_images"  # Or use validation images
    SAMPLE_SUB = None  # Will create from test files
    WEIGHTS_PATH = "./model_seg_final.pt"
else:
    # Kaggle paths
    TEST_DIR = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/test_images"
    SAMPLE_SUB = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/sample_submission.csv"
    WEIGHTS_PATH = "/kaggle/input/msv-v1/model_seg_final.pt"

# Model config (must match training)
DINO_PATH = "facebook/dinov2-base"
IMG_SIZE = 512
CHANNELS = 4

OUT_PATH = "submission.csv"

print(f"Mode: {'LOCAL' if LOCAL_MODE else 'KAGGLE'}")
print(f"Device: {device}")
print(f"Test dir: {TEST_DIR}")
print(f"Weights: {WEIGHTS_PATH}")

Mode: LOCAL
Device: cuda
Test dir: ./data/test_images
Weights: ./model_seg_final.pt


In [3]:
# ==================== MODEL DEFINITION ====================

class DinoDecoder(nn.Module):
    """Progressive upsampling decoder with regularization"""
    def __init__(self, in_ch=768, out_ch=CHANNELS, dropout=0.1):
        super().__init__()
        
        self.up1 = self._block(in_ch, 384, dropout)
        self.up2 = self._block(384, 192, dropout)
        self.up3 = self._block(192, 96, dropout)
        self.up4 = self._block(96, 48, dropout)
        
        self.final = nn.Conv2d(48, out_ch, kernel_size=1)
    
    def _block(self, in_ch, out_ch, dropout):
        return nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Dropout2d(dropout),
            nn.Conv2d(out_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
        )
    
    def forward(self, f, size):
        x = F.interpolate(f, scale_factor=2, mode='bilinear', align_corners=False)
        x = self.up1(x)
        
        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
        x = self.up2(x)
        
        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
        x = self.up3(x)
        
        x = F.interpolate(x, size=size, mode='bilinear', align_corners=False)
        x = self.up4(x)
        
        return self.final(x)


class DinoSegmenter(nn.Module):
    def __init__(self, encoder, processor, unfreeze_blocks=3):
        super().__init__()
        self.encoder, self.processor = encoder, processor
        
        # Freeze all parameters
        for p in self.encoder.parameters():
            p.requires_grad = False
        
        # Unfreeze last N blocks
        num_blocks = len(self.encoder.encoder.layer)
        for i in range(num_blocks - unfreeze_blocks, num_blocks):
            for p in self.encoder.encoder.layer[i].parameters():
                p.requires_grad = True
        
        for p in self.encoder.layernorm.parameters():
            p.requires_grad = True
        
        self.seg_head = DinoDecoder(768, CHANNELS)

    def forward_features(self, x):
        imgs = (x*255).clamp(0,255).byte().permute(0,2,3,1).cpu().numpy()
        inputs = self.processor(images=list(imgs), return_tensors="pt").to(x.device)
        with torch.no_grad():
            feats = self.encoder(**inputs).last_hidden_state
        B, N, C = feats.shape
        fmap = feats[:, 1:, :].permute(0, 2, 1)
        s = int(math.sqrt(N-1))
        fmap = fmap.reshape(B, C, s, s)
        return fmap

    def forward_seg(self, x):
        fmap = self.forward_features(x)
        return self.seg_head(fmap, (IMG_SIZE, IMG_SIZE))

In [4]:
# ==================== LOAD MODEL ====================

print("Loading DINOv2 encoder...")
processor = AutoImageProcessor.from_pretrained(DINO_PATH)
encoder = AutoModel.from_pretrained(DINO_PATH).eval().to(device)

print("Building model...")
model_seg = DinoSegmenter(encoder, processor).to(device)

print(f"Loading weights from {WEIGHTS_PATH}...")
model_seg.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
model_seg.eval()

print("✅ Model loaded successfully!")

Loading DINOv2 encoder...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading weights: 100%|██████████| 223/223 [00:00<00:00, 674.05it/s, Materializing param=layernorm.weight]                                 


Building model...
Loading weights from ./model_seg_final.pt...
✅ Model loaded successfully!


In [5]:
# ==================== INFERENCE FUNCTIONS ====================

@torch.no_grad()
def segment_prob_map_all_channels(pil):
    """Returns probability maps for ALL channels."""
    x = torch.from_numpy(np.array(pil.resize((IMG_SIZE, IMG_SIZE)), np.float32)/255.).permute(2,0,1)[None].to(device)
    return torch.sigmoid(model_seg.forward_seg(x))[0].cpu().numpy()


def enhanced_adaptive_mask(prob, alpha_grad=0.35):
    gx = cv2.Sobel(prob, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(prob, cv2.CV_32F, 0, 1, ksize=3)
    grad_mag = np.sqrt(gx**2 + gy**2)
    grad_norm = grad_mag / (grad_mag.max() + 1e-6)
    enhanced = (1 - alpha_grad) * prob + alpha_grad * grad_norm
    enhanced = cv2.GaussianBlur(enhanced, (3,3), 0)
    thr = np.mean(enhanced) + 0.3 * np.std(enhanced)
    mask = (enhanced > thr).astype(np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((5,5), np.uint8))
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, np.ones((3,3), np.uint8))
    return mask, thr


def finalize_mask(prob, orig_size):
    mask, thr = enhanced_adaptive_mask(prob)
    mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST)
    return mask, thr


def pipeline_final(pil):
    """Returns a LIST of masks (one per detected forged region)."""
    probs = segment_prob_map_all_channels(pil)
    
    all_masks = []
    all_areas = []
    all_means = []
    all_thrs = []
    
    for ch in range(probs.shape[0]):
        prob = probs[ch]
        mask, thr = finalize_mask(prob, pil.size)
        area = int(mask.sum())
        
        if area > 0:
            prob_resized = cv2.resize(prob, pil.size, interpolation=cv2.INTER_LINEAR)
            mean_inside = float(prob_resized[mask == 1].mean())
        else:
            mean_inside = 0.0
        
        # Filter out small/weak detections
        if area >= 400 and mean_inside >= 0.35:
            all_masks.append(mask)
            all_areas.append(area)
            all_means.append(mean_inside)
            all_thrs.append(thr)
    
    if len(all_masks) == 0:
        return "authentic", [], {"area": 0, "mean_inside": 0.0, "thr": 0.0}
    
    total_area = sum(all_areas)
    avg_mean = sum(all_means) / len(all_means)
    avg_thr = sum(all_thrs) / len(all_thrs)
    
    return "forged", all_masks, {"area": total_area, "mean_inside": avg_mean, "thr": avg_thr, "num_masks": len(all_masks)}

In [6]:
# ==================== RLE ENCODING ====================

def rle_encode_single(mask: np.ndarray, fg_val: int = 1) -> str:
    """Encode a single 2D mask to RLE JSON string."""
    pixels = mask.T.flatten()
    dots = np.where(pixels == fg_val)[0]
    if len(dots) == 0:
        return None
    run_lengths = []
    prev = -2
    for b in dots:
        if b > prev + 1:
            run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return json.dumps([int(x) for x in run_lengths])


def rle_encode_multi(masks: list, fg_val: int = 1) -> str:
    """Encode multiple masks, joining with semicolons."""
    encoded = []
    for m in masks:
        enc = rle_encode_single((m > 0).astype(np.uint8), fg_val)
        if enc is not None:
            encoded.append(enc)
    return ';'.join(encoded) if encoded else "authentic"

In [7]:
# ==================== GENERATE SUBMISSION ====================

rows = []
test_files = sorted(os.listdir(TEST_DIR))
print(f"Processing {len(test_files)} test images...")

for f in tqdm(test_files, desc="Inference"):
    pil = Image.open(Path(TEST_DIR)/f).convert("RGB")
    label, masks, dbg = pipeline_final(pil)

    if label == "authentic" or len(masks) == 0:
        annot = "authentic"
    else:
        annot = rle_encode_multi(masks)

    rows.append({
        "case_id": Path(f).stem,
        "annotation": annot,
    })

# Create submission DataFrame
sub = pd.DataFrame(rows)

if LOCAL_MODE or SAMPLE_SUB is None:
    # Local mode: just use our predictions directly
    final = sub
else:
    # Kaggle mode: merge with sample submission to ensure correct order
    ss = pd.read_csv(SAMPLE_SUB)
    ss["case_id"] = ss["case_id"].astype(str)
    sub["case_id"] = sub["case_id"].astype(str)
    final = ss[["case_id"]].merge(sub, on="case_id", how="left")
    final["annotation"] = final["annotation"].fillna("authentic")

# Save
final[["case_id", "annotation"]].to_csv(OUT_PATH, index=False)

print(f"\n✅ Saved submission to: {OUT_PATH}")
print(f"Total rows: {len(final)}")
print(f"Forged: {(final['annotation'] != 'authentic').sum()}")
print(f"Authentic: {(final['annotation'] == 'authentic').sum()}")
print("\nFirst 10 rows:")
print(final.head(10))

Processing 929 test images...


Inference: 100%|██████████| 929/929 [02:26<00:00,  6.35it/s]



✅ Saved submission to: submission.csv
Total rows: 929
Forged: 706
Authentic: 223

First 10 rows:
  case_id                                         annotation
0   10398                                          authentic
1   10476                                          authentic
2   10602  [35033, 11, 35516, 24, 36002, 33, 36492, 33, 3...
3   10632  [3044, 7, 3298, 15, 3552, 18, 3807, 19, 4061, ...
4   10636  [108, 58, 205, 40, 468, 58, 565, 40, 828, 58, ...
5   10645  [29, 26, 149, 26, 269, 26, 389, 26, 509, 27, 6...
6   10911  [103, 80, 359, 80, 612, 83, 869, 82, 1125, 82,...
7   10939  [155, 33, 229, 50, 308, 16, 515, 33, 589, 50, ...
8   10951  [1, 67, 172, 73, 349, 73, 526, 69, 708, 61, 88...
9   10980  [179, 63, 691, 63, 1203, 63, 1706, 94, 2214, 9...


In [8]:
# ==================== LOCAL SCORING (only in LOCAL_MODE) ====================

if LOCAL_MODE:
    print("\n" + "="*60)
    print("LOCAL SCORING - Comparing with ground truth")
    print("="*60)
    
    # You can add scoring code here if you want to evaluate locally
    # For now, just show some stats
    
    print(f"\nPrediction stats:")
    print(f"  Total images: {len(final)}")
    print(f"  Predicted forged: {(final['annotation'] != 'authentic').sum()}")
    print(f"  Predicted authentic: {(final['annotation'] == 'authentic').sum()}")
    
    # Count masks per forged image
    forged_rows = final[final['annotation'] != 'authentic']
    if len(forged_rows) > 0:
        mask_counts = forged_rows['annotation'].apply(lambda x: len(x.split(';')))
        print(f"\nMask count stats (forged images only):")
        print(f"  Min masks: {mask_counts.min()}")
        print(f"  Max masks: {mask_counts.max()}")
        print(f"  Mean masks: {mask_counts.mean():.2f}")


LOCAL SCORING - Comparing with ground truth

Prediction stats:
  Total images: 929
  Predicted forged: 706
  Predicted authentic: 223

Mask count stats (forged images only):
  Min masks: 1
  Max masks: 2
  Mean masks: 1.36
