# Score Submission

This notebook scores the submission.csv generated by submission.ipynb against the validation ground truth.

**Prerequisites:**
1. Run the "Prepare Fake Kaggle Environment" cell in cnn-dinov2-hybrid.ipynb
2. Run submission.ipynb to generate submission.csv

In [None]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numba
import scipy.optimize
import numpy.typing as npt

In [None]:
# ==================== PATHS ====================
SUBMISSION_PATH = "submission.csv"
GT_DIR = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/ground_truth"
GT_CSV = os.path.join(GT_DIR, "ground_truth.csv")

# Check files exist
assert os.path.exists(SUBMISSION_PATH), f"submission.csv not found. Run submission.ipynb first!"
assert os.path.exists(GT_CSV), f"Ground truth not found. Run the setup cell in cnn-dinov2-hybrid.ipynb first!"

print(f"‚úÖ Found submission.csv")
print(f"‚úÖ Found ground truth")

In [None]:
# ==================== RLE DECODE ====================

@numba.njit
def _rle_decode_jit(mask_rle: npt.NDArray, height: int, width: int) -> npt.NDArray:
    if len(mask_rle) % 2 != 0:
        raise ValueError('Odd number of values')
    starts, lengths = mask_rle[0::2], mask_rle[1::2]
    starts = starts - 1
    ends = starts + lengths
    img = np.zeros(height * width, dtype=np.bool_)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img


def rle_decode(mask_rle: str, shape: tuple) -> npt.NDArray:
    mask_rle = json.loads(mask_rle)
    mask_rle = np.asarray(mask_rle, dtype=np.int32)
    return _rle_decode_jit(mask_rle, shape[0], shape[1]).reshape(shape, order='F')


@numba.jit(nopython=True)
def _rle_encode_jit(x: npt.NDArray, fg_val: int = 1):
    dots = np.where(x.T.flatten() == fg_val)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if b > prev + 1:
            run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return run_lengths


def rle_encode(masks: list, fg_val: int = 1) -> str:
    return ';'.join([json.dumps(_rle_encode_jit(x, fg_val)) for x in masks])

In [None]:
# ==================== SCORING FUNCTIONS ====================

def calculate_f1_score(pred_mask: npt.NDArray, gt_mask: npt.NDArray):
    pred_flat = pred_mask.flatten()
    gt_flat = gt_mask.flatten()
    tp = np.sum((pred_flat == 1) & (gt_flat == 1))
    fp = np.sum((pred_flat == 1) & (gt_flat == 0))
    fn = np.sum((pred_flat == 0) & (gt_flat == 1))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    if (precision + recall) > 0:
        return 2 * (precision * recall) / (precision + recall)
    return 0


def calculate_f1_matrix(pred_masks: list, gt_masks: list):
    num_pred = len(pred_masks)
    num_gt = len(gt_masks)
    f1_matrix = np.zeros((max(num_pred, num_gt), num_gt))
    for i in range(num_pred):
        for j in range(num_gt):
            f1_matrix[i, j] = calculate_f1_score(pred_masks[i], gt_masks[j])
    return f1_matrix


def oF1_score(pred_masks: list, gt_masks: list):
    if len(pred_masks) == 0 or len(gt_masks) == 0:
        return 0.0
    f1_matrix = calculate_f1_matrix(pred_masks, gt_masks)
    row_ind, col_ind = scipy.optimize.linear_sum_assignment(-f1_matrix)
    excess_penalty = len(gt_masks) / max(len(pred_masks), len(gt_masks))
    return np.mean(f1_matrix[row_ind, col_ind]) * excess_penalty

In [None]:
# ==================== LOAD DATA ====================

# Load submission
submission_df = pd.read_csv(SUBMISSION_PATH)
submission_df['case_id'] = submission_df['case_id'].astype(str)
print(f"Loaded submission: {len(submission_df)} rows")

# Load ground truth
gt_df = pd.read_csv(GT_CSV)
gt_df['case_id'] = gt_df['case_id'].astype(str)
print(f"Loaded ground truth: {len(gt_df)} rows")

# Merge
merged = gt_df.merge(submission_df, on='case_id', how='left')
merged['annotation'] = merged['annotation'].fillna('authentic')
print(f"Merged: {len(merged)} rows")

In [None]:
# ==================== SCORE ====================

scores = []
details = []

for idx, row in tqdm(merged.iterrows(), total=len(merged), desc="Scoring"):
    case_id = row['case_id']
    gt_label = row['label']
    pred_annotation = row['annotation']
    shape = json.loads(row['shape']) if isinstance(row['shape'], str) else row['shape']
    
    # Case 1: Both authentic
    if gt_label == 'authentic' and pred_annotation == 'authentic':
        score = 1.0
        detail = "TN (both authentic)"
    
    # Case 2: GT authentic, Pred forged (false positive)
    elif gt_label == 'authentic' and pred_annotation != 'authentic':
        score = 0.0
        detail = "FP (predicted forged, actually authentic)"
    
    # Case 3: GT forged, Pred authentic (false negative)
    elif gt_label == 'forged' and pred_annotation == 'authentic':
        score = 0.0
        detail = "FN (predicted authentic, actually forged)"
    
    # Case 4: Both forged - compute oF1
    else:
        # Load ground truth masks
        mask_file = row['mask_file']
        gt_masks_arr = np.load(os.path.join(GT_DIR, mask_file))
        if gt_masks_arr.ndim == 2:
            gt_masks = [gt_masks_arr]
        else:
            gt_masks = [gt_masks_arr[i] for i in range(gt_masks_arr.shape[0])]
        
        # Decode predicted masks
        pred_rles = pred_annotation.split(';')
        pred_masks = [rle_decode(rle, tuple(shape)) for rle in pred_rles]
        
        score = oF1_score(pred_masks, gt_masks)
        detail = f"oF1 (pred={len(pred_masks)} masks, gt={len(gt_masks)} masks)"
    
    scores.append(score)
    details.append(detail)

merged['score'] = scores
merged['detail'] = details

In [None]:
# ==================== RESULTS ====================

final_score = np.mean(scores)

print("=" * 60)
print(f"üìä FINAL SCORE: {final_score:.4f}")
print("=" * 60)

# Breakdown
print(f"\nüìà Score Breakdown:")
print(f"   Total images: {len(merged)}")
print(f"   Mean score: {final_score:.4f}")
print(f"   Std score: {np.std(scores):.4f}")
print(f"   Min score: {np.min(scores):.4f}")
print(f"   Max score: {np.max(scores):.4f}")

# By category
print(f"\nüìã By Category:")
for detail_type in merged['detail'].unique():
    subset = merged[merged['detail'] == detail_type]
    print(f"   {detail_type}: {len(subset)} images, avg score = {subset['score'].mean():.4f}")

In [None]:
# ==================== WORST PREDICTIONS ====================

print("\n‚ùå Worst 10 Predictions:")
worst = merged.nsmallest(10, 'score')[['case_id', 'label', 'score', 'detail']]
print(worst.to_string(index=False))

In [None]:
# ==================== CONFUSION MATRIX ====================

# Simplified: authentic vs forged prediction
merged['pred_label'] = merged['annotation'].apply(lambda x: 'authentic' if x == 'authentic' else 'forged')

print("\nüî¢ Confusion Matrix (Classification):")
print(pd.crosstab(merged['label'], merged['pred_label'], margins=True))