# Inference InternVL3-4B

### Load Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from qwen_vl_utils import process_vision_info



# default: Load the model on the available device(s)
MODEL_ID = "OpenGVLab/InternVL3_5-4B-HF"

# Load with bfloat16 to fit nicely in 20GB
model = AutoModel.from_pretrained(
    MODEL_ID, 
    dtype=torch.bfloat16, # Use torch_dtype instead of .to() later
    device_map="auto",
    trust_remote_code=True,      # Required for InternVL models
    attn_implementation="eager"
).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.29it/s]


In [4]:
from qwen_vl_utils import process_vision_info


### Methods for manipulation

In [15]:
def zero_shot_inference_internvl(model, tokenizer, image_path, prompt, max_num=12):
    # 1. Use your existing load_image logic
    # This returns a tensor of shape [num_tiles, 3, 448, 448]
    pixel_values = load_image(image_path, max_num=max_num).to(torch.bfloat16).cuda()
    
    # 2. Preparation for inference
    # InternVL needs the <image> token to know where to 'inject' the visual features
    question = f'<image>\n{prompt}'
    generation_config = dict(max_new_tokens=1024, do_sample=True)

    # 3. Inference using the InternVL chat API
    # This handles the internal chat templates and tokenization for you
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    
    # 4. Calculate 'Input' dimensions (The canvas size)
    # InternVL stitches tiles into a grid. We calculate the grid dimensions:
    img = Image.open(image_path)
    aspect_ratio = img.width / img.height
    
    # Re-calculate the target ratio to find the final grid dimensions
    target_ratios = set(
        (i, j) for n in range(1, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= 1)
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, img.width, img.height, 448)
    
    # Final 'canvas' size the model operated on
    input_width = target_aspect_ratio[0] * 448
    input_height = target_aspect_ratio[1] * 448
    
    return response, input_height, input_width

In [None]:
question = '<image>\nDetect all signatures and return their locations and labels in the form of coordinates.'
zero_shot_inference_internvl(model, tokenizer, 'data/images/test_arz92e00_jpg.rf.d032a45166eda3a7b6ca41c47bde7d69_orig.jpg', "Describe the layout of the document.")

AttributeError: 'InternVLModel' object has no attribute 'chat'

: 

Tool for inference zero_shot

In [14]:
import math
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


In [None]:

# set the max number of tiles in `max_num`
pixel_values = load_image('data/images/test_arz92e00_jpg.rf.d032a45166eda3a7b6ca41c47bde7d69_orig.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

# single-image single-round conversation (单图单轮对话)
question = '<image>\nDetect all signatures and return their locations and labels in the form of coordinates.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')


In [5]:
def zero_shot_inference(model, processor, image, prompt):
  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": prompt},
          ],
      }
  ]
  # Preparation for inference
  text = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda")
  # Inference: Generation of the output
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  output_text = processor.batch_decode(
      generated_ids_trimmed, do_sample=True, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]
  input_height = inputs['image_grid_thw'][0][1]*14
  input_width = inputs['image_grid_thw'][0][2]*14
  return output_text, input_height, input_width

For visualization purposes

In [6]:
import json
import matplotlib.pyplot as plt
from PIL import Image
import matplotlib.patches as patches
def plot_bounding_boxes(image, bbox_data, height, width,Ground_T):
  image = image.resize((width, height))
  # Parse the JSON input
  # Plot the image
  fig, ax = plt.subplots(1)
  ax.imshow(image)
  ax.axis('off')
  # Plot the bounding boxes and labels
  for item in bbox_data:
      bbox = item['bbox_2d']
      label = item['label']
      rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], linewidth=2, edgecolor='r', facecolor='none',label='Prediction')
      ax.add_patch(rect)
      plt.text(bbox[0], bbox[1] - 10, label, color='r', fontsize=10)
  for item in Ground_T:
    rect = patches.Rectangle((item[0], item[1]), item[2] - item[0], item[3] - item[1], linewidth=2, edgecolor='g', facecolor='none',label='Ground Truth')
    ax.add_patch(rect)
  plt.legend(loc='upper right')
  plt.show()

### Metrics used to evalute performance
- Hard evaluation metrics: 
    - IoU: The standard academic benchmark.
- Soft evaluation metrics:
    - Iop: Intersection over prediction, how much of your predicted box is inside the ground Truth
    - Center-point hit: if the center of the predicted box is inside the center 
    - Center distance error: The distance between the center of the predicted box and the center of the ground truth

In [7]:
import math

def evaluate_detection(pred_box, gt_box, img_width=1, img_height=1):
    """
    Evaluates prediction against ground truth with distance metrics.
    
    Args:
        pred_box (list): [xmin, ymin, xmax, ymax]
        gt_box (list):   [xmin, ymin, xmax, ymax]
        img_width (int): Width of the image (for normalization)
        img_height (int): Height of the image (for normalization)
        
    Returns:
        dict: IoU, IoP, Center Distance (Pixels), Normalized Center Distance (0-1)
    """
    # --- 1. IoU Calculation (Standard) ---
    xA = max(pred_box[0], gt_box[0])
    yA = max(pred_box[1], gt_box[1])
    xB = min(pred_box[2], gt_box[2])
    yB = min(pred_box[3], gt_box[3])
    
    interArea = max(0, xB - xA) * max(0, yB - yA)
    predArea = (pred_box[2] - pred_box[0]) * (pred_box[3] - pred_box[1])
    gtArea = (gt_box[2] - gt_box[0]) * (gt_box[3] - gt_box[1])
    
    iou = interArea / float(predArea + gtArea - interArea + 1e-6)
    iop = interArea / float(predArea + 1e-6) # Intersection over Prediction

    # --- 2. Center Point Calculation ---
    pred_cx = (pred_box[0] + pred_box[2]) / 2.0
    pred_cy = (pred_box[1] + pred_box[3]) / 2.0
    
    gt_cx = (gt_box[0] + gt_box[2]) / 2.0
    gt_cy = (gt_box[1] + gt_box[3]) / 2.0
    
    # --- 3. Euclidean Distance (Pixels) ---
    # Pythagorean theorem: a^2 + b^2 = c^2
    dist_pixels = math.sqrt((pred_cx - gt_cx)**2 + (pred_cy - gt_cy)**2)
    
    # --- 4. Normalized Distance (0.0 to 1.0) ---
    # Distance relative to the image diagonal. 
    # 0.05 means the center is off by 5% of the image size.
    # This helps compare errors across images of different resolutions.
    img_diagonal = math.sqrt(img_width**2 + img_height**2) + 1e-6
    norm_dist = dist_pixels / img_diagonal

    return {
        "iou": round(iou, 4),
        "iop": round(iop, 4),
        "center_dist_px": round(dist_pixels, 1),
        "norm_center_dist": round(norm_dist, 4)
    }

### Load test images and ground_truth

In [8]:
import json

test_json_path = 'data/test.jsonl'
test_json = []

with open(test_json_path, 'r', encoding='utf-8') as f:
    for line in f:
        data=json.loads(line)
        image_path = 'data/'+data['image']
        groundTruth = json.loads(data['label'])
        test_json.append({
                        'image_path': image_path,
                        'groundTruth': groundTruth
                        })

# Now test_json is a list of dictionaries
print(test_json)

[{'image_path': 'data/images/test_image_152_png_jpg.rf.6336141e7564a9d7e3317f18684228bc_orig.jpg', 'groundTruth': [{'bbox_2d': [251, 504, 417, 547], 'label': 'signatures'}]}, {'image_path': 'data/images/test_qat01f00_jpg.rf.bd6d72fb9d15b1a50aa13dc6c3ae4d92_orig.jpg', 'groundTruth': [{'bbox_2d': [346, 333, 516, 384], 'label': 'signatures'}]}, {'image_path': 'data/images/test_jrk44a00_jpg.rf.49bde548064ee43a487d2f10ce48ff62_orig.jpg', 'groundTruth': [{'bbox_2d': [286, 511, 542, 558], 'label': 'signatures'}]}, {'image_path': 'data/images/test_pvx38c00-page06_6_jpg.rf.c1a201f59050a76046fbf08d79c1d068_orig.jpg', 'groundTruth': [{'bbox_2d': [125, 426, 299, 469], 'label': 'signatures'}, {'bbox_2d': [377, 335, 528, 361], 'label': 'signatures'}]}, {'image_path': 'data/images/test_image_22_png_jpg.rf.08f434b8c2b4f2036d491633b08c205c_orig.jpg', 'groundTruth': [{'bbox_2d': [141, 515, 205, 543], 'label': 'signatures'}, {'bbox_2d': [158, 542, 255, 570], 'label': 'signatures'}, {'bbox_2d': [464, 564,

In [9]:
def parse_ground_truth(Ground_T):
    """
    Extracts GT boxes from Qwen format: <box>(y1,x1),(y2,x2)</box>
    Returns list of [x1, y1, x2, y2] on 0-1000 scale.
    """
    bboxes = []
    for item in Ground_T:
        label = item['label']
        bbox = item['bbox_2d']
        bboxes.append(bbox)
    return bboxes

In [10]:
import json

def parse_and_scale_boxes(output_text, img_width, img_height):
    """
    Parses Qwen2.5-VL output (0-1000 scale) and converts to absolute pixels.
    
    Args:
        output_text (str): The raw string output from the model.
        img_width (int): Original width of the image.
        img_height (int): Original height of the image.
        
    Returns:
        list: A list of dicts with scaled 'bbox_2d' [x1, y1, x2, y2] in pixels.
    """
    try:
        # 1. Clean the output string to get pure JSON
        # This handles cases where the model wraps code in ```json ... ```
        if "```json" in output_text:
            json_str = output_text.split("```json")[1].split("```")[0].strip()
        elif "```" in output_text:
            json_str = output_text.split("```")[1].split("```")[0].strip()
        else:
            json_str = output_text.strip()
            
        # 2. Parse JSON
        data = json.loads(json_str)
        
        # 3. Scale Coordinates
        scaled_results = []
        for item in data:
            if "bbox_2d" in item:
                # Get the normalized 0-1000 coordinates
                # Format is [xmin, ymin, xmax, ymax]
                norm_box = item["bbox_2d"]
                
                # Apply the scaling formula
                abs_box = [
                    (norm_box[0] / 1000.0) * img_width,   # xmin
                    (norm_box[1] / 1000.0) * img_height,  # ymin
                    (norm_box[2] / 1000.0) * img_width,   # xmax
                    (norm_box[3] / 1000.0) * img_height   # ymax
                ]
                
                # Create a new item with the scaled box
                new_item = item.copy()
                new_item["bbox_2d"] = [int(x) for x in abs_box] # Optional: Convert to int for cleaner pixels
                scaled_results.append(new_item)
                
        return scaled_results

    except (json.JSONDecodeError, IndexError, KeyError) as e:
        print(f"Parsing Error: {e}")
        # Return empty list on failure so the pipeline doesn't crash
        return []


In [11]:
import numpy as np

def calculate_iou(boxA, boxB):
    # Standard IoU calculation
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])

    return interArea / float(boxAArea + boxBArea - interArea + 1e-6)

def match_predictions_to_ground_truth(pred_boxes, gt_boxes, iou_threshold=0.5):
    """
    Matches predictions to ground truths using greedy IoU strategy.
    
    Args:
        pred_boxes (list): List of [xmin, ymin, xmax, ymax]
        gt_boxes (list):   List of [xmin, ymin, xmax, ymax]
        iou_threshold (float): Minimum IoU to consider a match valid
        
    Returns:
        matches (list): List of dicts {'pred': box, 'gt': box, 'iou': float}
        unmatched_preds (list): List of pred_boxes that matched nothing
        unmatched_gts (list): List of gt_boxes that were missed
    """
    matches = []
    # pred_boxes = [box['bbox_2d'] for box in pred_boxes]
    
    # Keep track of which indices have been matched
    matched_pred_indices = set()
    matched_gt_indices = set()
    
    # 1. Calculate IoU for ALL pairs
    # Format: (iou, pred_index, gt_index)
    all_pairs = []
    for i, p_box in enumerate(pred_boxes):
        for j, g_box in enumerate(gt_boxes):
            iou = calculate_iou(p_box, g_box)
            if iou > 0.0: # Only consider pairs that overlap at least a little
                all_pairs.append((iou, i, j))
    
    # 2. Sort pairs by IoU (Highest first)
    all_pairs.sort(key=lambda x: x[0], reverse=True)
    
    # 3. Greedy Matching
    for iou, p_idx, g_idx in all_pairs:
        if p_idx not in matched_pred_indices and g_idx not in matched_gt_indices:
            # Found the best remaining match!
            if iou >= iou_threshold:
                matches.append({
                    'pred': pred_boxes[p_idx],
                    'gt': gt_boxes[g_idx],
                    'iou': iou
                })
                matched_pred_indices.add(p_idx)
                matched_gt_indices.add(g_idx)
    
    # 4. Gather leftovers
    unmatched_preds = [p for i, p in enumerate(pred_boxes) if i not in matched_pred_indices]
    unmatched_gts = [g for i, g in enumerate(gt_boxes) if i not in matched_gt_indices]
    
    return matches, unmatched_preds, unmatched_gts

### Running the inference test on single image

In [12]:
from PIL import Image
prompt = "Detect all signatures and return their locations and labels in the form of coordinates. "
for item in test_json:
    image_path = item['image_path']
    Ground_T = parse_ground_truth(item['groundTruth'])
    image = Image.open(image_path).convert('RGB')
    output_text, height, width = zero_shot_inference(model, tokenizer, image, prompt)
    height, width = int(height), int(width)
    #print(f"Output Text: {output_text}")
    #parsing of output
    bbox_data_predict =parse_and_scale_boxes(output_text, image.width, image.height)
    pred_boxes = [x['bbox_2d'] for x in bbox_data_predict if 'bbox_2d' in x]
    print(f"Predicted Bounding Boxes: {pred_boxes}")
    matches, false_positives, misses = match_predictions_to_ground_truth(pred_boxes, Ground_T, iou_threshold=0.1)
    for match in matches:
        # Use your detailed metric function here
        metrics = evaluate_detection(match['pred'], match['gt'], img_width=width, img_height=height)
        
        # Store these specific metrics
        # e.g., results.append(metrics)
        print(f"Matched with IoU: {metrics['iou']}, IoP: {metrics['iop']}, Center Dist: {metrics['center_dist_px']}px, Normalized Dist: {metrics['norm_center_dist']}")

    print(f"False Positives (Hallucinations): {len(false_positives)}")
    print(f"Missed Signatures: {len(misses)}")   
    plot_bounding_boxes(image, bbox_data_predict, image.height, image.width, Ground_T)
    
    break #single image

Keyword arguments {'images': [<PIL.Image.Image image mode=RGB size=644x644 at 0x7FA79A27BCA0>], 'videos': None} not recognized.


AttributeError: 'InternVLModel' object has no attribute 'generate'

In [13]:
import torch

def zero_shot_inference_internvl(model, tokenizer, image, prompt, max_num=12):
    """
    Adapts your inference logic for InternVL models.
    Note: InternVL typically uses a direct 'model.chat' method.
    """
    # 1. Preprocess the image using the dynamic tiling logic
    # This uses the load_image logic we discussed earlier
    pixel_values = load_image_from_pil(image, max_num=max_num).to(torch.bfloat16).cuda()
    
    # 2. Define generation config
    generation_config = dict(
        max_new_tokens=1024,
        do_sample=True,
    )

    # 3. InternVL Chat Interface
    # The <image> token must be in the prompt for the model to 'see' the pixel_values
    full_prompt = f'<image>\n{prompt}'
    
    # model.chat handles the template, tokenization, and decoding internally
    response = model.chat(
        tokenizer, 
        pixel_values, 
        full_prompt, 
        generation_config
    )

    # 4. Calculate Input Height/Width 
    # InternVL scales images into tiles of 448x448.
    # We can derive the 'canvas' size the model saw from the pixel_values shape.
    # pixel_values shape is [num_tiles, 3, 448, 448]
    num_tiles = pixel_values.size(0)
    
    # To get the exact resolution the model processed:
    # We look at the target aspect ratio used during dynamic_preprocess
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height
    target_ratios = set(
        (i, j) for n in range(1, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= 1)
    
    # This helper gets the grid (e.g., 2x3 tiles)
    from .your_script import find_closest_aspect_ratio # Assuming you kept the helper
    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, 448)
    
    input_width = target_aspect_ratio[0] * 448
    input_height = target_aspect_ratio[1] * 448

    return response, input_height, input_width

def load_image_from_pil(image, input_size=448, max_num=12):
    """Helper to process a PIL image directly instead of a file path"""
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(img) for img in images]
    return torch.stack(pixel_values)

### Running the inference test images

In [None]:
import numpy as np
import json
from PIL import Image
from tqdm import tqdm

# --- 1. Initialize Accumulators ---
metrics_summary = {
    "iou": [],
    "iop": [],
    "norm_dist": [],
    "false_positives": 0,
    "missed_signatures": 0,
    "total_images": 0
}

prompt = "Detect all signatures and return their locations and labels in the form of coordinates. "

print(f">> Starting evaluation on {len(test_json)} images...")

# --- 2. Main Loop ---
for item in tqdm(test_json):
    image_path = item['image_path']
    
    # Load GT and Image
    try:
        Ground_T = parse_ground_truth(item['groundTruth']) # Ensure this returns list of [x1, y1, x2, y2]
        image = Image.open(image_path).convert('RGB')
        metrics_summary["total_images"] += 1
    except Exception as e:
        print(f"Skipping {image_path}: {e}")
        continue

    # Inference
    try:
        output_text, height, width = zero_shot_inference(model, processor, image, prompt)
                    
        bbox_data_predict = parse_and_scale_boxes(output_text, image.width, image.height)

        
        # Extract just the boxes for matching [x1, y1, x2, y2]
        # Assuming bbox_data_predict is list of dicts: [{'bbox_2d': [...], ...}]
        pred_boxes = [x['bbox_2d'] for x in bbox_data_predict if 'bbox_2d' in x]

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        pred_boxes = []

    # Match Predictions to Ground Truth
    matches, false_positives, misses = match_predictions_to_ground_truth(pred_boxes, Ground_T, iou_threshold=0.1)
    
    # Update Totals for FP / FN
    metrics_summary["false_positives"] += len(false_positives)
    metrics_summary["missed_signatures"] += len(misses)

    # Collect Metrics for Matches
    for match in matches:
        # Evaluate using the normalized distance metric
        m = evaluate_detection(match['pred'], match['gt'], img_width=width, img_height=height)
        
        metrics_summary["iou"].append(m['iou'])
        metrics_summary["iop"].append(m['iop'])
        metrics_summary["norm_dist"].append(m['norm_center_dist'])

# --- 3. Calculate Averages ---
total_matches = len(metrics_summary["iou"])

if total_matches > 0:
    avg_iou = sum(metrics_summary["iou"]) / total_matches
    avg_iop = sum(metrics_summary["iop"]) / total_matches
    avg_norm_dist = sum(metrics_summary["norm_dist"]) / total_matches
else:
    avg_iou = avg_iop = avg_norm_dist = 0.0

# --- 4. Final Report ---
print("\n" + "="*50)
print(f" FINAL EVALUATION REPORT ({metrics_summary['total_images']} Images)")
print("="*50)
print(f"Total Matches Found:      {total_matches}")
print(f"Total Missed Signatures:  {metrics_summary['missed_signatures']}")
print(f"Total False Positives:    {metrics_summary['false_positives']}")
print("-" * 50)
print(f"Mean IoU (Overlap):             {avg_iou:.4f}")
print(f"Mean IoP (Tightness/Precision): {avg_iop:.4f}")
print(f"Mean Normalized Center Error:   {avg_norm_dist:.4f} ({(avg_norm_dist*100):.2f}% of image diagonal)")
print("="*50)

>> Starting evaluation on 257 images...


100%|██████████| 257/257 [10:58<00:00,  2.56s/it]


 FINAL EVALUATION REPORT (257 Images)
Total Matches Found:      277
Total Missed Signatures:  43
Total False Positives:    34
--------------------------------------------------
Mean IoU (Overlap):             0.5738
Mean IoP (Tightness/Precision): 0.7272
Mean Normalized Center Error:   0.0213 (2.13% of image diagonal)



