In [2]:
import numpy as np       
import pandas as pd      
import os                
from tqdm.notebook import tqdm  
import gc                
from copy import deepcopy 
import sys               

# Custom modules from project structure
# Import individual modules to prevent circular dependency issues
from scripts.utils import dataset, camera, image, metric, submission
from scripts.features import extraction, clustering

In [3]:
# Root data directory and output configuration
DATA_DIR = "../data/image-matching-challenge-2025"
OUTPUT_FILE = "output_graph.csv" # Final submission CSV with scene clusters and camera poses
TRAIN_DIR = os.path.join(DATA_DIR, "train")

# --- Feature Extraction Parameters ---
# Controls keypoint detection and descriptor computation
FEATURE_EXTRACTOR_TYPE = 'SIFT'  # Scale-Invariant Feature Transform - robust to scale/rotation changes
SIFT_NFEATURES = 8000  # Higher values capture more detail but increase memory and processing time

# --- Matching Parameters ---
# Controls how feature descriptors are matched between images
MATCHER_TYPE = 'FLANN'  # Fast Library for Approximate Nearest Neighbors - efficient for large feature sets
LOWE_RATIO_TEST_THRESHOLD = 0.8  # Lower values = stricter matching (fewer false positives, more false negatives)
MIN_INLIER_MATCHES_INITIAL = 15  # Minimum matches required for initial geometric verification
MIN_INLIER_MATCHES_GRAPH = 10  # Threshold for adding an edge to view graph (connectivity)

# --- Geometric Verification Parameters ---
# Controls outlier rejection using RANSAC for fundamental matrix estimation
RANSAC_THRESHOLD = 1.5  # Maximum distance in pixels for a point to be considered an inlier

# --- Clustering Parameters ---
# Determines how images are grouped into scenes
CLUSTERING_ALGORITHM = 'ConnectedComponents'  # Groups images based on view graph connectivity
MIN_CLUSTER_SIZE = 3  # Minimum images required to consider a group as a valid scene

# --- Structure from Motion (SfM) Parameters ---
# Controls 3D reconstruction and camera pose estimation
MIN_VIEWS_FOR_TRIANGULATION = 2  # Minimum camera views needed to triangulate a 3D point
PNP_RANSAC_THRESHOLD = 5.0  # Reprojection error threshold (pixels) for PnP pose estimation
PNP_CONFIDENCE = 0.999  # Confidence level for RANSAC in PnP (higher = more iterations)
MIN_3D_POINTS_FOR_PNP = 6  # Minimum 3D-2D correspondences needed for reliable pose estimation

# Camera model approximation (when calibration is unknown)
DEFAULT_FOCAL_LENGTH_FACTOR = 1.2  # Focal length approximation as factor of image dimension

print(f"Constants defined. Using {FEATURE_EXTRACTOR_TYPE} features and {MATCHER_TYPE} matcher.")
print(f"Data Directory: {DATA_DIR}")

Constants defined. Using SIFT features and FLANN matcher.
Data Directory: ../data/image-matching-challenge-2025


In [4]:
# Load dataset samples from label file into structured format
# Returns a dictionary mapping dataset names to lists of image metadata
samples = dataset.load_dataset(DATA_DIR)

# Display summary of loaded datasets and their image counts
for dataset_name in samples:
    print(f'Dataset "{dataset_name}" -> num_images={len(samples[dataset_name])}')

Dataset "imc2023_haiper" -> num_images=54
Dataset "imc2023_heritage" -> num_images=209
Dataset "imc2023_theather_imc2024_church" -> num_images=76
Dataset "imc2024_dioscuri_baalshamin" -> num_images=138
Dataset "imc2024_lizard_pond" -> num_images=214
Dataset "pt_brandenburg_british_buckingham" -> num_images=225
Dataset "pt_piazzasanmarco_grandplace" -> num_images=168
Dataset "pt_sacrecoeur_trevi_tajmahal" -> num_images=225
Dataset "pt_stpeters_stpauls" -> num_images=200
Dataset "amy_gardens" -> num_images=200
Dataset "fbk_vineyard" -> num_images=163
Dataset "ETs" -> num_images=22
Dataset "stairs" -> num_images=51


In [None]:
# Import matching module separately to avoid circular import issues in dependency graph
from scripts.features import matching

def process_dataset(dataset_id, test_image_dir, predictions, extractor, matcher):
    """
    Process a complete image dataset through the full SfM pipeline:
    1. Extract features from all images
    2. Build view graph based on feature matching
    3. Cluster images into scenes using connected components
    4. Handle outlier images (those not belonging to any cluster)
    5. Run Structure from Motion per cluster to compute camera poses
    
    Args:
        dataset_id: Name/ID of the dataset being processed
        test_image_dir: Root directory containing dataset folders
        predictions: List of prediction objects to be updated with results
        extractor: Initialized feature extractor object
        matcher: Initialized feature matcher object
        
    Returns:
        Updated predictions list with cluster assignments and camera poses
    """
    print(f"\n--- Processing Dataset: {dataset_id} ---")

    dataset_path = os.path.join(test_image_dir, dataset_id)
    filename_to_index = {p.filename: idx for idx, p in enumerate(predictions)}
    
    # STEP 1: Extract Features from each image in the dataset
    # Returns dict mapping image_id -> features and image dimensions
    extracted_features, image_dims = extraction.load_and_extract_features_dataset(dataset_id, test_image_dir, extractor)
    image_ids_in_dataset = list(extracted_features.keys())

    # STEP 2: Build View Graph representing image connectivity
    # Nodes = images, Edges = sufficient feature matches between image pairs
    # Also returns pairwise_matches dict with geometric verification results
    G, pairwise_matches = clustering.build_view_graph(image_ids_in_dataset, extracted_features, matcher)

    # STEP 3: Cluster Images into scenes based on view graph connectivity
    # Returns list of clusters (each a list of image IDs) and list of outlier image IDs
    clusters, outliers = clustering.cluster_images(G, algorithm=CLUSTERING_ALGORITHM, min_cluster_size=MIN_CLUSTER_SIZE)

    # STEP 4: Process Outliers - images not belonging to any valid cluster
    # Set null pose matrices and assign to 'outliers' cluster
    print(f"Marking {len(outliers)} images as outliers.")
    for img_id in outliers:
        r, t = camera.format_pose(None, None)
        prediction_index = filename_to_index[img_id]
        predictions[prediction_index].cluster_index = "outliers"
        predictions[prediction_index].rotation = r
        predictions[prediction_index].translation = t

    # STEP 5: Run Structure from Motion (SfM) for each identified cluster
    print(f"Running SfM for {len(clusters)} clusters...")
    for i, cluster_nodes in enumerate(clusters):
        cluster_label = f"cluster{i+1}"
        print(f"\nProcessing {cluster_label} ({len(cluster_nodes)} images)...")

        # Filter data to only include images in the current cluster (memory optimization)
        cluster_features = {img_id: extracted_features[img_id] for img_id in cluster_nodes if img_id in extracted_features}
        cluster_dims = {img_id: image_dims[img_id] for img_id in cluster_nodes if img_id in image_dims}
        
        # Filter pairwise matches to only include image pairs within this cluster
        cluster_pairwise_matches = {}
        for (id1, id2), matches in pairwise_matches.items():
             if id1 in cluster_nodes and id2 in cluster_nodes:
                 cluster_pairwise_matches[(id1, id2)] = matches

        # Estimate camera poses through triangulation and PnP for this cluster
        cluster_poses = camera.estimate_poses_for_cluster(
            cluster_nodes,
            cluster_features,
            cluster_dims,
            matcher,
            cluster_pairwise_matches # Pass filtered matches
        )

        # Update predictions with cluster assignments and camera poses
        for img_id in cluster_nodes:
            R, T = cluster_poses.get(img_id, (None, None)) # Get pose, default to None if not found
            r, t = camera.format_pose(R, T)
            prediction_index = filename_to_index[img_id]
            predictions[prediction_index].cluster_index = cluster_label
            predictions[prediction_index].rotation = deepcopy(r)
            predictions[prediction_index].translation = deepcopy(t)

        # Clean up memory to prevent OOM errors when processing large datasets
        del cluster_features, cluster_dims, cluster_poses, cluster_pairwise_matches
        gc.collect()

    print(f"--- Finished Processing Dataset: {dataset_id} ---")
    return predictions

In [10]:
# --- Main Pipeline Execution ---
# Process train datasets to verify algorithm performance against ground truth
if os.path.isdir(TRAIN_DIR):
    # List all dataset directories and sort by image count (process smaller datasets first)
    # This helps catch issues early before committing to larger datasets
    train_datasets = [os.path.basename(os.path.join(TRAIN_DIR, d)) for d in os.listdir(TRAIN_DIR) 
                     if os.path.isdir(os.path.join(TRAIN_DIR, d))]
    train_datasets.sort(key=lambda x: len(os.listdir(os.path.join(TRAIN_DIR, x))), reverse=False)
    
    print("=== Processing Train Datasets ===")
    # Initialize core components for the pipeline
    extractor = extraction.get_feature_extractor('SIFT', SIFT_NFEATURES)
    matcher = matching.get_matcher('FLANN', 'SIFT')
    
    # Process datasets (limiting to first dataset for demonstration/development)
    # Remove the slice [:1] to process all datasets
    for dataset_name in train_datasets[:1]:
        samples[dataset_name] = process_dataset(dataset_name, TRAIN_DIR, samples[dataset_name], extractor, matcher)
else:
    print("Train directory not found - check data path configuration")

=== Processing Train Datasets ===

--- Processing Dataset: ETs ---
Extracting features for 22 images in dataset ETs...


Features ETs: 100%|██████████| 22/22 [00:00<00:00, 43.19it/s]


Building view graph for 22 images...


Matching pairs: 100%|██████████| 231/231 [00:04<00:00, 52.50it/s]


View graph built with 22 nodes and 64 edges.
Clustering graph using ConnectedComponents...
Found 1 clusters and 3 potential outliers.
Marking 3 images as outliers.
Running SfM for 1 clusters...

Processing cluster1 (19 images)...
Initializing SfM with pair (another_et_another_et002.png, another_et_another_et001.png) with 484 matches.
Essential matrix inliers: 436
Triangulated 423 initial 3D points.
Attempting to register 17 remaining images...


Registering images: 100%|██████████| 17/17 [00:00<00:00, 495.32it/s]

Successfully registered image another_et_another_et005.png (113 PnP inliers).
Successfully registered image another_et_another_et004.png (109 PnP inliers).
Successfully registered image another_et_another_et003.png (57 PnP inliers).
Successfully registered image another_et_another_et006.png (38 PnP inliers).
Successfully registered image another_et_another_et007.png (20 PnP inliers).
Finished SfM for cluster. Registered 7 out of 19 images.
--- Finished Processing Dataset: ETs ---





In [None]:
# Generate submission file in required format for evaluation
# Format: dataset,image,cluster_index,rotation_matrix,translation_vector
submission.create_submission_file(samples, OUTPUT_FILE)

# Display first few rows of submission file to verify format correctness
!head {OUTPUT_FILE}

In [None]:
# Evaluate results against ground truth using competition metrics
# Two key metrics are measured:
# 1. Clustering accuracy (scene assignment correctness)
# 2. Camera pose estimation accuracy (rotation and translation error)

final_score, dataset_scores = metric.score(
    gt_csv=os.path.join(DATA_DIR, "train_labels.csv"),  # Ground truth from training set
    user_csv=OUTPUT_FILE,                               # Our predictions
    thresholds_csv=os.path.join(DATA_DIR, "train_thresholds.csv"),  # Dataset-specific thresholds
    mask_csv=None,                                      # Optional subset evaluation
    inl_cf=0,                                           # Inlier confidence factor
    strict_cf=-1,                                       # Strictness confidence factor
    verbose=True,                                       # Show detailed results
)