In [1]:
import numpy as np
import torch
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
from torch.utils.data import Dataset # Direct import is fine

from gluefactory.settings import DATA_PATH # Assuming this is where datasets are stored
from gluefactory.utils.image import ImagePreprocessor, load_image # Use Glue Factory's image utils
from gluefactory.utils.equirectangular_utils import equirectangular_to_dicemap # Import equirectangular utils
from gluefactory.utils.spherical_utils import standard_spherical_to_pixel, rotate_image, rotate_keypoints, spherical_to_cartesian # Import spherical utils
from gluefactory.utils.xfeat_utils import generate_keypoints # Import xfeat utils
from gluefactory.datasets.base_dataset import BaseDataset # Crucial import

loading weights from: /mnt/d/code/accelerated_features/modules/../weights/xfeat_perm_steer.pth


In [2]:
# Processing parameters
BATCH_SIZE = 8  # Adjust based on your GPU memory
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
IMG_DIR = "/mnt/d/code/glue-factory/data/pretraining/images"
OUTPUT_DIR = "/mnt/d/code/glue-factory/data/pretraining/pairs_batched"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Rotation ranges
aug_rot_yaw_range =  180.0 # degrees
aug_rot_pitch_range = 90.0 # degrees
aug_rot_roll_range = 180.0 # degrees


# Photometric augmentation parameters
photo_aug_prob = 0.8  # Probability of applying photometric augmentations
photo_brightness_range = 0.3  # e.g., adds a value in [-0.3, 0.3]
photo_contrast_range = (0.5, 1.5) # Multiplies by a value in [0.5, 1.5]
photo_gaussian_noise_std_range = (0.0, 0.05) # Std dev of noise

# Keypoints paramters
angle_threshold_degrees = 0.5
num_keypoints = 2048

# Image per pair
num_pairs_per_image= 5

In [4]:
def _apply_photometric_augmentations(img_np):
    """Applies a chain of random photometric augmentations to a numpy image."""
    
    # Apply augmentations with a certain probability
    if np.random.rand() > photo_aug_prob:
        return img_np
    
    # --- 1. Brightness & Contrast Adjustment ---
    # A common and effective way is alpha-beta correction: g(x) = alpha*f(x) + beta
    # alpha controls contrast, beta controls brightness.
    
    # Randomly sample contrast factor (alpha)
    alpha = np.random.uniform(photo_contrast_range[0], photo_contrast_range[1])
    # Randomly sample brightness factor (beta)
    beta = np.random.uniform(-photo_brightness_range, photo_brightness_range)
    
    # Apply the transformation
    img_aug = img_np * alpha + beta
    
    # We must clip the values to be in the valid [0, 1] range
    img_aug = np.clip(img_aug, 0.0, 1.0)

    # --- 2. Gaussian Noise ---
    # Add random noise to simulate sensor noise
    
    # Randomly sample the standard deviation of the noise
    noise_std = np.random.uniform(photo_gaussian_noise_std_range[0], photo_gaussian_noise_std_range[1])
    if noise_std > 0:
        # Generate noise with the same shape as the image
        gaussian_noise = np.random.normal(loc=0.0, scale=noise_std, size=img_aug.shape)
        # Add noise to the image
        img_aug = img_aug + gaussian_noise
        
        # Clip again to ensure values are valid
        img_aug = np.clip(img_aug, 0.0, 1.0)
        
    # --- 3. (Optional but Recommended) Gaussian Blur ---
    # Simulates motion blur or out-of-focus cameras
    if np.random.rand() < 0.5: # Apply blur with 50% probability
        # Kernel size must be odd
        sigma = np.random.uniform(0.5, 1.5)
        ksize = int(2 * np.ceil(2 * sigma) + 1)
        img_aug = cv2.GaussianBlur(img_aug, (ksize, ksize), sigmaX=sigma)
        
    return img_aug

def _generate_groundtruth_correspondences(view1, view2, angle_threshold_degrees=3.0):
    """Generates ground truth match between two views keypoints"""

    # Convert keypoints from view1 to of view2 and then compare them to a threshould
    # using information of image rotation from org_view -> view1 and org_view -> view2

    keypoints1 = view1[1][0] # Xfeat Keypoints from view1
    keypoints2 = view2[1][0] # Xfeat keypoints from view2

    N = len(keypoints1)
    M = len(keypoints2)

    # Initialize the outputs
    matches_view1_to_view2 = np.full(N, -1, dtype=int)
    matches_view2_to_view1 = np.full(M, -1, dtype=int)
    matches_pairs = np.empty((0, 2), dtype=int)

    if N == 0 or M == 0: # In case no keypoints detected
        return {
            'matches': matches_pairs,
            'gt_matches0': matches_view1_to_view2,
            'gt_matches1': matches_view2_to_view1
        }

    yaw1, pitch1, roll1 = view1[2] # img rotationg angles from org_view -> view1
    yaw2, pitch2, roll2 = view2[2] # img rotationg angles from org_view -> view2

    # 1. inverse rotation as moving keypoints from view1 -> org_view        
    kpts1_to_kpts = rotate_keypoints(keypoints1, yaw1, pitch1, roll1, inverse=True)
    # direct rotaion as moving keypoints from org_view -> view2
    kpts_to_kpts2 = rotate_keypoints(kpts1_to_kpts, yaw2, pitch2, roll2)

    # 2. Convert to 3D cartesian coordinate for distance calculatoin
    # These are unit vectors on a sphere
    kpts_to_kpts2_xyz = spherical_to_cartesian(kpts_to_kpts2)
    keypoints2_xyz = spherical_to_cartesian(keypoints2)

    # 3. Create a pairwise distance matrix (N x M)
    # we use dot production. Since unit vectors, dot(a, b) = cos(angle) -> angle = arccos(dot(a, b))
    # The einsum computes the dot product for every pair.
    similarity_matrix = np.einsum('ik,jk->ij', kpts_to_kpts2_xyz, keypoints2_xyz)

    # Clip the values to avoid numerical errors with arccos
    similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)

    # The distance is the angle in degrees
    angular_distance_matrix = np.rad2deg(np.arccos(similarity_matrix))

    # 4. Search for nearest neighbors in both directions
    # For each kpt in view1, find its nearest neighbour in view2
    best_match_for_kpt1 = np.argmin(angular_distance_matrix, axis=1) # Shape: (N,)
    min_distances_for_kpt1 = np.min(angular_distance_matrix, axis=1)

    # For each kpt in view2, find its nearest neighbour in view1
    best_match_for_kpt2 = np.argmin(angular_distance_matrix, axis=0) # Shape: (M,)
    min_distances_for_kpt2 = np.min(angular_distance_matrix, axis=0)

    # 5. Apply the tolerance threshold
    # Find all keypoints in view1 that have a match within the threshold
    valid_matches_mask = min_distances_for_kpt1 < angle_threshold_degrees

    # 6. Enforce mutual consistency (the "Mututual Nearest Neighbor" check --- Lighglue Paper)
    # We create an index array [0, 1, 2, ..., N-1]
    kpt1_indices = np.arange(N)

    # We check if the best match for kpt1's best matach is kpt1 it    # Example: If kpt1[i]'s best match is kpt2[j], we then check if kpt2[j]'s best match is kpt1[i].
    mutual_matches_mask = (best_match_for_kpt2[best_match_for_kpt1] == kpt1_indices)

    # The final mask combines the distance threshold and the mutual check
    final_mask = valid_matches_mask & mutual_matches_mask

    # Create the final list of matches
    # This will be an array of shape (num_matches, 2), where each row is (idx_kpt1, idx_kpt2)
    matches = np.stack([kpt1_indices[final_mask], best_match_for_kpt1[final_mask]], axis=1)

    # Get the indices of keypoints in view1 that have a valid mutual match
    matched_kpt1_indices = kpt1_indices[final_mask]

    # Get the corresponding matched keypoint indices in view2
    matched_kpt2_indices = best_match_for_kpt1[final_mask]

    # 1. Populate the original [num_matches, 2] array
    matches_pairs = np.stack([matched_kpt1_indices, matched_kpt2_indices], axis=1)

    # 2. Populate the gt_matches0 array
    matches_view1_to_view2[matched_kpt1_indices] = matched_kpt2_indices

    # 3. Populate the gt_matches1 array (the inverse mapping)
    matches_view2_to_view1[matched_kpt2_indices] = matched_kpt1_indices

    return {
        'matches': matches_pairs,
        'gt_matches0': matches_view1_to_view2,
        'gt_matches1': matches_view2_to_view1
    }

def _generate_views(img_raw):
    """Generates two views from the same image with random rotations and alterations."""

    # Randomly generate yaw, pitch, roll angles within the specified ranges
    # These angles are in degrees, and will be used to rotate the image
    yaw = np.random.uniform(-aug_rot_yaw_range, aug_rot_yaw_range)
    pitch = np.random.uniform(-aug_rot_pitch_range, aug_rot_pitch_range)
    roll = np.random.uniform(-aug_rot_roll_range, aug_rot_roll_range)

    # Rotate the image
    img_rotated = rotate_image(img_raw, yaw, pitch, roll)

    img_augmented = _apply_photometric_augmentations(img_rotated)

    # Convert the rotated image to dicemap
    img_dicemap = equirectangular_to_dicemap(img_augmented)

    # Generate keypoints, descriptors and scores from xfeat
    # Takes dicemap image but internally convert xfeat output back to equirectangular format
    kpt_data = generate_keypoints(img_dicemap, num_keypoints=num_keypoints) # tuple: (keypointCoords, keypointDescriptors, keypointScores)

    # Convert spherical keypoints (phi, theta) to equirectangular pixel coordinates
    # kpt_pixels = standard_spherical_to_pixel(kpt_data[0], img_augmented.shape[1], img_augmented.shape[0])

    # Plot the augmented image and overlay keypoints
    # plt.figure(figsize=(12, 6))
    # plt.imshow(img_augmented)
    # plt.scatter(kpt_pixels[:, 0], kpt_pixels[:, 1], s=8, c='r', marker='o')
    # plt.title("Augmented Image with Keypoints")
    # plt.axis('off')
    # plt.show()

    # Convert rotated image back to tensor and convert it to (C, H, W)
    augmented_image = torch.tensor(img_augmented.transpose(2, 0, 1))  # Convert back to (C, H, W)

    return (augmented_image, kpt_data, (yaw, pitch, roll))

In [None]:
image_path = "/mnt/d/code/glue-factory/data/pretraining/images"
image_files = os.listdir(image_path)

real = ['berlinStreet', 'church', 'corridors', 'meetingRoom1', 'meetingRoom2', 'stadium', 'townSquare', 'trainStation', 'uni' ]

for img in tqdm(image_files, desc="Processing Images", leave=True, colour='green'):
    img_name = img.split('_')[0]
    if img_name not in real:
        continue
    else:
        # Read image
        img_raw_torch = load_image(os.path.join(image_path, img))

        for i in range(num_pairs_per_image):
            # Generate Views
            view0 = _generate_views(img_raw_torch)
            view1 = _generate_views(img_raw_torch)

            # Generate groundtruth matches
            gt_data = _generate_groundtruth_correspondences(view0, view1, angle_threshold_degrees=angle_threshold_degrees)

            # Convert numpy arrays from gt_data to tensors, ensuring correct dtypes
            matches = torch.from_numpy(gt_data['matches']).long()
            gt_matches0 = torch.from_numpy(gt_data['gt_matches0']).long()
            gt_matches1 = torch.from_numpy(gt_data['gt_matches1']).long()

            name = f"{img.split('.')[0]}_{i}"
            # Save to npz file
            np.savez(
                f"/mnt/d/code/glue-factory/data/pretraining/pairs/{name}.npz",
                # image0=view0[0],
                
                keypoints0=view0[1][0],
                descriptors0=view0[1][1],
                scores0=view0[1][2],
                image_size0=torch.tensor(view0[0].shape[-2:][::-1]),
                yaw_pitch_roll_0=view0[2],
                # image1=view1[0],
                keypoints1=view1[1][0],
                descriptors1=view1[1][1],
                scores1=view1[1][2],
                image_size1=torch.tensor(view1[0].shape[-2:][::-1]),
                yaw_pitch_roll_1=view1[2],
                matches=matches,           # Original [num_matches, 2] format, if needed elsewhere
                gt_matches0=gt_matches0,
                gt_matches1=gt_matches1,
                name=f"{name}" # Base image used for alterations
            )