In [9]:
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# Thread-safe print lock
print_lock = Lock()

def validate_npz_file(fpath):
    """Validate a single NPZ file and remove if corrupted"""
    fname = os.path.basename(fpath)
    
    # Required keys that must be present in the NPZ file
    required_keys = {
        'keypoints0', 'descriptors0', 'scores0', 'image_size0',
        'keypoints1', 'descriptors1', 'scores1', 'image_size1',
        'matches', 'gt_matches0', 'gt_matches1'
    }
    
    try:
        with np.load(fpath) as data:
            # Check if file can be loaded
            available_keys = set(data.files)
            
            # Check for missing keys
            missing_keys = required_keys - available_keys
            if missing_keys:
                raise ValueError(f"Missing required keys: {missing_keys}")
            
            # Optionally verify data can be accessed (not just keys exist)
            for key in required_keys:
                _ = data[key]
                
        return fname, None, False
    except Exception as e:
        # Remove corrupted file
        try:
            os.remove(fpath)
            return fname, str(e), True
        except Exception as remove_error:
            return fname, f"{str(e)} | Failed to remove: {str(remove_error)}", False

def process_files_multithreaded(data_dir, max_workers=None):
    """
    Process NPZ files using multi-threading
    
    Args:
        data_dir: Directory containing NPZ files
        max_workers: Maximum number of threads (None = use default based on CPU count)
    """
    # Get all NPZ files
    npz_files = [
        os.path.join(data_dir, fname)
        for fname in os.listdir(data_dir)
        if fname.endswith('.npz')
    ]
    
    print(f"Found {len(npz_files)} NPZ files to process")
    print(f"Using {max_workers or 'default'} worker threads\n")
    
    # Process files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_file = {
            executor.submit(validate_npz_file, fpath): fpath 
            for fpath in npz_files
        }
        
        # Process completed tasks
        completed = 0
        errors = 0
        removed = 0
        
        for future in as_completed(future_to_file):
            fname, error, was_removed = future.result()
            completed += 1
            
            if error:
                errors += 1
                if was_removed:
                    removed += 1
                    with print_lock:
                        print(f'Error with {fname}: {error} [REMOVED]')
                else:
                    with print_lock:
                        print(f'Error with {fname}: {error}')
            
            # Optional: Print progress
            if completed % 100 == 0:
                with print_lock:
                    print(f"Progress: {completed}/{len(npz_files)} files processed")
    
    print(f"\nCompleted! Processed {completed} files with {errors} errors")
    print(f"Removed {removed} corrupted files")

if __name__ == '__main__':
    data_dir = '/data/code/glue-factory/data/finetuning/finetuning_pairs_spherecraft'
    
    # You can adjust max_workers based on your needs
    # None = default (usually 5x CPU count for I/O bound tasks)
    # Or specify a number like: max_workers=8
    process_files_multithreaded(data_dir, max_workers=10)

Found 88 NPZ files to process
Using 10 worker threads

Error with berlin_00000029_00000074.npz: Missing required keys: {'scores1', 'matches', 'gt_matches0', 'descriptors1', 'image_size1', 'image_size0', 'keypoints1', 'scores0', 'gt_matches1'} [REMOVED]
Error with berlin_00000027_00000205.npz: Missing required keys: {'scores1', 'matches', 'gt_matches0', 'descriptors1', 'image_size1', 'image_size0', 'keypoints1', 'scores0', 'gt_matches1'} [REMOVED]
Error with berlin_00000088_00000149.npz: Missing required keys: {'gt_matches1'} [REMOVED]
Error with berlin_00000196_00000221.npz: Missing required keys: {'scores1', 'matches', 'gt_matches0', 'descriptors1', 'image_size1', 'image_size0', 'keypoints1', 'scores0', 'gt_matches1'} [REMOVED]
Error with berlin_00000207_00000273.npz: Missing required keys: {'scores1', 'matches', 'gt_matches0', 'descriptors1', 'image_size1', 'image_size0', 'keypoints1', 'scores0', 'gt_matches1'} [REMOVED]
Error with berlin_00000209_00000222.npz: Missing required keys: