In [None]:
import numpy as np
import random

def create_npz_subset(input_path, output_path, subset_size, random_seed=None):

    # Load the original data
    with np.load(input_path) as data:
        # Get all arrays in the NPZ file
        arrays = {key: data[key] for key in data.files}
        
        # Check if subset_size is valid
        first_array = arrays[next(iter(arrays))]
        total_samples = len(first_array)
        
        if subset_size > total_samples:
            raise ValueError(f"Subset size ({subset_size}) cannot be larger than total samples ({total_samples})")
        
        # Set random seed if provided
        if random_seed is not None:
            random.seed(random_seed)
            np.random.seed(random_seed)
        
        # Generate random indices
        indices = random.sample(range(total_samples), subset_size)
        
        # Create subset for each array
        subset_arrays = {}
        for key, array in arrays.items():
            subset_arrays[key] = array[indices]
    
    # Save the subset to a new NPZ file
    np.savez_compressed(output_path, **subset_arrays)
    print(f"Successfully created subset with {subset_size} samples at {output_path}")

create_npz_subset('data/scaled_dataset_full.npz', 'data/sample_data.npz', subset_size=30, random_seed=42)

Successfully created subset with 30 samples at sample_data.npz
