In [None]:
import numpy as np
import pandas as pd

def sample_hours(yearly_data, total_hours=2000, method="random"):
    """
    Extract 2000 hours from hourly data of 1 year while preserving diurnal and seasonal cycles.
    
    Parameters:
    - yearly_data (pd.DataFrame): DataFrame with a DateTime index covering 1 full year of hourly data.
    - total_hours (int): Number of hours to sample (default: 2000).
    - method (str): "random" (random selection) or "systematic" (fixed interval).
    
    Returns:
    - list: Selected timestamps in YYYYMMDDhhmmss format.
    """
    # Ensure full year data (8760 hours for non-leap year)
    assert len(yearly_data) >= 8760, "Input data must contain at least 1 year of hourly values."
    
    # Define seasons
    seasons = {
        "winter": [12, 1, 2],  # Dec, Jan, Feb
        "spring": [3, 4, 5],   # Mar, Apr, May
        "summer": [6, 7, 8],   # Jun, Jul, Aug
        "autumn": [9, 10, 11]  # Sep, Oct, Nov
    }
    
    # Allocate hours per season
    hours_per_season = total_hours // len(seasons)
    
    sampled_indices = []
    
    for season, months in seasons.items():
        # Filter data for current season
        seasonal_data = yearly_data[yearly_data.index.month.isin(months)]
        
        # Ensure diurnal cycle representation (select across 24-hour periods)
        grouped_by_hour = [seasonal_data[seasonal_data.index.hour == h] for h in range(24)]
        
        # Select evenly distributed hours
        if method == "random":
            sampled_hours = [g.sample(n=hours_per_season//24, random_state=42) for g in grouped_by_hour]
        elif method == "systematic":
            sampled_hours = [g.iloc[::max(1, len(g)//(hours_per_season//24))] for g in grouped_by_hour]
        else:
            raise ValueError("Invalid method. Choose 'random' or 'systematic'.")
        
        # Store selected indices
        sampled_indices.extend(pd.concat(sampled_hours).index)
    
    # Convert timestamps to YYYYMMDDhhmmss format
    selected_timestamps = [ts.strftime("%Y%m%d%H%M%S") for ts in sorted(sampled_indices)]
    
    return selected_timestamps


def filter_files_by_timestamps(file_list, selected_timestamps):
    """
    Filters a list of filenames based on the selected timestamps.
    
    Parameters:
    - file_list (list): List of filenames with format YYYYMMDDhhmmss.
    - selected_timestamps (list): List of selected timestamps to filter the files.
    
    Returns:
    - list: List of filenames that match the selected timestamps.
    """
    # Create a set of selected timestamps for fast lookup
    selected_set = set(selected_timestamps)
    
    # Use regex to extract the timestamp part from the filenames
    timestamp_pattern = re.compile(r'(\d{8}\d{6})')  # Matches YYYYMMDDhhmmss

    # Filter files based on matching timestamp
    selected_files = [
        file for file in file_list 
        if timestamp_pattern.search(file) and timestamp_pattern.search(file).group(1) in selected_set
    ]
    
    return selected_files

import os 

data_dir = "/Users/fquareng/data"
# input_directory = os.path.join(data_dir, "DA/8h-PS-RELHUM_2M-T_2M_cropped_gridded")
input_directory = os.path.join(data_dir, "8h-PS-RELHUM_2M-T_2M_cropped_gridded")
dem_directory =  os.path.join(data_dir, "dem_squares")


for subdir in sorted(os.listdir(input_directory)):  # Ensure sorted order
    subdir_path = os.path.join(input_directory, subdir)

    # Ensure it's a valid directory
    if not os.path.isdir(subdir_path):
        continue

    # Corresponding DEM file
    dem_file = f"{subdir}_dem.nc"
    dem_path = os.path.join(dem_directory, dem_file)

    # Get all NetCDF files in the subdirectory
    files = [os.path.join(subdir_path, f) for f in os.listdir(subdir_path) if f.endswith(".nz")]

selected_hours = sample_hours(yearly_df, total_hours=2000, method="random")
filter_files_by_timestamps(file_list, selected_timestamps)

NameError: name 'yearly_df' is not defined