In [None]:
import xarray as xr
import numpy as np
import pandas as pd

import glob
import psutil
import threading
import time
import os

import dask
from dask.diagnostics import ProgressBar

from datetime import datetime
from dateutil.relativedelta import relativedelta
from datetime import date


import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.animation as animation

<div style="background-color: #FFE099; padding: 10px; border: 3px solid #FFC233; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">||| -------------------------------------------- |||    NOTES    ||| -------------------------------------------- |||
</div>
<div style="background-color: #EFFAFA; border: 2px solid #A2E2E2; font-family: Georgia, serif; padding: 10px">
    <br>This is the script to <strong>process percentile thresholds</strong> and <strong>climatological means</strong> from <strong>temperatures at various uniform depth levels.</strong><div>
    <br>&#10148;&#xFE0E; You can duplicate this script and run its copies simultaneously to download more percentiles/means.
    <br>&#10148;&#xFE0E; If unusual errors appear after attempting to run a cell again, shut down/restart the kernel (and run the cell again afterward)!
    <br>&#10148;&#xFE0E; After saving percentiles/means, check your saved files and compare their sizes to see which ones ought to be removed/redownloaded! 
    <br>&emsp;&emsp;&emsp;&#9733;&#xFE0E; Usually, extremely small file sizes indicate a dataset was not fully saved.
    <br><br>
</div>

In [None]:
## --- IMPORTANT SCRIPT-WIDE CONSTANTS ------------------------------------------------------------------------------------
## Baseline set up:
# Running the main function for the 1993–2022 baseline period (a 30 year baseline)
baseline_choice = "Baseline9322" # identifier
baseline_period_slice_choice = slice('1993-01-01', '2022-12-31') # for slicing time

## Desired constants
chosen_percentile = 90 # full number that is < 100

## Folder identifiers
folder_name_choice = "Full" # A "regional" identifier to save the severity dataset with ("{folder_name_choice}_SST... .zarr")
custom_id = "fgd" # A custom identifier to help further identify the downloaded data; can be left as ""
my_root_directory = "/" # Should be your root directory, from which you access data from and save data to.
                        # The root directory should be of a format: "/name", where "name" corresponds to the name of your root directory
temperature_dataset_id = "" # should be the folder name of the dataset whose raw data you downloaded. By efault, this will
                            # also be where your means/percentiles will be saved, although you can change this. 
temperature_directory = f"{my_root_directory}/{temperature_dataset_id}"
temperature_data_directory = f"{temperature_directory}/Data" # adjust to suit your needs
temperature_means_directory = f"{temperature_directory}/Clim"
temperature_percentile_directory = f"{temperature_directory}/Thresh{chosen_percentile}th"
print(f"Your specified temperature directory: {temperature_directory}")

# Doys (days of the year) set up
starting_day_of_the_year = 1 # Your starting doy point for processing/saving severity (can be 1-366)
ending_day_of_the_year = 366 # Your ending doy point for processing/saving severity (can be 1-366)

# Misc
minutes_choice = 10 # minutes (roughly) per memory update (to keep track of its use and avoid crashing/issues)

<div style="color:#CD6600; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    LOADING FULL OBSERVED GLOBAL DATASETS    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸
</div>

In [None]:
### This is my personal setup to load all my downloaded datasets in single incomplete OR complete folder/region-based datasets.
## You can choose to use a different setup to load your data.

## Function to concatenate the datasets for each folder into our raw "observed" datasets (as opposed to climatology or percentile datasets)
## ------------------------------------------------------------------------------------------

# This setup allows you to either load one or more subsets for the same region. Ex. the "Mid" region below only has one region active.
region_dict_list = [
   # {"Atlantic": ["Central", "Top", "Right"]}, # Here, the North Atlantic region data was subsetted into three different parts with unique id's.
   # {"Pacific": ["Center", 
    #             "Left"]},
    {"Mid": [
        #"All", # This two subset id names are bad to use (they are very vague); make sure to keep notes of what your chosen id's mean!
        "Mid"   
    ]}
]

# This function helps show a global-scale plot of the data that you have downloaded and loaded into single datasets!
def show_map(ds_input, chosen_depth=0.494):
    date = "2003-08-22"
    ds = ds_input.sel(time=date, depth=chosen_depth, method='nearest')
    projection_choice = ccrs.Mercator()
    fig, ax = plt.subplots(figsize=(10, 6), 
                           subplot_kw={'projection': projection_choice})
    im = ax.pcolormesh(ds.longitude, ds.latitude, ds,
                       transform=ccrs.PlateCarree(),
                       cmap='RdYlBu_r')
    ax.set_extent([0, 360, -30, 90], crs=ccrs.PlateCarree())
    ax.coastlines()
    ax.gridlines(draw_labels=True)
    cbar = plt.colorbar(im, ax=ax, shrink=0.7)
    cbar.set_label('Sea Surface Temperature (°C)', rotation=270, labelpad=15)
    ax.set_title(f'Sea Surface Temperature - {date}', fontsize=14)
    plt.show()
    plt.close(fig)


## We need to iterate over all the folders in storage
for folder in region_dict_list:
    for folder_name, sub_folders in folder.items():
        
        # Initializing...
        observed_data_directory = ""
        folder_name_datasets = []
        
        # Actual code to concatenate:
        for sub_folder_name in sub_folders:
            # Your filepath here; this is my setup
            observed_data_directory = f'{temperature_data_directory}/{folder_name}/{folder_name}_{sub_folder_name}'
            
            # Get the ordered paths...
            paths = glob.glob(f'{observed_data_directory}/daily_data_*.zarr')
            paths.sort()
            
            # Merge all the datasets in each subfolder...
            datasets = [xr.open_zarr(path) for path in paths]
            full_ds = xr.concat(datasets, dim="time")
            
            ## This section below lists specific folder_name (dataset) adjustments for my setup; adjustments for your setup may vary
            # Specifically, I made sure that there were no overlapping values along any of my coordinates except for latitude
            
            if folder_name == "Atlantic":
                if (min(full_ds.longitude.values)==-101):
                    full_ds = full_ds.sel(longitude=slice(-101, -14.001))
                if (min(full_ds.latitude.values)<0):
                    max_val = max(full_ds.latitude.values)
                    full_ds = full_ds.sel(latitude=slice(-0.75, max_val))
                best_chunks = {'depth': -1, 'time': 28, 'latitude': 218, 'longitude': 242}
                
            elif folder_name == "Pacific":
                if (max(full_ds.longitude.values)>259):
                    full_ds = full_ds.sel(longitude=slice(110.001, 258.999))
                if (min(full_ds.latitude.values)<-1.5):
                    full_ds = full_ds.sel(latitude=slice(-0.75, 90.001))
                best_chunks = {'depth': -1, 'time': 28, 'latitude': 218, 'longitude': 229}
                
            elif folder_name == "Mid":
                if sub_folder_name == "Mid":
                    best_chunks = {'depth': -1, 'time': 28, 'latitude': 221, 'longitude': 361}
                else:
                    full_ds = full_ds.sel(latitude=slice(-20.9, -0.75))
                    full_ds = full_ds.where(~((full_ds.longitude > 19.999) & (full_ds.longitude < 49.001)), drop=True)
                    best_chunks = {'depth': -1, 'time': 28, 'latitude': 121, 'longitude': 361}
            
            # Adding the dataset to the list...
            folder_name_datasets.append(full_ds)
                
             
        ## Combining stored datasets for each folder...
        print("Folder:", folder_name, '\n')
        
        # If we have many datasets appended to the list, we combine them
        if len(folder_name_datasets) > 1: 
            folder_combined_ds = xr.combine_by_coords(folder_name_datasets, compat='no_conflicts')
        # If there is just one dataset appended to the list, we ignore the list
        else:
            folder_combined_ds = full_ds
        
        folder_combined_ds = folder_combined_ds.chunk(best_chunks)
        
        # Saving this dataset globally...
        globals()[f'{folder_name}_obs'] = folder_combined_ds
        print(globals()[f'{folder_name}_obs'])
        
        # Showing a quick map of the dataset (to make sure everything came out right!)
        show_map(globals()[f'{folder_name}_obs'].thetao)
        print('\n-----------------------------------------------------------\n')

<div style="color:#104E8B; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    CALCULATING MEANS & PERCENTILES    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸</div>

In [None]:
## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ----------------------------------------------- '''
''' Function to keep track of and show memory usage '''
''' ----------------------------------------------- '''

stop_monitoring = True # We begin by NOT showing any memory usage

def monitor_memory(interval_minutes=5, log_file=None):
    interval = interval_minutes * 60  
    
    while not stop_monitoring:
        mem = psutil.Process(os.getpid()).memory_info().rss / (1024**3)  # in GB
        print(f" | Memory usage: {mem:.2f} GB | Memory: {psutil.virtual_memory().percent}% used | ")
        
        if log_file:
            with open(log_file, 'a') as f:
                f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: {mem:.2f} GB\n")
        time.sleep(interval)

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ---------------------------------------------------------------------------------------------------------- '''
''' Function to create a chunk list of missing day of the year datasets based on your own file directory setup '''
''' ---------------------------------------------------------------------------------------------------------- '''

def gather_missing_from_storage(folder_name_arg, sub_folder_name_arg, baseline_name_arg, 
                                chunk_start_arg, chunk_end_arg, 
                                current_percentile, return_check_all=True):
    error_counter = 0
    error_list = []
    
    for i in range(1, 366+1):
        # We check only within the desired chunk interval
        if i >= chunk_start_arg and i <= chunk_end_arg:
            
            id_path = f"{folder_name_arg}_{sub_folder_name_arg}"
            file_name = f"{id_path}_thetao_thresh_300m_subset_{i}_{baseline_name_arg}.zarr"
            filepath = f'{temperature_directory}/Thresh{current_percentile}th/{folder_name_arg}/{id_path}/{file_name}'

            # We check if the filepath exists in the target location; if it does not, we add it to the error_list to process them in the code!
            if not os.path.exists(filepath):
                error_counter += 1
                error_list.append(i)
        
    # If there are no missing doy datasets in our storage, we return None; otherwise, we return the list of missing doys!
    if error_counter == 0:
        print("All doys checked and present! (You should also check file sizes to verify everything downloaded correctly!")
        return None
    else:
        if return_check_all:
            print(f"All doys checked; you are missing {error_counter} doy datasets in total!\n")

        return error_list

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------------------------ '''
''' Function to normalize the unique day of the year value of each observed day in the format: 1-366 '''
''' ------------------------------------------------------------------------------------------------ '''

def normalize_dayofyear(time_coord):
    doy = time_coord.dt.dayofyear
    is_leap = time_coord.dt.is_leap_year

    # This code ensures March 1 is always day 61, regardless of leap year
    normalized_doy = xr.where(
        (~is_leap) & (doy >= 60),  # If it is a non-leap year, doy 60 is March 1. If we have March 1 or later,
        doy + 1,                          # then we push forward March 1 and/or the later days by 1 day.
        doy                               # Otherwise, we keep original for leap years and Jan-Feb 28.
    )

    return normalized_doy

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------- '''
''' Function to check whether the inputted time period is at least roughly 30 years '''
''' ------------------------------------------------------------------------------- '''

def rough_30_year_period_check(time_slice, tolerance=0.01):
    start_str = time_slice.start
    stop_str = time_slice.stop

    start_date = datetime.strptime(start_str, '%Y-%m-%d')
    end_date = datetime.strptime(stop_str, '%Y-%m-%d')

    delta = relativedelta(end_date, start_date)
    total_years = delta.years + delta.months/12 + delta.days/365.25

    return total_years, abs(total_years - 30) <= tolerance

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------------- '''
''' Function to just save the percentile threshold/climatological mean dataset to storage '''
''' ------------------------------------------------------------------------------------- '''

def save_dataset_to_storage(folder_name_arg, sub_folder_name_arg, baseline_name_arg,
                            clim_arg, show_debug_arg, single_download_arg, 
                            current_percentile=None, current_doy=None, 
                            start_val_arg = None, end_val_arg = None, 
                            ds_to_save=None):
    
    id_path = f"{folder_name_arg}_{sub_folder_name_arg}"
    
    # We check if we are saving a climatology dataset or percentile one
    if clim_arg:
        print("Starting doy for current subset: ", start_val_arg)
        print("Ending doy for current subset: ", end_val_arg, '\n')

        if start_val_arg == 1 and end_val_arg == 366:
            final_ds_to_save = ds_to_save
        else:
            final_ds_to_save = ds_to_save.sel({'normalized_doy': slice(start_val_arg, end_val_arg)})
        print("Current climatology subset to save: ", '\n', final_ds_to_save, '\n')

        file_name = f"{id_path}_thetao_clim_300m_subset_{start_val_arg}_to_{end_val_arg}_{baseline_name_arg}.zarr"
        filepath = f'{temperature_means_directory}/{folder_name_arg}/{id_path}/{file_name}'
        
    else:
        final_ds_to_save = ds_to_save
        print(f"{current_percentile}th Percentile for doy {current_doy} is being saved:\n", final_ds_to_save, '\n')
        file_name = f"{id_path}_thetao_thresh_300m_subset_{current_doy}_{baseline_name_arg}.zarr"
        filepath = f'{temperature_directory}/Thresh{current_percentile}th/{folder_name_arg}/{id_path}/{file_name}'
    
    print("Filepath of subset: ", filepath)

    # Continue to saving or stop (if we are at the end of the debug)
    if show_debug_arg:
        raise ValueError('End of debug. Proceed with the setting "show_debug = False" to start saving the percentile thresholds.')
        
    else:
        with ProgressBar():
            final_ds_to_save.to_zarr(filepath, mode='w', consolidated=True)
        
        if clim_arg:
            print(f"Saved subset: doys {start_val_arg} to {end_val_arg}")
        else:
            print(f"Saved subset: doys {current_doy}")
        
        print(f"Moving on!", "\n")
        print("---------------------------------------------------------------------------------------------------------")

        # Optional: for single downloads
        if single_download_arg:
            stop_monitoring = True
            raise ValueError("Single dataset file-saving finished. Please enter a new desired chunk starting value to begin from.")
            
## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ----------------------------------------------------------------- '''
''' Function to actually calculate the percentile threshold dataset to storage '''
''' ----------------------------------------------------------------- '''

# Function to calculate the percentile threshold values for specific depths
def calculate_thetao_thresh_or_clim_given_a_percentile(thetao_data, baseline_slice, 
                                                       folder_name, sub_folder_name, baseline_name, 
                                                       optimal_chunks, window_half_width=5, 
                                                       minutes_per_memory_update=5, percentile=90, 
                                                       start_chunking_doy=1, end_chunking_doy=366,
                                                       show_debug=True, single_download=False, 
                                                       chunk_list=False, finish_at_chunk_end=False):     
    
    if show_debug:
        debug_message_1 = "You have set show_debug to true; this will show how the percentiles/means are processed based on your inputted arguments \nand provide a preview of the output.\n"
        debug_message_2 = "\nIf you are satisfied with the output (and your arguments), compute and save the calculated percentiles/means by setting\nshow_debug to false.\n"
        print(debug_message_1, debug_message_2)
    
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 0: Running a few quick error checks for the provided arguments!")
        print("---------------------------------------------------------------------------------------------------------\n")
              
    # Checking start and end bounds
    if start_chunking_doy < 1 or end_chunking_doy > 366:
        raise ValueError("Please provide a start_chunking_doy that is ≥ 1 and an end_chunking_doy that is ≤ 366.")
        
    if chunk_end < chunk_start:
        raise ValueError("Please make sure your chunk_end is greater than your chunk_start; these are your dataset processing bounds.")
        
    # Running a rough time check for the baseline provided
    total_time, is_time_slice_30_years = rough_30_year_period_check(baseline_slice)
    
    if not is_time_slice_30_years:
        error_message = "Please check that your chosen baseline time slice covers a 30 year period."
        raise ValueError(f"{error_message}.\n            The chosen slice covers roughly {total_time} years.")
    
    # Establishing if we calculating percentiles or means 
    calculate_clim = True if (percentile == None) else False
    chunk_size = end_chunking_doy
    
    # Checking the chunk size for our climatological calculations
    if calculate_clim and chunk_size == 0:
        raise ValueError("Please set an integer value for end_chunking_doy, which sets the size of the doy batch you use to save the means in.")
    
    
    if show_debug: 
        print("All clear!\n")
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 1: Assign normalized unique day of the year (doy) values to the sliced observation dataset")
        print("---------------------------------------------------------------------------------------------------------\n")
    
    print(f"Chosen baseline slice: {baseline_slice}")
    print("Note: the chosen baseline period has been identified as roughly covering a 30 year period. Do ensure this is the case separately.\n")
    print(f"Chosen window half-width: {window_half_width}")
    print(f"(This means we use {window_half_width} days before and after each day of the year for each doy in our climatology/threshold.)", '\n')
    
    if calculate_clim:
        print(f"The selected doy chunk size for climatological mean dataset batches is {chunk_size} doys.\n")
        end_chunk_val = 366 if not single_download else (start_chunking_doy + chunk_size - 1)
        
        print(f"Calculating means in batches of (at most) {chunk_size} doys between {start_chunking_doy} and {end_chunk_val}.\n")
    else:
        percentile_used = percentile/100
        print(f"Final percentile used (in calculations): {percentile_used} ({percentile}th percentile)", '\n')
        print(f"Calculating thresholds individually for doys between {start_chunking_doy} and {end_chunking_doy}.\n")
    
    # Doy values for specific dates (for later)
    feb28_doy = 59
    feb29_doy = 60
    mar1_doy = 61
    
    # We create a missing doys chunk list for percentiles only, if desired
    if not calculate_clim:
        if chunk_list:
            missing_list = gather_missing_from_storage(folder_name, sub_folder_name, baseline_name, 
                                                       start_chunking_doy, end_chunking_doy,
                                                       percentile, return_check_all=show_debug)

            ## Quick check to make sure the returned missing doy list provided is valid.
            if missing_list is not None:
                if type(missing_list) is list:
                    filtered_list = [i for i in missing_list if i >= start_chunking_doy and i <= end_chunking_doy]

                    print("Missing doys within the set chunk interval were found!\n") 
                    print(f"We are processing all missing doys within the filtered chunk_list:\n{filtered_list}\n\n")
            else:
                raise ValueError("No missing doys were found in the doy interval! Turn off chunk_list to proceed anyway!")
    
    ## Extracting baseline period data
    thetao_baseline = thetao_data.sel(time=baseline_slice)
    if show_debug: print("Original Thetao Baseline Period Data: ", '\n', thetao_baseline, '\n')
   
    # Assigning normalized doy values to the baseline period dataset
    thetao_norm = thetao_baseline.assign_coords(
        normalized_doy=('time', normalize_dayofyear(thetao_baseline.time).data))
    if show_debug: print("Thetao with Normalized Doy: ", '\n', thetao_norm, '\n')
        
    '''
    # Totally optional debug option here: show ALL normalized day of the year (doy) values;
    # all years are in the 366-day format, with some missing day 60 (feb 29)
    with np.printoptions(threshold=np.inf):
        print(thetao_norm.normalized_doy.values) 
    '''
    

    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 2: Get the actual doy values of the baseline period data (should be 1 - 366)")
        print("---------------------------------------------------------------------------------------------------------\n")
    unique_doys = np.unique(thetao_norm.normalized_doy.data)
    unique_doys = unique_doys[~np.isnan(unique_doys)]  # Remove any NaN values
    unique_doys = unique_doys.astype(int)  # Ensure integer day-of-year values
    if show_debug: print(f"Found {len(unique_doys)} unique day-of-year values!")
    if show_debug: print("Unique doys:", '\n', unique_doys, '\n')

         
    global stop_monitoring
    if show_debug: 
        choice_message = "climatological mean" if calculate_clim else "percentile threshold"
        print("---------------------------------------------------------------------------------------------------------")
        print(f"Part 3: Calculate the desired {choice_message} data for the desired doy(s).")
        print("---------------------------------------------------------------------------------------------------------\n")
        stop_monitoring = True # We don't want to start showing memory use.
    else:
        # We start monitoring here so that it only runs once
        stop_monitoring = False # We do want to start showing memory use.
        monitor_thread = threading.Thread(target=monitor_memory, kwargs={'interval_minutes': minutes_per_memory_update})
        monitor_thread.daemon = True
        monitor_thread.start()
        
    # Error messages for later
    chunk_list_message_finished = "All missing doys within the chunk list (except Feb 29) processed!\n"
    chunk_message_finished = "Set chunk end reached!\n"
    no_feb29_possible_warning = "WARNING: Cannot interpolate Feb 29; missing Feb 28 or Mar 1 data!\n"
    
    # Initialize a dictionary for the climatological means if wanted
    if calculate_clim:
        seas_clim_dict = {}
        
    # Bool for debug purposes
    shown_once = False
 
    # Loop for doys 1 - 366 (excluding Feb 29, doy 60)
    for doy in unique_doys:
        # We skip February 29th (to interpolate later)
        if doy == feb29_doy:  
            continue # Note: doy 60 data is still used within the appropriate window_data when available
        
        # These next checks only run for percentile calculations; all doys are loaded in the climatological dictionary (except Feb 29)
        if not calculate_clim:
            # We skip doys to begin on the desired chunk_start value
            if doy < start_chunking_doy: 
                continue

            # Now we look at doys greater than the end_chunking_doy
            if doy > end_chunking_doy:
                # For percentiles, we check if we have a chunk list and whether we wish to continue past the set end_chunking_doy bound
                if chunk_list:
                    if finish_at_chunk_end: # If we want to stop at the end of the set chunk interval:
                        stop_monitoring = True
                        raise ValueError(chunk_list_message_finished)

                # And run this code otherwise for doys beyond the set chunk interval
                else: 
                    if finish_at_chunk_end:
                        stop_monitoring = True
                        raise ValueError(chunk_message_finished)

            # for percentiles, we skip any doys that are not found in the list of missing doys created above (if the chunk_list was set to true)
            if chunk_list:
                if doy not in missing_list:
                    continue 
        
        # Create window around this DOY
        window_doys = []
        
        for w in range(-window_half_width, window_half_width + 1):
            target_doy = doy + w
            
            if show_debug and not shown_once: 
                print("Day of the year: ", doy, "| Target Window Index: ", w, "| Target Window Value: ", target_doy)

            # Handle year wraparound properly
            if target_doy < 1:
                target_doy += 366
            elif target_doy > 366:
                target_doy -= 366
            
            # Handle year boundaries by keeping only valid doys
            if target_doy in unique_doys:
                window_doys.append(target_doy)
            
            if show_debug and not shown_once: print("Window Doys: ", window_doys, '\n')

        # Now, we select the data for this window
        window_data = thetao_norm.where(thetao_norm.normalized_doy.isin(window_doys), drop=True)
        
        '''
        ### Feature to be added: the ability to tweak the data prior to any percentile calculations in a manner like so:
        window_data = window_data.sel(latitude=slice(-3, 0))
        '''
        
        if show_debug and not shown_once: print("Final window data from the baseline period dataset: ", '\n', window_data, '\n')
        
        # Now, we calculate the percentile threshold/climatological mean across the time dimension
        if window_data.time.size > 0:
            # We calculate the climatological mean if that is what is desired
            if calculate_clim:
                seas_clim_dict[doy] = window_data.mean(dim = 'time', skipna = True).expand_dims(normalized_doy=[doy])
                
                if show_debug and not shown_once:
                    print("---------------------------------------------------------------------------------------------------------")
                    print(f"Part 4: Store the climatological means across all doys in an empty dictionary!")
                    print("---------------------------------------------------------------------------------------------------------\n")

                    print(f"Dictionary updated for doy {doy} with the time-averaged final window dataset in Part 3.\n\nDictionary entry:\n", seas_clim_dict[doy], '\n')
                    shown_once = True
            
            # Otherwise, we calculate the percentile for a doy and rechunk the result
            else:
                doy_to_save = window_data.chunk({'time':-1}).quantile(percentile_used, dim='time', skipna=True).expand_dims(normalized_doy=[doy])
                doy_to_save = doy_to_save.chunk(optimal_chunks)

                if show_debug: 
                    print("---------------------------------------------------------------------------------------------------------")
                    print(f"Part 4: Save the percentile threshold dataset one single doy at a time!")
                    print("---------------------------------------------------------------------------------------------------------\n")

                # Now, we save the percentile threshold dataset to storage
                save_dataset_to_storage(folder_name_arg=folder_name, sub_folder_name_arg=sub_folder_name, baseline_name_arg=baseline_name,
                                        clim_arg=calculate_clim, show_debug_arg=show_debug, single_download_arg=single_download, 
                                        current_percentile=percentile, current_doy=doy, ds_to_save=doy_to_save)
          
        
    # After the for loop over the 1-366 day of the year range, we handle February 29th using linear interpolation
    # doy 60 is not being printed? added to dict...
    
    if (feb29_doy in unique_doys):
        # If we have a dictionary with our climatological means...
        if calculate_clim:
            if feb28_doy in seas_clim_dict and mar1_doy in seas_clim_dict:
                feb_28_ds = seas_clim_dict[feb28_doy].squeeze().drop_vars('normalized_doy')
                mar_1_ds = seas_clim_dict[mar1_doy].squeeze().drop_vars('normalized_doy')
                seas_clim_dict[feb29_doy] = 0.5 * (feb_28_ds + mar_1_ds)
                seas_clim_dict[feb29_doy] = seas_clim_dict[feb29_doy].expand_dims(normalized_doy=[feb29_doy])
                if show_debug: print("Interpolated February 29 dataset (doy 60) in the dictionary:\n", seas_clim_dict[feb29_doy], '\n')
            else:
                print(no_feb29_possible_warning)
        
        # Otherwise, we interpolate percentile thresholds using saved percentile datasets
        else:
            # Quick check to ensure the 60th doy is within our chunk interval
            if (start_chunking_doy < feb29_doy) and (feb29_doy < end_chunking_doy):
                # We check if we have a chunk list with doy 60 among the missing doys
                if chunk_list:
                    if feb29_doy not in missing_list:
                        stop_monitoring = True
                        raise ValueError(chunk_list_message_finished)

                # Function for finding the files necessary to interpolate for Feb 29
                def find_file_and_return_it(folder_name_arg, sub_folder_name_arg, baseline_name_arg, current_doy, current_percentile):
                    id_path = f"{folder_name_arg}_{sub_folder_name_arg}"
                    file_name = f"{id_path}_thetao_thresh_300m_subset_{current_doy}_{baseline_name_arg}.zarr"
                    filepath = f'{temperature_directory}/Thresh{current_percentile}th/{folder_name_arg}/{id_path}/{file_name}'

                    # We check if the path/file exists in the target location
                    if os.path.exists(filepath):
                        ds = xr.open_zarr(filepath).squeeze('normalized_doy', drop=True)
                        return True, ds
                    else:
                        return False, None

                # We check for the files required for interpolation
                file_found_feb28, thresh_feb28_ds = find_file_and_return_it(folder_name, sub_folder_name, baseline_name, feb28_doy, percentile)
                file_found_mar1, thresh_mar1_ds = find_file_and_return_it(folder_name, sub_folder_name, baseline_name, mar1_doy, percentile)

                # We proceed with interpolation if both files are found
                if file_found_feb28 and file_found_mar1:
                    doy_to_save = 0.5 * (thresh_feb28_ds + thresh_mar1_ds)
                    doy_to_save = doy_to_save.expand_dims(normalized_doy=[feb29_doy])
                    doy_to_save = doy_to_save.chunk(optimal_chunks)

                    # Now, we save the percentile threshold dataset to storage
                    save_dataset_to_storage(folder_name_arg=folder_name, sub_folder_name_arg=sub_folder_name, baseline_name_arg=baseline_name,
                                           clim_arg=calculate_clim, show_debug_arg=show_debug, single_download_arg=single_download, 
                                           current_percentile=percentile, current_doy=feb29_doy, ds_to_save=doy_to_save)
                
            # If the two required datasets for interpolating Feb 29 are missing, we run this:
            else: 
                print(no_feb29_possible_warning)
            
    # We proceed with the full climatology dictionary if we are calculating climatological means
    if calculate_clim:
        if show_debug: 
            print("---------------------------------------------------------------------------------------------------------")
            print("Part 5: Creating the complete climatology dataset from the dictionary")
            print("---------------------------------------------------------------------------------------------------------\n")

        # We create the correct coordinates from our dictionary for our final dataset
        doy_coords = np.array(sorted(seas_clim_dict.keys())) # array for full year (1 to 366, if leap)
        if show_debug: print("Dictionary Keys of Registered Unique Day of the Year (doy) Climatological Mean Datasets\n", 
                             "(Should include all values from 1 to 366):\n", doy_coords, '\n')

        # We stack the resulting dictionary datasets while maintaining the correct order
        seas_clim_list = [seas_clim_dict[doy] for doy in doy_coords]
        seas_clim_year = xr.concat(seas_clim_list, dim='normalized_doy')
        seas_clim_year = seas_clim_year.assign_coords(normalized_doy=('normalized_doy', doy_coords))
        seas_clim_year = seas_clim_year.chunk(optimal_chunks)
        
        # Additional chunking that prevents crashing (can be lowered for more stability)
        if chunk_size <= 61:
            seas_clim_year = seas_clim_year.chunk({'normalized_doy': chunk_size})
        else:
            seas_clim_year = seas_clim_year.chunk({'normalized_doy': 10})
            
        print("Final Climatology Dataset:\n", seas_clim_year, '\n')

        # We check if we are saving this datasets fully or in batches; a chunk_size of 366 implies the full dataset is being saved (no batch saving)
        if chunk_size == 366:
            batch_saving = False
        else:
            batch_saving = True
                
        if show_debug: 
            saving_choice_message = "in one go" if not batch_saving else f"via batches of {chunk_size} doys"

            print("---------------------------------------------------------------------------------------------------------")
            print(f"Part 5: Saving the climatology dataset {saving_choice_message}!")
            print("---------------------------------------------------------------------------------------------------------\n")
            
        # Coordinate values for batch saving (not full climatology dataset saving)
        coord_values = seas_clim_year['normalized_doy'].values

        # We are saving the dataset in batches
        if batch_saving:
            for i in range(0, len(coord_values), chunk_size):
                # First, we gather the starting and ending values of the processed chunk
                start_val = coord_values[i]

                # Quick check to see where to begin downloading a batch from...
                if start_val < start_chunking_doy:
                    continue

                # Grab the end index and value
                end_idx = min(i + chunk_size, len(coord_values))
                end_val = coord_values[end_idx - 1]
                
                # Save the dataset (subsetting occurs in the function)              
                save_dataset_to_storage(folder_name_arg=folder_name, sub_folder_name_arg=sub_folder_name, baseline_name_arg=baseline_name,
                                        clim_arg=calculate_clim, show_debug_arg=show_debug, single_download_arg=single_download,
                                        start_val_arg=start_val, end_val_arg=end_val, ds_to_save=seas_clim_year)  
        
        # We are saving the full dataset
        else:
            save_dataset_to_storage(folder_name_arg=folder_name, sub_folder_name_arg=sub_folder_name, baseline_name_arg=baseline_name,
                                    clim_arg=calculate_clim, show_debug_arg=show_debug, single_download_arg=single_download,
                                    start_val_arg=start_chunking_doy, end_val_arg=end_chunking_doy, ds_to_save=seas_clim_year)  
            
        stop_monitoring = True

    stop_monitoring = True # reset the monitoring before the next loop
    
## -------------------------------------------------------------------------------------------------------------------------------------------------------


## Running the main function for the 1993–2022 Baseline # 30 year baseline
threshold_period_Baseline9322 = slice('1993-01-01', '2022-12-31')

## Set up for the for loop to iterate over for different regions

# Dictionary of chunk configs (set these to divide their coordinates' values by a small number less than 400 that leaves no remainder)
chunk_configs = {
    #"Atlantic": {
        #"Central": {'depth': -1, 'latitude': 101, 'longitude': 209},
        #"Right": {'depth': -1, 'latitude': 276, 'longitude': 204},
     #   "Top": {'depth': -1, 'latitude': 196, 'longitude': 209}
    #},
   # "Pacific": {
    #    "Center": {'depth': -1, 'latitude': 218, 'longitude': 257},
     #   "Left": {'depth': -1, 'latitude': 221, 'longitude': 244}
   # },
   "Mid": {
   #     "All": {'depth': -1, 'latitude': 253, 'longitude': 270},
        "Mid": {'depth': -1, 'latitude': 221, 'longitude': 361}
    }
}


# Regions to process thresholds for (adjust as need be)
region_dict_list = [
    #{"Atlantic": [#"Central", 
    #              "Top", 
                 # "Right"
   # ]},
  #  {"Pacific": ["Center", 
                 #"Left"
              #  ]},
    {"Mid": ["Mid",
        #"All",
    ]}
]

# What should the starting chunk value be (if you have already saved previous chunks for a threshold dataset)?
chunk_start = 1

# What should be the final chunk to be calculated (anything greater than this is excluded)?
chunk_end = 366

# For convenience, a for loop structure is used to process climatological means/percentile thresholds for folders in our directory, whose
# naming structure are implied by the dictionaries above. 

# Dictionary entries can be commented with a # at will to process only select subfolders at a time (within the larger regional folders).

for folder in region_dict_list:
    for folder_filename, sub_folders in folder.items():
        
        # Initializing...
        observed_data_directory = ""
        optimal_chunking = None
        full_ds = None
        
        # Code to concatenate:
        for sub_folder_filename in sub_folders:
            # Your custom subsetted-region filepath here; this is my setup.
            observed_data_directory = f'{temperature_data_directory}/{folder_filename}/{folder_filename}_{sub_folder_filename}'
            
            # Get the ordered paths...
            paths = glob.glob(f'{observed_data_directory}/daily_data_*.zarr')
            paths.sort()
            
            # Merge all the datasets in the subfolder into one to pass to the function; also designate the chunks to be used
            datasets = [xr.open_zarr(path) for path in paths]
            full_ds = xr.concat(datasets, dim="time")
            
            optimal_chunking = chunk_configs[folder_filename][sub_folder_filename]
            full_ds = full_ds.chunk(optimal_chunking)
            
            # Calling the function to calculate the mean thresholds (percentile=None)
            calculate_thetao_thresh_or_clim_given_a_percentile(thetao_data = full_ds.thetao, 
                                                               baseline_slice = threshold_period_Baseline9322, 
                                                               folder_name = folder_filename, sub_folder_name = sub_folder_filename, 
                                                               baseline_name = baseline_choice, 
                                                               optimal_chunks = optimal_chunking, window_half_width = 5, 
                                                               minutes_per_memory_update = 45, percentile = None,
                                                               start_chunking_doy = chunk_start, end_chunking_doy = chunk_end,
                                                               show_debug = True, single_download = False, 
                                                               chunk_list = False, finish_at_chunk_end = True)
            
            # Calling the function to calculate the percentile thresholds (percentile=NUMBER)
         #   calculate_thetao_thresh_or_clim_given_a_percentile(thetao_data = full_ds.thetao, 
          #                                                     baseline_slice = threshold_period_Baseline9322, 
           #                                                    folder_name = folder_filename, sub_folder_name = sub_folder_filename, 
            #                                                   baseline_name = baseline_choice, 
             #                                                  optimal_chunks = optimal_chunking, window_half_width = 5, 
              #                                                 minutes_per_memory_update = 45, percentile = chosen_percentile,
               #                                                start_chunking_doy = chunk_start, end_chunking_doy = chunk_end,
                #                                               show_debug = True, single_download = False, 
                 #                                              chunk_list = False, finish_at_chunk_end = True)

print("We have finished saving all desired doy percentile threshold datasets completely!")
stop_monitoring = True

<div style="color:#008B00; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸  FILE VALIDATION, VERIFICATION, AND ANIMATIONS  °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤
</div>

In [None]:
## Check stored percentile threshold or climatological mean datasets (if you want to do so independently from the code above)

def gather_missing_from_storage(baseline_name_arg, folder_name_arg, sub_folder_name_arg=None,
                                check_climatological_means=False, current_percentile=90, return_check_all=True):
    
    check_size_notif = "\n(You should also check file sizes to verify everything downloaded correctly!)"
    error_counter = 0
    error_list = []
    
    if check_climatological_means:
        # Gather the existing filepaths
        clim_data_directory = filepath = f'{temperature_means_directory}/{folder_name_arg}'
        paths = glob.glob(f'{clim_data_directory}/{folder_name_arg}_thetao_clim_300m_subset*{baseline_name_arg}.zarr')
        paths.sort()
        
        # Extract doy number ranges from existing files
        covered = set()
        for path in paths:
            file_nums = os.path.basename(path).replace('.zarr', '').split('_')
            # For the filename format: folderName_thetao_clim_300m_subset_START_to_END_baseline9322.zarr
            # Index:                      [0]      [1]   [2]  [3]   [4]    [5] [6] [7]     [8]
             
            # Extract the doy number pairs
            start, end = int(file_nums[5]), int(file_nums[7])
            covered.update(range(start, end + 1))
    
        # Full year (1-366) check
        missing_doys = sorted(set(range(1, 367)) - covered)
        
        
        if not missing_doys:
            print(f'Files for all {folder_name_arg} climatology doy ranges (a full 1-366 year) exist! {check_size_notif}')
            return None
        else:
            print(f'Some {folder_name_arg} climatology doy ranges missing!\n\nMissing: {sorted(missing_doys)}\n')
            return missing_doys
    
    # Checking percentile thresholds
    else: 
        for i in range(1, 366+1):
            # Filepath setup
            id_path = f"{folder_name_arg}_{sub_folder_name_arg}"
            file_name = f"{id_path}_thetao_thresh_300m_subset_{i}_{baseline_name_arg}.zarr"
            filepath = f'{temperature_directory}/Thresh{current_percentile}th/{folder_name_arg}/{id_path}/{file_name}'

            # We check if the path/file exists in the target location
            if os.path.exists(filepath):
                if return_check_all:
                    print(f'File for doy {i} exists at the specified path!\n')
            else:
                if return_check_all:
                    error_bar = "------------------------------------------------------------------------------"
                    print(f'{error_bar}\nWARNING: File for doy {i} not found at the specified path!\n{error_bar}\n')

                error_counter += 1
                error_list.append(i)

        # If there are no missing doy datasets in our storage, we return None; otherwise, we return the list of missing doys!
        if error_counter == 0:
            print(f"All {folder_name_arg} {sub_folder_name_arg} doys checked and present! {check_size_notif}")
            return None
        else:
            print(f"All {folder_name_arg} {sub_folder_name_arg} doys checked; you are missing {error_counter} doy datasets in total!\n")
            print("The missing doys are: ")
            print(error_list)
            return error_list

# Ex. climatology:
missing_list_clim = gather_missing_from_storage(baseline_name_arg = "Baseline9322", folder_name_arg = "Mid", check_climatological_means = True)

# Ex. thresholds:
missing_list_thresh = gather_missing_from_storage(baseline_name_arg = "Baseline9322", folder_name_arg = "Mid",
                                                 sub_folder_name_arg = "Mid", return_check_all = False)

In [None]:
## Function to create an animation that shows the mean and percentile latitude and longitude maps for the full 1 - 366 period. 
def check_processed_datasets_with_an_animation(baseline_name_arg, folder_name_arg, 
                                               sub_folder_name_arg=None,
                                               custom_output_filename=None,
                                               show_climatological_means=False, 
                                               percentile=0, chosen_depth=0.494):
    
    ## Check proper arguments are provided for threshold datasets
    if not show_climatological_means and percentile == 0:
        raise ValueError("Please set a non-zero numeric percentile (based on the percentile you used above in your percentile datasets)!")
    
    if not show_climatological_means and sub_folder_name_arg == None:
        raise ValueError("Please provide a valid sub_folder_name!")
        
    if show_climatological_means:
        sub_folder_name_arg = None
        percentile = 0
        
    
    ## Gather the stored dataset filepaths
    if show_climatological_means:
        data_type = "Clim"
        data_id   = f"{folder_name_arg}"
        data_path = f"{data_type}/{data_id}"
    else:
        data_type = f"Thresh{percentile}th"
        data_id   = f"{folder_name_arg}_{sub_folder_name_arg}"
        data_path = f"{data_type}/{folder_name_arg}/{data_id}"
        
    data_directory = f'{temperature_directory}/{data_path}'   
    paths = glob.glob(f'{data_directory}/{data_id}_thetao*{baseline_name_arg}.zarr')
    
    # A quick check to ensure we have located files given our arguments
    if not paths:
        start_error = "No files found matching the pattern"
        cont_error = "\nPlease verify you inputted the proper baseline_name, folder_name, sub_folder_name, percentile, and show_climatological_means arguments!"
        raise FileNotFoundError(f"{start_error}:\n{data_directory}/{data_id}_thetao...300m...{baseline_name_arg}.zarr\n{cont_error}")
    
    
    ## Fill a dictionary where all (1 to 366) doys are matched with their corresponding filepaths
    doys_dict = {}
    
    for filepath in paths:
        # Open and check what doys are in this file
        ds = xr.open_zarr(filepath)
        
        # In my earlier percentile datasets (calculated in the same manner), my normalized_doys were saved as variable doys instead.
        # this following if statement will likely be unnecessary for you.
        if 'doy' in ds.coords:
            ds = ds.rename({'doy': 'normalized_doy'}).expand_dims('normalized_doy')
        
        ds = ds.thetao
        file_doys = ds['normalized_doy'].values

        # Handle both single value and arrays
        if np.isscalar(file_doys):
            file_doys = [file_doys]
         
        # Map each doy to its file
        for doy in file_doys:
            doys_dict[int(doy)] = filepath
         
        ds.close()
    
    
    ## Use a file and its features to set up the plot
    available_doys = sorted(doys_dict.keys())
    setup_file = doys_dict[available_doys[0]]
    
    if not show_climatological_means:
        setup_ds = xr.open_zarr(filepath).thetao.drop_vars("quantile")
    else:
        setup_ds = xr.open_zarr(filepath).thetao
    
    # Quick fix for my personal, early datasets
    if 'doy' in setup_ds.coords:
        setup_ds = setup_ds.rename({'doy': 'normalized_doy'}).expand_dims('normalized_doy')
    
    # Depth selection
    setup_ds = setup_ds.sel(depth=chosen_depth, method='nearest')
    first_depth = round(setup_ds.depth.item(), 1)
    setup_ds = setup_ds.drop_vars("depth")

    lon = setup_ds.longitude.values
    lat = setup_ds.latitude.values
    
    # Check for single or multiple-doys in the setup dataset, and return the thetao data for just one (the first) doy
    if len(setup_ds['normalized_doy'].values.shape) == 0 or setup_ds['normalized_doy'].values.size == 1:
        # Single day file
        setup_ds   = setup_ds.drop_vars("normalized_doy").squeeze()
        setup_data = setup_ds.values
    else:
        # Multi-day file
        setup_data = setup_ds.isel(normalized_doy=0).values
    
    setup_ds.close()
    
    
    ## Initialize the plot    
    fig, ax = plt.subplots(figsize=(14, 6), 
                           subplot_kw={'projection': ccrs.Mercator()})
    
    pcm = ax.pcolormesh(
        lon, lat, setup_data,
        cmap='RdYlBu_r',
        vmin=-5, vmax=35,
        transform=ccrs.PlateCarree(),
    )
    
    ax.set_extent([0, 360, -30, 90], crs=ccrs.PlateCarree())
    ax.add_feature(cfeature.LAND, color='lightgray')
    ax.add_feature(cfeature.COASTLINE, linewidth=0.8)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    title = ax.set_title('')
    title_base = '(Relative to 1993-2022)' if baseline_name_arg == 'Baseline9322' else f'({baseline_name_arg})'
    
    
    ## Animation function
    def animate(i):
        doy = available_doys[i]
        filepath = doys_dict[doy]
        
        # Load the dataset
        ds = xr.open_zarr(filepath).thetao.sel(depth=chosen_depth, method='nearest').drop_vars("depth")
        
        # Quick fix for my personal, early datasets
        if 'doy' in ds.coords:
            ds = ds.rename({'doy': 'normalized_doy'}).expand_dims('normalized_doy')
        
        if not show_climatological_means:
            ds = ds.drop_vars("quantile")
            
        # Check if this is a single-day or multi-day file
        doy_values = ds['normalized_doy'].values
        
        # Load the data if available for a single doy or select the correct doy in a dataset
        if np.isscalar(doy_values) or doy_values.size == 1:
            frame_data = ds.values
        else:
            doy_idx = np.where(doy_values == doy)[0][0]
            frame_data = ds.isel(normalized_doy=doy_idx).values
        
        # Update the plot
        pcm.set_array(frame_data.ravel())
        title.set_text(f'{data_type} Day {doy} of the Year\n{title_base}')
        
        ds.close()
        return pcm, title
    
    
    ## Create the resulting animation
    type_message = "climatological means" if show_climatological_means else "percentile thresholds"
    print(f"Began animation for the {type_message} of the {data_id} datasets!")
    
    chosen_doys = len(available_doys)
    
    anim = animation.FuncAnimation(
        fig, animate,
        frames=chosen_doys,
        interval=200,
        blit=False,
        repeat=True
    )
    
    writer = animation.PillowWriter(fps=2)
    
    if custom_output_filename == None:
        output_filename = f"{data_id}_{data_type}_{baseline_name_arg}_{chosen_doys}_doys_total_with_depth_of_{first_depth}.gif"
    else:
        output_filename = custom_output_filename
    print(f"Saving animation at: {output_filename}") 
    
    
    ## Save the resulting animation
    def print_frame_progress(current_frame, total_frames):
        print(f"\r → Doy (Frame) Processed: {current_frame + 1}/{total_frames}", end='', flush=True)

    #with ProgressBar():
    anim.save(output_filename, writer=writer, dpi=100,
              progress_callback=print_frame_progress)
    
    plt.tight_layout()
    plt.close(fig)
    print(f"\nAnimation finished and saved!\n")
    
    return anim

# ------------------------------------------------------------------------------------------------------------------------------

# Examples for creating the animations
perc_anims_to_make = ["All"] 
#check_processed_datasets_with_an_animation("Baseline9322", "Atlantic", 
 #                                              sub_folder,
  #                                             show_climatological_means=True, 
   #                                            percentile=90, chosen_depth=300)

for sub_folder in perc_anims_to_make:    
    check_processed_datasets_with_an_animation("Baseline9322", "Atlantic", 
                                               sub_folder,
                                               show_climatological_means=False, 
                                               percentile=90, chosen_depth=300)
