In [None]:
''' Main Required Dependencies ''';
import xarray as xr
import numpy as np

''' Optional Dependencies ''';
from datetime import datetime                    # For a ~30 year period check function (Optional)
from dateutil.relativedelta import relativedelta # For a ~30 year period check function (Optional)
from dask.diagnostics import ProgressBar # To download processed datasets with a progress bar (Optional but HIGHLY recommended!)

''' Additional Optional Memory Monitoring Dependencies (memory monitoring is optional but HIGHLY recommended) ''';
import psutil 
import threading
import time
import os

''' Additional Optional Dependencies for the File Validation, Verification, and Animation Section ''';
import glob # For loading many datasets at once (required to produce an animation)
import matplotlib.pyplot as plt # Required to produce an animation
import matplotlib.animation as animation # Required to produce an animation
import cartopy.crs as ccrs # For the chosen map projection (in animations)
import cartopy.feature as cfeature # To add land features and coastlines (Optional if you use something else)

<div style="background-color: #FFE099; padding: 10px; border: 3px solid #FFC233; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">||| - - - - - - - - - - - - - - - - - - - - - - - <|       SCRIPT NOTES       |> - - - - - - - - - - - - - - - - - - - - - - - |||</div>

<div style="background-color: #EFFAFA; border: 2px solid #A2E2E2; font-family: Georgia, serif; padding: 10px">
    <div style="text-align: center;">
        This is the script to <strong>process percentile thresholds</strong> and <strong>climatological means</strong> from your temperature data (downloads not performed here).
    </div>
    <br>&#x27A1;&#xFE0E; This code calculates climatological means and percentile thresholds for 366 unique days of the year (including February 29) by default, using all available data. 
    <br>&#x27A1;&#xFE0E; Default constants and settings are provided to show an example of the usage of this code to (ultimately) detect marine heatwaves based closely on Hobday et al. (2016).
    <br>&#x27A1;&#xFE0E; A fixed 30-year historical baseline period is used to calculate climatological means and percentiles by default. Adjust as need be (in the script-wide constants section).  
    <br>&#x27A1;&#xFE0E; You can duplicate this script and run its copies simultaneously to download more percentiles/means at once (if need be).
    <br>&#x27A1;&#xFE0E; If unusual or unexpected errors appear after attempting to run a cell again, restart the kernel!
    <br>&#x27A1;&#xFE0E; If you save many files, you may compare their file sizes to see if any should be removed and redownloaded. Partially saved files will often have drastically different file sizes.
    <br>&#x27A1;&#xFE0E; Lastly, in this script, constants define your root directory and output directories; you may tweak these and any constants 
    (and code) as you see fit to suit your needs!
    <br><br>
</div>

<div style="color:#642CA9; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    IMPORTANT SCRIPT-WIDE CONSTANTS AND FUNCTIONS    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸
</div>

In [None]:
''' - - - Constants set up - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''';
percentile = 90             # Desired (temperature) percentile threshold. 90 here indicates the 90th percentile. 
minutes_per_mem_update = 10 # For memory monitoring (optional). Minutes (roughly) per memory update (to keep track of its use and avoid crashing/issues).
memory_monitoring = True    # Required to be either False or True. Decide whether you want to use the optional memory monitoring feature (True) or not.

''' - - - Historical baseline period setup - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''';
## Pick a FIXED time period whose data you will use to calculate the climatological means/percentiles
baseline_period_slice_choice = slice('1994-01-01', '2023-12-31') # Included time period for mean/percentile calculations
baseline_folder_name = "Baseline9423" # Custom identifier (for you to set based on the chosen baseline period) for your saved means/percentiles

''' - - - Percentile/mean dataset filename/file path setup - - - - - - - - - - - - - - - - - - - - ''';
# This is the default file path convention used here (all the * variables are determined automatically later in the code):
# {region_id_folder_name}_{custom_id}_sst_{*Data Type}_subset_{*min Day of the Year)_to_{*max Day of the Year}_{baseline_folder_name}.zarr
# Output example: Full_fgd_sst_thresh_subset_1_to_366_Baseline9423.zarr

# Note: Please avoid using "." for any of the following custom identifier constants as it messes with file readability (as well as 
# any other special characters that could mess with file readability, such as "*" and "/").

# *Data Type is set to be either "clim" or "thresh" for each processed dataset, short for either climatological means or percentile thresholds. 
# Instead of saving datasets with "clim" or "thresh", you may change these data type identifiers here.
climatological_means_id = "clim"
percentile_threshold_id = "thresh"

# These are the customizable filename variables; adjust as needed. 
region_id_folder_name = "Full" # A "regional" identifier to save the severity dataset with. Example: "North_Atlantic", "NA_top", "N_Atl_1", etc.
custom_id = "fgd"              # A custom identifier to further distinguish the saved data with. You can leave this empty as "".
final_custom_id = f"{custom_id}_" if custom_id != "" else "" # If you set a custom_id (not left as ""), then it adds a "_" to it for the filename.

# These are the components of the filename convention used to produce datasets named similarly to the output example above. Tweak as desired. 
processed_filename_start = f"{region_id_folder_name}_{final_custom_id}sst" # example component output: "Full_fgd_sst"
processed_filename_end = f"{baseline_folder_name}.zarr"                    # example component output: "Baseline9423.zarr"
processed_filename_perc_id = f"{percentile_threshold_id}_subset"            # example component output: "clim_subset_"
processed_filename_clim_id = f"{climatological_means_id}_subset"            # example component output: "thresh_subset_"

''' - - - Root and output directory setup - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''';
# These are the default directory conventions used here:
# Root directory: {my_root_directory}/
# Raw data directory: {my_root_directory}/{raw_data_folder_name}
# Climatological mean data directory: {my_root_directory}/{clim_data_folder_name}
# Percentile threshold data directory: {my_root_directory}/{perc_data_folder_name}
# In other words, within your root directory, you will have your raw data, climatological mean, and chosen percentile threshold dataset folders.

# Additionally, in the climatological mean and percentile threshold directories, a folder will be created named after your baseline_folder_name variable. 
# All the dataset outputs from this script will be stored in these baseline_folder_name folders. You can tweak this setup and the output directories.
# Final output path example:  d2/Thresh90th/Baseline9423/Full/Full_fgd_sst_thresh_subset_1_to_366_Baseline9423.zarr

my_root_directory = "/d0" # Should be your root directory, from which you access your data from and will save data to.
raw_data_folder_name = "Data"
clim_data_folder_name = "Clim"
perc_data_folder_name = f"Thresh{percentile}th"
raw_data_directory = f"{my_root_directory}/{raw_data_folder_name}"
clim_data_directory = f"{my_root_directory}/{clim_data_folder_name}/{baseline_folder_name}/{region_id_folder_name}"
perc_data_directory = f"{my_root_directory}/{perc_data_folder_name}/{baseline_folder_name}/{region_id_folder_name}"
## OPTIONAL: You can remove "/{region_id_folder_name}" from the clim and perc data directories if you do not plan 
#            on subsetting the desired region of interest into multiple subregions.

''' - - - Print (verification) statements - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''';
print("CHOSEN CONSTANTS:\n")
print(f"Percentile threshold:                {percentile}th percentile")
print(f"Baseline slice object:               {baseline_period_slice_choice}")
print(f"Corresponding baseline identifier:   {baseline_folder_name}")
print(f"Memory monitoring:                   {memory_monitoring}\n")

print(f"Chosen root directory:                  {my_root_directory}")
print(f"Chosen raw data directory:              {raw_data_directory}")
print(f"Chosen output directory (means):        {clim_data_directory}")
print(f"Chosen output directory (percentiles):  {perc_data_directory}\n")

example_full_file_name = f"{processed_filename_start}_{processed_filename_clim_id}_1_to_366_{processed_filename_end}" # just an example
print(f"Example output filename:           {example_full_file_name}")
print(f"Example output file path (full):   {clim_data_directory}/{example_full_file_name}\n") 

In [None]:
## REQUIRED FUNCTIONS
''' ------------------------------------------------------------------------------------------------ '''
''' Function to normalize the unique day of the year value of each observed day in the format: 1-366 '''
''' ------------------------------------------------------------------------------------------------ '''

def normalize_dayofyear(time_coord):
    doy = time_coord.dt.dayofyear
    is_leap = time_coord.dt.is_leap_year

    # This code ensures March 1 is always the unique day of the year (doy) 61, regardless of leap year status.
    normalized_doy = xr.where(
        (~is_leap) & (doy >= 60),  # For a non-leap year, doy 60 is March 1 initially. As such, we shift forward
        doy + 1,                   # March 1 and any later days by 1 day. This makes March 1 have a doy of 61.
        doy                        # Otherwise, we keep the day of the year values as is for leap years and the Jan. to Feb. 28 period.
    )                              # Non-leap years end up having a missing doy 60 day by design.
    return normalized_doy


''' ------------------------------------------------------------------------------------- '''
''' Function to just save the percentile threshold/climatological mean dataset to storage '''
''' ------------------------------------------------------------------------------------- '''
def save_dataset_to_storage(show_debug_arg, single_download_arg, 
                            current_percentile=None, start_val_arg=None, end_val_arg=None, 
                            ds_to_save=None):
    '''
    FUNCTION ARGUMENTS AND THEIR EXPLANATIONS
    show_debug_arg:      If set to true, only shows the output file path and dataset, but doesn't save anything. 
                         show_debug (from the main function) must be set to False for the dataset to be saved.

    single_download_arg: If set to True (from the inputted single_download variable in the main function), this
                         function will only save the current dataset and stop immediately afterward.

    current_percentile:  Is set to to either your set chosen percentile in the main function or remains None.
                         If it remains None, the saved dataset is determined to be a climatological means dataset,
                         rather than a percentile threshold dataset, adjusting the filename and file path accordingly.

    start_val_arg:       Sets the lower (minimum) bound for the desired unique day of the year (doy) data to save 
                         (within the 1 to 366 range).

    end_val_arg:         Sets the upper (maximum) bound for the desired unique day of the year (doy) data to save
                         (within the 1 to 366 range).

    ds_to_save:          The inputted mean/percentile dataset from the main function to be saved. Only the doys both
                         between AND including the start_val_arg and end_val_arg will be saved in the current dataset.
                         Example: if start_val_arg = 1 and end_val_arg = 30, doys 1 to 30 will be saved in the dataset.
    ''';
    
    ## We check if we are saving a climatology dataset or percentile one after subsetting the datasets appropriately
    print("Starting doy for current subset: ", start_val_arg)
    print("Ending doy for current subset: ", end_val_arg, '\n')

    if start_val_arg == 1 and end_val_arg == 366:
        final_ds_to_save = ds_to_save
    else:
        final_ds_to_save = ds_to_save.sel({'normalized_doy': slice(start_val_arg, end_val_arg)})
    
    desired_doy_period = f"{start_val_arg}_to_{end_val_arg}" # For use in the filename
    
    # Here, we load the file path destinations for the percentile threshold or climatological mean datasets
    if current_percentile is None: # climatological means
        print(f"Current climatological means subset to save:\n{final_ds_to_save}\n")
        file_name = f"{processed_filename_start}_{processed_filename_clim_id}_{desired_doy_period}_{processed_filename_end}"
        filepath = f'{clim_data_directory}/{file_name}'
    else: # percentile thresholds
        print(f"Current {current_percentile}th percentile subset to save:\n{final_ds_to_save}\n")
        file_name = f"{processed_filename_start}_{processed_filename_perc_id}_{desired_doy_period}_{processed_filename_end}"
        filepath = f'{perc_data_directory}/{file_name}'
        
    print("Filepath of subset: ", filepath)

    # Lastly, we either finish by saving or stop if we are at the end of the debug
    if show_debug_arg:
        raise ValueError('End of debug. Proceed with the setting "show_debug = False" to start saving the dataset.')
    else:
        try: # This should run if the optional ProgressBar dependency was fulfilled.
            with ProgressBar():
                final_ds_to_save.to_zarr(filepath, mode='w', consolidated=True)
        except Exception as e:
            final_ds_to_save.to_zarr(filepath, mode='w', consolidated=True)
        
        print(f"Saved subset: doys {start_val_arg} to {end_val_arg}\nMoving on!\n")
        print("---------------------------------------------------------------------------------------------------------")

print("Required functions loaded!\n")

In [None]:
## OPTIONAL
''' ------------------------------------------------------------------------------- '''
''' Function to check whether the inputted time period is at least roughly 30 years '''
''' ------------------------------------------------------------------------------- '''

def rough_30_year_period_check(time_slice, tolerance=0.01):
    start_str = time_slice.start
    stop_str = time_slice.stop

    start_date = datetime.strptime(start_str, '%Y-%m-%d')
    end_date = datetime.strptime(stop_str, '%Y-%m-%d')

    delta = relativedelta(end_date, start_date)
    total_years = delta.years + delta.months/12 + delta.days/365.25
    
    return total_years, abs(total_years - 30) <= tolerance

# Running a rough time check for the baseline provided
total_time, is_time_slice_30_years = rough_30_year_period_check(baseline_period_slice_choice)

if not is_time_slice_30_years:
    error_message = "Please check that your chosen baseline time slice covers a 30 year period."
    raise ValueError(f"{error_message}.\n            The chosen slice covers roughly {total_time} years.")
else:
    period_checked = f"{baseline_period_slice_choice.start} to {baseline_period_slice_choice.stop}"
    check_pass_msg_1 = f"The chosen baseline period of {period_checked} has been identified as roughly covering a 30 year period." 
    print(f"{check_pass_msg_1} Do ensure this is the case separately.\n")

In [None]:
# OPTIONAL
''' ----------------------------------------------- '''
''' Function to keep track of and show memory usage '''
''' ----------------------------------------------- ''' 

if memory_monitoring:
    def monitor_memory(interval_minutes=5, log_file=None):
        interval = interval_minutes * 60  
        
        while not stop_monitoring:
            mem = psutil.Process(os.getpid()).memory_info().rss / (1024**3)  # in GB
            print(f" | Memory usage: {mem:.2f} GB | Memory: {psutil.virtual_memory().percent}% used | ")
            
            if log_file:
                with open(log_file, 'a') as f:
                    f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: {mem:.2f} GB\n")
            time.sleep(interval)
    print("Memory function loaded!")
    
else:
    if "monitor_memory" in globals():
        del monitor_memory
    
    print('You have chosen to not use memory monitoring. If this was a mistake, update the constant "memory_monitoring" to True!')

<div style="color:#CD6600; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    LOADING FULL OBSERVED GLOBAL DATASETS    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸
</div>

In [None]:
''' - - - Full dataset loading (from the set raw data directory) - - - - - - - - - - - - - - - - - - - - - - - ''';
# Collect all applicable raw data datasets
raw_data_files = f'{raw_data_directory}/sst*.nc' # Adjust to identify your saved files as need be. 
#                                                  The * allows us to later grab all the datasets that end with ".nc" and start with "sst" here.

# Set any latitude and longitude bounds for your processed means/percentiles and raw data datasets to use, if desired
lat_bounds = slice(-15, 90) # latitude bounds
lon_bounds = slice(0, 360)  # longitude bounds

# Load the raw temperature datasets as one with xarray
ds = xr.open_mfdataset(
    raw_data_files,             # Glob pattern (the * grabs all datasets)
    parallel=True,              # Enable parallel file opening 
    chunks='auto',              # Let dask choose optimal chunking 
    combine='by_coords',        # Merge based on coordinate values
    engine='h5netcdf')          # Specify the engine (may not work with the wrong engine based on your dataset's format)

full_ds = ds.sel(lat=lat_bounds, lon=lon_bounds).sst # Make sure to access the appropriate desired variable if it is not called "sst"
print("Full raw data dataset:\n", full_ds, '\n')

# Make sure to perform any further desired filtering/cleaning here.

In [None]:
''' - - - Rechunk the raw data dataset - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''';
optimal_chunking = {'lat': 210, 'lon': 160}
# These values were chosen as they were below 300 and divided the observations of my full dataset neatly.
# For instance, there were 420 latitude observations and 1440 longitude observations in my loaded dataset. 420/210 = 2 and 1440/160 = 9.
# It is recommended you check what kind of chunk sizes are best for your system and dataset for efficient saving/storage.

temps_full = full_ds.chunk(optimal_chunking)
print("Final (properly chunked) raw data dataset:\n", temps_full)

<div style="color:#104E8B; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    CALCULATING CLIMATOLOGICAL MEANS AND PERCENTILE THRESHOLDS    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø</div>

In [None]:
''' ----------------------------------------------------------------- '''
''' Function to actually calculate the percentile threshold dataset to storage '''
''' ----------------------------------------------------------------- '''

# Function to calculate the percentile threshold values for specific depths
def calculate_temp_thresh_or_clim_given_a_percentile(temp_data, baseline_slice, 
                                                     optimal_chunks, window_half_width=5, 
                                                     minutes_per_memory_update=5, chosen_percentile=None, 
                                                     start_chunking_doy=1, end_chunking_doy=366, doy_batch_size = None, 
                                                     show_debug=True, perform_memory_monitoring=False, 
                                                     single_download=False):     
    '''
    FUNCTION ARGUMENTS AND THEIR EXPLANATIONS
    show_debug:                Setting this to True will go through all the steps up to generating the final percentile 
                               threshold or climatological means dataset output. Setting this to False will proceed with 
                               calculating and saving the desired dataset on your system based on your specified output 
                               directories (from the constants section near the top of the script).
                               
    temp_data:                 Your FULL inputted temperature dataset (loaded in the temperature dataset 
                               loading section before).
    
    baseline_slice:            Your set baseline period slice object (the baseline_period_slice_choice).
    
    optimal_chunks:            The chunks you want to use on your output dataset. They may be the same as the 
                               chunks used for the inputted raw data dataset, following a similar setup: 
                               {'latitude variable name': ###, 'longitude variable name': ###}. The chunks 
                               could be different if desired, and should be if you decide to tweak the code to 
                               modify your output dataset before saving it. Such a feature is not yet 
                               implemented here, but could be done through a modification like: 
                               window_data = window_data.sel(latitude=slice(-3, 0)), for instance.

    window_half_width:         By default, the window half-width is set to 5. This means that the data of the 
                               5 days before and after each unique day of the year are used to calculate the
                               means and percentiles of each "central" unique day of the year. 
                               For example, when show_debug is set to True, for the unique day of the year 1:
                               Window Doys:  [362, 363, 364, 365, 366, 1, 2, 3, 4, 5, 6] 
                               Accordingly, the data from the 5 days before and after day 1 (and day 1's data) 
                               are used to calculate the means or percentiles of day 1. This data window shifts 
                               to 363 to 366 and 1 to 7 for the 2nd unique day of the year, and so on.
                               
    minutes_per_memory_update: The desired number of minutes per each memory usage update from the optional
                               monitor_memory() function. Make sure the appropriate dependencies listed at the top
                               of this script are installed and active, and that the monitor_memory() function is loaded.
                               Reaching 100% memory use will cause your Jupyterhub server to crash and require a restart. 
                               You may avoid this by saving smaller datasets (such as by saving less days of the year in 
                               a single dataset and performing more spatial subsetting of larger regions of interest).
    
    chosen_percentile:         Must be set to a desired percentile (like the defined percentile constant) or None. Setting
                               a numeric percentile (greater than 0 and less than 100) will calculate percentile thresholds,
                               while setting chosen_percentile to None will calculate climatological means instead.
    
    start_chunking_doy:        When batch saving is disabled (doy_batch_size is set to None), start_chunking_doy sets the 
                               earliest possible unique day of the year (doy) from the full (1-366) climatological
                               mean/percentile threshold dataset to be saved (up to the final doy set by end_chunking_doy). 
                               Essentially, with batch saving disabled, you save only the doy interval you set from 
                               start_chunking_doy to end_chunking_doy. 
                               
                               When batch saving is enabled (doy_batch_size is set to a number), start_chunking_doy is used
                               to identify the earliest possible doy batch to be saved out of a pre-determined set of
                               possible doy batches determined by your doy_batch_size (see doy_batch_size for details). 
                               If you had previously saved a doy batch like 21 to 40 (with a doy_batch_size of 20 doys), for 
                               the same doy_batch_size, using any of the numbers present within the 21 to 40 interval will
                               redownload the same doy batch (of doys 21 to 40); accordingly, setting start_chunking_doy to 
                               41 will proceed to save the next batch (of doys 41 to 60) and setting start_chunking_doy to 1 
                               will proceed to save the earlier, initial batch (of doys 1 to 20).
    
    end_chunking_doy:          When batch saving is disabled (doy_batch_size is set to None), end_chunking_doy sets the 
                               latest possible unique day of the year (doy) from the full (1-366) climatological
                               mean/percentile threshold dataset to be saved up to (from the initial doy set by 
                               start_chunking_doy). Essentially, with batch saving disabled, you save only the doy 
                               interval you set from start_chunking_doy to end_chunking_doy.

                               When batch saving is enabled (doy_batch_size is set to a number), end_chunking_doy is used
                               to identify the latest possible doy batch to be saved out of a pre-determined set of
                               possible doy batches determined by your doy_batch_size (see doy_batch_size for details).
                               
                               When saving more than one dataset (single_download is set to False), batches of doys will
                               continue to be saved (in independent files named by default based on the doys they contain,
                               like 21_to_40, 41_to_60, and so on, for a batch size of 20 and similarly for other batch sizes)
                               until the batch with the end_chunking_doy is reached. For instance, for a batch size of 20, 
                               if the start_chunking_doy is set to 1 and the end_chunking_doy is set to 100, 20-doy batches 
                               will be saved until the batch that contains 100, the 81_to_100 batch, is reached; no other
                               batches would be downloaded. 
                               
                               If only one dataset is being saved (single_download is set to True), end_chunking_doy can 
                               be any number after the start_chunking_doy, as only the earliest batch detected to have the 
                               start_chunking_doy will be saved. 

    doy_batch_size:            When this is set to False, only the unique day of the year (doy) interval you set from 
                               start_chunking_doy to end_chunking_doy will be saved. When doy_batch_size is set to 
                               any number between 1 and 366, you enable batch saving. This means that the percentile/mean
                               data of the 366 total doys (by default) are automatically split into consecutive batches that
                               contain the data of (at most) doy_batch_size doys. For instance, for a doy_batch_size of 20, 
                               one may save the full 1-366 mean/percentile dataset in separate, smaller files that contain 
                               the data of 20 doys, such as doys 1 to 20, 21 to 40, 41 to 60, 341 to 360, and the smallest 
                               final batch of 361 to 366. You can set show_debug to True to see how saving differs between 
                               using and not using batch saving, as well as batch saving with different doy_batch_size batches.
                               
    perform_memory_monitoring: If set to True, this will allow the use of the optional memory monitoring feature, where memory 
                               use every minutes_per_memory_update minutes (roughly) is printed out. Of course, the required memory 
                               monitoring function (in the script-wide constants and functions section) must be run and its 
                               dependencies installed and loaded for it to work; if they are not, the code safely proceeds without 
                               memory monitoring (when it is set to True). 
                               
                               Setting perform_memory_monitoring to False will disable the optional memory monitoring feature. 
                               This feature may be helpful in allowing you to identify if your system is close to using up all 
                               (100%) of the available memory based on your chosen settings/constants/inputs, (which ought 
                               to be avoided, as this would crash your Jupyterhub environment, requiring a restart).
    
    single_download:           If set to True, this will ensure only the (earliest) specified dataset is downloaded 
                               (see doy_batch_size and start_chunking_doy). If set to False, the forceful termination of 
                               possibly saving any further datasets at the end of the script is ignored (only for batch 
                               saving; see doy_batch_size).
    ''';
    
    if show_debug:
        debug_message_1 = "You have set show_debug to true; this will show how the percentiles/means are processed based on your inputted arguments \nand provide a preview of the output.\n"
        debug_message_2 = "\nIf you are satisfied with the output (and your arguments), compute and save the calculated percentiles/means by setting\nshow_debug to false.\n"
        print(debug_message_1, debug_message_2)
    
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 0: Running a few quick error checks for the provided arguments!")
        print("---------------------------------------------------------------------------------------------------------\n")
              
    # Checking start and end bounds
    if start_chunking_doy < 1 or end_chunking_doy > 366:
        raise ValueError("Please provide a start_chunking_doy that is ≥ 1 and an end_chunking_doy that is ≤ 366.")
        
    if end_chunking_doy < start_chunking_doy: ## bug in other code, this is named chunk_end, not explicitly called.
        raise ValueError("Please make sure your end_chunking_doy is greater than your start_chunking_doy; these are your dataset processing bounds.")
        
    # Establishing if we calculating percentiles or means 
    calculate_means = True if (chosen_percentile == None) else False    
    
    # Checking if we are performing batch-saving (or saving everything all at once)
    if doy_batch_size == None: # We are saving everything at once, no need to batch save
        batch_saving = False
        doy_batch_size = end_chunking_doy - start_chunking_doy + 1 # Setting this to the output day of the year coordinate (size) value 
        
    else:
        batch_saving = True # We are not saving everything at once but want to do so in batches!
        
    
    if show_debug: 
        print("All clear!\n")
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 1: Assign normalized unique day of the year (doy) values to the sliced observation dataset")
        print("---------------------------------------------------------------------------------------------------------\n")
    
    print(f"Chosen baseline slice: {baseline_slice}")
    print(f"Chosen window half-width: {window_half_width}")
    print(f"(This means we use the data in the {window_half_width} days before and after each day of the year (doy) to calculate the means/percentiles of each doy.)", '\n')
    
    if calculate_means:
        print(f"Calculating means in batches of (at most) {doy_batch_size} doys between {start_chunking_doy} and {end_chunking_doy}.\n")
    else:
        percentile_used = chosen_percentile/100
        print(f"Final percentile used (in calculations): {percentile_used} ({chosen_percentile}th percentile)", '\n')
        print(f"Calculating percentiles in batches of (at most) {doy_batch_size} doys between {start_chunking_doy} and {end_chunking_doy}.\n")
    
    
    # Doy values for specific dates (for later)
    feb28_doy = 59
    feb29_doy = 60
    mar1_doy = 61
    
    ## Extracting baseline period data
    temp_baseline_data = temp_data.sel(time=baseline_slice)
    if show_debug: print("Original Temperature Baseline Period Data: ", '\n', temp_baseline_data, '\n')
   
    # Assigning normalized doy values to the baseline period dataset
    temp_norm = temp_baseline_data.assign_coords(
        normalized_doy=('time', normalize_dayofyear(temp_baseline_data.time).data))
    if show_debug: print("Temperatures with Normalized Unique Days of the Year (1-366): ", '\n', temp_norm, '\n')
        
    '''
    # Totally optional debug here: show ALL normalized day of the year (doy) values;
    # all years are in the 366-day format, with some missing day 60 (feb 29)
    with np.printoptions(threshold=np.inf):
        print(temp_norm.normalized_doy.values) 
    '''
    
    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 2: Get the actual doy values of the baseline period data (should be 1 - 366)")
        print("---------------------------------------------------------------------------------------------------------\n")
    unique_doys = np.unique(temp_norm.normalized_doy.data)
    unique_doys = unique_doys[~np.isnan(unique_doys)]  # Remove any NaN values
    unique_doys = unique_doys.astype(int)  # Ensure integer day-of-year values
    if show_debug: print(f"Found {len(unique_doys)} unique day-of-year values!")
    if show_debug: print("Unique doys:", '\n', unique_doys, '\n')

         
    global stop_monitoring
    if show_debug: 
        choice_message = "climatological mean" if calculate_means else "percentile threshold"
        print("---------------------------------------------------------------------------------------------------------")
        print(f"Part 3: Calculate the desired {choice_message} data for the desired doy(s).")
        print("---------------------------------------------------------------------------------------------------------\n")
        stop_monitoring = True # We don't want to start showing memory use.

        if perform_memory_monitoring:
            msgmem = "Please ensure the memory monitoring functions are run and their dependencies are installed."
            print(f"Memory monitoring is enabled. When saving the dataset, the memory usage will be displayed!\n{msgmem}\n")
        else:
            mem1msg = "The optional memory monitoring feature is disabled"
            mem2msg = "To display memory usage:"
            mem3msg = "set perform_memory_monitoring to True and the desired minutes_per_memory_update after running the memory functions."
            print(f"{mem1msg}. {mem2msg}\n{mem3msg}\n")
    else:
        # We start monitoring here so that it only runs once
        stop_monitoring = False # We do want to start showing memory use.
        
        if perform_memory_monitoring:
            try:
                monitor_thread = threading.Thread(target=monitor_memory, kwargs={'interval_minutes': minutes_per_memory_update})
                monitor_thread.daemon = True
                monitor_thread.start()
            except Exception as e:
                msgafter = "Proceeding without memory monitoring until the issues are resolved!"
                print(f"The required memory monitoring functions were not run and/or its required dependencies are missing.\n{msgafter}\n") 
        else:
            print("Proceeding without monitoring memory!\n")
        
    # Error messages for later
    chunk_message_finished = "Set chunk end reached!\n"
    no_feb29_possible_warning = "WARNING: Cannot interpolate Feb 29; missing Feb 28 or Mar 1 data!\n"
    
    # Initialize a dictionary for the climatological means or percentile thresholds
    seas_dict = {}
        
    # Bool for debug purposes
    shown_once = False
 
    # Loop for doys 1 - 366 (excluding Feb 29, doy 60)
    for doy in unique_doys:
        # We skip February 29th (to interpolate later)
        if doy == feb29_doy:  
            continue # Note: doy 60 data is still used within the appropriate window_data when available
        
        # Create window around this DOY
        window_doys = []
        
        for w in range(-window_half_width, window_half_width + 1):
            target_doy = doy + w
            
            if show_debug and not shown_once: 
                print("Day of the year: ", doy, "| Target Window Index: ", w, "| Target Window Value: ", target_doy)

            # Handle year wraparound properly
            if target_doy < 1:
                target_doy += 366
            elif target_doy > 366:
                target_doy -= 366
            
            # Handle year boundaries by keeping only valid doys
            if target_doy in unique_doys:
                window_doys.append(target_doy)
            
            if show_debug and not shown_once: print("Window Doys: ", window_doys, '\n')

        # Now, we select the data for this window
        window_data = temp_norm.where(temp_norm.normalized_doy.isin(window_doys), drop=True)
        
        if show_debug and not shown_once: print(f"Final window data for doy {doy} from the baseline period dataset: ", '\n', window_data, '\n')
        
        # Now, we calculate the percentile threshold/climatological mean across the time dimension
        if window_data.time.size > 0:
            # We calculate the climatological mean if that is what is desired
            if calculate_means:
                doy_to_save = window_data.mean(dim = 'time', skipna = True).expand_dims(normalized_doy=[doy])
            # Otherwise, we calculate the percentile for a doy 
            else:
                doy_to_save = window_data.chunk({'time':-1}).quantile(percentile_used, dim='time', skipna=True).expand_dims(normalized_doy=[doy])
            seas_dict[doy] = doy_to_save

            if show_debug and not shown_once:
                message_type = "climatological means" if calculate_means else "percentile thresholds"
                print("---------------------------------------------------------------------------------------------------------")
                print(f"Part 4: Store the {message_type} across all doys in an empty dictionary!")
                print("---------------------------------------------------------------------------------------------------------\n")

                first_part = f"Dictionary presently updated for doy {doy} with the time-averaged final window dataset in Part 3."
                print(f"{first_part}\n\nDictionary entry:\n", seas_dict[doy], '\n')
                shown_once = True  
                
    # After the for loop over the 1-366 day of the year range, we handle February 29th using linear interpolation
    if (feb29_doy in unique_doys):
        # If we have a dictionary with our percentiles/climatologies...
        if feb28_doy in seas_dict and mar1_doy in seas_dict:
            feb_28_ds = seas_dict[feb28_doy].squeeze().drop_vars('normalized_doy')
            mar_1_ds = seas_dict[mar1_doy].squeeze().drop_vars('normalized_doy')
            seas_dict[feb29_doy] = 0.5 * (feb_28_ds + mar_1_ds)
            seas_dict[feb29_doy] = seas_dict[feb29_doy].expand_dims(normalized_doy=[feb29_doy])
            if show_debug: print("Interpolated February 29 dataset (doy 60) in the dictionary:\n", seas_dict[feb29_doy], '\n')
        else:
            print(no_feb29_possible_warning)
            
    # Now we proceed with the full climatology dictionary
    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 5: Creating the complete dataset from the dictionary")
        print("---------------------------------------------------------------------------------------------------------\n")

    # We create the correct coordinates from our dictionary for our final dataset
    doy_coords = np.array(sorted(seas_dict.keys())) # array for full year (1 to 366, if leap)
    if show_debug: print("Dictionary Keys of All Calculated Unique Day of the Year Datasets\n", 
                         "(Should include all values from 1 to 366):\n", doy_coords, '\n')

    # We stack the resulting dictionary datasets while maintaining the correct order
    seas_list = [seas_dict[doy] for doy in doy_coords]
    seas_year = xr.concat(seas_list, dim='normalized_doy')
    seas_year = seas_year.assign_coords(normalized_doy=('normalized_doy', doy_coords))
    seas_year = seas_year.chunk(optimal_chunks)
    
    # Additional chunking of the resulting doy mean/percentile dataset to prevent crashes during saving.
    max_doy_chunking_val = 61 # Can be reduced for more stability (or increased to allow larger chunk size values)

    if doy_batch_size <= max_doy_chunking_val: # For small datasets, it's fine to chunk by the total data available
        seas_year = seas_year.chunk({'normalized_doy': doy_batch_size})
        
    else: # For larger datasets with means/percentiles for many doys, we find the best, largest value to chunk by (that produces no remainders)
        best_chunk_val = max_doy_chunking_val # Default; may lead to non-perfect chunking if no best (largest) divisor (under the max allowed) is found
        
        for divisor in range(max_doy_chunking_val, 0, -1): # Iterate from the maximum allowed divisor down to 1 to determine the best chunk size value
            if doy_batch_size % divisor == 0: # The current largest divisor is found to leave no remainder
                if divisor != 1: # If we find a divisor that is not 1 (the smallest possible value), we set that as the best chunk size to use
                    best_chunk_val = divisor
                break # End the best chunk-size (divisor) calculator for loop

        # Lastly, apply the best largest chunk size determined
        seas_year = seas_year.chunk({'normalized_doy': best_chunk_val})

    print("\nFinal Climatology Dataset:\n", seas_year, '\n')

    
    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print(f"Part 5: Saving the output dataset!")
        print("---------------------------------------------------------------------------------------------------------\n")
        
    # Coordinate values for batch saving (not full climatology dataset saving)
    coord_values = seas_year['normalized_doy'].values

    # We are saving the dataset in batches
    if batch_saving:
        if show_debug: 
            print(f"Proceeding with batch saving in batches of (at most) {doy_batch_size} doys between {start_chunking_doy} and {end_chunking_doy}!\n")
            
        for i in range(0, len(coord_values), doy_batch_size):
            # First, we gather the starting and ending values of the processed chunk
            start_val = coord_values[i]

            # Figure out when to stop downloading data (based on the set end_chunking_doy value)
            if start_val > end_chunking_doy:
                continue
            
            # Grab the end index and value
            end_idx = min(i + doy_batch_size, len(coord_values))
            end_val = coord_values[end_idx - 1]

             # Quick check to see where to begin downloading a batch from...
            if start_val < start_chunking_doy: # We skip any batches where the starting doy is earlier than the start_chunking_doy.
                # Howver, if the start_chunking_doy value is within the current batch, we allow it to be the first to be saved.
                if start_val <= start_chunking_doy <= end_val:
                    pass 
                else: # If the batch does not contain the start_chunking_doy and only contains doys before it, then we skip saving it!
                    continue 

            # Save the dataset (subsetting occurs in the function)              
            save_dataset_to_storage(show_debug_arg=show_debug, single_download_arg=single_download,
                                    current_percentile=chosen_percentile, 
                                    start_val_arg=start_val, end_val_arg=end_val, 
                                    ds_to_save=seas_year) 
            
            # If we only want to download a single dataset:
            if single_download:     
                stop_monitoring = True # stop memory monitoring
                raise ValueError("Single dataset file-saving finished. Please enter a new desired chunk starting value to begin from.")
            
    # We are saving the full dataset
    else:   
        if show_debug: 
            print("Proceeding with saving the desired doy interval data all at once!\n")
            
        save_dataset_to_storage(show_debug_arg=show_debug, single_download_arg=single_download,
                                current_percentile=chosen_percentile, start_val_arg=start_chunking_doy, 
                                end_val_arg=end_chunking_doy, ds_to_save=seas_year) 

        
    stop_monitoring = True # Reset the monitoring before the next loop
    
# ---------------------------------------------------------------------------------------------------------------------------------
# Calling the main function to calculate and store the 90th percentiles and climatological means!

calculate_temp_thresh_or_clim_given_a_percentile(temp_data=temps_full, baseline_slice=baseline_period_slice_choice,
                                                 optimal_chunks=optimal_chunking, window_half_width=5, 
                                                 minutes_per_memory_update=minutes_per_mem_update, chosen_percentile=None, 
                                                 start_chunking_doy=1, end_chunking_doy=366, doy_batch_size=None,
                                                 show_debug=True, perform_memory_monitoring=True, single_download=False)

calculate_temp_thresh_or_clim_given_a_percentile(temp_data=temps_full, baseline_slice=baseline_period_slice_choice,
                                                 optimal_chunks=optimal_chunking, window_half_width=5,
                                                 minutes_per_memory_update=minutes_per_mem_update, chosen_percentile=percentile,
                                                 start_chunking_doy=1, end_chunking_doy=366, doy_batch_size=None,
                                                 show_debug=True, perform_memory_monitoring=True, single_download=False)

print("We have finished saving all desired doy datasets completely!")
stop_monitoring = True

<div style="color:#008B00; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸  FILE VALIDATION, VERIFICATION, AND ANIMATIONS  °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤
</div>

In [None]:
''' ---------------------------------------------------------------------------------------------------------- '''
''' Function to make an animation of the calculated climatological mean/percentile latitude and longitude maps '''
''' ---------------------------------------------------------------------------------------------------------- '''

'''
NOTES
- Make sure that your climatological mean/percentile threshold dataset folders contain unique (no repeating) datasets that encapsulate
 the full 1-366 days of the year (doy). Making animations with some doys missing has not been tested yet, but this code may be tweaked to suit
 your needs. If need be, you can edit the code after the "## ***" line to access specific files, as that code section determines the files to be read.
 
- Animations are produced in the same location this script is present. You may tweak this if desired.
''';

def check_processed_datasets_with_an_animation(baseline_name_arg, region_id_arg, custom_id_arg,
                                               custom_output_filename=None,
                                               chosen_percentile=None):
    
    ## Check proper arguments are provided for threshold datasets
    if chosen_percentile == None:
        data_type = "Means"
        show_climatological_means = True
    else:
        data_type = f"{chosen_percentile}th_Percentiles" 
        show_climatological_means = False
        
        if chosen_percentile <= 0:
            raise ValueError("Please set a positive, non-zero numeric percentile (based on the percentile you used above in your percentile datasets)!")


    ## *** 
    ## Gather the stored dataset filepaths 
    if show_climatological_means:
        data_directory = f"{my_root_directory}/{clim_data_folder_name}/{baseline_name_arg}/{region_id_arg}"
    else:
        data_directory = f"{my_root_directory}/{perc_data_folder_name}/{baseline_name_arg}/{region_id_arg}"
    
    glob_path = f"{data_directory}/{region_id_arg}_{custom_id_arg}sst_*_{baseline_name_arg}.zarr"

    # Gather file paths from the set glob directory path constructed from the script-wide constants and function's inputs
    paths = glob.glob(glob_path)
    
    # A quick check to ensure we have located files given our arguments
    if not paths:
        start_error = "No files found matching the pattern"
        cont_error = "\nPlease verify you inputted the proper baseline_name, folder_name, sub_folder_name, percentile, and show_climatological_means arguments!"
        raise FileNotFoundError(f"{start_error}:\n{data_directory}_sst...{baseline_name_arg}.zarr\n{cont_error}")
    
    
    ## Fill a dictionary where all (1 to 366) doys are matched with their corresponding filepaths
    doys_dict = {}
    
    for filepath in paths:
        # Open and check what doys are in this file
        ds = xr.open_zarr(filepath).sst
        file_doys = ds['normalized_doy'].values

        # Handle both single value and arrays
        if np.isscalar(file_doys):
            file_doys = [file_doys]
         
        # Map each doy to its file
        for doy in file_doys:
            doys_dict[int(doy)] = filepath
         
        ds.close()
    
    
    ## Use a file and its features to set up the plot
    available_doys = sorted(doys_dict.keys())
    setup_file = doys_dict[available_doys[0]]
    
    if not show_climatological_means:
        setup_ds = xr.open_zarr(filepath).sst.drop_vars("quantile")
    else:
        setup_ds = xr.open_zarr(filepath).sst
    
    # Quick fix for my personal, early datasets (should be unnecessary for you; you could name your variables differently too, as I did here)
    if 'doy' in setup_ds.coords:
        setup_ds = setup_ds.rename({'doy': 'normalized_doy'}).expand_dims('normalized_doy')
    
    lon = setup_ds.lon.values
    lat = setup_ds.lat.values
    
    # Check for single or multiple-doys in the setup dataset, and return the thetao data for just one (the first) doy
    if len(setup_ds['normalized_doy'].values.shape) == 0 or setup_ds['normalized_doy'].values.size == 1:
        # Single day file
        setup_ds   = setup_ds.drop_vars("normalized_doy").squeeze()
        setup_data = setup_ds.values
    else:
        # Multi-day file
        setup_data = setup_ds.isel(normalized_doy=0).values
    
    setup_ds.close()
    
    
    ## Initialize the plot    
    fig, ax = plt.subplots(figsize=(14, 6), 
                           subplot_kw={'projection': ccrs.Mercator()})
    
    pcm = ax.pcolormesh(
        lon, lat, setup_data,
        cmap='RdYlBu_r',
        vmin=-5, vmax=35,
        transform=ccrs.PlateCarree(),
    )
    
    ax.set_extent([0, 360, -5, 90], crs=ccrs.PlateCarree())
    ax.add_feature(cfeature.LAND, color='lightgray')
    ax.add_feature(cfeature.COASTLINE, linewidth=0.8)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    title = ax.set_title('')
    title_base = '(Relative to 1993-2022)' if baseline_name_arg == 'Baseline9322' else f'({baseline_name_arg})'
    
    
    ## Animation function
    def animate(i):
        doy = available_doys[i]
        filepath = doys_dict[doy]
        
        # Load the dataset
        ds = xr.open_zarr(filepath).sst
      
        if not show_climatological_means:
            ds = ds.drop_vars("quantile")
            
        # Check if this is a single-day or multi-day file
        doy_values = ds['normalized_doy'].values
        
        # Load the data if available for a single doy or select the correct doy in a dataset
        if np.isscalar(doy_values) or doy_values.size == 1:
            frame_data = ds.values
        else:
            doy_idx = np.where(doy_values == doy)[0][0]
            frame_data = ds.isel(normalized_doy=doy_idx).values
        
        # Update the plot
        pcm.set_array(frame_data.ravel())
        title.set_text(f'{data_type}: Day {doy} of the Year\n{title_base}')
        
        ds.close()
        return pcm, title
    
    
    ## Create the resulting animation
    type_message = "climatological means" if show_climatological_means else "percentile thresholds"
    print(f"Began animation for the {type_message} of the {region_id_arg} datasets!")
    
    chosen_doys = len(available_doys)
    
    anim = animation.FuncAnimation(
        fig, animate,
        frames=chosen_doys,
        interval=200,
        blit=False,
        repeat=True
    )
    
    writer = animation.PillowWriter(fps=2)
    
    if custom_output_filename == None:
        output_filename = f"{region_id_arg}_{data_type}_{baseline_name_arg}_{chosen_doys}_doys_total.gif"
    else:
        output_filename = custom_output_filename
    print(f"Saving animation at: {output_filename}") 
    
    
    ## Save the resulting animation
    def print_frame_progress(current_frame, total_frames): # This shows the saving progress (in frames)!
        print(f"\r → Doy (Frame) Processed: {current_frame + 1}/{total_frames}", end='', flush=True)

    anim.save(output_filename, writer=writer, dpi=100,
              progress_callback=print_frame_progress)
    
    plt.tight_layout()
    plt.close(fig)
    print(f"\nAnimation finished and saved!\n")
    
    return anim

# -----------------------------------------------------------------------------------------------------------------------------

# Note: loading a percentile produces an animation for percentile thresholds, and setting it to None an animation for means
check_processed_datasets_with_an_animation(baseline_name_arg=baseline_folder_name, region_id_arg=region_id_folder_name, custom_id_arg=final_custom_id,
                                           custom_output_filename=None, chosen_percentile=None)

In [None]:
# Code to verify against the outputs of marineHeatWaves to come in the future...