In [None]:
import xarray as xr
import numpy as np
import pandas as pd

import glob
import psutil
import threading
import time
import os

import dask
from dask.diagnostics import ProgressBar

from datetime import datetime
from dateutil.relativedelta import relativedelta
from datetime import date

#from marineHeatWaves import marineHeatWaves

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.animation as animation

<div style="background-color: #FFE099; padding: 10px; border: 3px solid #FFC233; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">||| -------------------------------------------- |||    NOTES    ||| -------------------------------------------- |||
</div>
<div style="background-color: #EFFAFA; border: 2px solid #A2E2E2; font-family: Georgia, serif; padding: 10px">
    <br>This is the script to <strong>process percentile thresholds</strong> and <strong>climatological means</strong> from the data downloaded in <strong>data_downloader_script.ipynb</strong>.<div>
    <br>&#x27A1;&#xFE0E; You can duplicate this script and run its copies simultaneously to download more percentiles/means.
    <br>&#x27A1;&#xFE0E; If unusual errors appear after attempting to run a cell again, restart the kernel (and run the cell again afterward)!
    <br>&#x27A1;&#xFE0E; After saving percentiles/means, check your file directories and compare their file sizes to see which ones ought to be removed/redownloaded!
    <br>&#x27A1;&#xFE0E; Data directories that can be further modified by you may be identified by searching for "NOTE: POTENTIAL DIRECTORY TWEAKING HERE" in this script.
    <br><br>
</div>

<div style="color:#CD6600; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    LOADING FULL OBSERVED GLOBAL DATASETS    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸
</div>

In [None]:
## --- IMPORTANT SCRIPT-WIDE CONSTANTS ------------------------------------------------------------------------------------
## Baseline set up:
# Running the main function for the 1993–2022 baseline period (a 30 year baseline)
baseline_choice = "Baseline9322" # identifier
baseline_period_slice_choice = slice('1993-01-01', '2022-12-31') # for slicing time

## Folder identifiers
folder_name_choice = "Full" # A "regional" identifier to save the severity dataset with ("{folder_name_choice}_SST... .zarr")
custom_id = "fgd" # A custom identifier to help further identify the downloaded data; can be left as ""
my_root_directory = "" # Should be your root directory, from which you access data from and save data to

# Doys (days of the year) set up
starting_day_of_the_year = 1 # Your starting doy point for processing/saving severity (can be 1-366)
ending_day_of_the_year = 366 # Your ending doy point for processing/saving severity (can be 1-366)

# Misc
minutes_choice = 10 # minutes (roughly) per memory update (to keep track of its use and avoid crashing/issues)

In [None]:
# --- Full dataset loading (from my raw data) --------------------------------------------------------------------------
# Your dataset directory here; use a * to collect ALL the applicable datasets for a given naming set up (using glob).
raw_data_directory = f'{my_root_directory}/OISST/Data/sst.day.mean.*.nc'

# Your data's maximum bounds
lat_bounds = slice(-15, 90)
lon_bounds = slice(0, 360)

# Load the datasets as one
ds = xr.open_mfdataset(
    raw_data_directory,         # Glob pattern (the * grabs all datasets)
    parallel=True,              # Enable parallel file opening 
    chunks='auto',              # Let dask choose optimal chunking 
    combine='by_coords',        # Merge based on coordinate values
    engine='netcdf4')           # Specify the engine (may crash without this; restart the kernel if it happens)

full_ds = ds.sel(lat=lat_bounds, lon=lon_bounds).sst
print("Full raw data dataset:\n", full_ds, '\n')

In [None]:
# Rechunk the raw data dataset
optimal_chunking = {'lat': 210, 'lon': 160}
sst_full = full_ds.chunk(optimal_chunking)
print("Final (properly chunked) raw data dataset:\n", sst_full)

# NOTE: Check the printed dataset above and determine the optimal chunksize(s) for your data!

<div style="color:#104E8B; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">°º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸    CALCULATING CLIMATOLOGY    °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸</div>

Notes:
* My code does not including the padding for missing values found in Eric Oliver's marineHeatWave code. 
* Percentiles are calculated and saved exclusively for single unique days of the year.
* Means are calculated and saved for unique days of the year within custom batches.
* Saving larger/"complete" mean datasets for a desired region is recommended over saving smaller subsets for the region, at least time-wise.

In [None]:
## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ----------------------------------------------- '''
''' Function to keep track of and show memory usage '''
''' ----------------------------------------------- '''

stop_monitoring = True # We begin by NOT showing any memory usage

def monitor_memory(interval_minutes=5, log_file=None):
    interval = interval_minutes * 60  
    
    while not stop_monitoring:
        mem = psutil.Process(os.getpid()).memory_info().rss / (1024**3)  # in GB
        print(f" | Memory usage: {mem:.2f} GB | Memory: {psutil.virtual_memory().percent}% used | ")
        
        if log_file:
            with open(log_file, 'a') as f:
                f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: {mem:.2f} GB\n")
        time.sleep(interval)

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------------------------ '''
''' Function to normalize the unique day of the year value of each observed day in the format: 1-366 '''
''' ------------------------------------------------------------------------------------------------ '''

def normalize_dayofyear(time_coord):
    doy = time_coord.dt.dayofyear
    is_leap = time_coord.dt.is_leap_year

    # This code ensures March 1 is always day 61, regardless of leap year
    normalized_doy = xr.where(
        (~is_leap) & (doy >= 60),  # If it is a non-leap year, doy 60 is March 1. If we have March 1 or later,
        doy + 1,                          # then we push forward March 1 and/or the later days by 1 day.
        doy                               # Otherwise, we keep original for leap years and Jan-Feb 28.
    )
    return normalized_doy

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------- '''
''' Function to check whether the inputted time period is at least roughly 30 years '''
''' ------------------------------------------------------------------------------- '''

def rough_30_year_period_check(time_slice, tolerance=0.01):
    start_str = time_slice.start
    stop_str = time_slice.stop

    start_date = datetime.strptime(start_str, '%Y-%m-%d')
    end_date = datetime.strptime(stop_str, '%Y-%m-%d')

    delta = relativedelta(end_date, start_date)
    total_years = delta.years + delta.months/12 + delta.days/365.25
    
    return total_years, abs(total_years - 30) <= tolerance

## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ------------------------------------------------------------------------------------- '''
''' Function to just save the percentile threshold/climatological mean dataset to storage '''
''' ------------------------------------------------------------------------------------- '''
def save_dataset_to_storage(folder_name_arg, baseline_name_arg, custom_file_id_arg,
                            show_debug_arg, single_download_arg, 
                            current_percentile=None,
                            start_val_arg=None, end_val_arg=None, 
                            ds_to_save=None):
    
    # We check if we are saving a climatology dataset or percentile one after subsetting the datasets appropriately
    print("Starting doy for current subset: ", start_val_arg)
    print("Ending doy for current subset: ", end_val_arg, '\n')

    # Set up a filepath/file identifier, if one is provided
    if custom_file_id_arg != "":
        id = f"{custom_file_id_arg}_"
    else:
        id = ""
    
    if start_val_arg == 1 and end_val_arg == 366:
        final_ds_to_save = ds_to_save
    else:
        final_ds_to_save = ds_to_save.sel({'normalized_doy': slice(start_val_arg, end_val_arg)})
    
    # Here, we load the filepath destinations for the climatology datasets, the percentiles or means
    # This runs for a climatological means dataset; NOTE: POTENTIAL DIRECTORY TWEAKING HERE
    if current_percentile is None:
        print(f"Current climatological means subset to save:\n{final_ds_to_save}\n")
        file_name = f"{folder_name_arg}_{id}sst_clim_subset_{start_val_arg}_to_{end_val_arg}_{baseline_name_arg}.zarr"
        filepath = f'{my_root_directory}/OISST/Clim/{baseline_name_arg}/{file_name}'
    # This runs for a percentile dataset; NOTE: POTENTIAL DIRECTORY TWEAKING HERE
    else:
        print(f"Current {current_percentile}th percentile subset to save:\n{final_ds_to_save}\n")
        file_name = f"{folder_name_arg}_{id}sst_thresh_subset_{start_val_arg}_to_{end_val_arg}_{baseline_name_arg}.zarr"
        filepath = f'{my_root_directory}/OISST/Thresh{current_percentile}th/{baseline_name_arg}/{file_name}'
    print("Filepath of subset: ", filepath)

    # Lastly, we either finish by saving or stop if we are at the end of the debug
    if show_debug_arg:
        raise ValueError('End of debug. Proceed with the setting "show_debug = False" to start saving the percentile thresholds.')
    else:
        with ProgressBar():
            final_ds_to_save.to_zarr(filepath, mode='w', consolidated=True)
  
        print(f"Saved subset: doys {start_val_arg} to {end_val_arg}\nMoving on!\n")
        print("---------------------------------------------------------------------------------------------------------")

        # Optional: for single downloads
        if single_download_arg:
            stop_monitoring = True
            raise ValueError("Single dataset file-saving finished. Please enter a new desired chunk starting value to begin from.")
            
## -------------------------------------------------------------------------------------------------------------------------------------------------------
''' ----------------------------------------------------------------- '''
''' Function to actually calculate the percentile threshold dataset to storage '''
''' ----------------------------------------------------------------- '''

# Function to calculate the percentile threshold values for specific depths
def calculate_sst_thresh_or_clim_given_a_percentile(sst_data, baseline_slice,
                                                    folder_name, baseline_name, 
                                                    optimal_chunks, custom_file_id="", window_half_width=5,
                                                    minutes_per_memory_update=5, percentile=None, 
                                                    start_chunking_doy=1, end_chunking_doy=366,
                                                    show_debug=True, single_download=False):     
    
    if show_debug:
        debug_message_1 = "You have set show_debug to true; this will show how the percentiles/means are processed based on your inputted arguments \nand provide a preview of the output.\n"
        debug_message_2 = "\nIf you are satisfied with the output (and your arguments), compute and save the calculated percentiles/means by setting\nshow_debug to false.\n"
        print(debug_message_1, debug_message_2)
    
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 0: Running a few quick error checks for the provided arguments!")
        print("---------------------------------------------------------------------------------------------------------\n")
              
    # Checking start and end bounds
    if start_chunking_doy < 1 or end_chunking_doy > 366:
        raise ValueError("Please provide a start_chunking_doy that is ≥ 1 and an end_chunking_doy that is ≤ 366.")
        
    if end_chunking_doy < start_chunking_doy: ## bug in other code, this is named chunk_end, not explicitly called.
        raise ValueError("Please make sure your end_chunking_doy is greater than your start_chunking_doy; these are your dataset processing bounds.")
        
    # Running a rough time check for the baseline provided
    total_time, is_time_slice_30_years = rough_30_year_period_check(baseline_slice)
    
    if not is_time_slice_30_years:
        error_message = "Please check that your chosen baseline time slice covers a 30 year period."
        raise ValueError(f"{error_message}.\n            The chosen slice covers roughly {total_time} years.")
    
    # Establishing if we calculating percentiles or means 
    calculate_means = True if (percentile == None) else False    
    chunk_size = end_chunking_doy
    
    # Checking the chunk size for our climatological calculations
    if calculate_means and chunk_size == 0:
        raise ValueError("Please set an integer value for end_chunking_doy, which sets the size of the doy batch you use to save the means in.")

    
    if show_debug: 
        print("All clear!\n")
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 1: Assign normalized unique day of the year (doy) values to the sliced observation dataset")
        print("---------------------------------------------------------------------------------------------------------\n")
    
    print(f"Chosen baseline slice: {baseline_slice}")
    print("Note: the chosen baseline period has been identified as roughly covering a 30 year period. Do ensure this is the case separately.\n")
    print(f"Chosen window half-width: {window_half_width}")
    print(f"(This means we use {window_half_width} days before and after each day of the year for each doy in our climatology/threshold.)", '\n')
    
    if calculate_means:
        print(f"The selected doy chunk size for climatological mean dataset batches is {chunk_size} doys.\n")
        # if we are downloading only a single dataset, we adjust the upper/maximum limit to the data calculated/saved accordingly
        end_chunk_val = 366 if not single_download else (start_chunking_doy + chunk_size - 1)
        
        print(f"Calculating means in batches of (at most) {chunk_size} doys between {start_chunking_doy} and {end_chunk_val}.\n")
    else:
        percentile_used = percentile/100
        print(f"Final percentile used (in calculations): {percentile_used} ({percentile}th percentile)", '\n')
        print(f"Calculating thresholds individually for doys between {start_chunking_doy} and {end_chunking_doy}.\n")
    
    # Doy values for specific dates (for later)
    feb28_doy = 59
    feb29_doy = 60
    mar1_doy = 61
    
    ## Extracting baseline period data
    sst_baseline = sst_data.sel(time=baseline_slice)
    if show_debug: print("Original SST Baseline Period Data: ", '\n', sst_baseline, '\n')
   
    # Assigning normalized doy values to the baseline period dataset
    sst_norm = sst_baseline.assign_coords(
        normalized_doy=('time', normalize_dayofyear(sst_baseline.time).data))
    if show_debug: print("SST with Normalized Doy: ", '\n', sst_norm, '\n')
        
    '''
    # Totally optional debug option here: show ALL normalized day of the year (doy) values;
    # all years are in the 366-day format, with some missing day 60 (feb 29)
    with np.printoptions(threshold=np.inf):
        print(sst_norm.normalized_doy.values) 
    '''
    
    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 2: Get the actual doy values of the baseline period data (should be 1 - 366)")
        print("---------------------------------------------------------------------------------------------------------\n")
    unique_doys = np.unique(sst_norm.normalized_doy.data)
    unique_doys = unique_doys[~np.isnan(unique_doys)]  # Remove any NaN values
    unique_doys = unique_doys.astype(int)  # Ensure integer day-of-year values
    if show_debug: print(f"Found {len(unique_doys)} unique day-of-year values!")
    if show_debug: print("Unique doys:", '\n', unique_doys, '\n')

         
    global stop_monitoring
    if show_debug: 
        choice_message = "climatological mean" if calculate_means else "percentile threshold"
        print("---------------------------------------------------------------------------------------------------------")
        print(f"Part 3: Calculate the desired {choice_message} data for the desired doy(s).")
        print("---------------------------------------------------------------------------------------------------------\n")
        stop_monitoring = True # We don't want to start showing memory use.
    else:
        # We start monitoring here so that it only runs once
        stop_monitoring = False # We do want to start showing memory use.
        monitor_thread = threading.Thread(target=monitor_memory, kwargs={'interval_minutes': minutes_per_memory_update})
        monitor_thread.daemon = True
        monitor_thread.start()
        
    # Error messages for later
    chunk_message_finished = "Set chunk end reached!\n"
    no_feb29_possible_warning = "WARNING: Cannot interpolate Feb 29; missing Feb 28 or Mar 1 data!\n"
    
    # Initialize a dictionary for the climatological means or percentile thresholds
    seas_dict = {}
        
    # Bool for debug purposes
    shown_once = False
 
    # Loop for doys 1 - 366 (excluding Feb 29, doy 60)
    for doy in unique_doys:
        # We skip February 29th (to interpolate later)
        if doy == feb29_doy:  
            continue # Note: doy 60 data is still used within the appropriate window_data when available
        
        # Create window around this DOY
        window_doys = []
        
        for w in range(-window_half_width, window_half_width + 1):
            target_doy = doy + w
            
            if show_debug and not shown_once: 
                print("Day of the year: ", doy, "| Target Window Index: ", w, "| Target Window Value: ", target_doy)

            # Handle year wraparound properly
            if target_doy < 1:
                target_doy += 366
            elif target_doy > 366:
                target_doy -= 366
            
            # Handle year boundaries by keeping only valid doys
            if target_doy in unique_doys:
                window_doys.append(target_doy)
            
            if show_debug and not shown_once: print("Window Doys: ", window_doys, '\n')

        # Now, we select the data for this window
        window_data = sst_norm.where(sst_norm.normalized_doy.isin(window_doys), drop=True)
        
        '''
        ### Feature to be added: the ability to tweak the data prior to any percentile calculations in a manner like so:
        window_data = window_data.sel(latitude=slice(-3, 0))
        '''
        
        if show_debug and not shown_once: print(f"Final window data for doy {doy} from the baseline period dataset: ", '\n', window_data, '\n')
        
        # Now, we calculate the percentile threshold/climatological mean across the time dimension
        if window_data.time.size > 0:
            # We calculate the climatological mean if that is what is desired
            if calculate_means:
                doy_to_save = window_data.mean(dim = 'time', skipna = True).expand_dims(normalized_doy=[doy])
            # Otherwise, we calculate the percentile for a doy 
            else:
                doy_to_save = window_data.chunk({'time':-1}).quantile(percentile_used, dim='time', skipna=True).expand_dims(normalized_doy=[doy])
            seas_dict[doy] = doy_to_save

            if show_debug and not shown_once:
                message_type = "climatological means" if calculate_means else "percentile thresholds"
                print("---------------------------------------------------------------------------------------------------------")
                print(f"Part 4: Store the {message_type} across all doys in an empty dictionary!")
                print("---------------------------------------------------------------------------------------------------------\n")

                first_part = f"Dictionary presently updated for doy {doy} with the time-averaged final window dataset in Part 3."
                print(f"{first_part}\n\nDictionary entry:\n", seas_dict[doy], '\n')
                shown_once = True  
                
    # After the for loop over the 1-366 day of the year range, we handle February 29th using linear interpolation
    if (feb29_doy in unique_doys):
        # If we have a dictionary with our percentiles/climatologies...
        if feb28_doy in seas_dict and mar1_doy in seas_dict:
            feb_28_ds = seas_dict[feb28_doy].squeeze().drop_vars('normalized_doy')
            mar_1_ds = seas_dict[mar1_doy].squeeze().drop_vars('normalized_doy')
            seas_dict[feb29_doy] = 0.5 * (feb_28_ds + mar_1_ds)
            seas_dict[feb29_doy] = seas_dict[feb29_doy].expand_dims(normalized_doy=[feb29_doy])
            if show_debug: print("Interpolated February 29 dataset (doy 60) in the dictionary:\n", seas_dict[feb29_doy], '\n')
        else:
            print(no_feb29_possible_warning)
            
    # Now we proceed with the full climatology dictionary
    if show_debug: 
        print("---------------------------------------------------------------------------------------------------------")
        print("Part 5: Creating the complete climatology dataset from the dictionary")
        print("---------------------------------------------------------------------------------------------------------\n")

    # We create the correct coordinates from our dictionary for our final dataset
    doy_coords = np.array(sorted(seas_dict.keys())) # array for full year (1 to 366, if leap)
    if show_debug: print("Dictionary Keys of Registered Unique Day of the Year (doy) Climatological Mean Datasets\n", 
                         "(Should include all values from 1 to 366):\n", doy_coords, '\n')

    # We stack the resulting dictionary datasets while maintaining the correct order
    seas_list = [seas_dict[doy] for doy in doy_coords]
    seas_year = xr.concat(seas_list, dim='normalized_doy')
    seas_year = seas_year.assign_coords(normalized_doy=('normalized_doy', doy_coords))
    seas_year = seas_year.chunk(optimal_chunks)
    
    # Additional chunking that prevents crashing (can be lowered for more stability)
    if chunk_size <= 61:
        seas_year = seas_year.chunk({'normalized_doy': chunk_size})
    else:
        seas_year = seas_year.chunk({'normalized_doy': 61})
        
    print("\nFinal Climatology Dataset:\n", seas_year, '\n')

    # We check if we are saving this datasets fully or in batches; a chunk_size of 366 implies the full dataset is being saved (no batch saving)
    if chunk_size == 366:
        batch_saving = False
    else:
        batch_saving = True
            
    if show_debug: 
        saving_choice_message = "in one go" if not batch_saving else f"via batches of {chunk_size} doys"
        print("---------------------------------------------------------------------------------------------------------")
        print(f"Part 5: Saving the climatology dataset {saving_choice_message}!")
        print("---------------------------------------------------------------------------------------------------------\n")
        
    # Coordinate values for batch saving (not full climatology dataset saving)
    coord_values = seas_year['normalized_doy'].values

    # We are saving the dataset in batches
    if batch_saving:
        for i in range(0, len(coord_values), chunk_size):
            # First, we gather the starting and ending values of the processed chunk
            start_val = coord_values[i]

            # Quick check to see where to begin downloading a batch from...
            if start_val < start_chunking_doy:
                continue

            # Grab the end index and value
            end_idx = min(i + chunk_size, len(coord_values))
            end_val = coord_values[end_idx - 1]
            
            # Save the dataset (subsetting occurs in the function)              
            save_dataset_to_storage(folder_name_arg=folder_name, baseline_name_arg=baseline_name, custom_file_id_arg=custom_file_id,
                                    show_debug_arg=show_debug, single_download_arg=single_download,
                                    current_percentile=percentile, start_val_arg=start_val, 
                                    end_val_arg=end_val, ds_to_save=seas_year) 
            
    # We are saving the full dataset
    else:                      
        save_dataset_to_storage(folder_name_arg=folder_name, baseline_name_arg=baseline_name, custom_file_id_arg=custom_file_id,
                                    show_debug_arg=show_debug, single_download_arg=single_download,
                                    current_percentile=percentile, start_val_arg=start_chunking_doy, 
                                    end_val_arg=end_chunking_doy, ds_to_save=seas_year) 

        
    stop_monitoring = True # reset the monitoring before the next loop
    
# ---------------------------------------------------------------------------------------------------------------------------------

# Calling the function to calculate the climatological datasets
# Remember to update their arguments if you changed their names above!

'''
Argument explanations:
* percentile=None  -  if percentile is set to None, the function processes CLIMATOLOGICAL MEANS
* percentile=##  -  if a numeric percentile (under 1.0) is set, it processes the CLIMATOLOGICAL PERCENTILES (with that percentile) 
* optimal_chunking  -  this was set up earlier when loading the observed data!  
''';

calculate_sst_thresh_or_clim_given_a_percentile(sst_data=sst_full, baseline_slice=baseline_period_slice_choice,
                                                folder_name=folder_name_choice, baseline_name=baseline_choice,
                                                optimal_chunks=optimal_chunking, custom_file_id=custom_id, window_half_width=5, 
                                                minutes_per_memory_update=minutes_choice, percentile=None, 
                                                start_chunking_doy=starting_day_of_the_year, end_chunking_doy=ending_day_of_the_year,
                                                show_debug=True, single_download=False)

calculate_sst_thresh_or_clim_given_a_percentile(sst_data=sst_full, baseline_slice=baseline_period_slice_choice,
                                                folder_name=folder_name_choice, baseline_name=baseline_choice,
                                                optimal_chunks=optimal_chunking, custom_file_id=custom_id, window_half_width=5, 
                                                minutes_per_memory_update=minutes_choice, percentile=90,
                                                start_chunking_doy=starting_day_of_the_year, end_chunking_doy=ending_day_of_the_year,
                                                show_debug=True, single_download=False)

# disable chunk list, not need. for now, just test if works.
print("We have finished saving all desired doy datasets completely!")
stop_monitoring = True

<div style="color:#008B00; padding: 10px; text-align: center; font-family: Georgia, serif; font-weight: bold; white-space: pre;">ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤º°`°º¤ø,¸  FILE VALIDATION, VERIFICATION, AND ANIMATIONS  °º¤ø,¸¸,ø¤º°`°º¤ø,¸,ø¤°º¤ø,¸¸,ø¤
</div>

In [None]:
## Function to create an animation that shows the mean and percentile latitude and longitude maps for the full 1 - 366 period. 
def check_processed_datasets_with_an_animation(baseline_name_arg, folder_name_arg,
                                               custom_output_filename=None, 
                                               percentile=None):
    
    ## Check proper arguments are provided for threshold datasets
    if percentile == None:
        show_climatological_means = True
    else:
        show_climatological_means = False
        if percentile <= 0:
            raise ValueError("Please set a positive, non-zero numeric percentile (based on the percentile you used above in your percentile datasets)!")
    
    ## Gather the stored dataset filepaths
    if show_climatological_means:
        data_type = "Clim"
        data_path = f"{data_type}/{folder_name_arg}"
    else:
        data_type = f"Thresh{percentile}th"
        data_path = f"{data_type}/{folder_name_arg}"
        
    data_directory = f'{my_root_directory}/OISST/{data_path}'   
    paths = glob.glob(f'{data_directory}_sst*{baseline_name_arg}.zarr')
    
    # A quick check to ensure we have located files given our arguments
    if not paths:
        start_error = "No files found matching the pattern"
        cont_error = "\nPlease verify you inputted the proper baseline_name, folder_name, sub_folder_name, percentile, and show_climatological_means arguments!"
        raise FileNotFoundError(f"{start_error}:\n{data_directory}_sst...{baseline_name_arg}.zarr\n{cont_error}")
    
    
    ## Fill a dictionary where all (1 to 366) doys are matched with their corresponding filepaths
    doys_dict = {}
    
    for filepath in paths:
        # Open and check what doys are in this file
        ds = xr.open_zarr(filepath).sst
        file_doys = ds['normalized_doy'].values

        # Handle both single value and arrays
        if np.isscalar(file_doys):
            file_doys = [file_doys]
         
        # Map each doy to its file
        for doy in file_doys:
            doys_dict[int(doy)] = filepath
         
        ds.close()
    
    
    ## Use a file and its features to set up the plot
    available_doys = sorted(doys_dict.keys())
    setup_file = doys_dict[available_doys[0]]
    
    if not show_climatological_means:
        setup_ds = xr.open_zarr(filepath).sst.drop_vars("quantile")
    else:
        setup_ds = xr.open_zarr(filepath).sst
    
    # Quick fix for my personal, early datasets
    if 'doy' in setup_ds.coords:
        setup_ds = setup_ds.rename({'doy': 'normalized_doy'}).expand_dims('normalized_doy')
    
    lon = setup_ds.lon.values
    lat = setup_ds.lat.values
    
    # Check for single or multiple-doys in the setup dataset, and return the thetao data for just one (the first) doy
    if len(setup_ds['normalized_doy'].values.shape) == 0 or setup_ds['normalized_doy'].values.size == 1:
        # Single day file
        setup_ds   = setup_ds.drop_vars("normalized_doy").squeeze()
        setup_data = setup_ds.values
    else:
        # Multi-day file
        setup_data = setup_ds.isel(normalized_doy=0).values
    
    setup_ds.close()
    
    
    ## Initialize the plot    
    fig, ax = plt.subplots(figsize=(14, 6), 
                           subplot_kw={'projection': ccrs.Mercator()})
    
    pcm = ax.pcolormesh(
        lon, lat, setup_data,
        cmap='RdYlBu_r',
        vmin=-5, vmax=35,
        transform=ccrs.PlateCarree(),
    )
    
    ax.set_extent([0, 360, -5, 90], crs=ccrs.PlateCarree())
    ax.add_feature(cfeature.LAND, color='lightgray')
    ax.add_feature(cfeature.COASTLINE, linewidth=0.8)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    title = ax.set_title('')
    title_base = '(Relative to 1993-2022)' if baseline_name_arg == 'Baseline9322' else f'({baseline_name_arg})'
    
    
    ## Animation function
    def animate(i):
        doy = available_doys[i]
        filepath = doys_dict[doy]
        
        # Load the dataset
        ds = xr.open_zarr(filepath).sst
      
        if not show_climatological_means:
            ds = ds.drop_vars("quantile")
            
        # Check if this is a single-day or multi-day file
        doy_values = ds['normalized_doy'].values
        
        # Load the data if available for a single doy or select the correct doy in a dataset
        if np.isscalar(doy_values) or doy_values.size == 1:
            frame_data = ds.values
        else:
            doy_idx = np.where(doy_values == doy)[0][0]
            frame_data = ds.isel(normalized_doy=doy_idx).values
        
        # Update the plot
        pcm.set_array(frame_data.ravel())
        title.set_text(f'{data_type} Day {doy} of the Year\n{title_base}')
        
        ds.close()
        return pcm, title
    
    
    ## Create the resulting animation
    type_message = "climatological means" if show_climatological_means else "percentile thresholds"
    print(f"Began animation for the {type_message} of the {folder_name_arg} datasets!")
    
    chosen_doys = len(available_doys)
    
    anim = animation.FuncAnimation(
        fig, animate,
        frames=chosen_doys,
        interval=200,
        blit=False,
        repeat=True
    )
    
    writer = animation.PillowWriter(fps=2)
    
    if custom_output_filename == None:
        output_filename = f"{folder_name_arg}_{data_type}_{baseline_name_arg}_{chosen_doys}_doys_total.gif"
    else:
        output_filename = custom_output_filename
    print(f"Saving animation at: {output_filename}") 
    
    
    ## Save the resulting animation
    def print_frame_progress(current_frame, total_frames):
        print(f"\r → Doy (Frame) Processed: {current_frame + 1}/{total_frames}", end='', flush=True)

    #with ProgressBar():
    anim.save(output_filename, writer=writer, dpi=100,
              progress_callback=print_frame_progress)
    
    plt.tight_layout()
    plt.close(fig)
    print(f"\nAnimation finished and saved!\n")
    
    return anim

# -----------------------------------------------------------------------------------------------------------------------------

# Note: loading a percentile produces an animation for percentile thresholds, and setting it to None an animation for means
selected_baseline = baseline_choice # you can tweak this here, or use the one you selected at the top of the script
check_processed_datasets_with_an_animation(baseline_name_arg=selected_baseline, folder_name_arg="Full", 
                                           custom_output_filename=None, percentile=None)

In [None]:
# Code to verify against the outputs of marineHeatWaves to come in the future...