In [23]:
'''
This routine computes daily temperature percentiles (80th, 90th, and 95th) using a 30-year base period (1995-2024) for heatwave detection.

Daniela Risaro
July 2025
'''


'\nThis routine computes daily temperature percentiles (80th, 90th, and 95th) using a 30-year base period (1995-2024) for heatwave detection.\n\nDaniela Risaro\nJuly 2025\n'

In [1]:
import os
import xarray as xr
import numpy as np
import pandas as pd 


In [2]:
length_clim = 30  # standard base period length for climatology

def select_base_period(years):
    candidates = [y for y in range(max(years) - (length_clim - 1), min(years) - 1, -1)
                  if y % 5 == 0 or y % 10 == 0]
    for start in candidates:
        end = start + length_clim - 1
        if all(y in years for y in range(start, end + 1)):
            return start, end

    possible_lengths = [l for l in [20, 10] if l < length_clim] or [10]
    for length in possible_lengths:
        candidates = [y for y in range(max(years) - length + 1, min(years) - 1, -1)
                      if y % 5 == 0 or y % 10 == 0]
        for start in candidates:
            end = start + length - 1
            if all(y in years for y in range(start, end + 1)):
                return start, end

    raise ValueError("No se encontró un periodo base válido con los años disponibles.")


def find_available_periods(years):
    years = sorted(years)
    available_periods = []

    for start in range(min(years), max(years) - length_clim + 2):
        if start % 5 == 0:
            end = start + length_clim - 1
            if all(y in years for y in range(start, end + 1)):
                available_periods.append({
                    'start': start,
                    'end': end,
                    'description': f"{start}-{end}"
                })
    return available_periods


def select_base_period_interactive(years):
    available_periods = find_available_periods(years)

    if not available_periods:
        raise ValueError("No se encontraron periodos de {} años válidos con los años disponibles.".format(length_clim))

    print("\nPeriodos de {} años disponibles para el cálculo de la climatología:".format(length_clim))
    for i, period in enumerate(available_periods, 1):
        print(f"{i}. {period['description']}")

    while True:
        try:
            choice = int(input("\nSeleccione el número del periodo que desea usar: ")) - 1
            if 0 <= choice < len(available_periods):
                selected = available_periods[choice]
                return selected['start'], selected['end']
            else:
                print("Por favor, seleccione un número válido de la lista.")
        except ValueError:
            print("Por favor, ingrese un número válido.")


In [3]:
data_dir = "../data/raw/"
output_dir = "../data/processed/"

os.makedirs(output_dir, exist_ok=True)

## read files 
variable = "tmax"
region = [6, -74, -34, -33]  # [lat_max, lon_max, lat_min, lon_min]
files = sorted([file for file in os.listdir(data_dir) if file.endswith(".nc") and variable in file and all(str(abs(coord)) in file for coord in region)])

years_from_files = sorted([int(file.split("_")[3].split(".")[0]) for file in files])


In [4]:

print(f"\nAños disponibles: {min(years_from_files)}-{max(years_from_files)}")
start_year, end_year = select_base_period_interactive(years_from_files)
print(f"\nPeriodo base seleccionado para climatología: {start_year}-{end_year}")


Años disponibles: 1995-2025

Periodos de 30 años disponibles para el cálculo de la climatología:
1. 1995-2024

Periodo base seleccionado para climatología: 1995-2024


In [5]:
import re

base_files = [
    f for f in files
    if (m := re.search(r'(19|20)\d{2}', f)) and start_year <= int(m.group()) <= end_year
]


In [6]:
## open and concatenate files 
datasets = [xr.open_dataset(data_dir + f).squeeze("number", drop=True) if "number" in xr.open_dataset(data_dir + f).dims else xr.open_dataset(data_dir + f) for f in base_files]

combined = xr.concat(datasets, dim="valid_time")
combined["valid_time"] = pd.to_datetime(combined["valid_time"].values)

t2m_celsius = combined["t2m"] - 273.15
percentiles = [80, 90, 95, 98, 99]
percentile_vars = {}

for p in percentiles:
    t2m_p = (
        t2m_celsius
        .reduce(np.percentile, q=p, dim="valid_time")
        .squeeze()
    )
    t2m_p.name = f"t2m_p{p}"
    t2m_p.attrs["long_name"] = f"{p}th percentile of daily maximum 2m air temperature for period {start_year}-{end_year}"
    t2m_p.attrs["units"] = "degrees Celsius"
    percentile_vars[f"t2m_p{p}"] = t2m_p

t2m_mean = (
    t2m_celsius
    .mean(dim="valid_time")
    .squeeze()
)
t2m_mean.name = "t2m_mean"
t2m_mean.attrs["long_name"] = "Mean daily maximum 2m air temperature"
t2m_mean.attrs["units"] = "degrees Celsius"

percentile_vars["t2m_mean"] = t2m_mean

ds_out = xr.Dataset(percentile_vars)
ds_out.attrs["period"] = f"{start_year}-{end_year}"
ds_out.attrs["source"] = "ERA5 daily statistics (analysis), downloaded via CDS API"


variable = "tmax"
region = [6, -74, -34, -33] 

out_path = os.path.join(output_dir, f"tmax2m_mean_and_percentiles_{start_year}_{end_year}_area_{region[0]}N_{region[1]}W_{region[2]}S_{region[3]}E.nc")
ds_out.to_netcdf(out_path)
print(f"Archivo guardado: {out_path}")

Archivo guardado: ../data/processed/tmax2m_mean_and_percentiles_1995_2024_area_6N_-74W_-34S_-33E.nc
