In [23]:
'''
This routine computes daily temperature percentiles (80th, 90th, and 95th) using a 30-year base period (1995-2024) for heatwave detection.

Daniela Risaro
July 2025
'''


'\nThis routine computes daily temperature percentiles (80th, 90th, and 95th) using a 30-year base period (1995-2024) for heatwave detection.\n\nDaniela Risaro\nJuly 2025\n'

In [1]:
import os
import xarray as xr
import numpy as np
import pandas as pd 


In [None]:
data_dir = "../data/raw/"
output_dir = "../data/processed/"

os.makedirs(output_dir, exist_ok=True)

variable = "tmax"
area = [26, -93, 6.5, -56]

area_str = f"area_{area[0]}N_{abs(area[1])}W_{abs(area[2])}S_{abs(area[3])}E"

files = sorted(
    f for f in os.listdir(data_dir)
    if f.endswith(".nc")
    and variable in f
    and area_str in f
)

years_from_files = sorted(
    int(f.split("_")[4])
    for f in files
)

start_year = years_from_files[0]
end_year = years_from_files[-1]

print("Files found:")
for f in files:
    print(" ", f)

print("\nYears detected:", years_from_files)
print(f"Start year: {start_year}")
print(f"End year: {end_year}")

In [None]:
import re

base_files = [
    f for f in files
    if (m := re.search(r'(19|20)\d{2}', f)) and start_year <= int(m.group()) <= end_year
]


In [None]:
## open and concatenate files 
datasets = [
    ds.squeeze("number", drop=True) if "number" in ds.dims else ds
    for ds in (xr.open_dataset(os.path.join(data_dir, f)) for f in base_files)
]

combined = xr.concat(datasets, dim="valid_time")
combined["valid_time"] = pd.to_datetime(combined["valid_time"].values)

t2m_celsius = combined["t2m"] - 273.15
percentiles = [80, 90, 95, 98, 99]
percentile_vars = {}

for p in percentiles:
    t2m_p = (
        t2m_celsius
        .reduce(np.percentile, q=p, dim="valid_time")
        .squeeze()
    )
    t2m_p.name = f"t2m_p{p}"
    t2m_p.attrs["long_name"] = f"{p}th percentile of daily maximum 2m air temperature for period {start_year}-{end_year}"
    t2m_p.attrs["units"] = "degrees Celsius"
    percentile_vars[f"t2m_p{p}"] = t2m_p

t2m_mean = (
    t2m_celsius
    .mean(dim="valid_time")
    .squeeze()
)
t2m_mean.name = "t2m_mean"
t2m_mean.attrs["long_name"] = "Mean daily maximum 2m air temperature"
t2m_mean.attrs["units"] = "degrees Celsius"

percentile_vars["t2m_mean"] = t2m_mean

ds_out = xr.Dataset(percentile_vars)
ds_out.attrs["period"] = f"{start_year}-{end_year}"
ds_out.attrs["source"] = "ERA5 daily statistics (analysis), downloaded via CDS API"


variable = "tmax"
area = [26, -93, 6.5, -56]

out_path = os.path.join(output_dir, f"tmax2m_mean_and_percentiles_{start_year}_{end_year}_area_{area[0]}N_{abs(area[1])}W_{abs(area[2])}S_{abs(area[3])}E.nc")
ds_out.to_netcdf(out_path)
print(f"Archivo guardado: {out_path}")

Archivo guardado: ../data/processed/tmax2m_mean_and_percentiles_1995_2024_area_6N_-74W_-34S_-33E.nc
