##### About this notebook:

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
# Author:             Erick Rico Esparza
# Dates:              Jan 12 - 20, 2025
# Description:        Data sanity check + seasonality consistency prior to full analysis + diurnal cycle exploration
# Organization:       Tampere University / Institute of Atmospheric Sciences and Climate Change (ICAyCC-UNAM)
#-----------------------------------------------------------------------------------------------------------------------------

# Stage 1

## 1. Setup & constants

In [20]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from xarray.coding.variables import SerializationWarning
import warnings
warnings.filterwarnings("ignore", category=SerializationWarning)
import matplotlib
matplotlib.use("Agg")
import calendar

In [23]:
import os
import re
import zipfile
from scipy.stats import linregress
from pathlib import Path

In [2]:
# --- Set display formatting
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
# --- Domain & constants
LON_MIN, LON_MAX = -120, -85
LAT_MIN, LAT_MAX = 12, 33
LON_CDMX, LAT_CDMX = -99.13, 19.43

# MCMA box
SW_lat, SW_lon = 18.3, -100.9
NE_lat, NE_lon = 20.7, -97.4
MCMA_BOX = (SW_lon, SW_lat, NE_lon - SW_lon, NE_lat - SW_lat)

pollutants = ["PM2.5", "PM10"]

## 2. Load reanalysis + pre-processing

In [4]:
# 500 hPa
H500 = xr.open_dataset("hgt500_mex_2012_2024.nc")["hgt"]
U500 = xr.open_dataset("uwnd500_mex_2012_2024.nc")["uwnd"]
V500 = xr.open_dataset("vwnd500_mex_2012_2024.nc")["vwnd"]

# Force datetime format if necessary
H500 = H500.assign_coords(time=pd.to_datetime(H500.time.values))
U500 = U500.assign_coords(time=pd.to_datetime(U500.time.values))
V500 = V500.assign_coords(time=pd.to_datetime(V500.time.values))

lon2d, lat2d = H500["lon"].values, H500["lat"].values

## 3. PM daily + extremos mensuales

In [5]:
df = pd.read_csv("pm_cdmx_citymean_daily_2012_2024.csv")
df["DATE"] = pd.to_datetime(df["DATE"])
df = df.set_index("DATE").sort_index()

def p90_events_by_month(series: pd.Series) -> dict:
    """
    Returns dict: month(int 1-12) -> DatetimeIndex of event days (p90 within each month-year)
    Similiar to SEE4994's Week 6 logic, but organized by calendar month.
    """
    out = {m: [] for m in range(1, 13)}
    for _, s in series.groupby(series.index.to_period("M")):
        if len(s) == 0:
            continue
        thr = s.quantile(0.90)
        ev = s[s >= thr].index
        m = ev[0].month if len(ev) else None
        if m is not None:
            out[m].extend(list(ev))
    # unique + sort
    return {m: pd.DatetimeIndex(sorted(set(out[m]))) for m in out}

events_p90 = {p: p90_events_by_month(df[p].dropna()) for p in pollutants}

print("Event day count (p90 within each month-year):")
for p in pollutants:
    counts = {m: len(events_p90[p][m]) for m in range(1, 13)}
    print(p, counts)

Event day count (p90 within each month-year):
PM2.5 {1: 49, 2: 39, 3: 52, 4: 39, 5: 48, 6: 37, 7: 52, 8: 55, 9: 39, 10: 49, 11: 38, 12: 52}
PM10 {1: 49, 2: 38, 3: 52, 4: 40, 5: 48, 6: 37, 7: 51, 8: 51, 9: 37, 10: 49, 11: 39, 12: 51}


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4555 entries, 2012-01-01 to 2024-12-31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PM10    4555 non-null   float64
 1   PM2.5   4555 non-null   float64
dtypes: float64(2)
memory usage: 106.8 KB


## 4. Climatology + anomalies

In [7]:
def daily_climatology(da):
    da = da.assign_coords(time=pd.to_datetime(da.time.values))
    da_noleap = da.sel(time=~((da.time.dt.month==2) & (da.time.dt.day==29)))
    clim = da_noleap.groupby("time.dayofyear").mean("time")
    clim = clim.rolling(dayofyear=31, center=True, min_periods=1).mean()
    return clim

clim500 = daily_climatology(H500)

## 5. Monthly composites + 3×4 multipanel (12 months)

In [8]:
# Compute H′ anomalies relative to daily climatology
# Using groupby to align daily climatology (dayofyear) with the time dimension
Hprime500 = H500.groupby("time.dayofyear") - clim500

In [None]:
# --- Helper: composite of specific dates
def composite_dates(ds, dates):
    dates = pd.to_datetime(dates)
    if len(dates) == 0:
        return None
    # asegurar que dates están dentro del rango
    tmin = pd.to_datetime(ds.time.min().values)
    tmax = pd.to_datetime(ds.time.max().values)
    dates = dates[(dates >= tmin) & (dates <= tmax)]
    if len(dates) == 0:
        return None
    return ds.sel(time=ds.time.isin(dates)).mean("time", skipna=True)

# --- t-test: events vs non-events within the SAME month (avoid seasonality leakage)
def ttest_mask_within_month(Hprime, event_dates, month:int):
    event_dates = pd.to_datetime(event_dates)
    Hm = Hprime.sel(time=Hprime.time.dt.month == month)

    evt = Hm.sel(time=Hm.time.isin(event_dates))
    ctrl = Hm.sel(time=~Hm.time.isin(event_dates))

    if evt.time.size < 5 or ctrl.time.size < 5:
        return xr.zeros_like(Hm.isel(time=0), dtype=bool)

    t, p = ttest_ind(evt, ctrl, axis=0, equal_var=False, nan_policy="omit")
    return xr.DataArray(p < 0.05, coords=Hm.isel(time=0).coords)

# --- Main multipanel plot: 12 months in a 3x4 layout for one pollutant
def plot_monthly_multipanel(pol, events_by_month, Hprime, U, V, outname):
    proj = ccrs.PlateCarree()
    fig, axes = plt.subplots(3, 4, figsize=(16, 9), dpi=250,
                             subplot_kw={'projection': proj})
    axes = axes.flatten()

    # ---- Precompute monthly anomalies to lock a consistent color scale ----
    Hp_month = {}
    max_abs = 0.0

    for m in range(1, 13):
        dates_m = events_by_month[m]
        Hp = composite_dates(Hprime, dates_m)
        Hp_month[m] = Hp
        if Hp is not None:
            max_abs = max(max_abs, float(np.nanmax(np.abs(Hp.values))))

    if max_abs == 0:
        raise ValueError("max_abs is zero. Are there no events or missing data?")

    norm = TwoSlopeNorm(vcenter=0, vmin=-max_abs, vmax=max_abs)
    pcm_ref = None

    for i, m in enumerate(range(1, 13)):
        ax = axes[i]
        ax.set_extent([LON_MIN, LON_MAX, LAT_MIN, LAT_MAX], crs=proj)

        ax.coastlines(resolution="50m", linewidth=0.5)
        ax.add_feature(cfeature.BORDERS, linewidth=0.4)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.3)

        dates_m = events_by_month[m]
        n_evt = len(dates_m)
        Hp = Hp_month[m]

        if Hp is None or n_evt == 0:
            ax.set_title(f"{calendar.month_abbr[m]} (n=0)")
            ax.text(0.5, 0.5, "No events", transform=ax.transAxes,
                    ha="center", va="center", fontsize=10)
            continue

        Um = composite_dates(U, dates_m)
        Vm = composite_dates(V, dates_m)

        sig = ttest_mask_within_month(Hprime, dates_m, month=m)

        Hp_np = Hp.values
        Um_np = Um.values
        Vm_np = Vm.values
        sig_np = sig.values

        pcm = ax.pcolormesh(lon2d, lat2d, Hp_np, cmap="RdBu_r",
                            norm=norm, shading="auto", transform=proj)
        pcm_ref = pcm

        # contours
        stepc = 5
        lev = np.arange(-max_abs, max_abs + stepc, stepc)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev > 0],
                   colors="k", linewidths=0.4, linestyles="solid", transform=proj)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev < 0],
                   colors="k", linewidths=0.4, linestyles="dashed", transform=proj)

        # vectors
        step = 4
        ax.quiver(lon2d[::step, ::step], lat2d[::step, ::step],
                  Um_np[::step, ::step], Vm_np[::step, ::step],
                  scale=700, width=0.002, color="black", transform=proj)

        # stippling
        y, x = np.where(sig_np)
        thin = 8
        y = y[::thin]; x = x[::thin]
        ax.scatter(lon2d[y, x], lat2d[y, x], s=2, c="k", alpha=0.25, transform=proj)

        # MCMA box + CDMX star
        rect = mpatches.Rectangle((MCMA_BOX[0], MCMA_BOX[1]),
                                  MCMA_BOX[2], MCMA_BOX[3],
                                  fill=False, edgecolor="k",
                                  linewidth=1, transform=proj)
        ax.add_patch(rect)

        ax.plot(LON_CDMX, LAT_CDMX, marker="*", color="gold",
                markersize=8, markeredgecolor="k", transform=proj)

        # grid labels: only left column + bottom row
        gl = ax.gridlines(draw_labels=True, linewidth=0.2, color="gray",
                          alpha=0.5, linestyle="--")
        gl.top_labels = False
        gl.right_labels = False
        if (i % 4) != 0:
            gl.left_labels = False
        if i < 8:
            gl.bottom_labels = False

        ax.set_title(f"{calendar.month_abbr[m]} (n={n_evt})", fontsize=10, weight="bold")

    # colorbar
    cbar_ax = fig.add_axes([0.92, 0.20, 0.015, 0.60])
    cb = fig.colorbar(pcm_ref, cax=cbar_ax)
    cb.set_label("Z500 anomaly (m)")

    fig.suptitle(f"{pol}: Monthly Z500′ composites (p90 within month-year), 2012–2024",
                 fontsize=14, weight="bold", y=0.98)

    plt.tight_layout(rect=[0, 0, 0.9, 0.95])
    plt.savefig(outname, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", outname)


# --- Compute anomalies once
Hprime500 = H500.groupby("time.dayofyear") - clim500

# --- Make the 12-month multipanels
for pol in pollutants:
    plot_monthly_multipanel(
        pol=pol,
        events_by_month=events_p90[pol],   # <-- dict month->dates
        Hprime=Hprime500,
        U=U500,
        V=V500,
        outname=f"Z500_monthly_p90_{pol.replace('.', '')}.png"
    )

  plt.tight_layout(rect=[0, 0, 0.9, 0.95])


Saved: Z500_monthly_p90_PM25.png


  plt.tight_layout(rect=[0, 0, 0.9, 0.95])


Saved: Z500_monthly_p90_PM10.png


## Extra: WHO exceedance diagnostics

In [21]:
WHO = {"PM2.5": 15.0, "PM10": 45.0}

def who_exceedance_stats(df, pol, thr):
    s = df[pol].dropna()
    exc = s[s >= thr]

    # counts aggregated over all years
    exc_days = exc.groupby(exc.index.month).size().reindex(range(1, 13), fill_value=0)
    total_days = s.groupby(s.index.month).size().reindex(range(1, 13), fill_value=0)

    pct = (exc_days / total_days * 100).replace([np.inf, -np.inf], np.nan)

    # by year-month (PeriodIndex)
    counts_ym = exc.groupby(exc.index.to_period("M")).size()

    return exc_days, total_days, pct, counts_ym

# ---------- 1) PRINT: raw outputs ----------
results = {}  # store for plotting/table
for pol in pollutants:
    thr = WHO[pol]
    exc_days, total_days, pct, counts_ym = who_exceedance_stats(df, pol, thr)
    results[pol] = {"exc_days": exc_days, "total_days": total_days, "pct": pct, "counts_ym": counts_ym}

    print(f"\nWHO exceedance days for {pol} (thr={thr}):")
    print(exc_days)

    print("\nBy year-month:")
    print(counts_ym)

    print(f"\n{pol} WHO exceedance percentage by month:")
    print(pct.round(1))

# ---------- 2) PLOT: % exceedance by month ----------
months = np.arange(1, 13)
labels = [calendar.month_abbr[m] for m in months]

fig, axes = plt.subplots(2, 1, figsize=(10, 7), dpi=200, sharex=True)

bar_colors = {"PM2.5": "#d55e00", "PM10": "#0072b2"} 

for ax, pol in zip(axes, pollutants):
    pct = results[pol]["pct"].reindex(range(1, 13))
    ax.bar(months, pct.values, color=bar_colors.get(pol, None))
    ax.set_title(f"{pol}: WHO 24h exceedance percentage by month (thr={WHO[pol]} µg/m³)")
    ax.set_ylabel("% of days exceed")
    ax.grid(alpha=0.3)
    ax.set_ylim(0, 100)  # percentages

axes[-1].set_xticks(months)
axes[-1].set_xticklabels(labels)
axes[-1].set_xlabel("Month")

plt.tight_layout()
plt.savefig("WHO_exceedance_pct_by_month.png", dpi=300, bbox_inches="tight")
plt.close(fig)
print("\nSaved: WHO_exceedance_pct_by_month.png")


WHO exceedance days for PM2.5 (thr=15.0):
DATE
1     327
2     313
3     367
4     368
5     361
6     263
7     311
8     251
9     237
10    235
11    324
12    369
Name: PM2.5, dtype: int64

By year-month:
DATE
2012-01    30
2012-02    24
2012-03    28
2012-04    27
2012-05    30
           ..
2024-08    17
2024-09    18
2024-10    17
2024-11    26
2024-12    27
Freq: M, Name: PM2.5, Length: 154, dtype: int64

PM2.5 WHO exceedance percentage by month:
DATE
1    87.70
2    88.70
3    91.50
4    96.60
5    97.00
6    73.10
7    78.10
8    63.40
9    64.10
10   62.30
11   86.60
12   92.20
Name: PM2.5, dtype: float64

WHO exceedance days for PM10 (thr=45.0):
DATE
1     277
2     257
3     292
4     263
5     215
6      80
7      59
8      29
9      30
10     85
11    201
12    298
Name: PM10, dtype: int64

By year-month:
DATE
2012-01    26
2012-02    15
2012-03    27
2012-04    25
2012-05    28
           ..
2024-05    23
2024-06     6
2024-10     3
2024-11     9
2024-12    16
Freq: M,

In [22]:
# ---------- 3) TABLE + CSV export ----------
rows = []
for pol in pollutants:
    exc_days = results[pol]["exc_days"]
    total_days = results[pol]["total_days"]
    pct = results[pol]["pct"]

    for m in range(1, 13):
        rows.append({
            "Pollutant": pol,
            "Month": calendar.month_abbr[m],
            "Exceed_days": int(exc_days.loc[m]),
            "Total_days": int(total_days.loc[m]),
            "Exceed_%": float(pct.loc[m]),
        })

who_table = pd.DataFrame(rows)
who_table["Exceed_%"] = who_table["Exceed_%"].round(1)

print("\nWHO exceedance table (monthly, aggregated 2012–2024):")
print(who_table)

who_table.to_csv("WHO_exceedance_table_by_month.csv", index=False)
print("\nSaved: WHO_exceedance_table_by_month.csv")


WHO exceedance table (monthly, aggregated 2012–2024):
   Pollutant Month  Exceed_days  Total_days  Exceed_%
0      PM2.5   Jan          327         373     87.70
1      PM2.5   Feb          313         353     88.70
2      PM2.5   Mar          367         401     91.50
3      PM2.5   Apr          368         381     96.60
4      PM2.5   May          361         372     97.00
5      PM2.5   Jun          263         360     73.10
6      PM2.5   Jul          311         398     78.10
7      PM2.5   Aug          251         396     63.40
8      PM2.5   Sep          237         370     64.10
9      PM2.5   Oct          235         377     62.30
10     PM2.5   Nov          324         374     86.60
11     PM2.5   Dec          369         400     92.20
12      PM10   Jan          277         373     74.30
13      PM10   Feb          257         353     72.80
14      PM10   Mar          292         401     72.80
15      PM10   Apr          263         381     69.00
16      PM10   May         

## Extra2: Exploratory hourly-based diagnostic: daily maxima vs hour of occurrence (RAMA PM)

In [24]:
# -----------------------------
# Settings
# -----------------------------
DATA_DIR = Path("data-icaycc")  # folder containing the RAMA zip files
OUT_DIR = Path(".")
OUT_DIR.mkdir(exist_ok=True)

MISSING_VALUE = -99

# Years by pollutant
YEARS_PM10 = range(1995, 2024)   # 1995–2023
YEARS_PM25 = range(2003, 2024)   # 2003–2023

# Filenames inside zip: try to match patterns like "2023PM10.csv" or "2023PM25.csv"
POLLUTANT_TAGS = {
    "PM10": ["PM10"],
    "PM25": ["PM25", "PM2.5", "PM2_5"]  # be flexible
}

# Plot controls
FIGSIZE = (16, 10)
DPI = 300

# Single 12-panel multipanel
MONTHS_PER_FIG = None  # None -> single 3x4 multipanel

In [25]:
# -----------------------------
# Helpers: locate zip and member file
# -----------------------------
def zip_path_for_year(year: int) -> Path:
    """
    Return path to the year RAMA zip, e.g., 95RAMA.zip for 1995, 03RAMA.zip for 2003.
    """
    yy = f"{year % 100:02d}"
    candidates = list(DATA_DIR.glob(f"{yy}RAMA*.zip"))
    if not candidates:
        # Try recursive search in case the zips are inside subfolders
        candidates = list(DATA_DIR.rglob(f"{yy}RAMA*.zip"))
    if not candidates:
        raise FileNotFoundError(f"Zip for year={year} not found. Expected something like {yy}RAMA.zip inside {DATA_DIR}")
    # If multiple candidates, take the shortest path (usually the direct one)
    return sorted(candidates, key=lambda p: len(str(p)))[0]


def find_member_for_year(zip_obj: zipfile.ZipFile, year: int, pollutant: str) -> str:
    """
    Find the file inside the zip matching year and pollutant.
    Accepts CSV or Excel-like extensions.
    """
    year_str = str(year)
    tags = POLLUTANT_TAGS[pollutant]

    members = zip_obj.namelist()
    # Prefer flat files, but allow subpaths
    pattern_list = []
    for tag in tags:
        # e.g. 2023PM10.csv, 2023PM25.xls, etc.
        pattern_list.append(re.compile(rf".*{year_str}.*{tag}.*\.(csv|xls|xlsx)$", re.IGNORECASE))

    matches = []
    for m in members:
        for pat in pattern_list:
            if pat.match(m):
                matches.append(m)

    if not matches:
        # If nothing matched with extensions, attempt without extension filtering
        for tag in tags:
            pat2 = re.compile(rf".*{year_str}.*{tag}.*", re.IGNORECASE)
            for m in members:
                if pat2.match(m):
                    matches.append(m)

    if not matches:
        raise FileNotFoundError(f"No member found in zip for year={year}, pollutant={pollutant}. Zip members example: {members[:10]}")

    # Prefer csv if available, else first match
    matches_sorted = sorted(matches, key=lambda s: (0 if s.lower().endswith(".csv") else 1, len(s)))
    return matches_sorted[0]


def read_year_pollutant_from_zip(zip_path: Path, year: int, pollutant: str) -> pd.DataFrame:
    """
    Read one year file for a pollutant from the given zip.
    Returns a dataframe with columns: date (datetime), hour (int), row_max (float).
    row_max is the hourly maximum across stations for that hour.
    """
    with zipfile.ZipFile(zip_path, "r") as z:
        member = find_member_for_year(z, year, pollutant)

        # Read depending on extension
        if member.lower().endswith(".csv"):
            with z.open(member) as f:
                # Try auto separator detection (tabs/commas/semicolons)
                df = pd.read_csv(f, sep=None, engine="python")
        else:
            with z.open(member) as f:
                df = pd.read_excel(f)

    # Standardize column names
    df.columns = [str(c).strip() for c in df.columns]

    # Basic required cols
    if "FECHA" not in df.columns or "HORA" not in df.columns:
        raise ValueError(f"Expected columns FECHA and HORA in {zip_path.name}:{member}. Found: {df.columns[:10]}")

    # Parse date (day-first)
    df["FECHA"] = pd.to_datetime(df["FECHA"], dayfirst=True, errors="coerce")

    # Convert hour to numeric
    df["HORA"] = pd.to_numeric(df["HORA"], errors="coerce")

    # Replace -99 with NaN
    station_cols = [c for c in df.columns if c not in ["FECHA", "HORA"]]
    df[station_cols] = df[station_cols].replace(MISSING_VALUE, np.nan)

    # Hourly maximum across stations (regional maximum at that hour)
    df["row_max"] = df[station_cols].max(axis=1, skipna=True)

    out = df[["FECHA", "HORA", "row_max"]].dropna(subset=["FECHA", "HORA"])
    out = out.rename(columns={"FECHA": "date", "HORA": "hour"})

    return out

In [26]:
# -----------------------------
# Build daily maxima vs hour-of-maximum dataset
# -----------------------------
def build_daily_max_dataset(pollutant: str, years: range) -> pd.DataFrame:
    """
    For each day:
      - finds the maximum hourly value across all stations and hours (daily maximum)
      - stores the hour at which that maximum occurs
    Returns DataFrame with columns: date, month, hour_of_max, daily_max
    """
    daily_records = []

    for y in years:
        zpath = zip_path_for_year(y)
        yearly = read_year_pollutant_from_zip(zpath, y, pollutant)

        # Keep only valid maxima
        yearly = yearly.dropna(subset=["row_max"])

        if yearly.empty:
            continue

        # For each date, locate row of the maximum row_max
        idx = yearly.groupby("date")["row_max"].idxmax()
        daily = yearly.loc[idx].copy()

        daily["month"] = daily["date"].dt.month
        daily = daily.rename(columns={"hour": "hour_of_max", "row_max": "daily_max"})

        daily_records.append(daily[["date", "month", "hour_of_max", "daily_max"]])

    if not daily_records:
        raise RuntimeError(f"No daily records built for pollutant={pollutant}. Check filenames and missing values.")

    out = pd.concat(daily_records, ignore_index=True)
    out = out.sort_values("date")
    return out

In [39]:
# -----------------------------
# Plot: 12-month multipanel
# -----------------------------
def plot_dailymax_vs_hour_multipanel(
    df_daily,
    pollutant_label,
    year_start,
    year_end,
    out_png,
    months_per_fig=None,
    hour_min=0,
    hour_max=23,
    y_max_cap=None  # numeric cap to remove extreme outliers
):
    """
    Multipanel scatter:
      x = hour_of_max (0–23)
      y = daily_max (units as provided)
    Adds linear trend line, rate and R².
    """

    # Filter invalid hours
    df_daily = df_daily.copy()
    df_daily = df_daily[(df_daily["hour_of_max"] >= hour_min) & (df_daily["hour_of_max"] <= hour_max)]

    # Optional outlier cap for readability
    if y_max_cap is not None:
        df_daily = df_daily[df_daily["daily_max"] <= y_max_cap]

    months = list(range(1, 13))

    # Split months if requested
    if months_per_fig is None:
        month_chunks = [months]
    else:
        month_chunks = [months[i:i+months_per_fig] for i in range(0, 12, months_per_fig)]

    for chunk_i, chunk in enumerate(month_chunks, start=1):
        n = len(chunk)
        if n == 12:
            nrows, ncols = 3, 4
        elif n == 4:
            nrows, ncols = 2, 2
        else:
            ncols = min(4, n)
            nrows = int(np.ceil(n / ncols))

        fig, axes = plt.subplots(nrows, ncols, figsize=FIGSIZE, dpi=DPI)
        axes = np.array(axes).reshape(-1)

        # Keep handles for a single global legend
        sample_handle = None
        trend_handle = None

        for ax_i, m in enumerate(chunk):
            ax = axes[ax_i]
            sub = df_daily[df_daily["month"] == m].dropna(subset=["hour_of_max", "daily_max"]).copy()

            # Scatter: open red circles
            sc = ax.scatter(
                sub["hour_of_max"], sub["daily_max"],
                s=18, facecolors="none", edgecolors="red", alpha=0.55, linewidths=0.8,
                label="Samples"
            )
            if sample_handle is None:
                sample_handle = sc

            # Trend line
            if len(sub) >= 2 and sub["hour_of_max"].nunique() >= 2:
                lr = linregress(sub["hour_of_max"].values, sub["daily_max"].values)
                slope = lr.slope
                r2 = (lr.rvalue ** 2) if np.isfinite(lr.rvalue) else np.nan

                x_line = np.linspace(hour_min, hour_max, 100)
                y_line = lr.intercept + slope * x_line

                ln, = ax.plot(x_line, y_line, color="blue", linewidth=2.0, label="Trend")
                if trend_handle is None:
                    trend_handle = ln

                # Top-right annotation in bold
                ax.text(
                    0.98, 0.95,
                    f"Rate: {slope:.2f} (µg/m³)/h\nR²: {r2:.4f}",
                    transform=ax.transAxes,
                    ha="right", va="top",
                    fontsize=9,
                    fontweight="bold"
                )
            else:
                ax.text(
                    0.98, 0.95,
                    "Rate: n/a\nR²: n/a",
                    transform=ax.transAxes,
                    ha="right", va="top",
                    fontsize=9,
                    fontweight="bold"
                )

            mon_name = calendar.month_abbr[m]
            ax.set_title(
                f"Daily maxima of {pollutant_label} vs hour ({mon_name} {year_start}–{year_end})",
                fontsize=9,
                fontweight="bold"
            )

            ax.set_xlabel("Hour of daily maximum [h]")
            ax.set_ylabel(f"{pollutant_label} [µg/m³]")

            # Force x ticks 0–23
            ax.set_xlim(hour_min, hour_max)
            ax.set_xticks(np.arange(hour_min, hour_max + 1, 2))

            ax.grid(True, alpha=0.35)

        # Turn off extra axes
        for j in range(len(chunk), len(axes)):
            axes[j].axis("off")

        # Global title
        fig.suptitle(
            f"Maximum observed {pollutant_label}: daily maxima vs hour of occurrence",
            fontsize=14,
            fontweight="bold"
        )

        # Global legend (Trend / Samples only)
        handles = []
        labels = []
        if trend_handle is not None:
            handles.append(trend_handle)
            labels.append("Trend")
        if sample_handle is not None:
            handles.append(sample_handle)
            labels.append("Samples")

        fig.legend(handles, labels, loc="upper left", bbox_to_anchor=(0.01, 0.98), frameon=False)

        plt.tight_layout(rect=[0, 0, 1, 0.95])

        # Output name
        if months_per_fig is None:
            fname = out_png
        else:
            stem, ext = os.path.splitext(out_png)
            fname = f"{stem}_part{chunk_i}{ext}"

        plt.savefig(OUT_DIR / fname, dpi=DPI, bbox_inches="tight")
        plt.close(fig)

        print(f"Saved: {fname}")

In [40]:
# -----------------------------
# Run for PM2.5 and PM10
# -----------------------------
# PM2.5 (PM25)
df_pm25 = build_daily_max_dataset("PM25", YEARS_PM25)
plot_dailymax_vs_hour_multipanel(
    df_pm25,
    pollutant_label="PM$_{2.5}$",
    year_start=2003,
    year_end=2023,
    out_png="PM25_dailymax_vs_hour_multipanel.png",
    months_per_fig=MONTHS_PER_FIG
)

# PM10
df_pm10 = build_daily_max_dataset("PM10", YEARS_PM10)
plot_dailymax_vs_hour_multipanel(
    df_pm10,
    pollutant_label="PM$_{10}$",
    year_start=1995,
    year_end=2023,
    out_png="PM10_dailymax_vs_hour_multipanel.png",
    months_per_fig=MONTHS_PER_FIG
)

Saved: PM25_dailymax_vs_hour_multipanel.png
Saved: PM10_dailymax_vs_hour_multipanel.png
