##### About this notebook:

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
# Author:             Erick Rico Esparza
# Dates:              Feb 9 - 20, 2025
# Description:        Implementation of approved core pathway to achieve the objectives of Stage 2 of the thesis project 
# Organization:       Tampere University / Institute of Atmospheric Sciences and Climate Change (ICAyCC-UNAM)
#-----------------------------------------------------------------------------------------------------------------------------

# Stage 2

## 1. Libraries and setup

In [120]:
# --- Core
import numpy as np
import pandas as pd
import xarray as xr

import os
from pathlib import Path
import warnings

# --- Plotting
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import matplotlib.patches as mpatches
from matplotlib.colors import TwoSlopeNorm
import calendar

# --- Mapping
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [2]:
warnings.filterwarnings("ignore")

In [3]:
# Display options
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## 2. Paths, I/O rules, and folder structure

This notebook lives inside `Schedule/Stage 2/`.
All core inputs (CSV + 500 hPa NetCDF files) are located **one level above** (i.e., in `Schedule/`).

Using `Path` objects and relative paths to keep the project portable.


In [4]:
# --- Project root
HERE = Path.cwd()  # running from within Stage 2
PARENT = HERE.parent  # Schedule/

# --- Inputs (one level above Stage 2)
PM_CSV = PARENT / "pm_cdmx_citymean_daily_2012_2024.csv"
HGT_NC = PARENT / "hgt500_mex_2012_2024.nc"
U_NC   = PARENT / "uwnd500_mex_2012_2024.nc"
V_NC   = PARENT / "vwnd500_mex_2012_2024.nc"

# --- Outputs (inside Stage 2)
OUT_FIG = HERE / "outputs" / "figures"
OUT_TAB = HERE / "outputs" / "tables"

OUT_FIG.mkdir(parents=True, exist_ok=True)
OUT_TAB.mkdir(parents=True, exist_ok=True)

In [5]:
# --- Existence check
paths = {
    "PM_CSV": PM_CSV,
    "HGT_NC": HGT_NC,
    "U_NC": U_NC,
    "V_NC": V_NC,
    "OUT_FIG": OUT_FIG,
    "OUT_TAB": OUT_TAB,
}

for k, p in paths.items():
    print(f"{k}: {p}  | exists={p.exists()}")

PM_CSV: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\pm_cdmx_citymean_daily_2012_2024.csv  | exists=True
HGT_NC: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\hgt500_mex_2012_2024.nc  | exists=True
U_NC: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\uwnd500_mex_2012_2024.nc  | exists=True
V_NC: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\vwnd500_mex_2012_2024.nc  | exists=True
OUT_FIG: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures  | exists=True
OUT_TAB: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables  | exists=True


In [121]:
# --- Domain & global constants

# Regional domain for composites (Mexico-centered box)
LON_MIN, LON_MAX = -120.0, -85.0
LAT_MIN, LAT_MAX = 12.0, 33.0

# CDMX reference point (marker on maps)
LON_CDMX, LAT_CDMX = -99.13, 19.43

# Valley of Mexico regional box (renamed from MCMA box)
SW_lat, SW_lon = 18.3, -100.9
NE_lat, NE_lon = 20.7, -97.4

VOM_BOX = (
    SW_lon,
    SW_lat,
    NE_lon - SW_lon,
    NE_lat - SW_lat
)

print("Domain and Valley of Mexico box defined.")

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

Domain and Valley of Mexico box defined.


## 3. Load daily PM and 500 hPa reanalysis

This section:
- Loads the city-mean daily PM time series (PM2.5 and PM10)
- Opens the 500 hPa NetCDF fields (Z500, U500, V500)
- Standardizes time coordinates and subsets to the analysis domain
- Performs quick sanity checks (date coverage, missing data)

### 3.1 Load daily PM (CSV)

In [10]:
# --- Load PM data
df_pm = pd.read_csv(PM_CSV)

# Parse dates and standardize column names
df_pm["DATE"] = pd.to_datetime(df_pm["DATE"])
df_pm = df_pm.sort_values("DATE").set_index("DATE")

# Ensure numeric (coerce errors to NaN)
for col in ["PM10", "PM2.5"]:
    df_pm[col] = pd.to_numeric(df_pm[col], errors="coerce")

print(df_pm.head())
print(df_pm.tail())
print(df_pm.info())
print("\nDate range:", df_pm.index.min(), "to", df_pm.index.max())
print("\nMissing values:\n", df_pm[["PM10", "PM2.5"]].isna().sum())

             PM10  PM2.5
DATE                    
2012-01-01 100.14  66.43
2012-01-02  19.29   6.14
2012-01-03  38.00  17.43
2012-01-04  67.71  35.00
2012-01-05  61.43  28.86
            PM10  PM2.5
DATE                   
2024-12-27 42.86  22.71
2024-12-28 47.83  23.00
2024-12-29 43.83  21.67
2024-12-30 51.40  26.00
2024-12-31 52.00  26.00
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4555 entries, 2012-01-01 to 2024-12-31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PM10    4555 non-null   float64
 1   PM2.5   4555 non-null   float64
dtypes: float64(2)
memory usage: 106.8 KB
None

Date range: 2012-01-01 00:00:00 to 2024-12-31 00:00:00

Missing values:
 PM10     0
PM2.5    0
dtype: int64


In [8]:
# QC summary table
qc = pd.DataFrame({
    "start_date": [df_pm.index.min()],
    "end_date": [df_pm.index.max()],
    "n_days": [df_pm.shape[0]],
    "pm10_missing": [df_pm["PM10"].isna().sum()],
    "pm25_missing": [df_pm["PM2.5"].isna().sum()],
})

qc

Unnamed: 0,start_date,end_date,n_days,pm10_missing,pm25_missing
0,2012-01-01,2024-12-31,4555,0,0


### 3.2 Open and standardize NetCDF fields (Z500, U500, V500)

In [45]:
# --- Debugging: to check coordinates and dimensions of H
print("H dims:", H.dims)
print("H coords:", list(H.coords))
if "lat" in H.coords:
    print("lat dims/ndim:", H["lat"].dims, H["lat"].ndim)
if "lon" in H.coords:
    print("lon dims/ndim:", H["lon"].dims, H["lon"].ndim)

H dims: ('time', 'y', 'x')
H coords: ['time', 'level', 'y', 'x', 'lat', 'lon']
lat dims/ndim: ('y', 'x') 2
lon dims/ndim: ('y', 'x') 2


In [20]:
def open_da(path: Path, varname: str) -> xr.DataArray:
    """
    Open a NetCDF and return the requested variable as a DataArray.
    Assumes the dataset contains a single relevant 500 hPa field (curvilinear grid y,x).
    """
    ds = xr.open_dataset(path)
    da = ds[varname].sortby("time")
    return da


def subset_domain_curvilinear(da: xr.DataArray,
                              lat_min: float, lat_max: float,
                              lon_min: float, lon_max: float) -> xr.DataArray:
    """
    Subset a curvilinear grid where lat/lon are 2D coords with dims (y, x).
    Uses a boolean mask and drop=True.
    """
    # Standardize lon to [-180, 180] if the dataset uses 0..360
    if "lon" in da.coords:
        lon = da["lon"]
        if float(lon.max()) > 180.0:
            da = da.assign_coords(lon=(((lon + 180) % 360) - 180))

    lat = da["lat"]
    lon = da["lon"]

    mask = (lat >= lat_min) & (lat <= lat_max) & (lon >= lon_min) & (lon <= lon_max)
    return da.where(mask, drop=True)

In [22]:
# --- Open fields (explicit varnames from your files)
H = open_da(HGT_NC, "hgt")
U = open_da(U_NC,   "uwnd")
V = open_da(V_NC,   "vwnd")

print("Opened fields:")
print("H:", H.name, H.dims, H.shape)
print("U:", U.name, U.dims, U.shape)
print("V:", V.name, V.dims, V.shape)

# --- Subset domain using the curvilinear lat/lon mask
H = subset_domain_curvilinear(H, LAT_MIN, LAT_MAX, LON_MIN, LON_MAX)
U = subset_domain_curvilinear(U, LAT_MIN, LAT_MAX, LON_MIN, LON_MAX)
V = subset_domain_curvilinear(V, LAT_MIN, LAT_MAX, LON_MIN, LON_MAX)

print("\nSubset fields:")
print("H:", H.dims, H.shape,
      f"lat[{float(H.lat.min()):.1f},{float(H.lat.max()):.1f}]",
      f"lon[{float(H.lon.min()):.1f},{float(H.lon.max()):.1f}]")

Opened fields:
H: hgt ('time', 'y', 'x') (4749, 100, 145)
U: uwnd ('time', 'y', 'x') (4749, 100, 145)
V: vwnd ('time', 'y', 'x') (4749, 100, 145)

Subset fields:
H: ('time', 'y', 'x') (4749, 90, 140) lat[9.1,35.5] lon[-124.6,-78.1]


In [23]:
# Align time coverage with PM data (for consistency)
# keep only overlapping dates across PM and reanalysis time ranges.

pm_start, pm_end = df_pm.index.min(), df_pm.index.max()

# Convert xarray time to pandas timestamps for slicing
re_start = pd.to_datetime(H.time.values[0])
re_end   = pd.to_datetime(H.time.values[-1])

start = max(pm_start, re_start)
end   = min(pm_end, re_end)

print("Overlap window:", start.date(), "to", end.date())

# Slice all objects to overlap
df_pm = df_pm.loc[start:end]
H = H.sel(time=slice(np.datetime64(start), np.datetime64(end)))
U = U.sel(time=slice(np.datetime64(start), np.datetime64(end)))
V = V.sel(time=slice(np.datetime64(start), np.datetime64(end)))

print("PM days:", df_pm.shape[0])
print("Reanalysis timesteps:", H.time.size)

Overlap window: 2012-01-01 to 2024-12-31
PM days: 4555
Reanalysis timesteps: 4749


## 4. Monthly Z500 climatology and Z500′ anomalies

Recomputing monthly climatology of Z500 and anomalies (baseline met):
- `H_clim_monthly`: mean Z500 for each calendar month over 2012–2024
- `H_prime`: Z500 anomaly for each day relative to its calendar-month climatology

Sanity check:
- Z500′ should be roughly centered around 0 (distribution-level check).

In [24]:
# --- Stage 2 baseline

# Monthly climatology (2012–2024): mean Z500 for each calendar month
H_clim_mon = H.groupby("time.month").mean("time")

# Monthly anomalies: Z500′(t) = Z500(t) − climatological mean of the same calendar month
H_prime = H.groupby("time.month") - H_clim_mon

print("Monthly climatology and anomalies computed:")
print("H_clim_mon:", H_clim_mon.dims, H_clim_mon.shape)
print("H_prime   :", H_prime.dims, H_prime.shape)

Monthly climatology and anomalies computed:
H_clim_mon: ('month', 'y', 'x') (12, 90, 140)
H_prime   : ('time', 'y', 'x') (4749, 90, 140)


In [25]:
# Sanity check: anomalies should be roughly centered near zero (domain-wide, all times)
hp_vals = H_prime.values
hp_mean = float(np.nanmean(hp_vals))
hp_std  = float(np.nanstd(hp_vals))
hp_p50  = float(np.nanpercentile(hp_vals, 50))
hp_p95  = float(np.nanpercentile(hp_vals, 95))
hp_p05  = float(np.nanpercentile(hp_vals, 5))

print(f"Z500′ summary (all times, domain-wide): mean={hp_mean:.3f}, std={hp_std:.3f}, "
      f"p05={hp_p05:.3f}, p50={hp_p50:.3f}, p95={hp_p95:.3f}")

Z500′ summary (all times, domain-wide): mean=0.000, std=31.484, p05=-48.815, p50=1.012, p95=46.859


In [86]:
# histogram diagnostic (reproducible via RANDOM_SEED)
sample = hp_vals.ravel()
sample = sample[np.isfinite(sample)]
if sample.size > 0:
    rng = np.random.default_rng(RANDOM_SEED)
    sample = rng.choice(sample, size=min(200_000, sample.size), replace=False)

    plt.figure(figsize=(7, 4), dpi=140)
    plt.hist(sample, bins=80)
    plt.title("Z500′ anomaly distribution (monthly baseline; sample)")
    plt.xlabel("Z500′ (meters)")
    plt.ylabel("Count")
    out = OUT_FIG / "z500_anomaly_hist_monthly_baseline.png"
    plt.tight_layout()
    plt.savefig(out, dpi=200)
    plt.close()
    print(f"Saved: {out}")
else:
    print("No finite Z500′ values found for histogram check.")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\z500_anomaly_hist_monthly_baseline.png


## 5. Event definition: p90 by calendar month (daily city-mean) and episode flags

Defining extreme PM event days using the approved rule:
- For each pollutant (PM2.5 and PM10), compute the **90th percentile within each calendar month**
  using the full 2012–2024 daily city-mean time series.
- A day is classified as an **event day** if PM(t) ≥ p90(month of t).

This section produces:
- A table of monthly p90 thresholds
- Boolean event flags for each day (PM2.5 and PM10)
- Basic counts by month/year
- Light persistence diagnostics via run-length episodes

### 5.1 Compute monthly p90 thresholds + event flags

In [27]:
# --- Helper: month-by-month percentile thresholds for a daily series
def monthly_percentile_thresholds(series: pd.Series, q: float = 0.90) -> pd.Series:
    """
    Compute percentile thresholds by calendar month for a daily time series.
    Returns a Series indexed by month (1..12).
    """
    tmp = series.dropna()
    return tmp.groupby(tmp.index.month).quantile(q)


def apply_monthly_thresholds(series: pd.Series, thr_by_month: pd.Series) -> pd.Series:
    """
    Map each timestamp to the threshold of its calendar month.
    Returns a Series aligned with 'series' index (NaN if input is NaN).
    """
    month_index = series.index.month
    thr_aligned = pd.Series(index=series.index, dtype=float)
    for m in range(1, 13):
        thr_aligned.loc[month_index == m] = thr_by_month.loc[m]
    thr_aligned[series.isna()] = np.nan
    return thr_aligned

In [28]:
# --- Compute thresholds (p90 within calendar month)
# monthly_percentile_thresholds groups by series.index.month -> month calendar
p90_pm10 = monthly_percentile_thresholds(df_pm["PM10"], q=0.90) 
p90_pm25 = monthly_percentile_thresholds(df_pm["PM2.5"], q=0.90)

# Align thresholds to each day (based on that day's month)
thr_pm10 = apply_monthly_thresholds(df_pm["PM10"], p90_pm10)
thr_pm25 = apply_monthly_thresholds(df_pm["PM2.5"], p90_pm25)

# Event flags (daily)
evt_pm10 = (df_pm["PM10"] >= thr_pm10)
evt_pm25 = (df_pm["PM2.5"] >= thr_pm25)

# Store in the main dataframe for convenience
df_evt = df_pm.copy()
df_evt["thr_p90_PM10"] = thr_pm10
df_evt["thr_p90_PM2.5"] = thr_pm25
df_evt["evt_PM10_p90"] = evt_pm10.astype("Int64")   # 1/0 with NA allowed
df_evt["evt_PM2.5_p90"] = evt_pm25.astype("Int64")

df_evt.head()

Unnamed: 0_level_0,PM10,PM2.5,thr_p90_PM10,thr_p90_PM2.5,evt_PM10_p90,evt_PM2.5_p90
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01,100.14,66.43,78.89,41.0,1,1
2012-01-02,19.29,6.14,78.89,41.0,0,0
2012-01-03,38.0,17.43,78.89,41.0,0,0
2012-01-04,67.71,35.0,78.89,41.0,0,0
2012-01-05,61.43,28.86,78.89,41.0,0,0


### 5.2 Save monthly thresholds table

In [29]:
# Monthly thresholds table
thr_tbl = pd.DataFrame({
    "month": np.arange(1, 13),
    "p90_PM10": [p90_pm10.loc[m] for m in range(1, 13)],
    "p90_PM2.5": [p90_pm25.loc[m] for m in range(1, 13)],
})

# Add month labels for readability
month_names = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
thr_tbl["month_name"] = month_names

thr_tbl = thr_tbl[["month", "month_name", "p90_PM10", "p90_PM2.5"]]

out_thr = OUT_TAB / "p90_thresholds_by_calendar_month_2012_2024.csv"
thr_tbl.to_csv(out_thr, index=False)
print(f"Saved: {out_thr}")

thr_tbl

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\p90_thresholds_by_calendar_month_2012_2024.csv


Unnamed: 0,month,month_name,p90_PM10,p90_PM2.5
0,1,Jan,78.89,41.0
1,2,Feb,75.97,35.7
2,3,Mar,68.29,32.75
3,4,Apr,69.63,37.13
4,5,May,72.88,40.26
5,6,Jun,52.17,28.93
6,7,Jul,47.52,25.42
7,8,Aug,42.33,23.29
8,9,Sep,44.01,26.17
9,10,Oct,52.5,28.0


### 5.3 Counts: expected N per month and per year

In [43]:
# Counts by month (event days)
freq_month = pd.DataFrame({
    "PM10_event_days": df_evt.groupby(df_evt.index.month)["evt_PM10_p90"].sum(min_count=1),
    "PM2.5_event_days": df_evt.groupby(df_evt.index.month)["evt_PM2.5_p90"].sum(min_count=1),
})
freq_month.index.name = "month"
freq_month["month_name"] = month_names
freq_month = freq_month[["month_name", "PM10_event_days", "PM2.5_event_days"]]

out_cbm = OUT_TAB / "freq_event_days_by_month_p90.csv"
freq_month.to_csv(out_cbm)
print(f"Saved: {out_cbm}")

freq_month

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\freq_event_days_by_month_p90.csv


Unnamed: 0_level_0,month_name,PM10_event_days,PM2.5_event_days
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Jan,38,39
2,Feb,36,36
3,Mar,41,41
4,Apr,39,39
5,May,38,38
6,Jun,36,36
7,Jul,40,40
8,Aug,41,40
9,Sep,37,37
10,Oct,39,40


In [44]:
# Counts by year (event days)
freq_year = pd.DataFrame({
    "PM10_event_days": df_evt.groupby(df_evt.index.year)["evt_PM10_p90"].sum(min_count=1),
    "PM2.5_event_days": df_evt.groupby(df_evt.index.year)["evt_PM2.5_p90"].sum(min_count=1),
})
freq_year.index.name = "year"
out_cby = OUT_TAB / "freq_event_days_by_year_p90.csv"
freq_year.to_csv(out_cby)
print(f"Saved: {out_cby}")

freq_year.head()

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\freq_event_days_by_year_p90.csv


Unnamed: 0_level_0,PM10_event_days,PM2.5_event_days
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,94,55
2013,78,75
2014,39,39
2015,47,65
2016,47,54


### 5.4 Light persistence: convert event days into episodes (run lengths)

In [32]:
def compute_runs(event_bool: pd.Series) -> pd.DataFrame:
    """
    Convert a boolean event-day series into episode runs.
    Returns a DataFrame with episode start/end, duration, and episode id.
    - event_bool: index is daily datetime, values True/False (NaN treated as False)
    """
    s = event_bool.fillna(False).astype(bool)

    # Identify run boundaries
    change = s.ne(s.shift(1, fill_value=False))
    run_id = change.cumsum()

    # Keep only True runs
    runs = (
        pd.DataFrame({"is_event": s, "run_id": run_id}, index=s.index)
        .query("is_event")
        .groupby("run_id")
        .agg(start=("is_event", lambda x: x.index.min()),
             end=("is_event", lambda x: x.index.max()),
             duration_days=("is_event", "size"))
        .reset_index(drop=True)
    )

    # Add an episode index
    runs.insert(0, "episode_id", np.arange(1, len(runs) + 1))
    return runs

In [33]:
# Compute episodes for each pollutant
epi_pm10 = compute_runs(evt_pm10)
epi_pm25 = compute_runs(evt_pm25)

# Save episode tables
out_epi10 = OUT_TAB / "episodes_runs_PM10_p90.csv"
out_epi25 = OUT_TAB / "episodes_runs_PM25_p90.csv"
epi_pm10.to_csv(out_epi10, index=False)
epi_pm25.to_csv(out_epi25, index=False)
print(f"Saved: {out_epi10}")
print(f"Saved: {out_epi25}")

epi_pm10.head()

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\episodes_runs_PM10_p90.csv
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\episodes_runs_PM25_p90.csv


Unnamed: 0,episode_id,start,end,duration_days
0,1,2012-01-01,2012-01-01,1
1,2,2012-01-20,2012-01-21,2
2,3,2012-01-23,2012-01-24,2
3,4,2012-03-01,2012-03-03,3
4,5,2012-03-06,2012-03-09,4


### 5.5 Persistence summaries

In [34]:
def episode_summary(epi: pd.DataFrame) -> pd.Series:
    return pd.Series({
        "n_episodes": len(epi),
        "mean_duration": float(epi["duration_days"].mean()) if len(epi) else np.nan,
        "median_duration": float(epi["duration_days"].median()) if len(epi) else np.nan,
        "p90_duration": float(epi["duration_days"].quantile(0.90)) if len(epi) else np.nan,
        "max_duration": float(epi["duration_days"].max()) if len(epi) else np.nan,
    })

In [35]:
sum_pm10 = episode_summary(epi_pm10)
sum_pm25 = episode_summary(epi_pm25)

summary_tbl = pd.DataFrame({"PM10": sum_pm10, "PM2.5": sum_pm25})
out_sum = OUT_TAB / "episode_run_summary_p90.csv"
summary_tbl.to_csv(out_sum)
print(f"Saved: {out_sum}")

summary_tbl

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\episode_run_summary_p90.csv


Unnamed: 0,PM10,PM2.5
n_episodes,250.0,286.0
mean_duration,1.86,1.62
median_duration,1.0,1.0
p90_duration,3.0,3.0
max_duration,10.0,10.0


In [36]:
# Simple duration histograms
def plot_duration_hist(epi: pd.DataFrame, label: str, outpath: Path):
    plt.figure(figsize=(6.5, 4), dpi=140)
    plt.hist(epi["duration_days"].values, bins=np.arange(1, max(2, epi["duration_days"].max()+2)))
    plt.title(f"Episode duration distribution ({label}, p90 event days)")
    plt.xlabel("Duration (days)")
    plt.ylabel("Number of episodes")
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

In [37]:
if len(epi_pm10):
    plot_duration_hist(epi_pm10, "PM10", OUT_FIG / "episode_duration_hist_PM10_p90.png")
if len(epi_pm25):
    plot_duration_hist(epi_pm25, "PM2.5", OUT_FIG / "episode_duration_hist_PM25_p90.png")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\episode_duration_hist_PM10_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\episode_duration_hist_PM25_p90.png


### 5.6 Saving the working event dataframe

In [38]:
# Save a compact daily table with PM, thresholds, and flags for later sections
out_daily = OUT_TAB / "pm_daily_with_p90_flags_2012_2024.csv"
df_evt.reset_index().to_csv(out_daily, index=False)
print(f"Saved: {out_daily}")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\pm_daily_with_p90_flags_2012_2024.csv


## 6. Baseline episode climatology: frequency, severity, duration

This section characterizes the p90 event days (and event episodes) using:
- **Frequency**: event-day counts by month and by year, plus a year×month heatmap
- **Severity**: exceedance above the monthly p90 threshold (PM − p90_month), summarized by month/year
- **Duration**: run-length episode statistics (from Section 5), summarized by month/season/year

### 6.1 Frequency: monthly climatology, yearly totals, and year×month heatmap

Already computed in `Section 5`:
- Monthly event-day climatology
- Yearly event-day totals

In [46]:
# --- Heatmap table: year x month (event-day counts)
def year_month_event_table(df: pd.DataFrame, flag_col: str) -> pd.DataFrame:
    """
    Build a year×month table of event-day counts for a given event flag column (0/1).
    """
    tmp = df[[flag_col]].copy()
    tmp["year"] = tmp.index.year
    tmp["month"] = tmp.index.month
    table = tmp.pivot_table(index="year", columns="month", values=flag_col, aggfunc="sum", fill_value=0)
    # ensure full 1..12 columns
    table = table.reindex(columns=range(1, 13), fill_value=0)
    table.columns = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
    return table

hm_pm10 = year_month_event_table(df_evt, "evt_PM10_p90")
hm_pm25 = year_month_event_table(df_evt, "evt_PM2.5_p90")

hm_pm10.to_csv(OUT_TAB / "heatmap_year_month_eventdays_PM10_p90.csv")
hm_pm25.to_csv(OUT_TAB / "heatmap_year_month_eventdays_PM25_p90.csv")
print("Saved heatmap tables for PM10 and PM2.5.")

Saved heatmap tables for PM10 and PM2.5.


In [49]:
# --- Plots: monthly bars + yearly bars + heatmaps

def plot_monthly_bars(freq_month_df: pd.DataFrame, outpath: Path):
    x = np.arange(1, 13)
    plt.figure(figsize=(8.8, 4.5), dpi=150)
    plt.plot(x, freq_month_df["PM10_event_days"].values, marker="o", label="PM10")
    plt.plot(x, freq_month_df["PM2.5_event_days"].values, marker="o", label="PM2.5")
    plt.xticks(x, freq_month_df["month_name"].values)
    plt.title("Event-day frequency by calendar month (p90 events, 2012–2024)")
    plt.xlabel("Month")
    plt.ylabel("Number of event days")
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

def plot_yearly_bars(freq_year_df: pd.DataFrame, outpath: Path):
    years = freq_year_df.index.values
    plt.figure(figsize=(9.2, 4.6), dpi=150)
    plt.plot(years, freq_year_df["PM10_event_days"].values, marker="o", label="PM10")
    plt.plot(years, freq_year_df["PM2.5_event_days"].values, marker="o", label="PM2.5")
    plt.title("Event-day frequency by year (p90 events)")
    plt.xlabel("Year")
    plt.ylabel("Number of event days")
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

def plot_heatmap(table: pd.DataFrame, title: str, outpath: Path):
    # Force numeric matrix for imshow (handles pandas nullable dtypes / object)
    Z = table.apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=float)

    plt.figure(figsize=(10.5, 5.5), dpi=150)
    plt.imshow(Z, aspect="auto")
    plt.colorbar(label="Event days")
    plt.yticks(np.arange(table.shape[0]), table.index.values)
    plt.xticks(np.arange(12), table.columns.values)
    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Year")
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

In [50]:
plot_monthly_bars(freq_month, OUT_FIG / "freq_event_days_by_month_p90.png")
plot_yearly_bars(freq_year, OUT_FIG / "freq_event_days_by_year_p90.png")
plot_heatmap(hm_pm10, "PM10 event-day heatmap (p90 events)", OUT_FIG / "heatmap_year_month_PM10_p90.png")
plot_heatmap(hm_pm25, "PM2.5 event-day heatmap (p90 events)", OUT_FIG / "heatmap_year_month_PM25_p90.png")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\freq_event_days_by_month_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\freq_event_days_by_year_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\heatmap_year_month_PM10_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\heatmap_year_month_PM25_p90.png


### 6.2 Severity: exceedance above monthly p90 threshold

Severity is defined as:

$$
exceedance(t) = PM(t) - p90_{month}
$$
for event days.

In [51]:
# --- Exceedance time series (keep NaN for non-event days)
df_evt["exc_PM10"] = np.where(df_evt["evt_PM10_p90"] == 1,
                              df_evt["PM10"] - df_evt["thr_p90_PM10"],
                              np.nan)

df_evt["exc_PM2.5"] = np.where(df_evt["evt_PM2.5_p90"] == 1,
                               df_evt["PM2.5"] - df_evt["thr_p90_PM2.5"],
                               np.nan)

# Check
print("Exceedance non-NaN counts (should match event-day totals):")
print("PM10:", np.isfinite(df_evt["exc_PM10"]).sum())
print("PM2.5:", np.isfinite(df_evt["exc_PM2.5"]).sum())

Exceedance non-NaN counts (should match event-day totals):
PM10: 464
PM2.5: 464


In [52]:
# --- Severity by month (event days only)
sev_month = pd.DataFrame({
    "PM10_exc_mean": df_evt.groupby(df_evt.index.month)["exc_PM10"].mean(),
    "PM10_exc_median": df_evt.groupby(df_evt.index.month)["exc_PM10"].median(),
    "PM25_exc_mean": df_evt.groupby(df_evt.index.month)["exc_PM2.5"].mean(),
    "PM25_exc_median": df_evt.groupby(df_evt.index.month)["exc_PM2.5"].median(),
})
sev_month.index.name = "month"
sev_month["month_name"] = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
sev_month = sev_month[["month_name", "PM10_exc_mean", "PM10_exc_median", "PM25_exc_mean", "PM25_exc_median"]]

out = OUT_TAB / "severity_exceedance_by_month_p90.csv"
sev_month.to_csv(out)
print(f"Saved: {out}")

sev_month

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\severity_exceedance_by_month_p90.csv


Unnamed: 0_level_0,month_name,PM10_exc_mean,PM10_exc_median,PM25_exc_mean,PM25_exc_median
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Jan,12.73,9.61,10.89,4.67
2,Feb,7.1,4.95,4.52,2.46
3,Mar,10.95,8.71,3.81,3.54
4,Apr,11.38,7.23,4.99,3.54
5,May,11.72,10.25,10.47,6.16
6,Jun,4.41,3.41,4.02,2.94
7,Jul,4.62,2.06,2.97,2.03
8,Aug,6.55,5.3,3.26,2.35
9,Sep,6.42,5.16,3.68,2.83
10,Oct,7.38,5.33,4.38,3.23


In [53]:
# --- Severity by year (event days only)
sev_year = pd.DataFrame({
    "PM10_exc_mean": df_evt.groupby(df_evt.index.year)["exc_PM10"].mean(),
    "PM25_exc_mean": df_evt.groupby(df_evt.index.year)["exc_PM2.5"].mean(),
})
sev_year.index.name = "year"

out = OUT_TAB / "severity_exceedance_by_year_p90.csv"
sev_year.to_csv(out)
print(f"Saved: {out}")

sev_year.head()

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\severity_exceedance_by_year_p90.csv


Unnamed: 0_level_0,PM10_exc_mean,PM25_exc_mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,8.46,4.97
2013,12.29,7.02
2014,5.82,4.94
2015,6.8,5.92
2016,8.84,6.22


In [81]:
# --- Plots: severity by month (mean/median) and severity distribution (boxplots)

def plot_severity_by_month(sev_month_df: pd.DataFrame, outpath: Path):
    x = np.arange(1, 13)
    plt.figure(figsize=(9.0, 4.8), dpi=150)

    plt.plot(x, sev_month_df["PM10_exc_mean"].values, marker="o", label="PM10 mean exceedance")
    plt.plot(x, sev_month_df["PM25_exc_mean"].values, marker="o", label="PM2.5 mean exceedance")
    plt.xticks(x, sev_month_df["month_name"].values)

    plt.title("Mean exceedance above monthly p90 threshold (event days only)")
    plt.xlabel("Month")
    plt.ylabel("Exceedance (µg/m³)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

def plot_exceedance_boxplots(df: pd.DataFrame, col: str, title: str, outpath: Path):
    data = [df.loc[df.index.month == m, col].dropna().values for m in range(1, 13)]
    plt.figure(figsize=(10.2, 4.8), dpi=150)
    plt.boxplot(data, showfliers=False)
    plt.xticks(np.arange(1, 13), ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Exceedance (µg/m³)")
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

In [82]:
plot_severity_by_month(sev_month, OUT_FIG / "severity_mean_by_month_p90.png")
plot_exceedance_boxplots(df_evt, "exc_PM10",
                         "PM10 exceedance distribution by month (p90 event days)",
                         OUT_FIG / "severity_boxplot_by_month_PM10_p90.png")
plot_exceedance_boxplots(df_evt, "exc_PM2.5",
                         "PM2.5 exceedance distribution by month (p90 event days)",
                         OUT_FIG / "severity_boxplot_by_month_PM25_p90.png")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_mean_by_month_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_boxplot_by_month_PM10_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_boxplot_by_month_PM25_p90.png


### 6.3 Duration: episode/run statistics and timing

Already built episode tables (epi_pm10, epi_pm25) in Section 5. Here let's add:

- episode start month / season
- counts and mean durations by month/season

In [61]:
# --- Add timing metadata to episode tables
def add_episode_time_fields(epi: pd.DataFrame) -> pd.DataFrame:
    out = epi.copy()
    out["start"] = pd.to_datetime(out["start"])
    out["end"] = pd.to_datetime(out["end"])
    out["start_year"] = out["start"].dt.year
    out["start_month"] = out["start"].dt.month

    # Seasons (DJF/MAM/JJA/SON) by start date
    m = out["start_month"]
    season = pd.Series(index=out.index, dtype=object)
    season[(m == 12) | (m <= 2)] = "DJF"
    season[(m >= 3) & (m <= 5)] = "MAM"
    season[(m >= 6) & (m <= 8)] = "JJA"
    season[(m >= 9) & (m <= 11)] = "SON"
    out["start_season"] = season
    return out

In [62]:
epi_pm10_t = add_episode_time_fields(epi_pm10)
epi_pm25_t = add_episode_time_fields(epi_pm25)

# Save updated tables
epi_pm10_t.to_csv(OUT_TAB / "episodes_runs_PM10_p90_with_timefields.csv", index=False)
epi_pm25_t.to_csv(OUT_TAB / "episodes_runs_PM25_p90_with_timefields.csv", index=False)
print("Saved episode tables with timing fields.")

Saved episode tables with timing fields.


In [56]:
# --- Episode counts and mean duration by start month / season

def episode_stats_by_group(epi: pd.DataFrame, group_col: str) -> pd.DataFrame:
    stats = epi.groupby(group_col)["duration_days"].agg(
        n_episodes="count",
        mean_duration="mean",
        median_duration="median",
        p90_duration=lambda x: x.quantile(0.90),
        max_duration="max",
    )
    return stats

epi_stats_month_pm10 = episode_stats_by_group(epi_pm10_t, "start_month")
epi_stats_month_pm25 = episode_stats_by_group(epi_pm25_t, "start_month")

# Add month labels
epi_stats_month_pm10.index = epi_stats_month_pm10.index.astype(int)
epi_stats_month_pm25.index = epi_stats_month_pm25.index.astype(int)

epi_stats_month_pm10["month_name"] = [month_names[m-1] for m in epi_stats_month_pm10.index]
epi_stats_month_pm25["month_name"] = [month_names[m-1] for m in epi_stats_month_pm25.index]

epi_stats_month_pm10 = epi_stats_month_pm10[["month_name","n_episodes","mean_duration","median_duration","p90_duration","max_duration"]]
epi_stats_month_pm25 = epi_stats_month_pm25[["month_name","n_episodes","mean_duration","median_duration","p90_duration","max_duration"]]

epi_stats_month_pm10.to_csv(OUT_TAB / "episode_stats_by_start_month_PM10_p90.csv")
epi_stats_month_pm25.to_csv(OUT_TAB / "episode_stats_by_start_month_PM25_p90.csv")

# Season stats
epi_stats_seas_pm10 = episode_stats_by_group(epi_pm10_t, "start_season")
epi_stats_seas_pm25 = episode_stats_by_group(epi_pm25_t, "start_season")

epi_stats_seas_pm10.to_csv(OUT_TAB / "episode_stats_by_start_season_PM10_p90.csv")
epi_stats_seas_pm25.to_csv(OUT_TAB / "episode_stats_by_start_season_PM25_p90.csv")

print("Saved episode stats by month and season.")

Saved episode stats by month and season.


In [57]:
# --- Plots: episode count by month and mean duration by month (saved)

def plot_episode_counts_by_month(epi_stats_month_pm10: pd.DataFrame,
                                 epi_stats_month_pm25: pd.DataFrame,
                                 outpath: Path):
    x = np.arange(1, 13)
    # Ensure full months exist (reindex)
    pm10 = epi_stats_month_pm10.reindex(range(1, 13))
    pm25 = epi_stats_month_pm25.reindex(range(1, 13))
    month_lbl = month_names

    plt.figure(figsize=(9.2, 4.6), dpi=150)
    plt.plot(x, pm10["n_episodes"].values, marker="o", label="PM10")
    plt.plot(x, pm25["n_episodes"].values, marker="o", label="PM2.5")
    plt.xticks(x, month_lbl)
    plt.title("Number of episodes by start month (p90 events)")
    plt.xlabel("Start month")
    plt.ylabel("Number of episodes")
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

def plot_episode_mean_duration_by_month(epi_stats_month: pd.DataFrame, label: str, outpath: Path):
    x = np.arange(1, 13)
    s = epi_stats_month.reindex(range(1, 13))
    plt.figure(figsize=(9.2, 4.6), dpi=150)
    plt.plot(x, s["mean_duration"].values, marker="o")
    plt.xticks(x, month_names)
    plt.title(f"Mean episode duration by start month ({label}, p90 events)")
    plt.xlabel("Start month")
    plt.ylabel("Mean duration (days)")
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

In [58]:
plot_episode_counts_by_month(epi_stats_month_pm10, epi_stats_month_pm25,
                             OUT_FIG / "episode_counts_by_start_month_p90.png")
plot_episode_mean_duration_by_month(epi_stats_month_pm10, "PM10",
                                    OUT_FIG / "episode_mean_duration_by_month_PM10_p90.png")
plot_episode_mean_duration_by_month(epi_stats_month_pm25, "PM2.5",
                                    OUT_FIG / "episode_mean_duration_by_month_PM25_p90.png")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\episode_counts_by_start_month_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\episode_mean_duration_by_month_PM10_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\episode_mean_duration_by_month_PM25_p90.png


## 7. Seasonal structuring (DJF/MAM/JJA/SON)

This section summarizes p90 event behavior by meteorological season without changing the event definition.

Computing, for each pollutant:
- **Frequency**: number of event days per season (DJF/MAM/JJA/SON)
- **Severity**: exceedance statistics per season (event days only)

Notes:
- Seasons are assigned by calendar month:
  - DJF = Dec–Feb
  - MAM = Mar–May
  - JJA = Jun–Aug
  - SON = Sep–Nov
- For DJF, December is grouped with Jan–Feb (same DJF label), but we don't shift years here because this is descriptive seasonal stratification (not a time-series model).

### 7.1 Add season labels to the daily dataframe

In [63]:
def month_to_season(m: int) -> str:
    if m in (12, 1, 2):
        return "DJF"
    if m in (3, 4, 5):
        return "MAM"
    if m in (6, 7, 8):
        return "JJA"
    return "SON"

In [64]:
df_evt["season"] = [month_to_season(m) for m in df_evt.index.month]

# check
df_evt[["PM10", "PM2.5", "evt_PM10_p90", "evt_PM2.5_p90", "season"]].head()

Unnamed: 0_level_0,PM10,PM2.5,evt_PM10_p90,evt_PM2.5_p90,season
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01,100.14,66.43,1,1,DJF
2012-01-02,19.29,6.14,0,0,DJF
2012-01-03,38.0,17.43,0,0,DJF
2012-01-04,67.71,35.0,0,0,DJF
2012-01-05,61.43,28.86,0,0,DJF


### 7.2 Seasonal frequency (event days per season)

In [65]:
freq_season = pd.DataFrame({
    "PM10_event_days": df_evt.groupby("season")["evt_PM10_p90"].sum(min_count=1),
    "PM2.5_event_days": df_evt.groupby("season")["evt_PM2.5_p90"].sum(min_count=1),
})

# Order seasons
season_order = ["DJF", "MAM", "JJA", "SON"]
freq_season = freq_season.reindex(season_order)

out = OUT_TAB / "freq_event_days_by_season_p90.csv"
freq_season.to_csv(out)
print(f"Saved: {out}")

freq_season

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\freq_event_days_by_season_p90.csv


Unnamed: 0_level_0,PM10_event_days,PM2.5_event_days
season,Unnamed: 1_level_1,Unnamed: 2_level_1
DJF,115,115
MAM,118,118
JJA,117,116
SON,114,115


In [66]:
# Plot: seasonal frequency
plt.figure(figsize=(7.2, 4.2), dpi=150)
x = np.arange(len(season_order))

plt.plot(x, freq_season["PM10_event_days"].values, marker="o", label="PM10")
plt.plot(x, freq_season["PM2.5_event_days"].values, marker="o", label="PM2.5")

plt.xticks(x, season_order)
plt.title("Event-day frequency by season (p90 events, 2012–2024)")
plt.xlabel("Season")
plt.ylabel("Number of event days")
plt.legend()
plt.tight_layout()

out = OUT_FIG / "freq_event_days_by_season_p90.png"
plt.savefig(out, dpi=200)
plt.close()
print(f"Saved: {out}")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\freq_event_days_by_season_p90.png


### 7.3 Seasonal severity (exceedance stats per season; event days only)

In [78]:
def seasonal_severity_stats(df: pd.DataFrame, exc_col: str, prefix: str) -> pd.DataFrame:
    """
    Compute seasonal severity statistics (event days only).
    Prefix is used to clearly label exceedance variables.
    """
    grp = df.groupby("season")[exc_col]

    stats = pd.DataFrame({
        f"{prefix}_exc_mean": grp.mean(),
        f"{prefix}_exc_median": grp.median(),
        f"{prefix}_exc_std": grp.std(),
        f"{prefix}_exc_q25": grp.quantile(0.25),
        f"{prefix}_exc_q75": grp.quantile(0.75),
        f"{prefix}_exc_p90": grp.quantile(0.90),
    })

    stats[f"{prefix}_exc_IQR"] = stats[f"{prefix}_exc_q75"] - stats[f"{prefix}_exc_q25"]

    stats = stats.reindex(["DJF", "MAM", "JJA", "SON"])
    return stats


In [80]:
sev_season_pm10 = seasonal_severity_stats(df_evt, "exc_PM10", "PM10")
sev_season_pm25 = seasonal_severity_stats(df_evt, "exc_PM2.5", "PM25")

sev_season = pd.concat([sev_season_pm10, sev_season_pm25], axis=1)

out = OUT_TAB / "severity_exceedance_by_season_p90.csv"
sev_season.to_csv(out)
print(f"Saved: {out}")

sev_season

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\severity_exceedance_by_season_p90.csv


Unnamed: 0_level_0,PM10_exc_mean,PM10_exc_median,PM10_exc_std,PM10_exc_q25,PM10_exc_q75,PM10_exc_p90,PM10_exc_IQR,PM25_exc_mean,PM25_exc_median,PM25_exc_std,PM25_exc_q25,PM25_exc_q75,PM25_exc_p90,PM25_exc_IQR
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
DJF,9.74,6.78,9.7,3.8,13.35,21.47,9.55,8.01,4.0,10.61,1.55,8.8,23.78,7.25
MAM,11.34,8.82,10.83,2.58,17.3,27.42,14.72,6.34,4.12,6.48,2.01,8.05,16.39,6.04
JJA,5.23,3.67,5.22,1.15,7.63,12.79,6.48,3.39,2.37,3.02,0.8,5.28,8.02,4.48
SON,7.56,5.61,6.54,2.02,12.3,16.06,10.28,4.3,3.33,4.26,1.28,6.0,9.46,4.73


In [85]:
# Plot: seasonal mean exceedance (PM10 and PM2.5)
plt.figure(figsize=(7.6, 4.2), dpi=150)
x = np.arange(len(season_order))

plt.plot(x, sev_season["PM10_exc_mean"].values, marker="o", label="PM10 mean exceedance")
plt.plot(x, sev_season["PM25_exc_mean"].values, marker="o", label="PM2.5 mean exceedance")

plt.xticks(x, season_order)
plt.title("Mean exceedance above p90 threshold by season (event days only)")
plt.xlabel("Season")
plt.ylabel("Exceedance (µg/m³)")
plt.legend()
plt.tight_layout()

out = OUT_FIG / "severity_mean_exceedance_by_season_p90.png"
plt.savefig(out, dpi=200)
plt.close()
print(f"Saved: {out}")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_mean_exceedance_by_season_p90.png


### 7.4 Seasonal exceedance distributions (boxplots)

In [83]:
def boxplot_by_season(df: pd.DataFrame, col: str, title: str, outpath: Path):
    data = [df.loc[df["season"] == s, col].dropna().values for s in season_order]

    plt.figure(figsize=(7.8, 4.4), dpi=150)
    plt.boxplot(data, showfliers=False)
    plt.xticks(np.arange(1, len(season_order) + 1), season_order)
    plt.title(title)
    plt.xlabel("Season")
    plt.ylabel("Exceedance (µg/m³)")
    plt.tight_layout()
    plt.savefig(outpath, dpi=200)
    plt.close()
    print(f"Saved: {outpath}")

In [84]:
boxplot_by_season(df_evt, "exc_PM10",
                  "PM10 exceedance distribution by season (p90 event days)",
                  OUT_FIG / "severity_boxplot_by_season_PM10_p90.png")

boxplot_by_season(df_evt, "exc_PM2.5",
                  "PM2.5 exceedance distribution by season (p90 event days)",
                  OUT_FIG / "severity_boxplot_by_season_PM25_p90.png")

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_boxplot_by_season_PM10_p90.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\severity_boxplot_by_season_PM25_p90.png


## 8. Monthly Z500′ composites for p90 (PM10 & PM2.5)

Recomputing the 12-panel monthly composites for:
- PM2.5 p90 event days
- PM10 p90 event days

Fields:
- Shading: Z500′ anomaly (monthly baseline)
- Contours: Z500′ anomaly (positive solid, negative dashed)
- Vectors: mean 500 hPa winds (U, V) during event days
- Stippling: gridpoints where the event composite differs from non-event days
  based on a permutation test (two-sided, p<0.05)

All monthly panels share a consistent color scale (max abs across months).

### 8.1 Helper functions (date selection, composites, p-values)

In [None]:
def dates_for_month_event(df: pd.DataFrame, month: int, flag_col: str) -> pd.DatetimeIndex:
    """
    Return DatetimeIndex for event days in a given calendar month.
    df index must be datetime.
    """
    m = df.index.month
    sel = (m == month) & (df[flag_col] == 1)
    return df.index[sel]


def dates_for_month_nonevent(df: pd.DataFrame, month: int, flag_col: str) -> pd.DatetimeIndex:
    """
    Return DatetimeIndex for NON-event days in a given calendar month.
    """
    m = df.index.month
    sel = (m == month) & (df[flag_col] == 0)
    return df.index[sel]


def composite_mean(da: xr.DataArray, dates: pd.DatetimeIndex) -> xr.DataArray:
    """
    Mean composite over provided dates.
    Assumes da has 'time' dimension.
    """
    if len(dates) == 0:
        # Return NaNs with same spatial dims
        return da.isel(time=0) * np.nan
    return da.sel(time=dates).mean("time")


def ttest_pvals_grid(da: xr.DataArray,
                     event_dates: pd.DatetimeIndex,
                     nonevent_dates: pd.DatetimeIndex) -> xr.DataArray:
    """
    Two-sample Welch t-test (unequal variance) at each gridpoint:
    H′(event) vs H′(non-event). Returns p-values with dims (y, x).
    """
    if len(event_dates) < 3 or len(nonevent_dates) < 3:
        p_da = da.isel(time=0).drop_vars("time", errors="ignore") * np.nan
        for c in ["lat", "lon"]:
            if c in da.coords:
                p_da = p_da.assign_coords({c: da.coords[c]})
        return p_da

    Xe = da.sel(time=event_dates).values  # (Ne, y, x)
    Xn = da.sel(time=nonevent_dates).values  # (Nn, y, x)

    # Welch t-test, vectorized over y,x with axis=0 as sample axis
    _, p = ttest_ind(Xe, Xn, axis=0, equal_var=False, nan_policy="omit")

    p_da = xr.DataArray(
        p,
        coords={k: da.coords[k] for k in da.dims if k != "time"},
        dims=[d for d in da.dims if d != "time"],
        name="pval"
    )
    for c in ["lat", "lon"]:
        if c in da.coords:
            p_da = p_da.assign_coords({c: da.coords[c]})
    return p_da


### 8.2 Precompute monthly composites and global color scale

In [97]:
def compute_monthly_composites(flag_col: str,
                               label: str):
    """
    Compute monthly composites for one pollutant event flag.
    Returns dicts for Hprime, U, V, and pvals keyed by month (1..12),
    plus a max_abs for consistent color scaling.
    """
    Hp_month, U_month, V_month, P_month = {}, {}, {}, {}
    max_abs = 0.0

    for m in range(1, 13):
        ev_dates = dates_for_month_event(df_evt, m, flag_col)
        ne_dates = dates_for_month_nonevent(df_evt, m, flag_col)

        Hp = composite_mean(H_prime, ev_dates)
        Um = composite_mean(U, ev_dates)
        Vm = composite_mean(V, ev_dates)

        # Significance (event vs non-event) on H_prime
        P = ttest_pvals_grid(H_prime, ev_dates, ne_dates)

        Hp_month[m] = Hp # dictionary: {1: Hp_january, 2: Hp_february, ..., 12: Hp_december}
        U_month[m] = Um
        V_month[m] = Vm
        P_month[m] = P

        # Update global scale
        this_max = float(np.nanmax(np.abs(Hp.values)))
        if np.isfinite(this_max):
            max_abs = max(max_abs, this_max)

        print(f"{label} | month={m:02d} | N_event={len(ev_dates)} | N_nonevent={len(ne_dates)} | max|Hp|={this_max:.2f}")

    return Hp_month, U_month, V_month, P_month, max_abs

In [98]:
# Compute for both pollutants
Hp_pm25, U_pm25, V_pm25, P_pm25, maxabs_pm25 = compute_monthly_composites(
    flag_col="evt_PM2.5_p90",
    label="PM2.5",
)

Hp_pm10, U_pm10, V_pm10, P_pm10, maxabs_pm10 = compute_monthly_composites(
    flag_col="evt_PM10_p90",
    label="PM10",
)

print("Global maxabs:", "PM2.5", maxabs_pm25, "| PM10", maxabs_pm10)

PM2.5 | month=01 | N_event=39 | N_nonevent=334 | max|Hp|=14.45
PM2.5 | month=02 | N_event=36 | N_nonevent=317 | max|Hp|=34.91
PM2.5 | month=03 | N_event=41 | N_nonevent=360 | max|Hp|=17.64
PM2.5 | month=04 | N_event=39 | N_nonevent=342 | max|Hp|=15.25
PM2.5 | month=05 | N_event=38 | N_nonevent=334 | max|Hp|=56.28
PM2.5 | month=06 | N_event=36 | N_nonevent=324 | max|Hp|=18.53
PM2.5 | month=07 | N_event=40 | N_nonevent=358 | max|Hp|=15.11
PM2.5 | month=08 | N_event=40 | N_nonevent=356 | max|Hp|=18.80
PM2.5 | month=09 | N_event=37 | N_nonevent=333 | max|Hp|=15.90
PM2.5 | month=10 | N_event=40 | N_nonevent=337 | max|Hp|=42.52
PM2.5 | month=11 | N_event=38 | N_nonevent=336 | max|Hp|=26.49
PM2.5 | month=12 | N_event=40 | N_nonevent=360 | max|Hp|=25.98
PM10 | month=01 | N_event=38 | N_nonevent=335 | max|Hp|=20.77
PM10 | month=02 | N_event=36 | N_nonevent=317 | max|Hp|=32.89
PM10 | month=03 | N_event=41 | N_nonevent=360 | max|Hp|=24.96
PM10 | month=04 | N_event=39 | N_nonevent=342 | max|Hp|=27

### 8.3 Plotting (12-panel multipanel w/stippling)

In [None]:
def plot_monthly_multipanel(label: str,
                            Hp_month: dict,
                            U_month: dict,
                            V_month: dict,
                            P_month: dict,
                            max_abs: float,
                            outpath: Path,
                            p_thresh: float = 0.05,
                            vector_step: int = 4,
                            stipple_thin: int = 8):
    """
    Stage-1 style multipanel plot (3x4).
    Shading: Z500′ anomaly (TwoSlopeNorm centered at 0)
    Contours: black (pos solid / neg dashed)
    Vectors: black
    Stippling: black, semi-transparent
    + Valley of Mexico box + CDMX star
    """
    proj = ccrs.PlateCarree()
    fig, axes = plt.subplots(3, 4, figsize=(16, 9), dpi=250,
                             subplot_kw={"projection": proj})
    axes = axes.flatten()

    lon2d = H_prime["lon"].values
    lat2d = H_prime["lat"].values

    # Center colormap at 0 
    norm = TwoSlopeNorm(vcenter=0, vmin=-max_abs, vmax=max_abs)
    pcm_ref = None

    for i, m in enumerate(range(1, 13)):
        ax = axes[i]
        ax.set_extent([LON_MIN, LON_MAX, LAT_MIN, LAT_MAX], crs=proj)

        ax.coastlines(resolution="50m", linewidth=0.5)
        ax.add_feature(cfeature.BORDERS, linewidth=0.4)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.3)

        Hp = Hp_month[m].values
        Um = U_month[m].values
        Vm = V_month[m].values
        P  = P_month[m].values

        # Shading
        pcm = ax.pcolormesh(lon2d, lat2d, Hp, cmap="RdBu_r",
                            norm=norm, shading="auto", transform=proj)
        pcm_ref = pcm

        # Contours (black)
        stepc = 5
        lev = np.arange(-max_abs, max_abs + stepc, stepc)
        ax.contour(lon2d, lat2d, Hp, levels=lev[lev > 0],
                   colors="k", linewidths=0.4, linestyles="solid", transform=proj)
        ax.contour(lon2d, lat2d, Hp, levels=lev[lev < 0],
                   colors="k", linewidths=0.4, linestyles="dashed", transform=proj)

        # Vectors (black)
        yy = np.arange(0, Um.shape[0], vector_step)
        xx = np.arange(0, Um.shape[1], vector_step)
        ax.quiver(lon2d[np.ix_(yy, xx)], lat2d[np.ix_(yy, xx)],
                  Um[np.ix_(yy, xx)], Vm[np.ix_(yy, xx)],
                  scale=700, width=0.002, color="black", transform=proj)

        # Stippling: p < threshold (black, semi-transparent, thinned)
        sig = np.isfinite(P) & (P < p_thresh)
        y, x = np.where(sig)
        y = y[::stipple_thin]
        x = x[::stipple_thin]
        ax.scatter(lon2d[y, x], lat2d[y, x], s=2, c="k", alpha=0.25, transform=proj)

        # Valley of Mexico box + CDMX star
        rect = mpatches.Rectangle((VOM_BOX[0], VOM_BOX[1]),
                                  VOM_BOX[2], VOM_BOX[3],
                                  fill=False, edgecolor="k",
                                  linewidth=1, transform=proj)
        ax.add_patch(rect)

        ax.plot(LON_CDMX, LAT_CDMX, marker="*", color="gold",
                markersize=8, markeredgecolor="k", transform=proj)

        # Month title + N
        n_ev = int(df_evt.loc[(df_evt.index.month == m), f"evt_{label}_p90"].sum())
        ax.set_title(f"{calendar.month_abbr[m]} (N={n_ev})", fontsize=10, weight="bold")

        # grid labels: only left column + bottom row
        gl = ax.gridlines(draw_labels=True, linewidth=0.2, color="gray",
                          alpha=0.5, linestyle="--")
        gl.top_labels = False
        gl.right_labels = False
        if (i % 4) != 0:
            gl.left_labels = False
        if i < 8:
            gl.bottom_labels = False

    # Colorbar
    cbar_ax = fig.add_axes([0.92, 0.20, 0.015, 0.60])
    cb = fig.colorbar(pcm_ref, cax=cbar_ax)
    cb.set_label("Z500′ anomaly (m)")

    # Titles
    fig.suptitle(f"{label}: Monthly Z500′ composites during p90 event days (2012–2024)",
                 fontsize=18, weight="bold", y=0.96)
    fig.text(0.5, 0.91,
             "Shading/contours: Z500′ | Vectors: mean 500 hPa winds | Stippling: p<0.05 (t-test)",
             ha="center", fontsize=14, style="italic")

    # Legend elements (outside map area, top-left)
    legend_elements = [
        mpatches.Rectangle((0, 0), 1, 1, fill=False, edgecolor="k", linewidth=1.5,
                           label="Valley of Mexico"),
        plt.Line2D([0], [0], marker="*", color="w", markerfacecolor="gold",
                  markeredgecolor="k", markersize=12, label="Mexico City (CDMX)")
    ]
    fig.legend(handles=legend_elements, loc="upper left", fontsize=11,
              frameon=False, bbox_to_anchor=(0.01, 0.97))

    plt.tight_layout(rect=[0, 0, 0.9, 0.93])
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", outpath)

In [135]:
# Plot PM2.5
plot_monthly_multipanel(
    label="PM2.5",
    Hp_month=Hp_pm25,
    U_month=U_pm25,
    V_month=V_pm25,
    P_month=P_pm25,
    max_abs=maxabs_pm25,
    outpath=OUT_FIG / "Z500_monthly_p90_PM25_stage2.png",
    p_thresh=0.05,
    vector_step=6
)

# Plot PM10
plot_monthly_multipanel(
    label="PM10",
    Hp_month=Hp_pm10,
    U_month=U_pm10,
    V_month=V_pm10,
    P_month=P_pm10,
    max_abs=maxabs_pm10,
    outpath=OUT_FIG / "Z500_monthly_p90_PM10_stage2.png",
    p_thresh=0.05,
    vector_step=6
)

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_monthly_p90_PM25_stage2.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_monthly_p90_PM10_stage2.png


## 9. Focused composites: selected months

### 9.1 Ranking of months (days-event and severity) + top-k selection

In [146]:
# ==========================
# Rank months (transparent ranking tables for month-by-month comparison of frequency and severity)
# ==========================

def month_rank_tables(df_evt: pd.DataFrame, pol_label: str) -> pd.DataFrame:
    """
    Month ranking table using existing columns:
    - evt_{pol}_p90
    - thr_p90_{pol}
    severity = PM - threshold (on event days)
    event_days: # of p90 event-days per calendar month
    mean_severity: mean exceedance above the month-specific p90 threshold on event days (PM - thr)
    mean_event_value: mean pollutant value during event days
    p90_threshold_month: month-specific p90 threshold (constant within month)
    """
    flag = f"evt_{pol_label}_p90"
    thr  = f"thr_p90_{pol_label}"
    val  = pol_label

    if flag not in df_evt.columns:
        raise KeyError(f"Missing column: {flag}")
    if thr not in df_evt.columns:
        raise KeyError(f"Missing column: {thr}")
    if val not in df_evt.columns:
        raise KeyError(f"Missing column: {val}")

    out = []
    for m in range(1, 13):
        sel_m = (df_evt.index.month == m)
        sel_evt = sel_m & (df_evt[flag] == 1)

        event_days = int(df_evt.loc[sel_evt, flag].sum())

        # Severity computed directly
        sev = (df_evt.loc[sel_evt, val] - df_evt.loc[sel_evt, thr])
        mean_sev = float(sev.mean()) if event_days > 0 else np.nan

        mean_val = float(df_evt.loc[sel_evt, val].mean()) if event_days > 0 else np.nan
        thr_m = float(df_evt.loc[sel_m, thr].iloc[0]) if sel_m.any() else np.nan

        out.append({
            "month": m,
            "month_name": calendar.month_abbr[m],
            "event_days": event_days,
            "mean_severity": mean_sev,
            "mean_event_value": mean_val,
            "p90_threshold_month": thr_m,
        })

    tab = pd.DataFrame(out).set_index("month")
    tab["rank_by_event_days"] = tab["event_days"].rank(ascending=False, method="min")
    tab["rank_by_severity"]   = tab["mean_severity"].rank(ascending=False, method="min")
    return tab

In [None]:
rank_pm25 = month_rank_tables(df_evt, "PM2.5")
rank_pm10 = month_rank_tables(df_evt, "PM10")

# Save tables
rank_pm25.to_csv(OUT_TAB / "rank_months_PM25_p90_eventdays_severity.csv")
rank_pm10.to_csv(OUT_TAB / "rank_months_PM10_p90_eventdays_severity.csv")

# Select top-k months by event-days (primary ranking), with severity as secondary context
TOPK = 4
top_months_pm25 = rank_pm25.sort_values(["event_days", "mean_severity"], ascending=[False, False]).head(TOPK).index.tolist()
top_months_pm10 = rank_pm10.sort_values(["event_days", "mean_severity"], ascending=[False, False]).head(TOPK).index.tolist()

# Sort selected months chronologically (by month number)
top_months_pm25 = sorted(top_months_pm25)
top_months_pm10 = sorted(top_months_pm10)

print("Top months PM2.5 by event-days:", [calendar.month_abbr[m] for m in top_months_pm25])
print("Top months PM10  by event-days:", [calendar.month_abbr[m] for m in top_months_pm10])

Top months PM2.5 by event-days: ['Mar', 'Aug', 'Oct', 'Dec']
Top months PM10  by event-days: ['Mar', 'Jul', 'Aug', 'Dec']


In [167]:
# ==========================
# Select top-k months by SEVERITY (primary), with event-days as secondary criterion
# This provides a complementary perspective: extreme-magnitude events
# ==========================

top_months_pm25_sev = rank_pm25.sort_values(["mean_severity", "event_days"], ascending=[False, False]).head(TOPK).index.tolist()
top_months_pm10_sev = rank_pm10.sort_values(["mean_severity", "event_days"], ascending=[False, False]).head(TOPK).index.tolist()

# Sort selected months chronologically (by month number)
top_months_pm25_sev = sorted(top_months_pm25_sev)
top_months_pm10_sev = sorted(top_months_pm10_sev)

print("\nTop months PM2.5 by SEVERITY:", [calendar.month_abbr[m] for m in top_months_pm25_sev])
print("Top months PM10  by SEVERITY:", [calendar.month_abbr[m] for m in top_months_pm10_sev])


Top months PM2.5 by SEVERITY: ['Jan', 'Apr', 'May', 'Dec']
Top months PM10  by SEVERITY: ['Jan', 'Mar', 'Apr', 'May']


### 9.2 Helper: precompute month composites (same science as Section 8, but only selected months)

In [169]:
# ==========================
# Precompute selected-month composites
# ==========================

def compute_selected_month_composites(flag_col: str, months: list[int], label: str):
    """
    Same as compute_monthly_composites(), but restricted to a list of months.
    Returns dicts keyed by month, plus max_abs for consistent scaling across selected months.
    """
    Hp_sel, U_sel, V_sel, P_sel = {}, {}, {}, {}
    max_abs = 0.0

    for m in months:
        ev_dates = dates_for_month_event(df_evt, m, flag_col)
        ne_dates = dates_for_month_nonevent(df_evt, m, flag_col)

        Hp = composite_mean(H_prime, ev_dates)
        Um = composite_mean(U, ev_dates)
        Vm = composite_mean(V, ev_dates)
        P  = ttest_pvals_grid(H_prime, ev_dates, ne_dates)

        Hp_sel[m] = Hp
        U_sel[m]  = Um
        V_sel[m]  = Vm
        P_sel[m]  = P

        this_max = float(np.nanmax(np.abs(Hp.values)))
        if np.isfinite(this_max):
            max_abs = max(max_abs, this_max)

        print(f"{label} | {calendar.month_abbr[m]} | N_event={len(ev_dates)} | N_nonevent={len(ne_dates)} | max|Hp|={this_max:.2f}")

    return Hp_sel, U_sel, V_sel, P_sel, max_abs

### 9.3 Plot: “top months”

In [174]:
# ==========================
# Plot selected months (frequency)
# ==========================

def plot_selected_months(label: str,
                         months: list[int],
                         Hp: dict,
                         Uc: dict,
                         Vc: dict,
                         Pc: dict,
                         max_abs: float,
                         rank_df: pd.DataFrame,
                         outpath: Path,
                         rank_label: str | None = None,
                         p_thresh: float = 0.05,
                         vector_step: int = 4,
                         stipple_thin: int = 8):
    """
    Plot k selected months in a larger layout (2 rows).
    Uses the same styling as Section 8.
    rank_df: DataFrame with month ranking info (using mean_severity column)
    """
    proj = ccrs.PlateCarree()
    k = len(months)
    ncols = 2
    nrows = int(np.ceil(k / ncols))

    fig, axes = plt.subplots(nrows, ncols, figsize=(14, 5.8*nrows), dpi=250,
                             subplot_kw={"projection": proj})
    axes = np.array(axes).reshape(-1)  # flatten safely

    lon2d = H_prime["lon"].values
    lat2d = H_prime["lat"].values
    norm = TwoSlopeNorm(vcenter=0, vmin=-max_abs, vmax=max_abs)

    pcm_ref = None

    for i, m in enumerate(months):
        ax = axes[i]
        ax.set_extent([LON_MIN, LON_MAX, LAT_MIN, LAT_MAX], crs=proj)

        ax.coastlines(resolution="50m", linewidth=0.5)
        ax.add_feature(cfeature.BORDERS, linewidth=0.4)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.3)

        Hp_np = Hp[m].values
        Um_np = Uc[m].values
        Vm_np = Vc[m].values
        P_np  = Pc[m].values

        pcm = ax.pcolormesh(lon2d, lat2d, Hp_np, cmap="RdBu_r",
                            norm=norm, shading="auto", transform=proj)
        pcm_ref = pcm

        stepc = 5
        lev = np.arange(-max_abs, max_abs + stepc, stepc)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev > 0],
                   colors="k", linewidths=0.4, linestyles="solid", transform=proj)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev < 0],
                   colors="k", linewidths=0.4, linestyles="dashed", transform=proj)

        yy = np.arange(0, Um_np.shape[0], vector_step)
        xx = np.arange(0, Um_np.shape[1], vector_step)
        ax.quiver(lon2d[np.ix_(yy, xx)], lat2d[np.ix_(yy, xx)],
                  Um_np[np.ix_(yy, xx)], Vm_np[np.ix_(yy, xx)],
                  scale=700, width=0.002, color="black", transform=proj)

        sig = np.isfinite(P_np) & (P_np < p_thresh)
        y, x = np.where(sig)
        y = y[::stipple_thin]; x = x[::stipple_thin]
        ax.scatter(lon2d[y, x], lat2d[y, x], s=2, c="k", alpha=0.25, transform=proj)

        rect = mpatches.Rectangle((VOM_BOX[0], VOM_BOX[1]),
                                  VOM_BOX[2], VOM_BOX[3],
                                  fill=False, edgecolor="k",
                                  linewidth=1, transform=proj)
        ax.add_patch(rect)
        ax.plot(LON_CDMX, LAT_CDMX, marker="*", color="gold",
                markersize=9, markeredgecolor="k", transform=proj)

        n_ev = int(df_evt.loc[(df_evt.index.month == m), f"evt_{label}_p90"].sum())
        sev = rank_df.loc[m, "mean_severity"]
        thr = rank_df.loc[m, "p90_threshold_month"]
        ax.set_title(f"{calendar.month_abbr[m]} (N={n_ev}, sev={sev:.1f} µg/m³, p90 thr={thr:.0f} µg/m³)", 
                    fontsize=12, weight="bold")

        gl = ax.gridlines(draw_labels=True, linewidth=0.2, color="gray",
                          alpha=0.5, linestyle="--")
        gl.top_labels = False
        gl.right_labels = False

    # Turn off extra axes (if k is odd)
    for j in range(k, len(axes)):
        axes[j].axis("off")

    cbar_ax = fig.add_axes([0.92, 0.18, 0.015, 0.64])
    cb = fig.colorbar(pcm_ref, cax=cbar_ax)
    cb.set_label("Z500′ anomaly (m)")

    # Titles
    rank_suffix = f"; {rank_label}" if rank_label else ""
    fig.suptitle(f"{label}: Focused monthly Z500′ composites (top-{k} months{rank_suffix})",
                 fontsize=20, weight="bold", y=0.96)
    fig.text(0.5, 0.92,
             "Shading/contours: Z500′ | Vectors: mean 500 hPa winds | Stippling: p<0.05 (t-test)",
             ha="center", fontsize=16, style="italic")
    
    # Legend elements (outside map area, top-left)
    legend_elements = [mpatches.Rectangle((0, 0), 1, 1, fill=False, edgecolor="k", linewidth=1.5,
                        label="Valley of Mexico"), plt.Line2D([0], [0], marker="*", color="w", 
                        markerfacecolor="gold", markeredgecolor="k", markersize=12, label="Mexico City (CDMX)")
    ]
    fig.legend(handles=legend_elements, loc="upper left", fontsize=11,
              frameon=False, bbox_to_anchor=(0.01, 0.89))

    plt.tight_layout(rect=[0, 0, 0.9, 0.93])
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", outpath)


In [None]:
# Composites for frequency-based rankings
Hp25_sel, U25_sel, V25_sel, P25_sel, max25_sel = compute_selected_month_composites(
    flag_col="evt_PM2.5_p90", months=top_months_pm25, label="PM2.5"
)
Hp10_sel, U10_sel, V10_sel, P10_sel, max10_sel = compute_selected_month_composites(
    flag_col="evt_PM10_p90", months=top_months_pm10, label="PM10"
)

PM2.5 | Mar | N_event=41 | N_nonevent=360 | max|Hp|=17.64
PM2.5 | Aug | N_event=40 | N_nonevent=356 | max|Hp|=18.80
PM2.5 | Oct | N_event=40 | N_nonevent=337 | max|Hp|=42.52
PM2.5 | Dec | N_event=40 | N_nonevent=360 | max|Hp|=25.98
PM10 | Mar | N_event=41 | N_nonevent=360 | max|Hp|=24.96
PM10 | Jul | N_event=40 | N_nonevent=358 | max|Hp|=6.41
PM10 | Aug | N_event=41 | N_nonevent=355 | max|Hp|=18.74
PM10 | Dec | N_event=41 | N_nonevent=359 | max|Hp|=13.56


In [175]:
# Figures for frequency-based rankings
plot_selected_months(
    label="PM2.5",
    months=top_months_pm25,
    Hp=Hp25_sel, Uc=U25_sel, Vc=V25_sel, Pc=P25_sel,
    max_abs=max25_sel,
    rank_df=rank_pm25,
    rank_label="ranked by event-days",
    outpath=OUT_FIG / "Z500_topmonths_PM25_p90_frequency_stage2.png"
)

plot_selected_months(
    label="PM10",
    months=top_months_pm10,
    Hp=Hp10_sel, Uc=U10_sel, Vc=V10_sel, Pc=P10_sel,
    max_abs=max10_sel,
    rank_df=rank_pm10,
    rank_label="ranked by event-days",
    outpath=OUT_FIG / "Z500_topmonths_PM10_p90_frequency_stage2.png"
)

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_topmonths_PM25_p90_frequency_stage2.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_topmonths_PM10_p90_frequency_stage2.png


In [172]:
# Composites for severity-based rankings
Hp25_sel_sev, U25_sel_sev, V25_sel_sev, P25_sel_sev, max25_sel_sev = compute_selected_month_composites(
    flag_col="evt_PM2.5_p90", months=top_months_pm25_sev, label="PM2.5"
)
Hp10_sel_sev, U10_sel_sev, V10_sel_sev, P10_sel_sev, max10_sel_sev = compute_selected_month_composites(
    flag_col="evt_PM10_p90", months=top_months_pm10_sev, label="PM10"
)

PM2.5 | Jan | N_event=39 | N_nonevent=334 | max|Hp|=14.45
PM2.5 | Apr | N_event=39 | N_nonevent=342 | max|Hp|=15.25
PM2.5 | May | N_event=38 | N_nonevent=334 | max|Hp|=56.28
PM2.5 | Dec | N_event=40 | N_nonevent=360 | max|Hp|=25.98
PM10 | Jan | N_event=38 | N_nonevent=335 | max|Hp|=20.77
PM10 | Mar | N_event=41 | N_nonevent=360 | max|Hp|=24.96
PM10 | Apr | N_event=39 | N_nonevent=342 | max|Hp|=27.49
PM10 | May | N_event=38 | N_nonevent=334 | max|Hp|=41.61


In [176]:
# Figures for severity-based rankings
plot_selected_months(
    label="PM2.5",
    months=top_months_pm25_sev,
    Hp=Hp25_sel_sev, Uc=U25_sel_sev, Vc=V25_sel_sev, Pc=P25_sel_sev,
    max_abs=max25_sel_sev,
    rank_df=rank_pm25,
    rank_label="ranked by severity",
    outpath=OUT_FIG / "Z500_topmonths_PM25_p90_severity_stage2.png"
)

plot_selected_months(
    label="PM10",
    months=top_months_pm10_sev,
    Hp=Hp10_sel_sev, Uc=U10_sel_sev, Vc=V10_sel_sev, Pc=P10_sel_sev,
    max_abs=max10_sel_sev,
    rank_df=rank_pm10,
    rank_label="ranked by severity",
    outpath=OUT_FIG / "Z500_topmonths_PM10_p90_severity_stage2.png"
)

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_topmonths_PM25_p90_severity_stage2.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_topmonths_PM10_p90_severity_stage2.png


## 10. Seasonal composites (DJF/MAM/JJA/SON)

### 10.1 Helpers: assign season + event/non-event dates per season

In [189]:
SEASONS = ["DJF", "MAM", "JJA", "SON"]
SEASON_MONTHS = {
    "DJF": [12, 1, 2],
    "MAM": [3, 4, 5],
    "JJA": [6, 7, 8],
    "SON": [9, 10, 11],
}

def dates_for_season(df: pd.DataFrame, season: str, flag_col: str, is_event: bool) -> pd.DatetimeIndex:
    months = SEASON_MONTHS[season]
    sel_season = df.index.month.isin(months)
    sel_flag = (df[flag_col] == 1) if is_event else (df[flag_col] == 0)
    sel = sel_season & sel_flag
    return df.index[sel]


def compute_seasonal_composites(flag_col: str, label: str):
    """
    Returns dicts keyed by season: H′, U, V, pvals, plus max_abs for scaling.
    """
    Hp_s, U_s, V_s, P_s = {}, {}, {}, {}
    max_abs = 0.0

    for s in SEASONS:
        ev_dates = dates_for_season(df_evt, s, flag_col, is_event=True)
        ne_dates = dates_for_season(df_evt, s, flag_col, is_event=False)

        Hp = composite_mean(H_prime, ev_dates)
        Um = composite_mean(U, ev_dates)
        Vm = composite_mean(V, ev_dates)
        P  = ttest_pvals_grid(H_prime, ev_dates, ne_dates)

        Hp_s[s] = Hp
        U_s[s]  = Um
        V_s[s]  = Vm
        P_s[s]  = P

        this_max = float(np.nanmax(np.abs(Hp.values)))
        if np.isfinite(this_max):
            max_abs = max(max_abs, this_max)

        print(f"{label} | {s} | N_event={len(ev_dates)} | N_nonevent={len(ne_dates)} | max|Hp|={this_max:.2f}")

    return Hp_s, U_s, V_s, P_s, max_abs

### 10.2 Seasonal plot 2x2 per pollutant

In [200]:
def plot_seasonal_multipanel(label: str,
                            Hp_s: dict,
                            U_s: dict,
                            V_s: dict,
                            P_s: dict,
                            max_abs: float,
                            outpath: Path,
                            p_thresh: float = 0.05,
                            vector_step: int = 4,
                            stipple_thin: int = 8):
    proj = ccrs.PlateCarree()
    fig, axes = plt.subplots(2, 2, figsize=(14, 10), dpi=250,
                             subplot_kw={"projection": proj})
    axes = axes.flatten()

    lon2d = H_prime["lon"].values
    lat2d = H_prime["lat"].values
    norm = TwoSlopeNorm(vcenter=0, vmin=-max_abs, vmax=max_abs)

    pcm_ref = None

    for i, s in enumerate(SEASONS):
        ax = axes[i]
        ax.set_extent([LON_MIN, LON_MAX, LAT_MIN, LAT_MAX], crs=proj)

        ax.coastlines(resolution="50m", linewidth=0.5)
        ax.add_feature(cfeature.BORDERS, linewidth=0.4)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.3)

        Hp_np = Hp_s[s].values
        Um_np = U_s[s].values
        Vm_np = V_s[s].values
        P_np  = P_s[s].values

        pcm = ax.pcolormesh(lon2d, lat2d, Hp_np, cmap="RdBu_r",
                            norm=norm, shading="auto", transform=proj)
        pcm_ref = pcm

        stepc = 5
        lev = np.arange(-max_abs, max_abs + stepc, stepc)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev > 0],
                   colors="k", linewidths=0.4, linestyles="solid", transform=proj)
        ax.contour(lon2d, lat2d, Hp_np, levels=lev[lev < 0],
                   colors="k", linewidths=0.4, linestyles="dashed", transform=proj)

        yy = np.arange(0, Um_np.shape[0], vector_step)
        xx = np.arange(0, Um_np.shape[1], vector_step)
        ax.quiver(lon2d[np.ix_(yy, xx)], lat2d[np.ix_(yy, xx)],
                  Um_np[np.ix_(yy, xx)], Vm_np[np.ix_(yy, xx)],
                  scale=700, width=0.002, color="black", transform=proj)

        sig = np.isfinite(P_np) & (P_np < p_thresh)
        y, x = np.where(sig)
        y = y[::stipple_thin]; x = x[::stipple_thin]
        ax.scatter(lon2d[y, x], lat2d[y, x], s=2, c="k", alpha=0.25, transform=proj)

        rect = mpatches.Rectangle((VOM_BOX[0], VOM_BOX[1]),
                                  VOM_BOX[2], VOM_BOX[3],
                                  fill=False, edgecolor="k",
                                  linewidth=1, transform=proj)
        ax.add_patch(rect)
        ax.plot(LON_CDMX, LAT_CDMX, marker="*", color="gold",
                markersize=9, markeredgecolor="k", transform=proj)

        # Seasonal summary using MONTHLY p90 thresholds (consistent with Section 5)
        months = SEASON_MONTHS[s]
        sel_season = df_evt.index.month.isin(months)
        sel_event = sel_season & (df_evt[f"evt_{label}_p90"] == 1)

        n_ev = int(df_evt.loc[sel_event, f"evt_{label}_p90"].sum())
        exc_col = f"exc_{label}"
        thr_col = f"thr_p90_{label}"
        sev_val = float(np.nanmean(df_evt.loc[sel_event, exc_col]))
        thr_val = float(np.nanmean(df_evt.loc[sel_season, thr_col]))

        ax.set_title(
            f"{s} (N={n_ev}, sev={sev_val:.1f} µg/m³, p90 thr={thr_val:.0f} µg/m³)",
            fontsize=12, weight="bold"
        )

        gl = ax.gridlines(draw_labels=True, linewidth=0.2, color="gray",
                          alpha=0.5, linestyle="--")
        gl.top_labels = False
        gl.right_labels = False
        if (i % 2) != 0:
            gl.left_labels = False
        if i < 2:
            gl.bottom_labels = False

    cbar_ax = fig.add_axes([0.92, 0.18, 0.015, 0.64])
    cb = fig.colorbar(pcm_ref, cax=cbar_ax)
    cb.set_label("Z500′ anomaly (m)")

    # Titles
    fig.suptitle(f"{label}: Seasonal Z500′ composites during p90 event days (2012–2024)",
                 fontsize=18, weight="bold", y=0.96)
    fig.text(0.5, 0.92,
             "Shading/contours: Z500′ | Vectors: mean 500 hPa winds | Stippling: p<0.05 (t-test)",
             ha="center", fontsize=14, style="italic")

    # Legend elements (outside map area, top-left)
    legend_elements = [mpatches.Rectangle((0, 0), 1, 1, fill=False, edgecolor="k", linewidth=1.5,
                        label="Valley of Mexico"), plt.Line2D([0], [0], marker="*", color="w", 
                        markerfacecolor="gold", markeredgecolor="k", markersize=12, label="Mexico City (CDMX)")
    ]
    fig.legend(handles=legend_elements, loc="upper left", fontsize=11,
              frameon=False, bbox_to_anchor=(0.01, 0.92))

    plt.tight_layout(rect=[0, 0, 0.9, 0.93])
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", outpath)

In [192]:
# Seasonal summary (4 rows: DJF/MAM/JJA/SON)
# Uses MONTHLY p90 thresholds (event definition stays consistent with Section 5).
# p90 thr shown is the mean of monthly thresholds within each season.

season_stats = pd.DataFrame(index=SEASONS)

for pol in ["PM10", "PM2.5"]:
    n_vals = []
    sev_vals = []
    thr_vals = []

    for s in SEASONS:
        months = SEASON_MONTHS[s]
        sel_season = df_evt.index.month.isin(months)
        sel_event = sel_season & (df_evt[f"evt_{pol}_p90"] == 1)

        n_ev = int(df_evt.loc[sel_event, f"evt_{pol}_p90"].sum())
        sev_s = float(np.nanmean(df_evt.loc[sel_event, f"exc_{pol}"]))
        thr_s = float(np.nanmean(df_evt.loc[sel_season, f"thr_p90_{pol}"]))

        n_vals.append(n_ev)
        sev_vals.append(sev_s)
        thr_vals.append(thr_s)

    season_stats[f"N_{pol}"] = n_vals
    season_stats[f"sev_{pol}"] = sev_vals
    season_stats[f"p90_{pol}"] = thr_vals

season_stats.index.name = "season"
season_stats = season_stats[["N_PM10", "sev_PM10", "p90_PM10", "N_PM2.5", "sev_PM2.5", "p90_PM2.5"]]

out = OUT_TAB / "seasonal_summary_p90_events.csv"
season_stats.to_csv(out)
print(f"Saved: {out}")

season_stats

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\tables\seasonal_summary_p90_events.csv


Unnamed: 0_level_0,N_PM10,sev_PM10,p90_PM10,N_PM2.5,sev_PM2.5,p90_PM2.5
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DJF,115,9.74,80.04,115,8.01,40.26
MAM,118,11.34,70.21,118,6.34,36.62
JJA,117,5.23,47.19,116,3.39,25.78
SON,114,7.56,54.55,115,4.3,30.06


In [191]:
# Seasonal composites for all seasons
Hp25_s, U25_s, V25_s, P25_s, max25_s = compute_seasonal_composites(
    flag_col="evt_PM2.5_p90",
    label="PM2.5"
)
Hp10_s, U10_s, V10_s, P10_s, max10_s = compute_seasonal_composites(
    flag_col="evt_PM10_p90",
    label="PM10"
)

PM2.5 | DJF | N_event=115 | N_nonevent=1011 | max|Hp|=21.01
PM2.5 | MAM | N_event=118 | N_nonevent=1036 | max|Hp|=23.87
PM2.5 | JJA | N_event=116 | N_nonevent=1038 | max|Hp|=13.27
PM2.5 | SON | N_event=115 | N_nonevent=1006 | max|Hp|=26.49
PM10 | DJF | N_event=115 | N_nonevent=1011 | max|Hp|=15.01
PM10 | MAM | N_event=118 | N_nonevent=1036 | max|Hp|=25.48
PM10 | JJA | N_event=117 | N_nonevent=1037 | max|Hp|=9.10
PM10 | SON | N_event=114 | N_nonevent=1007 | max|Hp|=12.45


In [201]:
# Seasonal multipanels for all seasons
plot_seasonal_multipanel(
    label="PM2.5",
    Hp_s=Hp25_s, U_s=U25_s, V_s=V25_s, P_s=P25_s,
    max_abs=max25_s,
    outpath=OUT_FIG / "Z500_seasonal_PM25_p90_stage2.png"
)

plot_seasonal_multipanel(
    label="PM10",
    Hp_s=Hp10_s, U_s=U10_s, V_s=V10_s, P_s=P10_s,
    max_abs=max10_s,
    outpath=OUT_FIG / "Z500_seasonal_PM10_p90_stage2.png"
)

Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_seasonal_PM25_p90_stage2.png
Saved: c:\Users\DELL\OneDrive - TUNI.fi\Documents\Finlandia\Tampere Uni\Tesis\Schedule\Stage 2\outputs\figures\Z500_seasonal_PM10_p90_stage2.png
