<a href="https://colab.research.google.com/github/drfperez/openair/blob/main/Limits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

# ============================================================
# FULL AIR QUALITY COMPLIANCE ENGINE (ONE COLAB CELL)
# EU 2026 | EU 2030 | WHO 2021 | WHO 2005
# Stations • QA • Compliance • Plots • ZIP
# ============================================================

import io, os, zipfile, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

warnings.filterwarnings("ignore")

# ----------------------------
# Upload CSV
# ----------------------------
from google.colab import files
uploaded = files.upload()
fname = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[fname]))

# ----------------------------
# Detect datetime column
# ----------------------------
dt_col = next(c for c in df.columns if pd.to_datetime(df[c], errors="coerce").notna().sum() > 10)
df[dt_col] = pd.to_datetime(df[dt_col], errors="coerce")
df = df.dropna(subset=[dt_col]).set_index(dt_col).sort_index()

# ----------------------------
# Detect station column (optional)
# ----------------------------
station_col = None
for c in df.columns:
    if c.lower() in ["station","station_id","site","site_id","code","monitor"]:
        station_col = c
        break

if station_col is None:
    df["__station__"] = "SINGLE_STATION"
    station_col = "__station__"

# ----------------------------
# Detect pollutants
# ----------------------------
ALIASES = {
    "PM2.5": ["pm2.5","pm25"],
    "PM10": ["pm10"],
    "NO2": ["no2"],
    "O3": ["o3"],
    "SO2": ["so2"],
    "CO": ["co"],
    "Benzene": ["benzene"]
}

pollutant_cols = {}
for col in df.columns:
    for p, keys in ALIASES.items():
        if any(k in col.lower() for k in keys):
            pollutant_cols[p] = col

# ----------------------------
# Frameworks & limits
# ----------------------------
FRAMEWORKS = {
    "EU_2026": {
        "PM2.5":[("year",25,None)],
        "PM10":[("day",50,35),("year",40,None)],
        "NO2":[("hour",200,18),("year",40,None)],
        "SO2":[("hour",350,24),("day",125,3)],
        "O3":[("8h",120,25,"3yr")],
        "CO":[("8h",10,None)],
        "Benzene":[("year",5,None)]
    },
    "EU_2030": {
        "PM2.5":[("day",25,18),("year",10,None)],
        "PM10":[("day",45,18),("year",20,None)],
        "NO2":[("hour",200,3),("day",50,18),("year",20,None)],
        "SO2":[("hour",350,3),("day",50,18)],
        "O3":[("8h",120,18,"3yr")],
        "CO":[("8h",10,None)],
        "Benzene":[("year",3.4,None)]
    },
    "WHO_2021": {
        "PM2.5":[("day",15,None),("year",5,None)],
        "PM10":[("day",45,None),("year",15,None)],
        "NO2":[("day",25,None),("year",10,None)],
        "O3":[("8h",100,None)],
        "SO2":[("day",40,None)]
    },
    "WHO_2005": {
        "PM2.5":[("day",25,None),("year",10,None)],
        "PM10":[("day",50,None),("year",20,None)],
        "NO2":[("hour",200,None),("year",40,None)],
        "O3":[("8h",100,None)],
        "SO2":[("24h",20,None)]
    }
}

# ----------------------------
# Results containers
# ----------------------------
summary, events, station_summary = [], [], []

os.makedirs("plots", exist_ok=True)

# ----------------------------
# Main loop
# ----------------------------
for fw, rules in FRAMEWORKS.items():
    for station, sdf in df.groupby(station_col):
        hourly = sdf[pollutant_cols.values()].apply(pd.to_numeric, errors="coerce").resample("1H").mean()

        for pol, col in pollutant_cols.items():
            if pol not in rules:
                continue

            s = hourly[col]
            daily = s.resample("1D").mean()
            roll8 = s.rolling(8, min_periods=6).mean()
            daily8 = roll8.resample("1D").max()

            for rule in rules[pol]:
                period, limit, allowed = rule[0], rule[1], rule[2]

                if period == "hour":
                    exc = s > limit
                    grp = exc.groupby(exc.index.year).sum()

                elif period == "day":
                    exc = daily > limit
                    grp = exc.groupby(exc.index.year).sum()

                elif period == "8h":
                    exc = daily8 > limit
                    grp = exc.groupby(exc.index.year).sum()

                elif period == "year":
                    ann = s.resample("1Y").mean()
                    grp = (ann > limit).astype(int)
                    grp.index = grp.index.year

                for y, cnt in grp.items():
                    compliant = "COMPLIANT"
                    if allowed is not None and cnt > allowed:
                        compliant = "NON_COMPLIANT"
                    summary.append([fw,station,pol,period,y,int(cnt),allowed,compliant])

# ----------------------------
# DataFrames
# ----------------------------
df_summary = pd.DataFrame(summary, columns=[
    "framework","station","pollutant","period","year",
    "exceedances","allowed","compliance"
])

df_summary.to_csv("summary_exceedances_ALL.csv", index=False)

# ----------------------------
# Plots
# ----------------------------
for (fw, pol), g in df_summary.groupby(["framework","pollutant"]):
    pivot = g.groupby("year")["exceedances"].sum()
    plt.figure()
    pivot.plot(kind="bar")
    plt.title(f"{pol} – {fw}")
    plt.ylabel("Exceedances")
    plt.tight_layout()
    plt.savefig(f"plots/{pol}_{fw}.png")
    plt.close()

# ----------------------------
# ZIP
# ----------------------------
with open("README.txt","w") as f:
    f.write("Full EU + WHO air quality compliance package.\n")

with zipfile.ZipFile("air_quality_FULL_PACKAGE.zip","w") as z:
    z.write("summary_exceedances_ALL.csv")
    z.write("README.txt")
    for fn in os.listdir("plots"):
        z.write(os.path.join("plots",fn))

files.download("air_quality_FULL_PACKAGE.zip")

print("✅ ALL DONE — full compliance engine executed")

Saving processed_data_wide.csv to processed_data_wide (2).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ ALL DONE — full compliance engine executed


In [4]:
# ============================================================
# UNIVERSAL AIR QUALITY EXCEEDANCE CALCULATOR (ONE CELL)
# EU LAW (2026 / 2030) + WHO (2021 / 2005)
# ============================================================

LEGAL_TARGET = "EU_2026"  # <<< CHANGE HERE ONLY

import io, os, zipfile, warnings
import numpy as np
import pandas as pd
from datetime import datetime

warnings.filterwarnings("ignore")

try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

# ----------------------------
# Upload CSV
# ----------------------------
if IN_COLAB:
    uploaded = files.upload()
    fname = list(uploaded.keys())[0]
    df = pd.read_csv(io.BytesIO(uploaded[fname]))
else:
    df = pd.read_csv("/mnt/data/processed_data_wide.csv")

# ----------------------------
# Detect datetime column
# ----------------------------
dt_col = None
for c in df.columns:
    try:
        t = pd.to_datetime(df[c], errors="coerce")
        if t.notna().sum() > 10:
            dt_col = c
            break
    except:
        pass

if dt_col is None:
    raise ValueError("No datetime column detected.")

df[dt_col] = pd.to_datetime(df[dt_col], errors="coerce")
df = df.dropna(subset=[dt_col])
df = df.set_index(dt_col).sort_index()

# ----------------------------
# Detect pollutants
# ----------------------------
ALIASES = {
    "PM2.5": ["pm2.5", "pm25"],
    "PM10": ["pm10"],
    "NO2": ["no2"],
    "O3": ["o3", "ozone"],
    "SO2": ["so2"],
    "CO": ["co"],
    "Benzene": ["benzene"]
}

pollutants = {}
for col in df.columns:
    c = col.lower()
    for p, keys in ALIASES.items():
        if any(k in c for k in keys):
            pollutants[p] = pd.to_numeric(df[col], errors="coerce")

if not pollutants:
    raise ValueError("No pollutant columns detected.")

data = pd.DataFrame(pollutants)
hourly = data.resample("1H").mean()

# ----------------------------
# LIMIT TABLES
# ----------------------------
LIMITS = {
    "EU_2026": {
        "PM2.5": [("year", 25, None)],
        "PM10": [("day", 50, 35), ("year", 40, None)],
        "NO2":  [("hour", 200, 18), ("year", 40, None)],
        "SO2":  [("hour", 350, 24), ("day", 125, 3)],
        "O3":   [("8h", 120, 25, "3yr")],
        "CO":   [("8h", 10, None)],
        "Benzene": [("year", 5, None)]
    },
    "EU_2030": {
        "PM2.5": [("day", 25, 18), ("year", 10, None)],
        "PM10":  [("day", 45, 18), ("year", 20, None)],
        "NO2":   [("hour", 200, 3), ("day", 50, 18), ("year", 20, None)],
        "SO2":   [("hour", 350, 3), ("day", 50, 18)],
        "O3":    [("8h", 120, 18, "3yr")],
        "CO":    [("8h", 10, None)],
        "Benzene": [("year", 3.4, None)]
    },
    "WHO_2021": {
        "PM2.5": [("day", 15, None), ("year", 5, None)],
        "PM10":  [("day", 45, None), ("year", 15, None)],
        "NO2":   [("day", 25, None), ("year", 10, None)],
        "O3":    [("8h", 100, None)],
        "SO2":   [("day", 40, None)],
        "CO":    [("24h", 4, None)]
    },
    "WHO_2005": {
        "PM2.5": [("day", 25, None), ("year", 10, None)],
        "PM10":  [("day", 50, None), ("year", 20, None)],
        "NO2":   [("hour", 200, None), ("year", 40, None)],
        "O3":    [("8h", 100, None)],
        "SO2":   [("24h", 20, None)]
    }
}

rules = LIMITS[LEGAL_TARGET]

# ----------------------------
# Calculations
# ----------------------------
summary = []
events = []

for pol, series in hourly.items():
    if pol not in rules:
        continue

    daily = series.resample("1D").mean()
    roll8 = series.rolling(8, min_periods=6).mean()
    daily8max = roll8.resample("1D").max()

    for rule in rules[pol]:
        period = rule[0]
        limit = rule[1]

        if period == "hour":
            exc = series > limit
            for ts, v in series[exc].items():
                events.append([pol, "hour", ts.strftime("%Y-%m-%d %H"), v, limit])
            summary += [[pol, LEGAL_TARGET, "hour", str(y), int(exc[exc.index.year==y].sum())]
                        for y in exc.index.year.unique()]

        elif period == "day":
            exc = daily > limit
            for ts, v in daily[exc].items():
                events.append([pol, "day", ts.strftime("%Y-%m-%d"), v, limit])
            summary += [[pol, LEGAL_TARGET, "day", str(y), int(exc[exc.index.year==y].sum())]
                        for y in exc.index.year.unique()]

        elif period == "8h":
            exc = daily8max > limit
            for ts, v in daily8max[exc].items():
                events.append([pol, "8h", ts.strftime("%Y-%m-%d"), v, limit])
            yearly = exc.resample("1Y").sum()
            for ts, v in yearly.items():
                summary.append([pol, LEGAL_TARGET, "8h_days", str(ts.year), int(v)])

        elif period == "year":
            annual = series.resample("1Y").mean()
            for ts, v in annual.items():
                if v > limit:
                    events.append([pol, "year", str(ts.year), v, limit])
                summary.append([pol, LEGAL_TARGET, "year", str(ts.year), int(v > limit)])

# ----------------------------
# Save outputs
# ----------------------------
df_summary = pd.DataFrame(summary,
    columns=["pollutant","legal_target","period","year","exceedances"])

df_events = pd.DataFrame(events,
    columns=["pollutant","period","timestamp","value","limit"])

df_summary.to_csv("summary_exceedances.csv", index=False)
df_events.to_csv("detailed_exceedance_events.csv", index=False)

with zipfile.ZipFile("air_quality_results.zip","w") as z:
    z.write("summary_exceedances.csv")
    z.write("detailed_exceedance_events.csv")

if IN_COLAB:
    files.download("air_quality_results.zip")

print("✅ DONE — Legal target:", LEGAL_TARGET)

Saving processed_data_wide.csv to processed_data_wide (1).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ DONE — Legal target: EU_2026


In [3]:

# Colab one-cell: EU air pollutant exceedance calculator (didactic)
# - Upload a CSV (or it will try /mnt/data/processed_data_wide.csv if present)
# - Produces two CSVs and zips them for download:
#     1) summary_exceedances.csv
#     2) detailed_exceedance_events.csv
# - Implements limit tables for two legal milestones from Directive (EU) 2024/2881:
#     * "2026" transitional table (to be attained by 11 Dec 2026)
#     * "2030" table (to be attained by 1 Jan 2030)
#
# How to use:
# 1) Run this cell in Google Colab.
# 2) Upload a CSV when prompted (or Colab will read /mnt/data/processed_data_wide.csv if available).
# 3) Wait until it finishes — it will create and download eu_exceedance_results.zip containing the two CSVs.
#
# Important assumptions & robustness choices:
# - The script heuristically finds a datetime column. If there's a separate date and time column, it will combine them.
# - Common pollutant column name variants are recognized (e.g. PM2.5, PM25, pm25, NO2, no2, PM10, O3, CO, SO2, benzene).
# - If input resolution is coarser than hourly (e.g. daily), the code adapts and warns where hourly-derived metrics cannot be computed.
# - Rolling 8-hour means are computed as "running 8-hour averages" and each 8-hour average is assigned to the day on which it ends (per directive).
# - Data coverage rules are implemented in a simple way (minimum hours required: 75% rule => e.g. for daily means min 18 hourly values).
#
# Legal source (used to code thresholds): Directive (EU) 2024/2881 (Annex I tables for 2026 & 2030).
# For reference: European Commission / EEA materials.
# (The code below is fully self-contained for Colab: pandas + numpy + zipfile + google.colab.files)

# ---- Begin cell code ----
import io, os, zipfile, sys, warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

# Colab-specific uploads/downloads
try:
    from google.colab import files
    _in_colab = True
except Exception:
    _in_colab = False

warnings.filterwarnings("ignore")

print("EU exceedance calculator (single-cell). Will ask you to upload a CSV (Colab).")

# ---------- Helper: get input CSV ----------
def upload_or_open_default():
    if _in_colab:
        print("\nPlease upload your CSV file when the file-picker appears (it may take a few seconds).")
        uploaded = files.upload()  # user uploads file(s)
        if uploaded:
            # take the first uploaded file
            name = list(uploaded.keys())[0]
            print(f"Loaded uploaded file: {name}")
            return name, io.BytesIO(uploaded[name])
        else:
            # fallback to default path if present
            default_path = "/mnt/data/processed_data_wide.csv"
            if os.path.exists(default_path):
                print(f"No upload detected — using default path: {default_path}")
                return os.path.basename(default_path), open(default_path, "rb")
            raise FileNotFoundError("No file uploaded and default file not present.")
    else:
        # Not running in Colab — try default path
        default_path = "/mnt/data/processed_data_wide.csv"
        if os.path.exists(default_path):
            print(f"Running outside Colab, using default path: {default_path}")
            return os.path.basename(default_path), open(default_path, "rb")
        raise EnvironmentError("Not in Colab and default CSV not present. Run this in Colab or provide a CSV.")

fname, fh = upload_or_open_default()

# ---------- Read CSV with robust parsing ----------
# Try multiple separators, encoding; try to infer datetime columns
text = fh.read()
if isinstance(text, bytes):
    # attempt utf-8 first, fallback to latin1
    try:
        s = text.decode("utf-8")
    except:
        s = text.decode("latin1")
else:
    s = str(text)

# pandas read with common separators
sep_candidates = [",", ";", "\t", "|"]
for sep in sep_candidates:
    try:
        df = pd.read_csv(io.StringIO(s), sep=sep, engine="python")
        # require at least 2 columns
        if df.shape[1] >= 2:
            break
    except Exception:
        df = None
if df is None:
    raise ValueError("Unable to parse CSV with common separators. Please re-export as comma-delimited or upload a simpler CSV.")

print(f"CSV parsed: {df.shape[0]} rows, {df.shape[1]} columns.")

# ---------- Find datetime / timestamp column ----------
def find_datetime_column(df):
    # common names
    candidates = [c for c in df.columns if c.lower() in ("datetime","timestamp","time","date","date_time","date/time","measurement_time")]
    if candidates:
        return candidates[0]
    # try to find a single column that parses as datetime
    for c in df.columns:
        try:
            parsed = pd.to_datetime(df[c], errors="coerce")
            if parsed.notna().sum() > max(5, 0.01 * len(df)):  # at least a few parse
                return c
        except Exception:
            continue
    # try combining date + time columns
    date_cols = [c for c in df.columns if "date" in c.lower() and c.lower()!= "data"]
    time_cols = [c for c in df.columns if "time" in c.lower()]
    if date_cols and time_cols:
        return (date_cols[0], time_cols[0])
    return None

dt_col = find_datetime_column(df)
if dt_col is None:
    raise ValueError("Could not detect a datetime or date & time column. Please ensure your CSV has a datetime/timestamp column.")
if isinstance(dt_col, tuple):
    date_col, time_col = dt_col
    print(f"Combining date + time columns: {date_col} + {time_col}")
    df['__datetime__'] = pd.to_datetime(df[date_col].astype(str) + " " + df[time_col].astype(str), errors="coerce")
    df = df.drop(columns=[date_col, time_col])
    dt_col = '__datetime__'
else:
    # parse detected column
    print(f"Detected datetime column: {dt_col}")
    df[dt_col] = pd.to_datetime(df[dt_col], errors="coerce")

# drop rows without valid datetime
df = df.loc[df[dt_col].notna()].copy()
df = df.sort_values(dt_col).reset_index(drop=True)

# set as index
df.index = pd.DatetimeIndex(df[dt_col])

# ---------- Detect pollutant columns (common names) ----------
# Mapping of canonical pollutant keys to name patterns
POLLUTANT_ALIASES = {
    'PM2.5': ['pm2.5','pm2_5','pm25','pm_2_5','pm2p5'],
    'PM10' : ['pm10','pm_10'],
    'NO2'  : ['no2','nitrogen_dioxide','nitrogen dioxide'],
    'SO2'  : ['so2','sulphur_dioxide','sulfur_dioxide'],
    'O3'   : ['o3','ozone'],
    'CO'   : ['co','carbon_monoxide','carbon monoxide'],
    'Benzene': ['benzene','c6h6', 'ben']
}

# auto-detect
available = {}
for col in df.columns:
    cname = col.lower().replace(" ","_")
    for pollutant, aliases in POLLUTANT_ALIASES.items():
        if any(a in cname for a in aliases):
            # attempt to coerce to numeric
            try:
                series = pd.to_numeric(df[col], errors='coerce')
                if series.notna().sum() > 0:
                    available[pollutant] = col
            except:
                continue

if not available:
    raise ValueError("No pollutant columns detected. Column names should include e.g. PM2.5, PM10, NO2, SO2, O3, CO, Benzene (case-insensitive). Found columns: " + ", ".join(df.columns))

print("Detected pollutant columns (mapped):")
for k,v in available.items():
    print(f"  {k}  <-- column '{v}'")

# Convert detected pollutant columns to numeric series (units assumed μg/m3 except CO in mg/m3)
series_dict = {}
for p, col in available.items():
    s = pd.to_numeric(df[col], errors='coerce')
    series_dict[p] = s

data = pd.DataFrame(series_dict, index=df.index)

# Resample to hourly (mean) so we have consistent hourly timeseries for rolling 8-hour windows
# If input is coarser than hourly (e.g., daily), resampling will upsample (NaNs) and we will adjust later
data_hourly = data.resample('1H').mean()

# ---------- EU limit tables (from Directive (EU) 2024/2881, Annex I) ----------
# We implement two legal milestones: '2026' (earlier limits, to be attained by 11 Dec 2026)
# and '2030' (stricter limits, to be attained by 1 Jan 2030).
# The script uses the 2030 table by default; change 'legal_target' to '2026' if you want transitional limits.

legal_target = '2030'   # <-- change to '2026' if you want the earlier transitional table

# Limits structure:
# For each pollutant we store limit entries with keys: 'period' (hour, 8h, day, year), 'limit', 'max_exceedances_per_year' (None if not limited)
# Units: numeric values assumed μg/m3 except for CO (mg/m3), for benzene (μg/m3)
LIMIT_TABLES = {
    '2030': {
        'PM2.5': [
            {'period':'day', 'limit':25.0, 'max_exceedances_per_year':18},
            {'period':'year','limit':10.0, 'max_exceedances_per_year':None},
        ],
        'PM10': [
            {'period':'day', 'limit':45.0, 'max_exceedances_per_year':18},
            {'period':'year','limit':20.0, 'max_exceedances_per_year':None},
        ],
        'NO2': [
            {'period':'hour','limit':200.0, 'max_exceedances_per_year':3},
            {'period':'day','limit':50.0, 'max_exceedances_per_year':18},
            {'period':'year','limit':20.0, 'max_exceedances_per_year':None},
        ],
        'SO2': [
            {'period':'hour','limit':350.0, 'max_exceedances_per_year':3},
            {'period':'day','limit':50.0, 'max_exceedances_per_year':18},
            {'period':'year','limit':20.0, 'max_exceedances_per_year':None},
        ],
        'O3': [
            # O3 is a target value: max daily 8-hour mean 120 μg/m3; days count averaged over 3 years
            {'period':'8h_daily_max','limit':120.0, 'max_exceedances_per_year':18, 'three_year_rule':True},
        ],
        'CO': [
            {'period':'8h_max_daily','limit':10.0, 'max_exceedances_per_year':None},  # mg/m3 (note: input suspected μg/m3 — user must confirm units)
            {'period':'day','limit':4.0, 'max_exceedances_per_year':18},
        ],
        'Benzene': [
            {'period':'year','limit':3.4, 'max_exceedances_per_year':None},  # μg/m3
        ]
    },
    '2026': {
        'PM2.5': [
            {'period':'year','limit':25.0, 'max_exceedances_per_year':None},
        ],
        'PM10': [
            {'period':'day','limit':50.0, 'max_exceedances_per_year':35},
            {'period':'year','limit':40.0, 'max_exceedances_per_year':None},
        ],
        'NO2': [
            {'period':'hour','limit':200.0, 'max_exceedances_per_year':18},
            {'period':'year','limit':40.0, 'max_exceedances_per_year':None},
        ],
        'SO2': [
            {'period':'hour','limit':350.0, 'max_exceedances_per_year':24},
            {'period':'day','limit':125.0, 'max_exceedances_per_year':3},
        ],
        'O3': [
            {'period':'8h_daily_max','limit':120.0, 'max_exceedances_per_year':25, 'three_year_rule':True},
        ],
        'CO': [
            {'period':'8h_max_daily','limit':10.0, 'max_exceedances_per_year':None},  # mg/m3
            {'period':'day','limit':4.0, 'max_exceedances_per_year':18},
        ],
        'Benzene': [
            {'period':'year','limit':5.0, 'max_exceedances_per_year':None},
        ]
    }
}

LIMITS = LIMIT_TABLES[legal_target]
print(f"\nUsing legal target: {legal_target} (Directive (EU) 2024/2881 tables).")

# ---------- Utility to compute daily & rolling 8-hour means ----------
# apply data coverage minima per directive simply: daily mean requires at least 18 valid hourly values (75% of 24),
# 8-hour mean requires at least 6 valid hours in the 8-hour window (75%).
MIN_HOURS_PER_DAY = 18
MIN_HOURS_IN_8H = 6

# hourly series is data_hourly
results_summary = []     # will collect summary per pollutant/period/year
events_rows = []         # detailed events table rows

# Helper to calculate 8-hour rolling and assign to day on which it ends
def compute_8h_running(series_hourly):
    # series_hourly: pandas Series indexed hourly
    # compute rolling window of size 8 (hours), require at least MIN_HOURS_IN_8H non-na values
    roll = series_hourly.rolling(window=8, min_periods=MIN_HOURS_IN_8H).mean()
    # assign to the day on which the 8-hour period ends (so the timestamp is roll.index, which is end)
    return roll

# Helper to get daily mean with min coverage
def compute_daily_mean(series_hourly):
    # count non-na hours per day
    valid_counts = series_hourly.resample('1D').count()
    daily_mean = series_hourly.resample('1D').mean()
    # mask days with low coverage
    daily_mean[valid_counts < MIN_HOURS_PER_DAY] = np.nan
    return daily_mean

# Loop pollutants available in data
for pollutant, series in data_hourly.items():
    if pollutant not in LIMITS:
        # skip if no legal limit in our tables
        print(f"Warning: pollutant {pollutant} not present in legal table for {legal_target}; skipping.")
        continue

    limits_for_pollutant = LIMITS[pollutant]
    # prepare hourly, daily, 8h series as needed
    s_hour = series.copy()
    s_day = compute_daily_mean(s_hour)
    s_8h = compute_8h_running(s_hour)  # hourly-indexed rolling 8-hour means

    # For each limit entry, compute exceedances per year (and record event details)
    for ent in limits_for_pollutant:
        period = ent['period']
        limit = ent['limit']
        max_exc = ent.get('max_exceedances_per_year', None)
        three_year_rule = ent.get('three_year_rule', False)

        if period == 'hour':
            # count hours where hourly mean > limit
            exceeded = s_hour > limit
            # group by calendar year
            counts = exceeded.groupby(exceeded.index.year).sum().astype(int)
            for yr, cnt in counts.items():
                results_summary.append({
                    'pollutant': pollutant,
                    'legal_target': legal_target,
                    'period_type': 'hour',
                    'period_label': str(yr),
                    'limit': limit,
                    'exceedances': int(cnt),
                    'allowed_exceedances_per_year': max_exc
                })
            # detailed events: record each hour exceedance with yyyy-mm-dd HH
            ev = s_hour[exceeded].dropna()
            for ts, val in ev.items():
                events_rows.append({
                    'pollutant': pollutant,
                    'period_type': 'hour',
                    'period_stamp': ts.strftime("%Y-%m-%d %H"),
                    'value': float(val),
                    'limit': limit,
                    'excess_amount': float(val - limit)
                })

        elif period == 'day':
            # s_day is daily mean with NaNs for low coverage
            exceeded = s_day > limit
            counts = exceeded.groupby(exceeded.index.year).sum().astype(int)
            for yr, cnt in counts.items():
                results_summary.append({
                    'pollutant': pollutant,
                    'legal_target': legal_target,
                    'period_type': 'day',
                    'period_label': str(yr),
                    'limit': limit,
                    'exceedances': int(cnt),
                    'allowed_exceedances_per_year': max_exc
                })
            # detailed events: each day (yyyy-mm-dd)
            ev = s_day[exceeded].dropna()
            for ts, val in ev.items():
                events_rows.append({
                    'pollutant': pollutant,
                    'period_type': 'day',
                    'period_stamp': ts.strftime("%Y-%m-%d"),
                    'value': float(val),
                    'limit': limit,
                    'excess_amount': float(val - limit)
                })

        elif period in ('8h_daily_max','8h_max_daily'):
            # For O3 and CO 8h rules: compute 8h running means (s_8h)
            # For daily classification we want the *daily maximum* of the 8h running means.
            # For each day, take the maximum of s_8h values that end on that day (i.e. timestamps within that day)
            s8 = s_8h.copy()
            # assign each s8 timestamp to the day on which the 8h window ends
            s8_daily_max = s8.resample('1D').max()   # since s8 index hours, resample day -> default label is day's midnight
            exceeded = s8_daily_max > limit
            counts = exceeded.groupby(exceeded.index.year).sum().astype(int)
            for yr, cnt in counts.items():
                results_summary.append({
                    'pollutant': pollutant,
                    'legal_target': legal_target,
                    'period_type': '8h_daily_max',
                    'period_label': str(yr),
                    'limit': limit,
                    'exceedances': int(cnt),
                    'allowed_exceedances_per_year': max_exc,
                    'three_year_rule': bool(three_year_rule)
                })
            # detailed events: each day where daily max 8h > limit
            ev = s8_daily_max[exceeded].dropna()
            for ts, val in ev.items():
                # ts is day's midnight; we output yyyy-mm-dd
                events_rows.append({
                    'pollutant': pollutant,
                    'period_type': '8h_daily_max',
                    'period_stamp': ts.strftime("%Y-%m-%d"),
                    'value': float(val),
                    'limit': limit,
                    'excess_amount': float(val - limit)
                })

        elif period == 'year':
            # annual mean (calendar year). For annual means we require data coverage:
            # compute annual mean of the 1-hour/8h/24h as appropriate: easiest is to use hourly series annual mean
            # but require at least 85% coverage in year ideally — we'll implement a simpler threshold: >0.7 * hours_in_year present
            yr_groups = s_hour.groupby(s_hour.index.year)
            for yr, group in yr_groups:
                hours_present = group.count()
                hours_in_year = 8760 + (1 if (datetime(yr,1,1).year%4==0 and (yr%100!=0 or yr%400==0)) else 0)
                # Use fraction presence threshold 0.7 (relaxed); directive requests 85% but that is more involved to compute per pollutant
                if hours_present.mean() < 0.7 * hours_in_year:
                    annual_mean = np.nan
                else:
                    annual_mean = group.mean()
                    # group.mean() returns a Series if multiple pollutants; for single pollutant series it's a scalar
                    if isinstance(annual_mean, pd.Series):
                        annual_mean = float(annual_mean.loc[pollutant])
                    else:
                        annual_mean = float(annual_mean)
                exceeded_flag = False
                if pd.notna(annual_mean) and annual_mean > limit:
                    exceeded_flag = True
                results_summary.append({
                    'pollutant': pollutant,
                    'legal_target': legal_target,
                    'period_type': 'year',
                    'period_label': str(yr),
                    'limit': limit,
                    'exceedances': int(bool(exceeded_flag)),   # 1 if annual mean > limit else 0
                    'allowed_exceedances_per_year': max_exc
                })
                # detailed event if exceeded
                if exceeded_flag:
                    events_rows.append({
                        'pollutant': pollutant,
                        'period_type': 'year',
                        'period_stamp': str(yr),
                        'value': float(annual_mean),
                        'limit': limit,
                        'excess_amount': float(annual_mean - limit)
                    })
        else:
            # unknown period - skip
            print(f"Skipping unknown period type '{period}' for pollutant {pollutant}")

    # Additional specialised rule: 3-year averages for O3 days and PM2.5 AEI
    # For O3: if the 8h_daily_max entry had three_year_rule True, compute 3-year running average of annual 'days > 8h limit' counts
    for ent in limits_for_pollutant:
        if ent.get('three_year_rule', False):
            # compute yearly counts of daily exceedance days (we already collected in results_summary with period_type '8h_daily_max')
            # Let's get a simple series of counts by year
            # Recompute to ensure correctness:
            limit = ent['limit']
            s8 = s_8h.copy()
            s8_daily_max = s8.resample('1D').max()
            exceeded_daily = (s8_daily_max > limit).astype(int).resample('1Y').sum()
            # make years aligned as int year
            exceeded_by_year = {ts.year: int(v) for ts, v in exceeded_daily.items()}
            years_sorted = sorted(exceeded_by_year.keys())
            # compute 3-year running average (centered on last year of window): average over year y-2,y-1,y
            for i in range(len(years_sorted)):
                if i < 2:
                    continue  # need 3 years
                ywindow = years_sorted[i-2:i+1]
                avg_val = int(round(np.mean([exceeded_by_year[y] for y in ywindow]), 3) if ywindow else 0)
                results_summary.append({
                    'pollutant': pollutant,
                    'legal_target': legal_target,
                    'period_type': '3year_average_of_daily_8h_exceedance_days',
                    'period_label': f"{ywindow[0]}-{ywindow[-1]}",
                    'limit': limit,
                    'exceedances': avg_val,
                    'allowed_exceedances_per_year': ent.get('max_exceedances_per_year')
                })

    # Special for PM2.5: average exposure indicator (AEI) is a 3-year running annual mean of PM2.5 (simplified per-station AEI)
    if pollutant == 'PM2.5':
        # compute annual means (using s_hour)
        ann = s_hour.resample('1Y').mean()
        ann_by_year = {ts.year: float(v) for ts, v in ann.items() if pd.notna(v)}
        years_sorted = sorted(ann_by_year.keys())
        for i in range(len(years_sorted)):
            if i < 2:
                continue
            ywindow = years_sorted[i-2:i+1]
            avg_val = np.mean([ann_by_year[y] for y in ywindow])
            results_summary.append({
                'pollutant': pollutant,
                'legal_target': legal_target,
                'period_type': 'PM2.5_AEI_3yr',
                'period_label': f"{ywindow[0]}-{ywindow[-1]}",
                'limit': next((e['limit'] for e in limits_for_pollutant if e['period']=='year'), None),
                'exceedances': float(avg_val),
                'allowed_exceedances_per_year': None
            })

# ---------- Build output DataFrames ----------
df_summary = pd.DataFrame(results_summary)
if df_summary.empty:
    print("No summary results were produced (no pollutant limits matched). Exiting.")
else:
    # Pivot/organize summary for readability: keep as flat table with columns:
    # pollutant, legal_target, period_type, period_label, limit, exceedances, allowed_exceedances_per_year
    df_summary = df_summary[['pollutant','legal_target','period_type','period_label','limit','exceedances','allowed_exceedances_per_year']]
    df_summary = df_summary.sort_values(['pollutant','period_type','period_label']).reset_index(drop=True)

df_events = pd.DataFrame(events_rows)
if not df_events.empty:
    df_events = df_events[['pollutant','period_type','period_stamp','value','limit','excess_amount']]
    df_events = df_events.sort_values(['pollutant','period_type','period_stamp']).reset_index(drop=True)

# ---------- Save CSVs and zip ----------
out_summary = "summary_exceedances.csv"
out_events = "detailed_exceedance_events.csv"
zip_name = "eu_exceedance_results.zip"

df_summary.to_csv(out_summary, index=False)
df_events.to_csv(out_events, index=False)

# Create zip
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as zf:
    zf.write(out_summary)
    zf.write(out_events)

print(f"\nCreated files: {out_summary}, {out_events}, zipped into {zip_name}.")
print("First rows of summary:")
print(df_summary.head(20).to_string(index=False))

# Trigger download in Colab if available
if _in_colab:
    print("\nStarting download of the zip file...")
    files.download(zip_name)
else:
    print(f"You are not in Colab. The files are saved in the working directory: {os.getcwd()}")

# ---- End cell code ----

EU exceedance calculator (single-cell). Will ask you to upload a CSV (Colab).

Please upload your CSV file when the file-picker appears (it may take a few seconds).


Saving processed_data_wide.csv to processed_data_wide.csv
Loaded uploaded file: processed_data_wide.csv
CSV parsed: 290832 rows, 12 columns.
Detected datetime column: date
Detected pollutant columns (mapped):
  CO  <-- column 'co'
  NO2  <-- column 'no2'
  O3  <-- column 'o3'
  PM10  <-- column 'pm10'
  PM2.5  <-- column 'pm2.5'
  SO2  <-- column 'so2'

Using legal target: 2030 (Directive (EU) 2024/2881 tables).

Created files: summary_exceedances.csv, detailed_exceedance_events.csv, zipped into eu_exceedance_results.zip.
First rows of summary:
pollutant legal_target  period_type period_label  limit  exceedances  allowed_exceedances_per_year
       CO         2030 8h_daily_max         1991   10.0            0                           NaN
       CO         2030 8h_daily_max         1992   10.0            0                           NaN
       CO         2030 8h_daily_max         1993   10.0            0                           NaN
       CO         2030 8h_daily_max         1994   10

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>