In [1]:
import pandas as pd
from pathlib import Path

RAW_PATH = Path("../data/Agrimonia_Dataset_v_2_0_2.csv")
OUTPUT_PATH = Path("../data/agrimonia_daily.csv")

# load raw CSV
raw = pd.read_csv(RAW_PATH, low_memory=False)

# define mapping from raw columns to standardized names
RAW_TO_STD = {
    "Time": "date",
    "AQ_pm25":                "pm25",
    "AQ_pm10":                "pm10",
    "AQ_no2":                 "no2",
    "AQ_co":                  "co",
    "AQ_nh3":                 "nh3",
    "AQ_no2":                 "no2",
    "WE_temp_2m":             "temp_c",
    "WE_rh_mean":             "humidity",
    "WE_wind_speed_10m_mean": "wind_m_s",
    "WE_tot_precipitation":   "precip_mm",
    "WE_surface_pressure":    "press_hPa",
}


# check for missing columns
missing = set(RAW_TO_STD) - set(raw.columns)
if missing:
    raise KeyError(f"Agrimonia CSV is missing columns: {missing}")

# rename
df = raw[list(RAW_TO_STD.keys())].rename(columns=RAW_TO_STD)

# parse dates
df["date"] = pd.to_datetime(df["date"])

# force pollutant and meteorological columns to numeric
for col in ["pm25", "pm10", "no2", "co", "nh3", "no2",
            "temp_c", "humidity", "wind_m_s", "precip_mm", "press_hPa"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# daily aggregation (mean of each numeric column)
daily = (
    df
    .groupby("date", as_index=False)
    .mean(numeric_only=True)
    .sort_values("date")
)

# fill missing dates with NaN
all_days = pd.DataFrame({"date": pd.date_range(daily["date"].min(),
                                               daily["date"].max(),
                                               freq="D")})
daily = all_days.merge(daily, on="date", how="left")

# save to CSV
daily.to_csv(OUTPUT_PATH, index=False)
print(f"Saved clean daily Agrimonia data to {OUTPUT_PATH}")


  df["date"] = pd.to_datetime(df["date"])


Saved clean daily Agrimonia data to ../data/agrimonia_daily.csv
