In [1]:
import pandas as pd
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import logging
import re
from pathlib import Path
from typing import Optional

# ====================== Config ======================
START_DATE = dt.date(2000, 1, 1)
END_DATE   = dt.date(2025, 11, 5)
LIMIT_TO_FOUND_SPAN = True  # clip x-axis to discovered min..max dates

# Optional metadata for nicer titles if we recognize the folder name
KNOWN_PROVIDER_META = {
    "tomorrow_io":               ("Tomorrow.io",                 "No cached data yet (expected 5m/1h timeline)"),
    "open_meteo":                ("Open-Meteo",                  "Hourly cadence (1h)"),
    "visual_crossing":           ("Visual Crossing",             "Hourly cadence (1h)"),
    "noaa_isd":                  ("NOAA ISD",                    "Sub-hourly METAR (median ~53 min)"),
    "noaa_lcd":                  ("NOAA LCD",                    "Sub-hourly LCD (median ~53 min)"),
    "meteostat":                 ("Meteostat",                   "Hourly multi-source blend (1h)"),
    "nasa_power":                ("NASA POWER",                  "Hourly NASA POWER (satellite/model)"),
    "iem_asos":                  ("IEM ASOS",                    "1-min ASOS observations"),
    "copernicus_era5_single":    ("Copernicus ERA5 (single)",    "Hourly ERA5 single levels"),
    "copernicus_era5_land":      ("Copernicus ERA5-Land",        "Hourly ERA5-Land (0.1 deg grid)"),
    "copernicus_era5_pressure":  ("Copernicus ERA5 (pressure)",  "Hourly ERA5 pressure levels (0.25 deg grid)"),
    "copernicus_era5_land_timeseries": ("Copernicus ERA5-Land TS","Hourly ERA5-Land point series"),
    "openweather":               ("OpenWeather",                 "Hourly observations (1h)"),
    "weatherbit":                ("Weatherbit",                  "No cached data yet"),
    "weatherapi_com":            ("WeatherAPI.com",              "Hourly forecast/history (1h)"),
    "copernicus_cds":            ("Copernicus CDS",              "Daily CSV exports"),
}

# ====================== Locate data/ ======================
def find_data_root(start: Optional[Path] = None, max_up: int = 6) -> Path:
    """
    Look for a folder named 'data' starting at CWD and walking up.
    Prefer a 'data' that actually contains provider subdirs or files.
    """
    start = start or Path.cwd()
    candidates = []
    for i, base in enumerate([start, *start.parents]):
        if i > max_up:
            break
        cand = base / "data"
        if cand.exists() and cand.is_dir():
            has_subdir = any(p.is_dir() for p in cand.iterdir())
            has_files_nested = any(cand.rglob("*.*"))
            if has_subdir or has_files_nested:
                candidates.append(cand)
    return candidates[0] if candidates else (start / "data")

DATA_ROOT = find_data_root()
LOG_DIR = DATA_ROOT.parent / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = LOG_DIR / "weather_export.log"
CACHE_IMAGE_PATH = DATA_ROOT / "cache_coverage.png"

# ====================== Discovery & parsing ======================
def pretty_label(key: str) -> str:
    return key.replace("_", " ").replace("-", " ").title()

def discover_providers(root: Path):
    if not root.exists():
        return []
    return sorted([p for p in root.iterdir() if p.is_dir()])

def find_locations(provider_dir: Path):
    if not provider_dir.exists():
        return []
    return sorted([p for p in provider_dir.iterdir() if p.is_dir()])

DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")

def extract_date_from_name(path: Path) -> Optional[dt.date]:
    m = DATE_RE.search(path.stem)
    if not m:
        return None
    try:
        return dt.date.fromisoformat(m.group(1))
    except ValueError:
        return None

def collect_dates(location_dir: Path):
    """
    Recursively glob common file types and extract dates from filenames.
    Works with nested folders and names like '2024-07-01_12h.csv'.
    """
    exts = ("*.csv", "*.json", "*.parquet", "*.feather")
    dates = set()
    for pat in exts:
        for f in location_dir.rglob(pat):
            d = extract_date_from_name(f)
            if d:
                dates.add(d)
    return dates

# ====================== Scan & preflight ======================
providers = discover_providers(DATA_ROOT)
provider_meta = []
for pdir in providers:
    key = pdir.name
    label, resolution = KNOWN_PROVIDER_META.get(key, (pretty_label(key), ""))
    provider_meta.append((key, label, resolution, pdir))

summary_lines = []
any_dates_found = False
global_min: Optional[dt.date] = None
global_max: Optional[dt.date] = None

# provider_payload: (key, label, resolution, [(location_name, dates_set)])
provider_payload = []

for key, label, resolution, pdir in provider_meta:
    loc_dirs = find_locations(pdir)
    loc_payload = []
    for loc_dir in loc_dirs:
        dates = collect_dates(loc_dir)
        loc_payload.append((loc_dir.name, dates))
        if dates:
            any_dates_found = True
            dmin, dmax = min(dates), max(dates)
            global_min = dmin if global_min is None else min(global_min, dmin)
            global_max = dmax if global_max is None else max(global_max, dmax)

    provider_payload.append((key, label, resolution, loc_payload))
    total_files = sum(len(d) for _, d in loc_payload)
    total_locs  = len(loc_payload)
    summary_lines.append(f"- {label}: {total_locs} locations, {total_files} dated files")

print("== Cache preflight ==")
print(f"DATA_ROOT: {DATA_ROOT.resolve()}")
print("\n".join(summary_lines) if summary_lines else "(no providers discovered)")
if any_dates_found:
    print(f"Found date span: {global_min} -> {global_max}")
else:
    print("No dated files found. If this is unexpected, confirm the data root above matches your project tree.")

# ====================== Build index ======================
if any_dates_found and LIMIT_TO_FOUND_SPAN:
    date_index = pd.date_range(global_min, global_max, freq="D")
else:
    date_index = pd.date_range(START_DATE, END_DATE, freq="D")

# Keep only panels that actually have any cached days
panels = []
for key, label, resolution, loc_payload in provider_payload:
    rows = []
    loc_names = []
    for loc_name, dates in loc_payload:
        if not dates:
            continue
        row = [1 if ts.date() in dates else 0 for ts in date_index]
        rows.append(row)
        loc_names.append(loc_name)
    has_any = any(any(r) for r in rows) if rows else False
    if has_any:
        panels.append((label, resolution, loc_names, np.asarray(rows, dtype=int)))

# ====================== Plot ======================
if not panels:
    fig, ax = plt.subplots(figsize=(10, 2), constrained_layout=True)
    ax.text(0.5, 0.5, "No cached files found in the discovered date span",
            ha="center", va="center")
    ax.axis("off")
    CACHE_IMAGE_PATH.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(CACHE_IMAGE_PATH, dpi=150)
    plt.close(fig)
    logging.info("Saved cached coverage chart to %s", CACHE_IMAGE_PATH)
else:
    cmap = matplotlib.colors.ListedColormap(["#f0f0f0", "#2ca02c"])  # 0=missing, 1=cached
    norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
    mappable = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap)
    mappable.set_array([])

    nrows = len(panels)
    fig, axes = plt.subplots(
        nrows, 1,
        figsize=(16, 2.2 * nrows),
        sharex=True,
        constrained_layout=True
    )
    if nrows == 1:
        axes = [axes]

    # Slim, top colorbar (fallback if Matplotlib is older)
    try:
        cbar = fig.colorbar(
            mappable, ax=axes, orientation="horizontal",
            fraction=0.02, pad=0.02, location="top"
        )
    except TypeError:
        # Matplotlib < 3.6 doesn't support 'location'
        cbar = fig.colorbar(
            mappable, ax=axes, orientation="horizontal",
            fraction=0.02, pad=0.02
        )
    cbar.set_ticks([0, 1])
    cbar.set_ticklabels(["Missing", "Cached"])
    cbar.ax.tick_params(labelsize=9)

    for ax, (label, resolution, loc_names, data) in zip(axes, panels):
        ax.imshow(data, aspect="auto", interpolation="nearest", cmap=cmap, vmin=0, vmax=1)
        ax.set_yticks(range(len(loc_names)))
        ax.set_yticklabels(loc_names, fontsize=9)

        tick_count = min(len(date_index), 8)
        if tick_count > 0:
            tick_positions = np.linspace(0, len(date_index) - 1, tick_count, dtype=int)
            ax.set_xticks(tick_positions)
            ax.set_xticklabels(
                [date_index[i].date().isoformat() for i in tick_positions],
                rotation=40, ha="right", fontsize=9
            )

        title = f"{label}" + (f" ({resolution})" if resolution else "")
        ax.set_title(title, fontsize=11)
        ax.set_ylabel("Location", fontsize=9)

    axes[-1].set_xlabel("Date", fontsize=10)
    fig.suptitle(
        f"Cached coverage {date_index[0].date().isoformat()} â€” {date_index[-1].date().isoformat()}",
        fontsize=12
    )

    CACHE_IMAGE_PATH.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(CACHE_IMAGE_PATH, dpi=150)
    plt.close(fig)
    logging.info("Saved cached coverage chart to %s", CACHE_IMAGE_PATH)

== Cache preflight ==
DATA_ROOT: /Volumes/Untitled/DL Project/data
- Copernicus CDS: 1 locations, 60 dated files
- Copernicus ERA5-Land: 1 locations, 329 dated files
- Copernicus ERA5-Land TS: 1 locations, 0 dated files
- Copernicus ERA5 (pressure): 1 locations, 316 dated files
- Copernicus ERA5 (single): 1 locations, 3513 dated files
- IEM ASOS: 5 locations, 41974 dated files
- Meteostat: 5 locations, 27846 dated files
- NASA POWER: 5 locations, 45375 dated files
- NOAA ISD: 5 locations, 44666 dated files
- NOAA LCD: 5 locations, 44654 dated files
- Open-Meteo: 5 locations, 47205 dated files
- OpenWeather: 5 locations, 1820 dated files
- Tomorrow.io: 5 locations, 0 dated files
- Visual Crossing: 5 locations, 45 dated files
- WeatherAPI.com: 5 locations, 1830 dated files
- Weatherbit: 5 locations, 0 dated files
Found date span: 2000-01-01 -> 2025-11-05
