### Match hindcast and observed events with IBTRACS named storms

In [9]:
import os
import numpy as np
import pandas as pd
import requests

# CO-OPS 8654467 (USCG Station Hatteras) location (deg)
STATION_LAT = 35.208637
STATION_LON = -75.70417

RADIUS_KM = 100.0              # search radius for storm tracks
TIME_WINDOW_DAYS = 2.0         # search +/- this many days around each event time
MAX_MATCHES_PER_EVENT = 1      # return top N candidates per event (set 1 for only best)

# IBTrACS source: North Atlantic v04r01 (CSV)
IBTRACS_URL = "https://www.ncei.noaa.gov/data/international-best-track-archive-for-climate-stewardship-ibtracs/v04r01/access/csv/ibtracs.NA.list.v04r01.csv"
IBTRACS_LOCAL = "ibtracs.NA.list.v04r01.csv"


def haversine_km(lat1, lon1, lat2, lon2):
    """Great-circle distance (km) between points."""
    R = 6371.0
    lat1 = np.deg2rad(lat1); lon1 = np.deg2rad(lon1)
    lat2 = np.deg2rad(lat2); lon2 = np.deg2rad(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    return 2.0 * R * np.arcsin(np.sqrt(a))

def ensure_ibtracs_csv(url, local_path):
    if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
        return local_path
    print(f"Downloading IBTrACS CSV -> {local_path} ...")
    r = requests.get(url, stream=True, timeout=300)
    r.raise_for_status()
    with open(local_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk:
                f.write(chunk)
    return local_path

def load_ibtracs_points(csv_path):
    """
    Load a minimal set of IBTrACS columns needed for matching.
    Column names come from IBTrACS v04r01 CSV documentation.
    """
    usecols = [
        "SID", "NAME", "SEASON", "BASIN", "ISO_TIME",
        "USA_LAT", "USA_LON", "USA_WIND", "USA_PRES", "NATURE"
    ]
    df = pd.read_csv(csv_path, usecols=lambda c: c in usecols, low_memory=False)

    # Parse and clean
    df["ISO_TIME"] = pd.to_datetime(df["ISO_TIME"], utc=True, errors="coerce")
    df["USA_LAT"] = pd.to_numeric(df["USA_LAT"], errors="coerce")
    df["USA_LON"] = pd.to_numeric(df["USA_LON"], errors="coerce")
    df["USA_WIND"] = pd.to_numeric(df.get("USA_WIND"), errors="coerce")
    df["USA_PRES"] = pd.to_numeric(df.get("USA_PRES"), errors="coerce")

    df = df.dropna(subset=["ISO_TIME", "USA_LAT", "USA_LON"])
    # IBTrACS sometimes uses "NOT NAMED" or blanks; keep but label later
    df["NAME"] = df["NAME"].fillna("").astype(str).str.strip()
    return df

def match_events_to_ibtracs(event_times_utc, ib, lat0, lon0, radius_km=100, window_days=2, topn=1):
    out_rows = []
    dt = pd.Timedelta(days=float(window_days))

    for t in pd.to_datetime(event_times_utc, utc=True):
        t1, t2 = t - dt, t + dt
        sub = ib[(ib["ISO_TIME"] >= t1) & (ib["ISO_TIME"] <= t2)].copy()
        if len(sub) == 0:
            out_rows.append(dict(event_time=t, n_candidates=0))
            continue

        # Distance to station for each best-track point
        sub["dist_km"] = haversine_km(lat0, lon0, sub["USA_LAT"].values, sub["USA_LON"].values)

        # keep within radius
        sub = sub[sub["dist_km"] <= radius_km].copy()
        if len(sub) == 0:
            out_rows.append(dict(event_time=t, n_candidates=0))
            continue

        # Score: prioritize smallest distance, then smallest time offset
        sub["dt_hours"] = (sub["ISO_TIME"] - t).abs() / pd.Timedelta(hours=1)
        sub["score"] = sub["dist_km"] + 5.0 * sub["dt_hours"]  # weight time offsets (tunable)

        sub = sub.sort_values(["score", "dist_km", "dt_hours"]).head(int(topn))

        for _, r in sub.iterrows():
            name = r["NAME"] if r["NAME"] else "UNNAMED"
            out_rows.append(dict(
                event_time=t,
                storm_sid=r.get("SID", ""),
                storm_name=name,
                season=r.get("SEASON", np.nan),
                basin=r.get("BASIN", ""),
                nature=r.get("NATURE", ""),
                track_time=r["ISO_TIME"],
                storm_lat=r["USA_LAT"],
                storm_lon=r["USA_LON"],
                dist_km=float(r["dist_km"]),
                dt_hours=float(r["dt_hours"]),
                usa_wind_kt=r.get("USA_WIND", np.nan),
                usa_pres_mb=r.get("USA_PRES", np.nan),
                n_candidates=len(sub),
            ))

    return pd.DataFrame(out_rows)


# Input event timews from hindcast peaks table:
events = pd.read_csv( 'COOPS8654467_top20_hindcast_events_summary.csv' )
event_times = pd.to_datetime(events["peak_time_utc"], utc=True)

csv_path = ensure_ibtracs_csv(IBTRACS_URL, IBTRACS_LOCAL)
ib = load_ibtracs_points(csv_path)

matches = match_events_to_ibtracs(
    event_times_utc=event_times,
    ib=ib,
    lat0=STATION_LAT,
    lon0=STATION_LON,
    radius_km=RADIUS_KM,
    window_days=TIME_WINDOW_DAYS,
    topn=MAX_MATCHES_PER_EVENT
)
matches["peak_m"] = events["wl_pred_peak_m"]

print(matches.head(20).to_string(index=False))

# Optional: save results
matches.to_csv("hcast_events_to_ibtracs_matches.csv", index=False)
print("\nWrote: hcast_event_to_ibtracs_matches.csv")


               event_time     storm_sid storm_name season  basin nature                track_time  storm_lat  storm_lon   dist_km  dt_hours  usa_wind_kt  usa_pres_mb  n_candidates   peak_m
1958-09-28 04:00:00+00:00 1958264N17308     HELENE   1958    NaN     TS 1958-09-28 03:00:00+00:00       35.1      -74.9 74.100894       1.0        113.0        944.0             1 4.895494
1980-03-25 02:00:00+00:00           NaN        NaN    NaN    NaN    NaN                       NaT        NaN        NaN       NaN       NaN          NaN          NaN             0 4.121277
1980-07-04 08:00:00+00:00           NaN        NaN    NaN    NaN    NaN                       NaT        NaN        NaN       NaN       NaN          NaN          NaN             0 4.121277
1979-12-02 04:00:00+00:00           NaN        NaN    NaN    NaN    NaN                       NaT        NaN        NaN       NaN       NaN          NaN          NaN             0 4.121277
1981-03-27 07:00:00+00:00           NaN        NaN    N

  df["ISO_TIME"] = pd.to_datetime(df["ISO_TIME"], utc=True, errors="coerce")


In [10]:
# Top observed event times:

event_times = ["2016-10-09 10:00:00+00:00",
                "2019-09-06 15:00:00+00:00",
                "2016-09-03 17:00:00+00:00",
                "2010-09-03 09:00:00+00:00",
                "2012-10-29 03:00:00+00:00"]
obs_peak_m = [1.788, 1.658, 1.394, 1.267, 1.205]


matches = match_events_to_ibtracs(
    event_times_utc=event_times,
    ib=ib,
    lat0=STATION_LAT,
    lon0=STATION_LON,
    radius_km=RADIUS_KM,
    window_days=TIME_WINDOW_DAYS,
    topn=MAX_MATCHES_PER_EVENT
)

matches["peak_m"] = pd.Series(obs_peak_m, dtype=float)

print(matches.head(20).to_string(index=False))

# Optional: save results
matches.to_csv("obs_events_to_ibtracs_matches.csv", index=False)
print("\nWrote: obs_events_to_ibtracs_matches.csv")

               event_time     storm_sid storm_name season  basin nature                track_time  storm_lat  storm_lon   dist_km  dt_hours  usa_wind_kt  usa_pres_mb  n_candidates  peak_m
2016-10-09 10:00:00+00:00 2016273N13300    MATTHEW   2016    NaN     TS 2016-10-09 09:00:00+00:00       34.9      -75.3 50.311519       1.0         68.0        984.0             1   1.788
2019-09-06 15:00:00+00:00 2019236N10314     DORIAN   2019    NaN     TS 2019-09-06 12:30:00+00:00       35.2      -75.6  9.513230       2.5         85.0        956.0             1   1.658
2016-09-03 17:00:00+00:00 2016242N24279    HERMINE   2016    NaN     ET 2016-09-03 12:00:00+00:00       35.8      -75.5 68.304375       5.0         60.0        995.0             1   1.394
2010-09-03 09:00:00+00:00           NaN        NaN    NaN    NaN    NaN                       NaT        NaN        NaN       NaN       NaN          NaN          NaN             0   1.267
2012-10-29 03:00:00+00:00           NaN        NaN    NaN   