### Download inventory of data from the NCEI Integrated Surface Database
https://www.ncei.noaa.gov/products/land-based-station/integrated-surface-database  

and parse the .csv file to look for longest records in the vicinity of Pamlico Sound

In [2]:
import pandas as pd
import numpy as np
import gzip
import requests
from pathlib import Path

# ----------------------------
# Files + URLs
# ----------------------------
DATA_DIR = Path("isd_meta")
DATA_DIR.mkdir(exist_ok=True)

HISTORY = DATA_DIR / "isd-history.csv"
INVENTORY_Z = DATA_DIR / "isd-inventory.csv.z"

URL_HISTORY = "https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv"
URL_INVENTORY = "https://www.ncei.noaa.gov/pub/data/noaa/isd-inventory.csv.z"

def download_if_missing(url, outfile):
    if not outfile.exists():
        print(f"Downloading {outfile.name}")
        r = requests.get(url, stream=True, timeout=60)
        r.raise_for_status()
        with open(outfile, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

download_if_missing(URL_HISTORY, HISTORY)
download_if_missing(URL_INVENTORY, INVENTORY_Z)

# ----------------------------
# Pamlico Sound-ish bounding box
# ----------------------------
N, W, S, E = 36.8, -77.8, 34.7, -74.8

# ----------------------------
# Helper functions
# ----------------------------
def pad_usaf(x):
    x = str(x).strip()
    return x.zfill(6) if x.isdigit() else x

def pad_wban(x):
    x = str(x).strip()
    return x.zfill(5) if x.isdigit() else x


# ----------------------------
# Load station history
# ----------------------------
hist = pd.read_csv(HISTORY)

cand = hist[
    (hist["CTRY"] == "US") &
    (hist["LAT"].between(S, N)) &
    (hist["LON"].between(W, E))
].copy()

cand["USAF_s"] = cand["USAF"].apply(pad_usaf)
cand["WBAN_s"] = cand["WBAN"].apply(pad_wban)
cand["key"] = cand["USAF_s"] + cand["WBAN_s"]

# ----------------------------
# Load inventory (gzip)
# ----------------------------
with gzip.open(INVENTORY_Z, "rt") as f:
    inv = pd.read_csv(f, low_memory=False)

inv["USAF_s"] = inv["USAF"].apply(pad_usaf)
inv["WBAN_s"] = inv["WBAN"].apply(pad_wban)
inv["key"] = inv["USAF_s"] + inv["WBAN_s"]

month_cols = ["JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"]
for c in month_cols:
    inv[c] = pd.to_numeric(inv[c], errors="coerce").fillna(0)

inv = inv[inv["key"].isin(cand["key"])].copy()
inv["year_total"] = inv[month_cols].sum(axis=1)

rank = (inv.groupby("key")
          .agg(total_obs=("year_total","sum"),
               first_year=("YEAR","min"),
               last_year=("YEAR","max"),
               years_with_data=("year_total", lambda s: int((s > 0).sum())))
          .reset_index())

rank = rank.merge(
    cand.drop_duplicates("key")[["key","STATION NAME","ICAO","LAT","LON","USAF_s","WBAN_s"]],
    on="key", how="left"
)

rank = rank.sort_values(
    ["first_year", "years_with_data", "total_obs"],
    ascending=[True, False, False]
)

print(rank.head(10)[[
    "STATION NAME","ICAO","LAT","LON",
    "first_year","last_year","years_with_data"
]])

# ----------------------------
# Example: build download URLs
# ----------------------------
top = rank.iloc[0]
usafwban = top["USAF_s"] + top["WBAN_s"]

years = range(int(top["first_year"]), int(top["last_year"]) + 1)
urls = [f"https://www.ncei.noaa.gov/data/global-hourly/access/{y}/{usafwban}.csv"
        for y in years]

print("Example URLs:", urls[:3], "...", urls[-3:])


                   STATION NAME  ICAO     LAT     LON  first_year  last_year  \
37            CHERRY POINT MCAS  KNKT  34.900 -76.883        1945       1972   
38                FENTRESS NAAS  KNFE  36.695 -76.136        1945       2004   
40                  MANTEO NAAS  KMQI  35.917 -75.700        1945       2004   
39                FRANKLIN NAAS  KFKN  36.698 -76.903        1945       1945   
29                    MONROE CO  KECG  36.261 -76.175        1949       1990   
42   NEW BERN CRAVEN CO REGL AP  KEWN  35.068 -77.048        1949       1955   
43        KINSTON STALLINGS AFB  KISO  35.317 -77.633        1954       1957   
44               NEW RIVER MCAS  KNCA  34.700 -77.383        1955       1972   
45   HATTERAS BILLY MITCHELL AP  KHSE  35.233 -75.622        1957       1972   
41  ELIZABETH CITY MUNICIPAL AP  KECG  36.261 -76.175        1971       1998   

    years_with_data  
37               28  
38                7  
40                3  
39                1  
29       

#### Check to see of there is older hourly data from KHSE

In [3]:
import pandas as pd
from pathlib import Path
import requests

def ensure_dir(p):
    p = Path(p); p.mkdir(parents=True, exist_ok=True); return p

def pad_id(x, width):
    x = str(x).strip()
    return x.zfill(width) if x.isdigit() else x

meta_dir = ensure_dir("isd_meta")
hist_path = meta_dir / "isd-history.csv"
if not hist_path.exists():
    url = "https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv"
    r = requests.get(url, timeout=(10,60))
    r.raise_for_status()
    hist_path.write_bytes(r.content)

hist = pd.read_csv(hist_path)

# KHSE/Billy Mitchell approximate coords
lat0, lon0 = 35.233, -75.622

# Make IDs + simple distance filter
hist["USAF_s"] = hist["USAF"].apply(lambda x: pad_id(x, 6))
hist["WBAN_s"] = hist["WBAN"].apply(lambda x: pad_id(x, 5))
hist["usafwban"] = hist["USAF_s"] + hist["WBAN_s"]

# crude bbox around KHSE (tweak if needed)
cand = hist[
    (hist["LAT"].between(lat0-0.25, lat0+0.25)) &
    (hist["LON"].between(lon0-0.25, lon0+0.25))
].copy()

cand["ICAO"] = cand["ICAO"].astype(str).str.strip()
cand["STATION NAME"] = cand["STATION NAME"].astype(str)

# prioritize likely matches
mask = cand["STATION NAME"].str.contains("HATTERAS|BILLY|MITCHELL", case=False, na=False) | (cand["ICAO"]=="KHSE")
cand = cand[mask].copy()

cand = cand[["STATION NAME","ICAO","LAT","LON","usafwban","BEGIN","END"]].sort_values(["BEGIN","END"])
print(cand.to_string(index=False))


              STATION NAME ICAO    LAT     LON    usafwban    BEGIN      END
                  HATTERAS  nan 35.217 -75.683 99999913745 19480101 19570301
HATTERAS BILLY MITCHELL AP KHSE 35.233 -75.622 99999993729 19570301 19721231
             CAPE HATTERAS KHSE 35.233 -75.622 72304093729 19730101 19951231
    BILLY MITCHELL AIRPORT KHSE 35.232 -75.622 72313993729 19960101 20250827
        CAPE HATTERAS  NC. KHAT 35.267 -75.550 72304099999 19960322 19971231
                  HATTERAS  nan 35.209 -75.704 99842699999 20100820 20250824


#### It looks like there is, so run this to try and get all of the data

In [6]:
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from pathlib import Path
import time

# ----------------------------
# Helpers
# ----------------------------
def ensure_dir(p):
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p

def year_from_begin_end(x):
    """
    BEGIN/END from isd-history are often YYYYMMDD (numeric) but sometimes YYYY-MM-DD.
    This extracts year robustly.
    """
    s = str(x).strip()
    if len(s) >= 4 and s[:4].isdigit():
        return int(s[:4])
    # fallback
    dt = pd.to_datetime(s, errors="coerce")
    if pd.isna(dt):
        return None
    return int(dt.year)

def make_retry_session(retries=5, backoff=0.6, status_forcelist=(429,500,502,503,504)):
    s = requests.Session()
    retry = Retry(
        total=retries, connect=retries, read=retries, status=retries,
        backoff_factor=backoff,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(["GET", "HEAD"]),
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    return s

def download_global_hourly_csv(usafwban, year, out_dir, session,
                               connect_timeout=10, read_timeout=120,
                               use_head=True, verbose=True):
    """
    Download https://www.ncei.noaa.gov/data/global-hourly/access/{YEAR}/{USAFWBAN}.csv
    Return local path or None if 404/failed.
    """
    out_dir = ensure_dir(out_dir)
    f = out_dir / f"{usafwban}_{year}.csv"
    url = f"https://www.ncei.noaa.gov/data/global-hourly/access/{year}/{usafwban}.csv"

    if f.exists() and f.stat().st_size > 0:
        if verbose:
            print(f"[ISD] {usafwban} {year}: cached ({f.stat().st_size/1e6:.1f} MB)")
        return f

    try:
        if verbose:
            print(f"[ISD] {usafwban} {year}: {url}")

        if use_head:
            h = session.head(url, timeout=(connect_timeout, read_timeout))
            if h.status_code == 404:
                if verbose:
                    print(f"[ISD] {usafwban} {year}: 404")
                return None

        t0 = time.time()
        r = session.get(url, timeout=(connect_timeout, read_timeout))
        if r.status_code == 404:
            if verbose:
                print(f"[ISD] {usafwban} {year}: 404")
            return None
        r.raise_for_status()

        f.write_bytes(r.content)
        if verbose:
            print(f"[ISD] {usafwban} {year}: downloaded ({f.stat().st_size/1e6:.1f} MB) in {time.time()-t0:.1f}s")
        return f

    except requests.exceptions.RequestException as e:
        if verbose:
            print(f"[ISD] {usafwban} {year}: ERROR {e}")
        return None

def parse_isd_wind_from_global_hourly(csv_path):
    """
    Parse global-hourly station-year CSV to minimal wind record.
    WND is a comma-separated group; speed is typically tenths of m/s (divide by 10).
    """
    df = pd.read_csv(csv_path, low_memory=False)

    # Time
    df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", utc=True)

    # Wind group
    wnd = df["WND"].astype(str).str.split(",", expand=True)
    if wnd.shape[1] < 4:
        raise RuntimeError(f"WND parsing failed for {csv_path}")

    wdir = pd.to_numeric(wnd[0], errors="coerce").replace(999, np.nan)      # deg FROM
    wspd = pd.to_numeric(wnd[3], errors="coerce").replace(9999, np.nan) / 10.0  # m/s

    out = pd.DataFrame({
        "time": df["DATE"],
        "wind_dir_from_deg": wdir,
        "wind_speed_mps": wspd,
    }).dropna(subset=["time"])

    out = out.sort_values("time").reset_index(drop=True)
    return out

# ----------------------------
# Main: download + combine multiple station IDs
# ----------------------------
def download_and_merge_isd_winds_from_candidates(
    cand,
    out_dir="isd_multi_khse",
    connect_timeout=10,
    read_timeout=120,
    use_head=True,
    verbose=True,
    prefer_order=None,
):
    """
    cand: DataFrame with columns: usafwban, BEGIN, END, (optional ICAO, STATION NAME)
    prefer_order: list of usafwban to prefer if timestamps overlap.
                  If None, uses descending END date (newest instrumentation first).
    Returns merged DataFrame and an audit table.
    """
    cand = cand.copy()

    # Ensure required columns
    if "usafwban" not in cand.columns:
        raise ValueError("cand must include a 'usafwban' column (USAF+WBAN concatenated).")
    if "BEGIN" not in cand.columns or "END" not in cand.columns:
        raise ValueError("cand must include BEGIN and END columns.")

    cand["y0"] = cand["BEGIN"].apply(year_from_begin_end)
    cand["y1"] = cand["END"].apply(year_from_begin_end)
    cand = cand.dropna(subset=["y0","y1"])
    cand["y0"] = cand["y0"].astype(int)
    cand["y1"] = cand["y1"].astype(int)

    # Preference order for overlaps
    if prefer_order is None:
        # default: prefer records with the latest END (often the modern station)
        cand = cand.sort_values(["y1","y0"], ascending=[False, False])
        prefer_order = cand["usafwban"].tolist()
    prefer_rank = {k:i for i,k in enumerate(prefer_order)}

    session = make_retry_session()
    out_dir = ensure_dir(out_dir)

    audit_rows = []
    all_parts = []

    for _, row in cand.iterrows():
        usafwban = str(row["usafwban"]).strip()
        y0, y1 = int(row["y0"]), int(row["y1"])

        if verbose:
            name = row["STATION NAME"] if "STATION NAME" in row else ""
            icao = row["ICAO"] if "ICAO" in row else ""
            print(f"\n[ISD] Candidate: {usafwban}  {icao}  {name}  years {y0}-{y1}")

        files = []
        for year in range(y0, y1 + 1):
            f = download_global_hourly_csv(
                usafwban, year, out_dir=out_dir / "raw",
                session=session,
                connect_timeout=connect_timeout, read_timeout=read_timeout,
                use_head=use_head, verbose=verbose
            )
            audit_rows.append({
                "usafwban": usafwban,
                "year": year,
                "downloaded": f is not None,
                "path": str(f) if f is not None else ""
            })
            if f is not None:
                files.append(f)

        if not files:
            continue

        # Parse files
        parts = []
        for f in files:
            try:
                d = parse_isd_wind_from_global_hourly(f)
                d["usafwban"] = usafwban
                parts.append(d)
            except Exception as e:
                audit_rows.append({
                    "usafwban": usafwban,
                    "year": int(Path(f).stem.split("_")[-1]),
                    "downloaded": True,
                    "path": str(f),
                    "parse_error": str(e)
                })

        if parts:
            all_parts.append(pd.concat(parts, ignore_index=True))

    audit = pd.DataFrame(audit_rows)

    if not all_parts:
        raise RuntimeError("No data parsed from any candidate station IDs.")

    raw = pd.concat(all_parts, ignore_index=True)
    raw = raw.dropna(subset=["time"]).sort_values(["time"])

    # De-duplicate: keep preferred station record when multiple IDs report same timestamp
    raw["prefer_rank"] = raw["usafwban"].map(lambda x: prefer_rank.get(x, 9999)).astype(int)
    raw = raw.sort_values(["time", "prefer_rank"])

    merged = raw.drop_duplicates(subset=["time"], keep="first").copy()
    merged = merged.drop(columns=["prefer_rank"]).reset_index(drop=True)

    # Save outputs
    merged_path = out_dir / "KHSE_area_ISD_wind_merged.csv"
    audit_path  = out_dir / "KHSE_area_ISD_download_audit.csv"
    merged.to_csv(merged_path, index=False)
    audit.to_csv(audit_path, index=False)

    if verbose:
        print(f"\n[ISD] Merged rows: {len(merged):,}")
        print(f"[ISD] Saved merged: {merged_path}")
        print(f"[ISD] Saved audit : {audit_path}")

    return merged, audit, merged_path, audit_path


# Filter to KHSE rows only
cand_khse = cand.copy()
cand_khse["ICAO"] = cand_khse["ICAO"].astype(str).str.strip()
cand_khse = cand_khse[cand_khse["ICAO"] == "KHSE"].copy()

# Sanity check
print(cand_khse[["STATION NAME","ICAO","LAT","LON","usafwban","BEGIN","END"]].to_string(index=False))

#  Run the downloader/merger into a NEW directory
merged_wind, audit, merged_path, audit_path = download_and_merge_isd_winds_from_candidates(
    cand_khse,
    out_dir="isd_KHSE_only",   # NEW output folder so you start clean
    connect_timeout=10,
    read_timeout=120,
    use_head=True,
    verbose=True,
    # prefer newest station when overlaps occur (default does this)
    prefer_order=["72313993729","72304093729","99999993729"]
)


              STATION NAME ICAO    LAT     LON    usafwban    BEGIN      END
HATTERAS BILLY MITCHELL AP KHSE 35.233 -75.622 99999993729 19570301 19721231
             CAPE HATTERAS KHSE 35.233 -75.622 72304093729 19730101 19951231
    BILLY MITCHELL AIRPORT KHSE 35.232 -75.622 72313993729 19960101 20250827

[ISD] Candidate: 99999993729  KHSE  HATTERAS BILLY MITCHELL AP  years 1957-1972
[ISD] 99999993729 1957: https://www.ncei.noaa.gov/data/global-hourly/access/1957/99999993729.csv
[ISD] 99999993729 1957: downloaded (3.1 MB) in 2.5s
[ISD] 99999993729 1958: https://www.ncei.noaa.gov/data/global-hourly/access/1958/99999993729.csv
[ISD] 99999993729 1958: downloaded (3.6 MB) in 0.5s
[ISD] 99999993729 1959: https://www.ncei.noaa.gov/data/global-hourly/access/1959/99999993729.csv
[ISD] 99999993729 1959: downloaded (3.8 MB) in 1.9s
[ISD] 99999993729 1960: https://www.ncei.noaa.gov/data/global-hourly/access/1960/99999993729.csv
[ISD] 99999993729 1960: downloaded (3.7 MB) in 0.7s
[ISD] 999999937