In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

MERGED_CSV = "isd_KHSE_only/KHSE_area_ISD_wind_merged.csv"   # your merged winds
RAW_DIR    = Path("isd_KHSE_only/raw")                      # where station-year CSVs were saved
SAMPLE_N_FILES_PER_ID = 3                                   # metadata sampling from raw files

# -----------------------------
# 1) Read merged file and confirm time dtype
# -----------------------------
wind = pd.read_csv(MERGED_CSV, parse_dates=["time"])
wind = wind.sort_values("time").reset_index(drop=True)

print("Merged file:", MERGED_CSV)
print("Rows:", len(wind))
print("Span:", wind["time"].min(), "to", wind["time"].max())
print("time dtype:", wind["time"].dtype)

# If time is timezone-naive, you can make it explicit UTC for consistency:
if str(wind["time"].dtype) == "datetime64[ns]":
    wind["time"] = wind["time"].dt.tz_localize("UTC")
    print("-> localized merged time to UTC")
print("time dtype (after):", wind["time"].dtype)

print("\nIDs present (usafwban) and record counts:")
print(wind["usafwban"].value_counts().to_string())

# -----------------------------
# 2) Inspect station metadata from raw global-hourly station-year CSVs
#    Global-hourly access CSVs often include columns like:
#    STATION, NAME, LATITUDE, LONGITUDE, ELEVATION, DATE, WND, ...
#    We'll sample a few files per usafwban and summarize unique values.
# -----------------------------
def find_station_year_files(raw_dir, usafwban):
    return sorted(raw_dir.glob(f"{usafwban}_*.csv"))

def read_station_metadata_one_file(csv_path):
    # Read only header + first row to minimize I/O
    df0 = pd.read_csv(csv_path, nrows=1, low_memory=False)
    cols = {c.upper(): c for c in df0.columns}

    def get_col(*names):
        for n in names:
            if n in cols:
                return df0[cols[n]].iloc[0]
        return np.nan

    return {
        "file": csv_path.name,
        "STATION":   get_col("STATION"),
        "NAME":      get_col("NAME", "STATION NAME"),
        "LATITUDE":  get_col("LATITUDE", "LAT"),
        "LONGITUDE": get_col("LONGITUDE", "LON"),
        "ELEVATION": get_col("ELEVATION", "ELEV", "ELEV(M)"),
        "DATE_example": get_col("DATE", "DATE TIME", "DATETIME"),
    }

meta_rows = []
for usafwban in sorted(wind["usafwban"].dropna().astype(str).unique()):
    files = find_station_year_files(RAW_DIR, usafwban)
    if not files:
        print(f"\n[WARN] No raw files found for {usafwban} under {RAW_DIR}")
        continue

    # sample a few files across the span (first/middle/last)
    idx = np.unique(np.linspace(0, len(files)-1, min(SAMPLE_N_FILES_PER_ID, len(files))).astype(int))
    for i in idx:
        meta_rows.append({"usafwban": usafwban, **read_station_metadata_one_file(files[i])})

meta = pd.DataFrame(meta_rows)

print("\nSampled station metadata from raw files:")
print(meta[["usafwban","file","STATION","NAME","LATITUDE","LONGITUDE","ELEVATION","DATE_example"]].to_string(index=False))

# summarize uniqueness per usafwban
def nunique_s(x): return pd.Series(x).dropna().astype(str).nunique()

summary = meta.groupby("usafwban").agg(
    n_files=("file","count"),
    STATION_unique=("STATION", nunique_s),
    NAME_unique=("NAME", nunique_s),
    LAT_unique=("LATITUDE", nunique_s),
    LON_unique=("LONGITUDE", nunique_s),
    ELEV_unique=("ELEVATION", nunique_s),
).reset_index()

print("\nUniqueness summary (1 = stable across sampled files):")
print(summary.to_string(index=False))

# -----------------------------
# 3) Quick note on wind height
# -----------------------------
print("\nNOTE on wind height:")
print("Global-hourly access CSV does not reliably provide per-record anemometer height.")
print("For airport ASOS/AWOS winds, treating them as standard near-surface (often ~10 m) is common,")
print("but you should not assume an explicit, invariant sensor height without external station documentation.")


Merged file: isd_KHSE_only/KHSE_area_ISD_wind_merged.csv
Rows: 623597
Span: 1957-03-01 05:00:00+00:00 to 2025-08-27 04:59:00+00:00
time dtype: datetime64[ns, UTC]
time dtype (after): datetime64[ns, UTC]

IDs present (usafwban) and record counts:
usafwban
72313993729    333084
72304093729    198426
99999993729     92087

Sampled station metadata from raw files:
   usafwban                 file     STATION                                      NAME  LATITUDE  LONGITUDE  ELEVATION        DATE_example
72304093729 72304093729_1973.csv 72304093729 CAPE HATTERAS BILLY MITCHELL FIELD, NC US  35.23260  -75.62190        3.4 1973-01-01T00:00:00
72304093729 72304093729_1984.csv 72304093729 CAPE HATTERAS BILLY MITCHELL FIELD, NC US  35.23260  -75.62190        3.4 1984-01-01T00:00:00
72304093729 72304093729_1995.csv 72304093729 CAPE HATTERAS BILLY MITCHELL FIELD, NC US  35.23260  -75.62190        3.4 1995-01-01T00:00:00
72313993729 72313993729_1996.csv 72313993729 CAPE HATTERAS BILLY MITCHELL FIELD, 