In [2]:
import pandas as pd

hist_url = "https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv"
df = pd.read_csv(hist_url)
# helpful columns: USAF, WBAN, STATION NAME, CTRY, ICAO, LAT, LON, ELEV(M), BEGIN, END

mask_country = df['CTRY'].eq('KG')
mask_name = df['STATION NAME'].str.contains('BISH|MANAS|FRUNZE', case=False, na=False)

kg_stations = df[mask_country & mask_name].copy()
print(kg_stations['USAF'].unique())


['382200' '383530' '383531']


In [3]:
inv_url = "https://www.ncei.noaa.gov/pub/data/noaa/isd-inventory.csv"
inv = pd.read_csv(inv_url)

# pick a USAF/WBAN you found above, for example:
USAF = "383531"    # e.g., '389830'
WBAN = "99999"    # e.g., '99999'

have = inv[(inv['USAF'].astype(str).str.zfill(6)==str(USAF).zfill(6)) &
           (inv['WBAN'].astype(str).str.zfill(5)==str(WBAN).zfill(5))]
print(have.head())      # shows YEAR, MONTH, and record counts


         USAF   WBAN  YEAR   JAN   FEB   MAR   APR   MAY   JUN   JUL   AUG  \
25225  383531  99999  2020  1478  1387  1463  1408  1483  1433  1482  1483   
25226  383531  99999  2021  1488  1093  1485  1438  1487  1440  1488  1488   
25227  383531  99999  2022  1488  1343  1485  1432  1482  1435  1487  1470   
25228  383531  99999  2023  1484  1328  1464  1436  1470  1402  1472  1457   
25229  383531  99999  2024  1486  1388  1253  1367  1428  1311  1439  1422   

        SEP   OCT   NOV   DEC  
25225  1439  1486  1431  1488  
25226  1439  1488  1440  1482  
25227  1431  1452  1436  1452  
25228  1391  1469  1427  1486  
25229  1411  1486  1238  1423  


  inv = pd.read_csv(inv_url)


In [4]:
import urllib.request
import gzip
import io
import pandas as pd

# ---- CONFIG ----
USAF = "383530"      # <-- put your USAF here (6 digits, zero-padded if needed)
WBAN = "99999"       # <-- put your WBAN here (5 digits, often 99999 outside US)
START_YEAR = 2021
END_YEAR   = 2025    # inclusive
SAVE_CSV   = f"isd_lite_{USAF}_{WBAN}_{START_YEAR}_{END_YEAR}.csv"

# ISD-Lite column layout (fixed-width), per NOAA docs
WIDTHS = [4, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6]
NAMES  = [
    "year", "month", "day", "hour",
    "air_temp_tenths_c", "dewpoint_tenths_c", "slp_tenths_hpa",
    "wind_dir_deg", "wind_speed_tenths_ms",
    "sky_cover_code", "precip_1h_tenths_mm", "precip_6h_tenths_mm"
]

def fetch_isd_lite_year(usaf: str, wban: str, year: int) -> pd.DataFrame | None:
    """
    Download and parse one ISD-Lite file for a station-year.
    Returns a DataFrame or None if file is missing/unavailable.
    """
    url = f"https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/{year}/{usaf}-{wban}-{year}.gz"
    try:
        with urllib.request.urlopen(url) as resp:
            # Decompress to text stream
            with gzip.GzipFile(fileobj=io.BytesIO(resp.read())) as gz:
                # Read fixed-width into DataFrame
                df = pd.read_fwf(gz, widths=WIDTHS, names=NAMES)
    except urllib.error.HTTPError as e:
        # Commonly 404 if that station-year doesn't exist
        print(f"[WARN] {year}: HTTP {e.code} for {url} — skipping.")
        return None
    except Exception as e:
        print(f"[WARN] {year}: Failed to fetch/parse — {e}")
        return None

    # Build datetime (assume UTC in ISD/ISD-Lite)
    try:
        dt = pd.to_datetime(df[['year','month','day','hour']], errors='coerce', utc=True)
        df.insert(0, 'datetime_utc', dt)
    except Exception as e:
        print(f"[WARN] {year}: datetime parse issue — {e}")

    # Replace missing sentinels and scale numeric fields
    # -9999 / -999 / etc. appear as missing; safest is to set < -9000 to NA for tenths fields
    for col in ["air_temp_tenths_c","dewpoint_tenths_c","slp_tenths_hpa",
                "wind_speed_tenths_ms","precip_1h_tenths_mm","precip_6h_tenths_mm"]:
        if col in df.columns:
            df[col] = df[col].where(df[col] > -9000, pd.NA)

    # Scaled columns
    df["air_temp_c"]     = df["air_temp_tenths_c"] / 10
    df["dewpoint_c"]     = df["dewpoint_tenths_c"] / 10
    df["slp_hpa"]        = df["slp_tenths_hpa"] / 10
    df["wind_speed_ms"]  = df["wind_speed_tenths_ms"] / 10
    df["precip_1h_mm"]   = df["precip_1h_tenths_mm"] / 10
    df["precip_6h_mm"]   = df["precip_6h_tenths_mm"] / 10

    # Local time (Asia/Bishkek, UTC+6)
    try:
        df["datetime_local"] = df["datetime_utc"].dt.tz_convert("Asia/Bishkek")
    except Exception:
        # If datetime_utc isn't tz-aware for any reason, localize first then convert
        df["datetime_local"] = (pd.to_datetime(df["datetime_utc"], utc=True)
                                .dt.tz_convert("Asia/Bishkek"))

    # Keep a tidy subset plus raw fields if you want them for QA/QC
    cols_order = [
        "datetime_utc", "datetime_local",
        "air_temp_c", "dewpoint_c", "slp_hpa", "wind_dir_deg", "wind_speed_ms",
        "sky_cover_code", "precip_1h_mm", "precip_6h_mm",
        "year","month","day","hour",  # originals for audit
    ]
    existing = [c for c in cols_order if c in df.columns]
    return df[existing].copy()

# ---- Fetch loop and combine ----
frames = []
for yr in range(START_YEAR, END_YEAR + 1):
    print(f"[INFO] Fetching {USAF}-{WBAN}-{yr} ...")
    df_year = fetch_isd_lite_year(USAF, WBAN, yr)
    if df_year is not None and not df_year.empty:
        frames.append(df_year)

if not frames:
    raise SystemExit("No data retrieved — check USAF/WBAN IDs or availability for those years.")

combined = pd.concat(frames, ignore_index=True).drop_duplicates(subset=["datetime_utc"])
combined = combined.sort_values("datetime_utc").reset_index(drop=True)

print(combined.head())
print(combined.tail())
print(f"[INFO] Combined shape: {combined.shape}")

# Save to CSV
combined.to_csv(SAVE_CSV, index=False)
print(f"[OK] Saved to {SAVE_CSV}")


[INFO] Fetching 383530-99999-2021 ...
[INFO] Fetching 383530-99999-2022 ...
[INFO] Fetching 383530-99999-2023 ...
[INFO] Fetching 383530-99999-2024 ...
[INFO] Fetching 383530-99999-2025 ...
               datetime_utc            datetime_local  air_temp_c  dewpoint_c  \
0 2021-01-01 00:00:00+00:00 2021-01-01 06:00:00+06:00        -8.2        -8.4   
1 2021-01-01 03:00:00+00:00 2021-01-01 09:00:00+06:00        -8.9        -9.2   
2 2021-01-01 06:00:00+00:00 2021-01-01 12:00:00+06:00        -8.7        -9.0   
3 2021-01-01 09:00:00+00:00 2021-01-01 15:00:00+06:00        -7.6        -8.5   
4 2021-01-01 12:00:00+00:00 2021-01-01 18:00:00+06:00        -8.2        -8.8   

   slp_hpa  wind_dir_deg  wind_speed_ms  sky_cover_code  precip_1h_mm  \
0   1036.5             0            NaN               9           NaN   
1      NaN           350            1.0               8           NaN   
2   1040.1           320            1.0               9           NaN   
3   1039.7           320       

In [5]:
combined.describe()

Unnamed: 0,air_temp_c,dewpoint_c,slp_hpa,wind_dir_deg,wind_speed_ms,sky_cover_code,precip_1h_mm,precip_6h_mm,year,month,day,hour
count,12821.0,12812.0,12662.0,12842.0,11957.0,12842.0,0.0,1.0,12842.0,12842.0,12842.0,12842.0
mean,13.513603,2.77806,1017.396178,162.065021,1.44476,-2337.73244,,4.0,2022.866454,6.253154,15.634948,10.495561
std,11.821119,6.838468,10.518966,294.492348,2.48725,4237.321399,,,1.353435,3.376326,8.759623,6.8833
min,-23.2,-25.7,994.5,-9999.0,0.0,-9999.0,,4.0,2021.0,1.0,1.0,0.0
25%,4.2,-1.9,1009.0,50.0,1.0,2.0,,4.0,2022.0,3.0,8.0,6.0
50%,14.2,3.8,1016.5,180.0,1.0,5.0,,4.0,2023.0,6.0,16.0,9.0
75%,22.9,8.2,1024.7,270.0,2.0,8.0,,4.0,2024.0,9.0,23.0,18.0
max,39.6,18.7,1058.9,360.0,53.0,9.0,,4.0,2025.0,12.0,31.0,21.0


In [6]:
gaps = combined["datetime_utc"].sort_values().diff().value_counts()
print(gaps.head())  # you'll likely see 03:00:00 as dominant

datetime_utc
0 days 03:00:00    12294
0 days 06:00:00      503
0 days 09:00:00       25
0 days 12:00:00        7
1 days 03:00:00        3
Name: count, dtype: int64


In [12]:
# Install if needed (safe to re-run)
try:
    import meteostat  # noqa: F401
except Exception:
    %pip -q install meteostat

from datetime import datetime
from zoneinfo import ZoneInfo
import pandas as pd
import numpy as np
from meteostat import Stations, Hourly

# --- CONFIG ---
LAT, LON = 42.8746, 74.5698          # Bishkek
START    = datetime(2021, 1, 1)
END      = datetime(2025, 8, 27, 23, 59)
TZ_LOCAL = "Asia/Bishkek"
OUT_CSV  = "meteostat_bishkek_hourly.csv"

# --- Pick the best nearby station (e.g., Manas/UCFM vicinity) ---
stns = Stations()
stn  = stns.nearby(LAT, LON).fetch(1)         # closest station
assert not stn.empty, "No nearby station found"
print("Using station:", stn.index[0], stn.iloc[0].name if hasattr(stn.iloc[0], "name") else "")

# --- Fetch hourly data (tz='UTC' keeps the index consistent) ---
hourly = Hourly(stn, START, END, timezone="UTC")  # Hourly class is the right entrypoint
df = hourly.fetch()                                # columns: temp, dwpt, rhum, prcp, snow, wdir, wspd, wpgt, pres, tsun, coco

# --- Rename & tidy units (per Meteostat docs) ---
rename = {
    "temp": "air_temp_c",
    "dwpt": "dewpoint_c",
    "rhum": "rel_humidity_pct",
    "prcp": "precip_mm",
    "snow": "snow_depth_cm",
    "wdir": "wind_dir_deg",
    "wspd": "wind_speed_kmh",
    "wpgt": "wind_gust_kmh",
    "pres": "slp_hpa",
    "tsun": "sunshine_min",
    "coco": "weather_code",
}
df = df.rename(columns=rename)

# Convert wind km/h -> m/s
for src, dst in [("wind_speed_kmh", "wind_speed_ms"), ("wind_gust_kmh", "wind_gust_ms")]:
    if src in df.columns:
        df[dst] = df[src] / 3.6

# Ensure strict hourly grid (insert missing hours as NaN)
df = df.sort_index()
full_idx = pd.date_range(df.index.min(), df.index.max(), freq="H", tz="UTC")
df = df.reindex(full_idx)

# Add local time column for plots
df["datetime_local"] = df.index.tz_convert(TZ_LOCAL)

# (Optional) very light interpolation for smooth variables ONLY (not precip)
for c in ["air_temp_c","dewpoint_c","slp_hpa","rel_humidity_pct","wind_speed_ms","wind_gust_ms"]:
    if c in df.columns:
        df[c] = df[c].interpolate("time", limit=2)

# Circular interpolation for wind direction (optional)
if "wind_dir_deg" in df.columns:
    rad = np.deg2rad(df["wind_dir_deg"])
    sin = np.sin(rad).interpolate("time", limit=2)
    cos = np.cos(rad).interpolate("time", limit=2)
    ang = np.degrees(np.arctan2(sin, cos))
    df["wind_dir_deg"] = (ang + 360) % 360

# Save
df.to_csv(OUT_CSV)
print("Saved:", OUT_CSV)
print(df.head(3))
print(df.tail(3))


Using station: 38353 38353




Saved: meteostat_bishkek_hourly.csv
                           air_temp_c  dewpoint_c  rel_humidity_pct  \
2021-01-01 00:00:00+00:00        -8.2        -8.5              98.0   
2021-01-01 01:00:00+00:00        -8.7        -9.9              91.0   
2021-01-01 02:00:00+00:00        -9.0       -10.1              92.0   

                           precip_mm  snow_depth_cm  wind_dir_deg  \
2021-01-01 00:00:00+00:00       <NA>           <NA>           0.0   
2021-01-01 01:00:00+00:00       <NA>           <NA>         179.0   
2021-01-01 02:00:00+00:00       <NA>           <NA>         183.0   

                           wind_speed_kmh  wind_gust_kmh  slp_hpa  \
2021-01-01 00:00:00+00:00             1.8           <NA>   1036.5   
2021-01-01 01:00:00+00:00             1.8           <NA>   1038.6   
2021-01-01 02:00:00+00:00             1.8           <NA>   1038.9   

                           sunshine_min  weather_code  wind_speed_ms  \
2021-01-01 00:00:00+00:00          <NA>           5.0

# Combining PurpleAir data


In [None]:
# df_4 = pd.read_csv('44919 2016-08-26 2025-08-26 60-Minute Average.csv')
# df_9 = pd.read_csv('92387 2016-08-26 2025-08-26 60-Minute Average.csv')
# df_2 = pd.read_csv('217883 2016-08-26 2025-08-26 60-Minute Average.csv')

# combined = pd.concat([df_4, df_9, df_4], ignore_index=True).drop_duplicates(subset=["time_stamp"])
# combined = combined.sort_values("datetime_utc").reset_index(drop=True)




Unnamed: 0,time_stamp,humidity,temperature,pressure,voc,analog_input,pm2.5_alt|pm2.5_alt = C * (0.00030418*N1 0.0018512*N2 0.02069706*N3),deciviews,visual_range,0.3_um_count,...,1.0_um_count,2.5_um_count,5.0_um_count,10.0_um_count,pm1.0_cf_1,pm1.0_atm,pm2.5_atm,pm2.5_cf_1,pm10.0_atm,pm10.0_cf_1
0,2025-01-15T07:00:00+06:00,16,72,935.98,,0.04,10.6,16.9,71.6,2470.2,...,112.26,17.3,4.834,2.622,12.0,12.0,18.4,18.4,22.2,22.2
1,2025-01-15T08:00:00+06:00,18,56,936.21,,0.04,15.7,20.3,51.4,3657.7,...,164.92,26.63,6.509,3.412,20.3,19.2,29.3,30.9,36.0,36.5
2,2025-01-15T09:00:00+06:00,22,46,936.45,,0.05,16.4,20.0,52.8,3550.6,...,188.31,33.56,7.445,3.638,20.6,20.3,32.9,33.9,40.7,40.7
3,2025-01-15T10:00:00+06:00,20,50,936.39,,0.05,22.9,22.4,41.6,4652.3,...,275.75,48.06,10.694,5.239,26.8,23.9,39.5,46.1,51.4,55.6
4,2025-01-15T11:00:00+06:00,17,53,936.27,,0.05,14.2,19.3,56.4,3283.3,...,162.32,27.93,5.728,2.579,15.7,15.5,26.4,27.0,31.8,31.9
