# Historical Weather Data by ZIP Code (Meteostat)

This notebook retrieves daily historical weather data for US ZIP codes using the Meteostat API. It supports bulk fetching for the past 2 years and incremental updates for recent days. Data is stored at the ZIP code level with basic weather columns.

## Configuration
Set your parameters here. You can control the ZIP codes, date ranges, and whether to refetch bulk data.

In [124]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pgeocode
from meteostat import Point, Daily

# --- CONFIGURATION FLAGS ---
ZIP_CODES = ["94105", "10001", "60601", "90210", "77002"]  # Example ZIP codes
COUNTRY = "US"
BULK_YEARS = 2  # How many years back for bulk fetch
REFETCH_BULK = False  # Set True to force refetch of bulk data
BULK_DATA_FILE = "bulk_weather_by_zip.csv"
N_DAYS = 7  # How many recent days to fetch with incremental API

## Convert ZIP Codes to Coordinates

In [125]:
def zip_to_point(zip_codes, country=COUNTRY):
    nomi = pgeocode.Nominatim(country)
    df = nomi.query_postal_code(zip_codes)
    df = df[['postal_code', 'latitude', 'longitude']].rename(columns={'postal_code': 'zip_code'})
    df = df.dropna(subset=['latitude', 'longitude'])
    df['zip_code'] = df['zip_code'].astype(str)
    return df

zip_df = zip_to_point(ZIP_CODES)
zip_df

Unnamed: 0,zip_code,latitude,longitude
0,94105,37.7864,-122.3892
1,10001,40.7484,-73.9967
2,60601,41.8858,-87.6181
3,90210,34.0901,-118.4065
4,77002,29.7594,-95.3594


## Bulk Fetch: Past 2 Years Daily Weather by ZIP Code

In [126]:
def fetch_bulk_weather(zip_df, years=2):
    end = datetime.now().date() - timedelta(days=1)
    start = end - timedelta(days=365*years)
    # Ensure both start and end are datetime.date objects
    records = []
    for _, row in zip_df.iterrows():
        point = Point(row['latitude'], row['longitude'])
        # Convert start and end to datetime.datetime for Meteostat compatibility
        start_dt = datetime.combine(start, datetime.min.time())
        end_dt = datetime.combine(end, datetime.min.time())
        data = Daily(point, start_dt, end_dt).fetch()
        data = data.reset_index()
        data['zip_code'] = row['zip_code']
        # Keep only basic columns
        data = data[['zip_code', 'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd']]
        records.append(data)
    if records:
        return pd.concat(records, ignore_index=True)
    else:
        return pd.DataFrame()

if REFETCH_BULK or not os.path.exists(BULK_DATA_FILE):
    print("Fetching bulk weather data for all ZIP codes...")
    bulk_df = fetch_bulk_weather(zip_df, years=BULK_YEARS)
    bulk_df.to_csv(BULK_DATA_FILE, index=False)
    print(f"Saved bulk data to {BULK_DATA_FILE}")
else:
    print(f"Loading cached bulk data from {BULK_DATA_FILE}")
    bulk_df = pd.read_csv(BULK_DATA_FILE, parse_dates=['time'])
bulk_df.head()

Loading cached bulk data from bulk_weather_by_zip.csv


Unnamed: 0,zip_code,time,tavg,tmin,tmax,prcp,snow,wspd
0,94105,2023-05-22,15.0,11.7,21.1,0.0,,15.1
1,94105,2023-05-23,14.7,12.2,19.4,0.0,,22.3
2,94105,2023-05-24,13.9,11.7,18.3,0.0,,18.4
3,94105,2023-05-25,14.1,12.2,18.3,0.0,,20.2
4,94105,2023-05-26,14.4,11.7,18.3,0.0,,19.4


## Incremental Fetch: Most Recent N Days

In [127]:
def fetch_recent_weather(zip_df, n_days=7):
    end = datetime.now().date() - timedelta(days=1)
    start = end - timedelta(days=n_days-1)
    records = []
    for _, row in zip_df.iterrows():
        point = Point(row['latitude'], row['longitude'])
        # Convert start and end to datetime.datetime for Meteostat compatibility
        start_dt = datetime.combine(start, datetime.min.time())
        end_dt = datetime.combine(end, datetime.min.time())
        data = Daily(point, start_dt, end_dt).fetch()
        data = data.reset_index()
        data['zip_code'] = row['zip_code']
        data = data[['zip_code', 'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd']]
        records.append(data)
    if records:
        return pd.concat(records, ignore_index=True)
    else:
        return pd.DataFrame()

recent_df = fetch_recent_weather(zip_df, n_days=N_DAYS)
recent_df.head()

Unnamed: 0,zip_code,time,tavg,tmin,tmax,prcp,snow,wspd
0,94105,2025-05-15,15.0,10.0,21.1,0.0,,15.0
1,94105,2025-05-16,14.5,11.0,19.0,0.0,,17.6
2,94105,2025-05-17,14.6,11.7,18.3,0.0,,28.6
3,94105,2025-05-18,15.7,8.9,21.1,0.0,,13.2
4,94105,2025-05-19,16.1,10.6,21.7,0.0,,14.4


## Append New Data to Bulk Data

In [128]:
def append_new_data(bulk_df, new_df):
    # Remove any overlap (same zip_code and date)
    if bulk_df.empty:
        return new_df.copy()
    merged = pd.concat([bulk_df, new_df], ignore_index=True)
    merged = merged.drop_duplicates(subset=['zip_code', 'time'], keep='last')
    merged = merged.sort_values(['zip_code', 'time'])
    return merged

full_df = append_new_data(bulk_df, recent_df)
full_df.to_csv(BULK_DATA_FILE, index=False)
print(f"Appended new data. Full dataset now has {len(full_df)} rows.")
full_df.tail()

Appended new data. Full dataset now has 3690 rows.


Unnamed: 0,zip_code,time,tavg,tmin,tmax,prcp,snow,wspd
3657,94105,2025-05-17,14.6,11.7,18.3,0.0,,28.6
3658,94105,2025-05-18,15.7,8.9,21.1,0.0,,13.2
3659,94105,2025-05-19,16.1,10.6,21.7,0.0,,14.4
3660,94105,2025-05-20,15.3,11.0,20.0,0.0,,12.1
3661,94105,2025-05-21,15.6,9.0,23.0,0.0,,14.1


## Data Quality Diagnostics

In [129]:
# Simple diagnostics
print("Missing values by column:")
print(full_df.isnull().sum())
print("\nSample data:")
print(full_df.head())

Missing values by column:
zip_code       0
time           0
tavg           0
tmin           0
tmax           0
prcp           0
snow        1535
wspd           0
dtype: int64

Sample data:
    zip_code       time  tavg  tmin  tmax  prcp  snow  wspd
731    10001 2023-05-22  17.8  13.9  23.3   0.0   0.0   7.4
732    10001 2023-05-23  15.8  12.8  21.0   0.0   0.0   7.7
733    10001 2023-05-24  16.8  11.7  24.4   0.0   0.0   6.3
734    10001 2023-05-25  15.1  10.6  19.4   0.0   0.0  12.5
735    10001 2023-05-26  17.4  11.7  22.2   0.0   0.0   6.9


## Save Results

In [130]:
# Save the final results to CSV
full_df.to_csv("weather_by_zip_final.csv", index=False)
print("Saved final results to weather_by_zip_final.csv")

Saved final results to weather_by_zip_final.csv


## Example: Analyze or Visualize Data

In [None]:
# Example: Average temperature by ZIP code over the last 30 days
cutoff = pd.Timestamp(datetime.now().date() - timedelta(days=30))
recent_30 = full_df[full_df['time'] >= cutoff]
grouped = recent_30.groupby('zip_code')['tavg'].mean().reset_index()
print(grouped)

TypeError: Invalid comparison between dtype=datetime64[ns] and date