In [2]:
import boto3
import pandas as pd
import os
import matplotlib.pyplot as plt
from io import BytesIO
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
import botocore

# Set up the S3 client without credentials (for public datasets)
s3 = boto3.client('s3', region_name='us-east-1',
                 config=boto3.session.Config(signature_version=botocore.UNSIGNED))

In [4]:
# Define the bucket name
bucket_name = 'noaa-isd-pds'

# Download the station history file
print("Downloading station history file...")
response = s3.get_object(Bucket=bucket_name, Key='isd-history.csv')
stations_df = pd.read_csv(BytesIO(response['Body'].read()))

Downloading station history file...


In [5]:
# Save a local copy for reference
os.makedirs('../data/raw', exist_ok=True)
stations_df.to_csv('../data/raw/isd-history.csv', index=False)

# Display basic info
print(f"Downloaded information for {stations_df.shape[0]} weather stations")

Downloaded information for 29659 weather stations


In [7]:
print(f"Columns: {stations_df.columns.tolist()}")
print(f"Sample data:")
display(stations_df.head())

Columns: ['USAF', 'WBAN', 'STATION NAME', 'CTRY', 'STATE', 'ICAO', 'LAT', 'LON', 'ELEV(M)', 'BEGIN', 'END']
Sample data:


Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8260,99999,WXPOD8270,,,,0.0,0.0,0.0,20050101,20120731
4,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323


In [48]:
# Specific target stations for each city
target_stations = {
    "New York City": "JOHN F KENNEDY INTERNATIONAL",
    "Los Angeles": "LOS ANGELES INTERNATIONAL AIR",
    "Chicago": "CHICAGO O'HARE INTERNATIONAL",   
    "Miami": "MIAMI INTERNATIONAL AIRPORT",
    "Houston": "G BUSH INTERCONTINENTAL AP/HO",
    "Boston": "GEN E L LOGAN INTERNATIONAL A",
    "Minneapolis": "MINNEAPOLIS-ST PAUL INTERNATI",
    "Seattle": "SEATTLE-TACOMA INTERNATIONAL"
}

In [49]:
# Function to find the exact station for a city
def find_exact_station(station_name):
    # Find stations containing the exact station name
    # Handle NaN values in STATION NAME column with dropna=False
    matching_stations = stations_df[
        stations_df['STATION NAME'].str.contains(station_name, case=False, na=False)
    ]
    
    # Filter for active stations with data in our time range (2018-2024)
    active_stations = matching_stations[
        (matching_stations['BEGIN'] <= 20180101) & 
        (matching_stations['END'] >= 20241231)
    ]
    
    if active_stations.empty:
        print(f"⚠️ No matching active station found for '{station_name}'")
        # Try a less strict match
        print(f"Trying alternative search...")
        words = station_name.split()
        if len(words) > 1:
            # Try with just the first word or two
            alt_name = " ".join(words[:2])
            alt_stations = stations_df[stations_df['STATION NAME'].str.contains(alt_name, case=False)]
            alt_active = alt_stations[
                (alt_stations['BEGIN'] <= 20180101) & 
                (alt_stations['END'] >= 20231231)
            ]
            if not alt_active.empty:
                best_station = alt_active.sort_values('END', ascending=False).iloc[0]
                print(f"Found alternative station: {best_station['STATION NAME']}")
                return best_station
        return None
    
    # Take the station with the most recent data
    best_station = active_stations.sort_values('END', ascending=False).iloc[0]
    return best_station

In [50]:
# Find the best station for each city
selected_stations = []
for city, station_name in target_stations.items():
    print(f"Finding station for {city}...")
    station = find_exact_station(station_name)
    if station is not None:
        # Add city information
        station_dict = station.to_dict()
        station_dict['CITY'] = city
        selected_stations.append(station_dict)
        print(f"Selected: {station['STATION NAME']} (USAF: {station['USAF']}, WBAN: {station['WBAN']})")
    else:
        print(f"❌ Could not find a suitable station for {city}")

Finding station for New York City...
Selected: JOHN F KENNEDY INTERNATIONAL AIRPORT (USAF: 744860, WBAN: 94789)
Finding station for Los Angeles...
Selected: LOS ANGELES INTERNATIONAL AIRPORT (USAF: 722950, WBAN: 23174)
Finding station for Chicago...
Selected: CHICAGO O'HARE INTERNATIONAL AIRPORT (USAF: 725300, WBAN: 94846)
Finding station for Miami...
Selected: MIAMI INTERNATIONAL AIRPORT (USAF: 722020, WBAN: 12839)
Finding station for Houston...
Selected: G BUSH INTERCONTINENTAL AP/HOUSTON AP (USAF: 722430, WBAN: 12960)
Finding station for Boston...
Selected: GEN E L LOGAN INTERNATIONAL AIRPORT (USAF: 725090, WBAN: 14739)
Finding station for Minneapolis...
Selected: MINNEAPOLIS-ST PAUL INTERNATIONAL AP (USAF: 726580, WBAN: 14922)
Finding station for Seattle...
Selected: SEATTLE-TACOMA INTERNATIONAL AIRPORT (USAF: 727930, WBAN: 24233)


In [51]:
# Create a dataframe of selected stations
selected_stations_df = pd.DataFrame(selected_stations)

# Display the selected stations
print("\nSelected stations:")
display(selected_stations_df[['USAF', 'WBAN', 'STATION NAME', 'LAT', 'LON', 'BEGIN', 'END', 'CITY']])



Selected stations:


Unnamed: 0,USAF,WBAN,STATION NAME,LAT,LON,BEGIN,END,CITY
0,744860,94789,JOHN F KENNEDY INTERNATIONAL AIRPORT,40.639,-73.764,19730101,20250318,New York City
1,722950,23174,LOS ANGELES INTERNATIONAL AIRPORT,33.938,-118.387,19440101,20250318,Los Angeles
2,725300,94846,CHICAGO O'HARE INTERNATIONAL AIRPORT,41.96,-87.932,19461001,20250317,Chicago
3,722020,12839,MIAMI INTERNATIONAL AIRPORT,25.788,-80.317,19730101,20250317,Miami
4,722430,12960,G BUSH INTERCONTINENTAL AP/HOUSTON AP,29.984,-95.361,19730101,20250317,Houston
5,725090,14739,GEN E L LOGAN INTERNATIONAL AIRPORT,42.361,-71.01,19431121,20250318,Boston
6,726580,14922,MINNEAPOLIS-ST PAUL INTERNATIONAL AP,44.885,-93.231,19450101,20250317,Minneapolis
7,727930,24233,SEATTLE-TACOMA INTERNATIONAL AIRPORT,47.445,-122.314,19480101,20250317,Seattle


In [52]:
# Create a formatted station ID for AWS S3 path (with proper zero-padding)
selected_stations_df['STATION_ID'] = selected_stations_df['USAF'].astype(str).str.zfill(6) + '-' + selected_stations_df['WBAN'].astype(str).str.zfill(5)

# Save the selected stations to CSV
selected_stations_df.to_csv('../data/processed/selected_stations.csv', index=False)
print(f"\nSaved {len(selected_stations_df)} stations to ../data/processed/selected_stations.csv")


Saved 8 stations to ../data/processed/selected_stations.csv


In [53]:
# Show station details as a formatted table
print("\nDetailed Station Information:")
station_details = selected_stations_df[['CITY', 'STATION NAME', 'STATION_ID', 'LAT', 'LON', 'ELEV(M)', 'BEGIN', 'END']]
station_details = station_details.rename(columns={
    'STANDARDIZED_CITY': 'City',
    'STATION NAME': 'Station Name',
    'STATION_ID': 'Station ID',
    'LAT': 'Latitude',
    'LON': 'Longitude',
    'ELEV(M)': 'Elevation (m)',
    'BEGIN': 'Data Begin Date',
    'END': 'Data End Date'
})
display(station_details)


Detailed Station Information:


Unnamed: 0,CITY,Station Name,Station ID,Latitude,Longitude,Elevation (m),Data Begin Date,Data End Date
0,New York City,JOHN F KENNEDY INTERNATIONAL AIRPORT,744860-94789,40.639,-73.764,2.7,19730101,20250318
1,Los Angeles,LOS ANGELES INTERNATIONAL AIRPORT,722950-23174,33.938,-118.387,29.7,19440101,20250318
2,Chicago,CHICAGO O'HARE INTERNATIONAL AIRPORT,725300-94846,41.96,-87.932,204.8,19461001,20250317
3,Miami,MIAMI INTERNATIONAL AIRPORT,722020-12839,25.788,-80.317,1.4,19730101,20250317
4,Houston,G BUSH INTERCONTINENTAL AP/HOUSTON AP,722430-12960,29.984,-95.361,27.5,19730101,20250317
5,Boston,GEN E L LOGAN INTERNATIONAL AIRPORT,725090-14739,42.361,-71.01,3.3,19431121,20250318
6,Minneapolis,MINNEAPOLIS-ST PAUL INTERNATIONAL AP,726580-14922,44.885,-93.231,254.5,19450101,20250317
7,Seattle,SEATTLE-TACOMA INTERNATIONAL AIRPORT,727930-24233,47.445,-122.314,112.5,19480101,20250317
