In [34]:
import boto3
import pandas as pd
import os
import matplotlib.pyplot as plt
from io import BytesIO
from tqdm import tqdm
import time
import gzip
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [56]:
import botocore

# Set up the S3 client without credentials (for public datasets)
s3 = boto3.client('s3', region_name='us-east-1',
                 config=boto3.session.Config(signature_version=botocore.UNSIGNED))

In [68]:
# Define the bucket name
bucket_name = 'noaa-isd-pds'

# Load the selected stations
selected_stations_df = pd.read_csv('../data/processed/selected_stations.csv')
print(f"Loaded {len(selected_stations_df)} selected stations")
display(selected_stations_df[['STATION_ID', 'STATION NAME', 'CITY']])

Loaded 8 selected stations


Unnamed: 0,STATION_ID,STATION NAME,CITY
0,744860-94789,JOHN F KENNEDY INTERNATIONAL AIRPORT,New York City
1,722950-23174,LOS ANGELES INTERNATIONAL AIRPORT,Los Angeles
2,725300-94846,CHICAGO O'HARE INTERNATIONAL AIRPORT,Chicago
3,722020-12839,MIAMI INTERNATIONAL AIRPORT,Miami
4,722430-12960,G BUSH INTERCONTINENTAL AP/HOUSTON AP,Houston
5,725090-14739,GEN E L LOGAN INTERNATIONAL AIRPORT,Boston
6,726580-14922,MINNEAPOLIS-ST PAUL INTERNATIONAL AP,Minneapolis
7,727930-24233,SEATTLE-TACOMA INTERNATIONAL AIRPORT,Seattle


In [69]:
# Define the years we want to download
years = list(range(2019, 2025))  # 2019 through 2024 
print(f"Will download data for years: {years}")

# Create the download directory
download_dir = '../data/raw/isd_data'
os.makedirs(download_dir, exist_ok=True)

Will download data for years: [2019, 2020, 2021, 2022, 2023, 2024]


In [70]:
# Function to check if a file exists in S3
def check_file_exists(s3_key):
    """Check if a file exists in the S3 bucket"""
    try:
        s3.head_object(Bucket=bucket_name, Key=s3_key)
        return True
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            return False
        else:
            # Something else went wrong
            print(f"Error checking {s3_key}: {str(e)}")
            return False

In [71]:
# Function to download a specific station and year
def download_station_year(station_id, year):
    """Download weather data for a specific station and year from AWS S3"""
    # Format: data/2023/010010-99999-2023.gz
    filename = f"{station_id}-{year}.gz"
    s3_key = f"data/{year}/{filename}"
    local_path = os.path.join(download_dir, filename)
    
    # Skip if file already exists locally
    if os.path.exists(local_path):
        return f"Skipped existing file: {filename}"
    
    # Check if the file exists in S3
    if not check_file_exists(s3_key):
        return f"File not found in S3: {s3_key}"
    
    try:
        s3.download_file(Bucket=bucket_name, Key=s3_key, Filename=local_path)
        
        # Verify the download
        if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
            # Attempt to open the file to confirm it's valid
            try:
                with gzip.open(local_path, 'rb') as f:
                    # Read a small sample to verify it's a valid gzip file
                    sample = f.read(100)
                return f"Downloaded: {filename} ({os.path.getsize(local_path)/1024:.1f} KB)"
            except Exception as e:
                # If we can't open the file, it might be corrupted
                os.remove(local_path)
                return f"Downloaded corrupted file {filename}: {str(e)}"
        else:
            return f"Downloaded empty file: {filename}"
    except Exception as e:
        return f"Error downloading {filename}: {str(e)}"

In [72]:
# First, check which files already exist in S3
print("Checking which files exist in S3...")
available_files = {}

for year in tqdm(years, desc="Checking years"):
    available_files[year] = []
    for _, station in selected_stations_df.iterrows():
        station_id = station['STATION_ID']
        s3_key = f"data/{year}/{station_id}-{year}.gz"
        if check_file_exists(s3_key):
            available_files[year].append(station_id)

# Display available files by year
for year in years:
    print(f"Year {year}: {len(available_files[year])}/{len(selected_stations_df)} stations available")

Checking which files exist in S3...


Checking years: 100%|██████████| 6/6 [00:00<00:00,  6.01it/s]

Year 2019: 8/8 stations available
Year 2020: 8/8 stations available
Year 2021: 8/8 stations available
Year 2022: 8/8 stations available
Year 2023: 8/8 stations available
Year 2024: 8/8 stations available





In [73]:
# Download files sequentially with progress tracking
download_tasks = []
for _, station in selected_stations_df.iterrows():
    station_id = station['STATION_ID']
    for year in years:
        if station_id in available_files[year]:
            download_tasks.append((station_id, year))

print(f"Starting download of {len(download_tasks)} files...")
results = []

with tqdm(total=len(download_tasks)) as pbar:
    for station_id, year in download_tasks:
        result = download_station_year(station_id, year)
        results.append(result)
        pbar.update(1)
        # Small delay to avoid hammering the S3 bucket
        time.sleep(0.1)

Starting download of 48 files...


100%|██████████| 48/48 [00:05<00:00,  9.45it/s]


In [74]:
# Count successes and failures
downloads = sum(1 for r in results if r.startswith("Downloaded"))
skipped = sum(1 for r in results if r.startswith("Skipped"))
not_found = sum(1 for r in results if r.startswith("File not found"))
errors = sum(1 for r in results if r.startswith("Error"))

print(f"\nDownload summary:")
print(f"- Successfully downloaded: {downloads} files")
print(f"- Skipped existing files: {skipped} files")
print(f"- Files not found in S3: {not_found} files")
print(f"- Download errors: {errors} files")


Download summary:
- Successfully downloaded: 0 files
- Skipped existing files: 48 files
- Files not found in S3: 0 files
- Download errors: 0 files


In [75]:
import os
import glob

# List the downloaded files
downloaded_files = glob.glob(os.path.join(download_dir, "*.gz"))
print(f"\nTotal files in download directory: {len(downloaded_files)}")


Total files in download directory: 48


In [76]:
# Display a sample of downloaded files
if downloaded_files:
    print("\nSample of downloaded files:")
    for file in sorted(downloaded_files)[:5]:
        print(f"{os.path.basename(file)} - {os.path.getsize(file)/1024:.1f} KB")
        
    # Get file sizes
    file_sizes = [os.path.getsize(file) / 1024 for file in downloaded_files]  # KB
    
    print(f"\nTotal size of downloaded files: {sum(file_sizes)/1024:.2f} MB")
    print(f"Average file size: {sum(file_sizes)/len(file_sizes):.2f} KB")
    print(f"Largest file: {max(file_sizes):.2f} KB")
    print(f"Smallest file: {min(file_sizes):.2f} KB")


Sample of downloaded files:
722020-12839-2019.gz - 861.7 KB
722020-12839-2020.gz - 900.4 KB
722020-12839-2021.gz - 850.1 KB
722020-12839-2022.gz - 861.8 KB
722020-12839-2023.gz - 830.7 KB

Total size of downloaded files: 39.86 MB
Average file size: 850.42 KB
Largest file: 1056.97 KB
Smallest file: 665.31 KB


In [77]:
# Verify all cities have at least some data
city_data = {}
for file in downloaded_files:
    for _, row in selected_stations_df.iterrows():
        if row['STATION_ID'] in os.path.basename(file):
            city = row['CITY']
            if city not in city_data:
                city_data[city] = []
            city_data[city].append(file)

print("\nData availability by city:")
for city, files in city_data.items():
    print(f"{city}: {len(files)} files")


Data availability by city:
Chicago: 6 files
Houston: 6 files
Seattle: 6 files
Minneapolis: 6 files
Los Angeles: 6 files
New York City: 6 files
Miami: 6 files
Boston: 6 files


In [78]:
# Save a report of downloaded files
download_report = pd.DataFrame(results, columns=['Status'])
download_report.to_csv('../data/processed/download_report.csv', index=False)
print(f"Saved download report to ../data/processed/download_report.csv")

Saved download report to ../data/processed/download_report.csv
