In [2]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import os
from io import StringIO, BytesIO
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
import botocore

# Set up the S3 client without credentials (for public datasets)
s3 = boto3.client('s3', region_name='us-east-1',
                 config=boto3.session.Config(signature_version=botocore.UNSIGNED))

In [5]:
# Define the bucket name
bucket_name = 'noaa-isd-pds'

# Function to    list files in a directory
def list_s3_files(prefix=''):
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    files = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                files.append(obj['Key'])
    
    return files

# List top-level directories/files
top_level = list_s3_files()
print("Top-level directories and files:")
for item in top_level[:20]:  # Show just the first 20 items
    print(item)

Top-level directories and files:
/data/2022/010060-99999-2022.gz
/data/2022/010150-99999-2022.gz
/data/2022/010160-99999-2022.gz
/data/2022/010170-99999-2022.gz
/data/2022/010231-99999-2022.gz
/data/2022/010240-99999-2022.gz
/data/2022/010250-99999-2022.gz
/data/2022/010280-99999-2022.gz
/data/2022/010440-99999-2022.gz
/data/2022/010470-99999-2022.gz
/data/2022/010490-99999-2022.gz
/data/2022/010620-99999-2022.gz
/data/2022/010740-99999-2022.gz
/data/2022/010860-99999-2022.gz
/data/2022/010877-99999-2022.gz
/data/2022/010882-99999-2022.gz
/data/2022/010886-99999-2022.gz
/data/2022/010890-99999-2022.gz
/data/2022/010980-99999-2022.gz
/data/2022/010990-99999-2022.gz


In [6]:
# Download the station history file (contains information about all weather stations)
print("\nDownloading station history file...")
response = s3.get_object(Bucket=bucket_name, Key='isd-history.csv')
stations_df = pd.read_csv(BytesIO(response['Body'].read()))


Downloading station history file...


In [7]:
# Display basic info about the stations dataset
print(f"\nStation history file has {stations_df.shape[0]} stations and {stations_df.shape[1]} columns")
print("\nColumn names:")
print(stations_df.columns.tolist())


Station history file has 29659 stations and 11 columns

Column names:
['USAF', 'WBAN', 'STATION NAME', 'CTRY', 'STATE', 'ICAO', 'LAT', 'LON', 'ELEV(M)', 'BEGIN', 'END']


In [8]:
# Show a sample of the data
print("\nSample data:")
stations_df.head()


Sample data:


Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8260,99999,WXPOD8270,,,,0.0,0.0,0.0,20050101,20120731
4,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323


In [9]:
# Explore the data structure for recent years
print("\nExploring data structure for recent years...")
recent_files = list_s3_files('data/2023/')
print(f"Found {len(recent_files)} files for 2023")
print("Sample filenames:")
for file in recent_files[:5]:
    print(file)


Exploring data structure for recent years...
Found 13435 files for 2023
Sample filenames:
data/2023/010010-99999-2023.gz
data/2023/010014-99999-2023.gz
data/2023/010020-99999-2023.gz
data/2023/010030-99999-2023.gz
data/2023/010060-99999-2023.gz


In [11]:
import gzip
import io

# Get the object
print(f"\nChecking format of sample file: {sample_file}")
response = s3.get_object(Bucket=bucket_name, Key=sample_file)

# Check if content is gzipped by looking at file extension or magic bytes
content = response['Body'].read()
if sample_file.endswith('.gz') or (len(content) > 1 and content[0] == 0x1f and content[1] == 0x8b):
    # It's gzipped, decompress it
    with gzip.GzipFile(fileobj=io.BytesIO(content)) as gzipped_content:
        sample_content = gzipped_content.read().decode('utf-8')
else:
    # Try different encodings or handle as binary
    try:
        sample_content = content.decode('utf-8')
    except UnicodeDecodeError:
        # If it's not UTF-8, you might want to handle it as binary or try another encoding
        sample_content = f"Binary content, first 100 bytes: {content[:100]}"

print(f"First 500 characters of the file:")
print(sample_content[:500])


Checking format of sample file: data/2023/010010-99999-2023.gz
First 500 characters of the file:
0104010010999992023010100004+70939-008669FM-12+001099999V0202671N0142199999999999999999-01001-01241097251ADDAA199999999KA1120M-00991KA2120N-01081MA1999999097131MD1310301+9999OC102641OD199902361999REMSYN004BUFR
0093010010999992023010101004+70939-008669FM-12+001099999V0202541N0141199999999999999999-01021-01271097341ADDKA1120M-00981KA2120N-01031MA1999999097221MD1310351+9999OC102091OD199901741999REMSYN004BUFR
0093010010999992023010102004+70939-008669FM-12+001099999V0202581N0123199999999999999999-009


In [14]:
# Look at the data directory structure to understand how files are organized
years = []
for year in range(2018, 2025):
    year_files = list_s3_files(f'data/{year}/')
    years.append((year, len(year_files)))

print("\nNumber of files per year (2018-2024):")
for year, count in years:
    print(f"{year}: {count} files")


Number of files per year (2018-2024):
2018: 14203 files
2019: 13505 files
2020: 13570 files
2021: 13546 files
2022: 13475 files
2023: 13435 files
2024: 13345 files
