In [None]:
import pandas as pd

# Read the CSV file
home_locations_df = pd.read_csv('./data/home_locations.csv')

# Get a list of all country codes from the 'country' column
country_codes = home_locations_df['COUNTRY'].unique().tolist()

print(country_codes)

In [None]:
# Read the stations file with error handling for inconsistent fields
stations_df = pd.read_csv('data/ghcnd-stations.csv', on_bad_lines='skip', engine='python')

# Filter stations that have IDs starting with any of the country codes
home_stations_df = stations_df[stations_df['ID'].str[:2].isin(country_codes)]

# Write the filtered data to a new CSV file
home_stations_df.to_csv('data/home_stations.csv', index=False)

In [None]:
from geopy.distance import geodesic

# Create an empty list to store the results
nearby_stations = []

# Iterate through each home location
for idx, home_row in home_locations_df.iterrows():
    home_coords = (home_row['LATITUDE'], home_row['LONGITUDE'])
    city = home_row['CITY']
    
    # Check each station in home_stations_df
    for station_idx, station_row in home_stations_df.iterrows():
        # Only calculate distance if station country matches home location country
        if station_row['ID'][:2] == home_row['COUNTRY']:
            station_coords = (station_row['LATITUDE'], station_row['LONGITUDE'])
            
            # Calculate distance in kilometers
            distance = geodesic(home_coords, station_coords).kilometers
            
            # Set distance threshold based on city
            distance_threshold = 30 if city == 'London' else 20
            
            # If within threshold, add to results
            if distance <= distance_threshold:
            # Create a copy of the station row and add the CITY column
                station_data = station_row.to_dict()
                station_data['CITY'] = city
                nearby_stations.append(station_data)

# Create the new dataframe
nearby_stations_df = pd.DataFrame(nearby_stations)

print(f"Found {len(nearby_stations_df)} stations within 20km of home locations")
print(nearby_stations_df.head())
nearby_stations_df.to_csv('data/nearby_stations.csv', index=False)


In [None]:
import ftplib
import gzip
import os

# Create the output directory if it doesn't exist
os.makedirs('data/by_station', exist_ok=True)
# Read the nearby stations CSV file
nearby_stations_df = pd.read_csv('data/nearby_stations.csv')
# Connect to the FTP server
ftp = ftplib.FTP('ftp.ncei.noaa.gov')
ftp.login()  # Anonymous login

# Change to the target directory
ftp.cwd('pub/data/ghcn/daily/by_station')

# Download and process files for each station
for idx, row in nearby_stations_df.iterrows():
    station_id = row['ID']
    print(f"Processing station: {station_id}")
    filename = f"{station_id}.csv.gz"
    local_gz_path = f"data/by_station/{filename}"
    local_csv_path = f"data/by_station/{station_id}.csv"
    
    # Skip if file already exists
    if os.path.exists(local_csv_path):
        print(f"Skipping {station_id}.csv (already exists)")
        continue
    
    try:
        # Download the .csv.gz file
        with open(local_gz_path, 'wb') as f:
            ftp.retrbinary(f'RETR {filename}', f.write)
        
        # Decompress the .gz file
        with gzip.open(local_gz_path, 'rb') as f_in:
            with open(local_csv_path, 'wb') as f_out:
                f_out.write(f_in.read())
        
        # Remove the .gz file after extraction
        os.remove(local_gz_path)
        
        print(f"Downloaded and extracted: {station_id}.csv")
        
    except Exception as e:
        print(f"Error downloading {filename}: {e}")

# Close the FTP connection
ftp.quit()

print(f"\nCompleted downloading {len(nearby_stations_df)} station files")