In [2]:
import pandas as pd
import numpy as np
import codecs
import csv
import os
import urllib.request
from pathlib import Path

In [3]:
# Set path to local Visual Crossing API key file
path = Path()
vc_key_file = 'visualcrossing_apikey.txt'
vc_key_filepath = path/'..'/vc_key_file

# Raise exception if key file not found
if not os.path.exists(vc_key_filepath):
    raise FileNotFoundError('Visual Crossing API key file not found! Please check directory.')

# Read in Visual Crossing API key as environment variable
with open(vc_key_filepath, 'r') as f:
    os.environ['vc_api_key'] = f.readline().strip()

In [4]:
# Make directory for storing downloaded weather data files
weather_dir = 'weather_data'
weather_dir_path = path/'..'/weather_dir

if not os.path.exists(weather_dir_path):
    os.mkdir(weather_dir_path)

In [5]:
# Read in the longitude and latitude of the centroid of all the counties in New York state
mapping_file = 'mapping.csv'
mapping_file_path = path/'..'/mapping_file
counties = []
with open(mapping_file_path, 'r') as csv_file:
    mapping_reader = csv.DictReader(csv_file)
    for row in mapping_reader:
        long, lat = row['County Centroid'].replace('(', '').replace(')', '').split(', ')
        counties.append((row['County'], long, lat))

In [6]:
# Check county list
print(f"County list length = {len(counties)}")
for county in counties[:5]:
    print(county)

County list length = 62
('BRONX', '40.8448', '-73.8648')
('KINGS', '40.6782', '-73.9442')
('RICHMOND', '40.5795', '-74.1502')
('ALBANY', '42.5882713', '-73.9740136')
('SCHENECTADY', '42.8175421', '-74.0435834')


In [29]:
def get_daily_weather_data(county, start_date, end_date, long, lat, content_type='csv'):
    try:
        print(f"Downloading weather data for {county} county...")
        url = f'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{long},{lat}/{start_date}/{end_date}?unitGroup=us&include=days&key={os.environ.get("vc_api_key")}&contentType={content_type}'
        result_bytes = urllib.request.urlopen(url)
        # Parse the results as CSV
        csv_text = csv.reader(codecs.iterdecode(result_bytes, 'utf-8'))
        county_filename = county.replace(' ', '_') + '.csv'
        csv_filepath = weather_dir_path/county_filename
        # Create new CSV file and write to it
        with open(csv_filepath, 'x', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerows(csv_text)
        print(f"Weather data for {county} county successfully downloaded!")
    except urllib.error.HTTPError as e: # Handle HTTP exceptions
        error_info = e.read().decode()
        print('Error code: ', e.code, error_info)
        return
    except urllib.error.URLError as e: # Handle URL exceptions
        error_info = e.read().decode()
        print('Error code: ', e.code, error_info)
        return

In [25]:
# Test for BRONX county in past 7 days
start_date = '2022-11-01' # Dates are in YYYY-MM-DD format
end_date = '2022-11-07' # End date is inclusive
get_daily_weather_data('BRONX', start_date, end_date, '40.8448', '-73.8648')

Weather data for BRONX county successfully downloaded!


In [30]:
# Download weather data for all NY counties
start_date = '2020-01-01' # Dates are in YYYY-MM-DD format
end_date = '2022-09-30' # End date is inclusive
for county in counties:
    county_name, long, lat = county
    get_daily_weather_data(county_name, start_date, end_date, long, lat)

Downloading weather data for BRONX county...
Weather data for BRONX county successfully downloaded!
Downloading weather data for KINGS county...
Weather data for KINGS county successfully downloaded!
Downloading weather data for RICHMOND county...
Weather data for RICHMOND county successfully downloaded!
Downloading weather data for ALBANY county...
Weather data for ALBANY county successfully downloaded!
Downloading weather data for SCHENECTADY county...
Weather data for SCHENECTADY county successfully downloaded!
Downloading weather data for CHAUTAUQUA county...
Weather data for CHAUTAUQUA county successfully downloaded!
Downloading weather data for DUTCHESS county...
Weather data for DUTCHESS county successfully downloaded!
Downloading weather data for CORTLAND county...
Weather data for CORTLAND county successfully downloaded!
Downloading weather data for RENSSELAER county...
Weather data for RENSSELAER county successfully downloaded!
Downloading weather data for CAYUGA county...
We

In [7]:
# Make directory for storing processed/cleaned weather data file
cleaned_weather_dir_path = path/'..'/'data'/'processed'/'weather_data'
cleaned_weather_csv = 'weather_data.csv'
cleaned_weather_csv_path = cleaned_weather_dir_path/cleaned_weather_csv

if not os.path.exists(cleaned_weather_dir_path):
    os.mkdir(cleaned_weather_dir_path)

# Create new csv file for storing cleaned and aggregated weather data
with open(cleaned_weather_csv_path, 'x', newline='') as f:
    print(f"Created new CSV file at: {cleaned_weather_csv_path}")

Created new CSV file at: ../data/processed/weather_data/weather_data.csv


In [8]:
def process_weather_data(county, dest_csv_file_path, first=False):
    print(f"Processing weather data for {county} county...")
    county_filename = county.replace(' ', '_')
    csv_filename = county_filename + '.csv'
    # Read data from raw CSV file
    df_county = pd.read_csv(weather_dir_path/csv_filename)
    # Remove redundant columns
    df_county = df_county.drop(labels=['description', 'icon', 'stations'], axis=1)
    df_county['name'] = county
    df_county = df_county.rename(columns={'name': 'county'})
    # Drop first few and last few rows to ensure conformity with weekly-basis
    df_county = df_county.drop(labels=[0, 1, 2, 3, 998, 999, 1000, 1001, 1002, 1003], axis=0).reset_index(drop=True)
    agg_col_map = {col: 'mean' for col in df_county.columns}
    # Delete unnecessary columns
    del agg_col_map['sunrise']
    del agg_col_map['sunset']
    # Choose last appeared value for these columns
    agg_col_map['county'] = 'last'
    agg_col_map['datetime'] = 'last'
    # Choose the most frequent value for these columns
    agg_col_map['preciptype'] = lambda x: '' if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]
    agg_col_map['conditions'] = lambda x: '' if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]
    # Group data by every 7 days
    df_county_agg = df_county.groupby(df_county.index//7).agg(agg_col_map)
    # Append dataframe to CSV file
    if first:
        df_county_agg.to_csv(dest_csv_file_path, mode='a', index=False, header=True)
    else:
        df_county_agg.to_csv(dest_csv_file_path, mode='a', index=False, header=False)
    print(f"Appended cleaned weather data for {county} county to CSV!")

In [9]:
# Process weather data for all counties
for i in range(len(counties)):
    county_name = counties[i][0]
    if i == 0:
        process_weather_data(county_name, cleaned_weather_csv_path, True)
    else:
        process_weather_data(county_name, cleaned_weather_csv_path)

Processing weather data for BRONX county...
Appended cleaned weather data for BRONX county to CSV!
Processing weather data for KINGS county...
Appended cleaned weather data for KINGS county to CSV!
Processing weather data for RICHMOND county...
Appended cleaned weather data for RICHMOND county to CSV!
Processing weather data for ALBANY county...
Appended cleaned weather data for ALBANY county to CSV!
Processing weather data for SCHENECTADY county...
Appended cleaned weather data for SCHENECTADY county to CSV!
Processing weather data for CHAUTAUQUA county...
Appended cleaned weather data for CHAUTAUQUA county to CSV!
Processing weather data for DUTCHESS county...
Appended cleaned weather data for DUTCHESS county to CSV!
Processing weather data for CORTLAND county...
Appended cleaned weather data for CORTLAND county to CSV!
Processing weather data for RENSSELAER county...
Appended cleaned weather data for RENSSELAER county to CSV!
Processing weather data for CAYUGA county...
Appended cle