This file is to scrape all data I plan to visualize from the NOAA API. This will let me consolidate the files instead of having different CSVs and scraping scripts for each type of measurement.

In [None]:
import requests 
import numpy as np
import pandas as pd

from my_api_info import get_noaa_token
from time import sleep
from MI_info import stationids_to_FIPS, FIPS_to_county

In [None]:
def get_month_data(YYYY_MM: str) -> requests.models.Response:
    """
    Given an input string representing a month (in the format 
    YYYY-MM, e.g. '2024-01' for January of 2024), returns Response
    from NOAA API with desired monthly data for each MI station.
    """
    # Desired date range
    start_date = YYYY_MM + '-01'

    # API specifications
    base_url = 'https://www.ncei.noaa.gov/cdo-web/api/v2'
    extension = '/data'
    url = base_url + extension

    headers = {'token':get_noaa_token()}
    parameters = {'datasetid':'GSOM',
                'startdate':start_date,
                'enddate':start_date,
                'units':'metric',
                'datatypeid':'PRCP,SNOW,TMIN,TMAX',
                'station':'GHCND:US',
                'locationid':'FIPS:' + '26',
                'limit':1000,
                'includemetadata':'false'}
    r = requests.get(url,
                    headers=headers,
                    params=parameters)

    return r

In [None]:
# Create master DataFrame 
master_df = pd.DataFrame(
    [],
    columns=['station', 'FIPS', 'county_name',
              'date', 'data_type', 'value']
)

In [None]:
# Code for adding requested JSON data to DataFrame
def add_data(r: requests.models.Response, start_idx: int) -> None:
    """
    Adds json data from r to master_df. start_idx offests 
    addition so that we don't overwrite previous data
    """
    # Create DataFrame
    df = pd.DataFrame.from_dict(r.json()['results'])
    df = df.drop(['attributes'], axis=1)
    # Add DataFrame to master_df 
    for i in df.index:
        i_ = start_idx + i
        station = df.loc[i, 'station']
        date = df.loc[i, 'date']
        data_type = df.loc[i, 'datatype']
        value = df.loc[i, 'value']
        try: 
            _, station = station.split(':')
            fips = stationids_to_FIPS[station]
            county_name = FIPS_to_county['0' + str(fips)]
        except KeyError:
            fips = np.nan
            county_name = 'NO FIPS'
        master_df.loc[i_,'station'] = df.loc[i, 'station']
        master_df.loc[i_, 'FIPS'] = fips 
        master_df.loc[i_, 'data_type'] = data_type
        master_df.loc[i_, 'county_name'] = county_name
        master_df.loc[i_, 'date'] = date
        master_df.loc[i_, 'value'] = value

In [None]:
# Main scraping loop to scrape 10 years at a time (this seems like
# a reasonable range). Modify the starting segment of the YYYY and
# the range.

# I learned the hard way that this code is really inefficient; 
# using loc to insert is a very inefficient way of adding to a 
# pandas DataFrame. Better would be to use concat, and better than 
# that would be to insert data into a dict and then at the end 
# create a DataFrame from this dict. 

for j in range(20, 24): # year loop
    YYYY = '20' + str(j)
    for i in range(1, 13): # month loop
        print(f'Working on Year {YYYY}, Month {i}')
        # NOAA API requests limited to 5 per second; 
        # I'm just being cautious here
        sleep(1.5)
        if i < 10: 
            MM = '0' + str(i)
        else: 
            MM = str(i)
        YYYY_MM = YYYY + '-' + MM
        try: r = get_month_data(YYYY_MM)
        except TimeoutError: 
            sleep(5)
            r = get_month_data(YYYY_MM)
        status_code = r.status_code

        if status_code == 200: # Make sure request succeeded
            add_data(r, len(master_df))
        
        else: # If there was an error accessing the API 
            print(f'Retrying Year {YYYY} Month {MM}')
            sleep(5)
            try: r = get_month_data(YYYY_MM)
            except TimeoutError: 
                sleep(5)
                r = get_month_data(YYYY_MM)
            status_code = r.status_code 
            print(f'Year {YYYY} Month {i} status: {status_code}')
            if status_code == 200: 
                add_data(r, len(master_df))
            else: # If it fails again, try one more time with long pause
                print(f'Retrying again...')
                sleep(10)
                try: r = get_month_data(YYYY_MM)
                except TimeoutError: 
                    sleep(5)
                    r = get_month_data(YYYY_MM)
                status_code = r.status_code
                print(f'Year {YYYY} Month {i} status: {status_code}')
                if status_code == 200:
                    add_data(r, len(master_df))
                else: # Third time it fails, give up 
                    print(f'**************** Failed for Year {YYYY} Month {MM} :( ****************')

In [None]:
# Display final DataFrame
master_df.sample(5)

In [None]:
# Uncomment to save as CSV
#master_df.to_csv('../data/bulk_data/2020-2023_bulk.csv')