# Data Collection

# Install Dependencies

In [None]:
%%capture
%pip install bs4

## Import Modules

In [None]:
import os
import requests
from zipfile import ZipFile
from bs4 import BeautifulSoup

## Mount Drive

In [None]:
import os

team_name = 'capstone-power-grid-protagonists'
colab_path = f'/content/drive/Shareddrives/{team_name}/project'
studiolab_path = f'/home/studio-lab-user/sagemaker-studiolab-notebooks/{team_name}'

try:
    # Try to mount Google Drive and set project path
    from google.colab import drive
    drive.flush_and_unmount()
    drive.mount('/content/drive')
    print('')

    root_path = colab_path
    os.chdir(root_path)

except:
    try:
        # Try to set AWS SageMaker Studio Lab project path
        root_path = studiolab_path
        os.chdir(root_path)
    
    except:
        # Set current working directory as root path
        root_path = os.getcwd()
        os.chdir(root_path)

        # If the current folder is 'notebooks', move up one level
        if root_path.endswith('/notebooks'):
            root_path = '/'.join(cwd.split('/')[:-1])
            os.chdir(root_path)
        
print('Current working directory is:')
print(os.getcwd())

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive

Current working directory is:
/content/drive/Shareddrives/capstone-power-grid-protagonists/project


## Download Data

In [None]:
def set_download_folder(folder):
    # If path does not exist, create it
    path = os.path.join(root_path, 'data', 'raw', folder)
    if os.path.isdir(path) == False: os.makedirs(path)
    
    # Change working directory
    os.chdir(path)

### EIA

In [None]:
set_download_folder('eia')

#### EIA930 Interchange

Form EIA-930 data collection provides a centralized and comprehensive source for hourly operating data about the high-voltage bulk electric power grid in the Lower 48 states. We collect the data from the electricity balancing authorities (BAs) that operate the grid.

We publish hourly operating data for individual BAs exactly as we receive these reported data. However, hourly U.S. and regional aggregations and all daily data aggregations follow procedures to manage anomalous values of some data elements. We advise caution when using these data.

In [None]:
# Download file
url = 'https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_INTERCHANGE_2022_Jan_Jun.csv'
req = requests.get(url, allow_redirects=True)
name = url.split('/')[-1].lower()
open(name, 'wb').write(req.content)

103418235

#### EIA930 Reference Tables

In [None]:
# Download file
url = 'https://www.eia.gov/electricity/930-content/EIA930_Reference_Tables.xlsx'
req = requests.get(url, allow_redirects=True)
name = url.split('/')[-1].lower()
open(name, 'wb').write(req.content)

38343

In [None]:
#Get the most recent list of balancing authorities as recognized on the EIA-930 survey form

url = 'https://www.eia.gov/electricity/930-content/EIA930_Reference_Tables.xlsx'

def get_eia930_ba_list(eia930_location_url):
  '''
  Get the most recent list of balancing authorities as recognized on the EIA-930 survey form

  from EIA website: https://www.eia.gov/electricity/gridmonitor/about
  We use BA acronyms based on official NERC BA codes, and so codes may not match the full name of BAs 
  because of changes in corporate ownership, mergers, or other business transactions.
  '''
  eia930_ref_table = pd.read_excel('https://www.eia.gov/electricity/930-content/EIA930_Reference_Tables.xlsx')

  return eia930_ref_table['BA Code'].values

####EIA's API
Through EIA's API, the most recent time series data can be directly obtained. This was the data that was used for forecasting tasks in the project.  The code below queries the API for the most up-to-date daily balancing authority data, specifically: Demand(D), Net-Generation(NG), Demand Forecast(DF), and Energy Interchange between neighboring BA's (EI)

**NOTE: In order to access data from the API you must register to obtain a free, access key from the EIA here: [https://www.eia.gov/opendata/](https://www.eia.gov/opendata/)**

In [None]:
#sample input parameters

#monthly and hourly to come soon
frequency = 'daily'
#can also select NG, DF, or EI (though not all balancing authorities will have data for each categeory)
data_type = 'D'
#sample BA used for illustrations
ba_code = 'FPL'
#API key --- Paul provided his for the purpose of reproducing results in this SIADS 697 class, but will delete afterwards
api_key = '6e1b1996731d78aa3d7803e2de270e75'

In [None]:
def generate_query_url(frequency='daily', data_type='D', ba_code=None, api_key=None):
  if frequency=='daily':
    desired_dataset = 'daily-region-data'
    api_route = f'electricity/rto/{desired_dataset}'
    url = f'https://api.eia.gov/v2/{api_route}/data?api_key={api_key}&data[]=value&facets[respondent][]={ba_code}&facets[type][]={data_type}&facets[timezone][]=Eastern&frequency={frequency}&length=500000'
    return query_url
  else:
    return print('This function currently only supports frequency="daily".')

In [None]:
def api_query_df_generation(query_url):
  try:
    r = requests.get(url)

    #convert data to json file
    json_data_ba = r.json()

    #put response-level data into pandas df with the keys as columns
    df_ba = pd.DataFrame.from_dict(json_data_ba['response']['data'], orient='columns')

    return df_ba

  except:
    return print(f'Issue with generating df from API request for {ba_code}')

In [None]:
def clean_df_generated_from_api(df_ba):
  try:
    #make a copy of the df before manipulation
    df_ba_copy = df_ba.copy()

    #get ba_code from df
    respondent_ba = df_ba_copy.respondent.unique()[0]

    #get data_type from df
    data_type = df_ba_copy.type.unique()[0]

    #reducing the number of needed columns for forecasting
    df_ba_copy = df_ba_copy[['period','respondent','value']]

    #so info is not lost about what the data is (NG) and what unit it is measured in (MegaWatt-hours) I changed the name of the column
    df_ba_copy.rename(columns={'value':f'{respondent_ba}_{data_type}(MW-h)'}, inplace=True)
    df_ba_copy.drop('respondent', axis=1, inplace=True)
    return df_ba_copy
  except:
    return print(f'Issue with cleaning df for {ba_code}')

In [None]:
def export_df_as_csv_to_folder(df_ba_copy, ba_code, data_path):
  try:
    #give name to DataFrame
    from pandas.core.frame import DataFrame
    df_ba_copy.name = f'{ba_code}_df'
    
    #getting the startdate of the dataset for file naming purposes
    data_start_date = df_ba_copy.period[0]

    #getting the enddate of the dataset for file naming purposes
    data_end_date = df_ba_copy.period[len(df_ba_copy)-1]

    #getting the data name for dataset naming purposes
    dataset_info = df_ba_copy.columns[1]

    #exporting the data as a CSV
    df_ba_copy.to_csv(f'{data_path}{dataset_info}_({data_start_date} to {data_end_date}).csv')
  
    return print(f'{df_ba_copy.name} successfully exported to location {data_path}{dataset_info}_({data_start_date} to {data_end_date}).csv')

  except:
    return print(f'Issue with exporting the df for {ba_code}')

##### Get Data for All Available Balancing Authorities

In [None]:
# def run_pipeline_for_all_ba(ba_list_eia930,frequency, data_type):
  # for ba_code in ba_list_eia930:
  # #running the pipeline for ba
  #   url_output = generate_query_url(frequency, data_type, ba_code, api_key)
  #   df_generated = api_query_df_generation(url_output)
  #   df_cleaned = clean_df_generated_from_api(df_generated)
  #   export_df_as_csv_to_folder(df_cleaned, ba_code, data_path)

### EPA

In [None]:
set_download_folder('epa')

#### eGRID
The Emissions & Generation Resource Integrated Database (eGRID) is a comprehensive source of data from EPA's Clean Air Markets Division on the environmental characteristics of almost all electric power generated in the United States.

The data includes emissions, emission rates, generation, heat input, resource mix, and many other attributes. eGRID is typically used for greenhouse gas registries and inventories, carbon footprints, consumer information disclosure, emission inventories and standards, power market changes, and avoided emission estimates.

In [None]:
# Download data from 2018 - 2020
urls = ['https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx',
        'https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx',
        'https://www.epa.gov/sites/default/files/2020-03/egrid2018_data_v2.xlsx']

for url in urls:
    req = requests.get(url, allow_redirects=True)
    name = url.split('/')[-1].lower()
    open(name, 'wb').write(req.content)

In [None]:
# Download data from 1996 - 2016
url = 'https://www.epa.gov/sites/default/files/2020-01/egrid2018_historical_files_since_1996.zip'
req = requests.get(url, allow_redirects=True)
name = url.split('/')[-1].lower()
open(name, 'wb').write(req.content)

# Extract CSV file from ZIP file
with ZipFile(name, 'r') as zipObj:
    for fileName in zipObj.namelist():
        if 'data.xls' in fileName.lower() or 'data_v2.xls' in fileName.lower():
            zipObj.extract(fileName)
            os.rename(fileName, fileName.lower())

# Remove ZIP file
os.remove(name)

### HIFLD

In [None]:
set_download_folder('hifld')

#### Electric Substations
This feature class/shapefile represents electric power substations primarily associated with electric power transmission. In this layer, substations are considered facilities and equipment that switch, transform, or regulate electric power at voltages equal to, or greater than, 69 kilovolts. Substations with a maximum operating voltage less than 69 kilovolts may be included, depending on the availability of authoritative sources, but coverage of these features should not be considered complete. The Substations feature class/shapefile includes taps, a location where power on a transmission line is tapped by another transmission line.

In [None]:
# Download file
url = 'https://opendata.arcgis.com/api/v3/datasets/39567c09b9d1491b892b3bbb065e77ef_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1'
req = requests.get(url, allow_redirects=True)
name = 'electric_substations.csv'
open(name, 'wb').write(req.content)

22214648

#### Power Plants
This feature class/shapefile represents electric power plants. Power plants are all the land and land rights, structures and improvements, boiler or reactor vessel equipment, engines and engine-driven generators, turbo generator units, accessory electric equipment, and miscellaneous power plant equipment are grouped together for each individual facility. Included are the following plant types: hydroelectric dams, fossil fuel (coal, natural gas, or oil), nuclear, solar, wind, geothermal, and biomass. 

In [None]:
# Download file
url = 'https://opendata.arcgis.com/api/v3/datasets/9dd630378fcf439999094a56c352670d_0/downloads/data?format=csv&spatialRefId=3857&where=1%3D1'
req = requests.get(url, allow_redirects=True)
name = 'power_plants.csv'
open(name, 'wb').write(req.content)

6055494

#### Electric Power Transmission Lines
This feature class/shapefile represents electric power transmission lines. Transmission Lines are the system of structures, wires, insulators and associated hardware that carry electric energy from one point to another in an electric power system. Lines are operated at relatively high voltages varying from 69 kV up to 765 kV, and are capable of transmitting large quantities of electricity over long distances. Underground transmission lines are included where sources were available.

In [None]:
# Download file (CSV)
url = 'https://opendata.arcgis.com/api/v3/datasets/468e9601b9b7407396e5c4f59772f1ff_0/downloads/data?format=csv&spatialRefId=3857&where=1%3D1'
req = requests.get(url, allow_redirects=True)
name = 'electric_power_transmission_lines.csv'
open(name, 'wb').write(req.content)

# Download file (GeoJSON)
url = 'https://opendata.arcgis.com/api/v3/datasets/468e9601b9b7407396e5c4f59772f1ff_0/downloads/data?format=geojson&spatialRefId=4326&where=1%3D1'
req = requests.get(url, allow_redirects=True)
name = 'electric_power_transmission_lines.geojson'
open(name, 'wb').write(req.content)

155515969

### DOE

In [None]:
set_download_folder('doe')

#### Electric Disturbance Events
The Electric Emergency Incident and Disturbance Report (Form DOE-417) collects information on electric incidents and emergencies. The Department of Energy uses the information to fulfill its overall national security and other energy emergency management responsibilities, as well as for analytical purposes.

In [None]:
# Parse webpage
url = 'https://www.oe.netl.doe.gov/'
page = 'OE417_annual_summary.aspx'
req = requests.get(url + page)
soup = BeautifulSoup(req.text, 'html.parser')

# Extract download links from table
table = soup.find('table', id='MainContent_GridView1')
links = {}
for row in table.find_all('tr')[1:]:
    row_data = row.find_all('td')
    year = row_data[0].text
    link = row_data[2].a
    if link:
        links[year] = url + link.get('href')

# Download files
for year in links:
    req = requests.get(links[year], allow_redirects=True)
    name = year + '_annual_summary.xls'
    open(name, 'wb').write(req.content)

### State & County Data

In [None]:
# Download states and abbreviations to assist with extration in data set
url = 'https://worldpopulationreview.com/static/states/name-abbr.csv'
req = requests.get(url, allow_redirects=True)
name = 'states.csv'
open(name, 'wb').write(req.content)

850

####US Census

In [None]:
# Download State and County FIPS data for linking events
url = 'https://www2.census.gov/programs-surveys/popest/geographies/2017/all-geocodes-v2017.xlsx'
req = requests.get(url, allow_redirects=True)
name = 'counties.xlsx'
open(name, 'wb').write(req.content)

1785556

In [None]:
# Download Census Data
url = 'https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/counties/totals/co-est2021-alldata.csv'
req = requests.get(url, allow_redirects=True)
name = 'census.csv'
open(name, 'wb').write(req.content)

645814

###Local Temperatures

The code below is to specifically download temperature data for Miami (since that was the only sample we had time to test)

In [None]:
#download florida temperature data
url = 'https://kilthub.cmu.edu/ndownloader/files/32874272'

def api_query_df_generation(url):
  try:
    #read in only desired columns to reduce the size of the data
    fpl_temp = pd.read_csv(url, usecols=['Date','tmax','tmin'], parse_dates=['Date'])

    #remove dates that span prior to the available EIA API data
    fpl_temp = fpl_temp[fpl_temp['Date']>'2015-01-01']

    return fpl_temp 
  except:
    return print('There was an error reading the data from the URL to a pandas dataframe.')