In [None]:
%reload_ext autoreload
%autoreload 2

# import packages
import pandas as pd
import numpy as np
import geopandas as gpd
import plotly.express as px
import requests
from shapely.ops import nearest_points
from os import path

# Overview
In order to identify which buildings in the NREL EULP dataset are located in each climate zone in each balancing authority (to select our stratified random sample), we need to identify all of the census tract IDs and TMY3 locations that are located in each BA-climate zone pair.

The NREL commercial building metadata identifies an NHGIS census tract identifier for each commercial building, and the residential building metadata identifies a "weather file location" (which is a TMY3 location) for each residential building.

Before creating this, we check that Carbonara CI data is available for each BA in which we are interested. If not, we remove that BA. 

### Downloaded data sources
United States Environmental Protection Agency (EPA). 2022. “Emissions & Generation Resource Integrated Database (eGRID), 2020” Washington, DC: Office of Atmospheric Programs, Clean Air Markets Division. Available from EPA’s eGRID web site: https://www.epa.gov/egrid.
- eGRID2019 data: https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx

United States Energy Information Administration (EIA). "Hourly Electric Grid Monitor (EIA Form-930)." Available from: https://www.eia.gov/electricity/gridmonitor/about
- 2019 generation balance data from EIA-930: https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_BALANCE_2019_Jul_Dec.csv and https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_BALANCE_2019_Jan_Jun.csv


GIS shapefiles downloaded from the U.S. Department of Homeland Security's "Homeland Infrastructure Foundation-Level Data (HIFLD)"

- Electric Planning Areas: https://hifld-geoplatform.opendata.arcgis.com/datasets/geoplatform::electric-planning-areas-1/explore (NOTE: In this dataset, the PACW and PACE BAs are combined as a single multipolygon, as are the CPLW and CPLE BAs. Prior to loading this shapefile into this notebook, the authors manually separated these multiploygons into their component parts using QGIS software)

GIS Shapefiles downloaded from the U.S. Energy Information Administration's "U.S. Energy Atlas"
- Climate Zones - DOE Building America Program: https://atlas.eia.gov/datasets/eia::climate-zones-doe-building-america-program/explore?location=35.902577%2C-95.221420%2C4.90 (NOTE: this shapefile was missing climate zone data for the southern tip of Florida (indluding Miami), so the author manually corrected this in QGIS)

GIS Shapefiles from  Steven Manson, Jonathan Schroeder, David Van Riper, Tracy Kugler, and Steven Ruggles. IPUMS National Historical Geographic Information System: Version 16.0 [dataset]. Minneapolis, MN: IPUMS. 2021. http://doi.org/10.18128/D050.V16.0
- 2019 U.S. Census Tracts

Wilson et al. 2021. End-Use Load Profiles for the U.S. Building Stock: Methodology and Results of Model Calibration, Validation, and Uncertainty Quantification. NREL/TP-5500-80889 (forthcoming report). 
- Commercial and Residential metadata files

### Data inputs that were manually created
- `ba_tz.csv` identifies the UTC timezone offset for each balancing area in the US



# 1) Determine which Balancing Areas to Use

### Load BA metadata from eGRID

In [None]:
# first get some metadata about the BAs from eGRID
column_names = {'BANAME':'ba_name',
                'BACODE':'ba_code',
                'BANGENAN':'net_generation',
                'BACO2AN':'emissions'}
ba_meta = pd.read_excel('../data/downloaded/egrid/egrid2019_data.xlsx', sheet_name='BA19', header=1, usecols=['BANAME','BACODE','BANGENAN','BACO2AN']).rename(columns=column_names)
ba_meta.head(3)

### Load information about the timezone offset for each BA

In [None]:
# get the tz offset for each BA for use when downloading singularity data
if path.exists(f'../data/manual/ba_tz.csv'):
    pass
else:
    tz_offsets = pd.read_csv('../data/downloaded/eia/EIA930_BALANCE_2019_Jan_Jun.csv', usecols=['Balancing Authority','Local Time at End of Hour','UTC Time at End of Hour'], parse_dates=['Local Time at End of Hour','UTC Time at End of Hour'])

    tz_offsets = tz_offsets.drop_duplicates(subset='Balancing Authority', keep='first')

    tz_offsets['offset'] =  (tz_offsets['UTC Time at End of Hour'] - tz_offsets['Local Time at End of Hour']).astype('timedelta64[h]').astype(int)

    print(dict(zip(tz_offsets['Balancing Authority'], tz_offsets['offset'])))

    tz_offsets.to_csv('../data/manual/ba_tz.csv')

### Load EIA-930 data for 2019 and get sums for each BA

In [None]:
columns_to_use = ['Balancing Authority', 'Data Date', 'Hour Number',
     'Demand (MW) (Adjusted)', 'Net Generation (MW) (Adjusted)', 'Net Generation (MW)',
       'Net Generation (MW) from Coal', 'Net Generation (MW) from Natural Gas',
       'Net Generation (MW) from Nuclear',
       'Net Generation (MW) from All Petroleum Products',
       'Net Generation (MW) from Hydropower and Pumped Storage',
       'Net Generation (MW) from Solar', 'Net Generation (MW) from Wind',
       'Net Generation (MW) from Other Fuel Sources',
       'Net Generation (MW) from Unknown Fuel Sources']

# load the data from EIA-930 for 2019
eia_930 = pd.concat([pd.read_csv('../data/downloaded/eia/EIA930_BALANCE_2019_Jan_Jun.csv', usecols=columns_to_use, thousands=','),pd.read_csv('../data/downloaded/eia/EIA930_BALANCE_2019_Jul_Dec.csv', usecols=columns_to_use, thousands=',')])

# sum by balancing authority
eia_930 = eia_930.groupby('Balancing Authority').sum()
eia_930

### Identify relevant BAs and view size of BA by total demand

In [None]:
# drop any generation-only balancing authorities (according to https://www.eia.gov/electricity/gridmonitor/about)
gen_only_bas = ['AVRN','DEAA','EEI','GRID','GRIF','GWA','HGMA','SEPA','WWA','YAD'] #NOTE: GRMA also generation-only, but retired in 2018
eia_930 = eia_930.drop(gen_only_bas)

# add the remaining BAs to the list that we should consider
bas_to_consider = list(eia_930.index)

eia_930['Percent Demand'] = eia_930['Demand (MW) (Adjusted)'] / eia_930['Demand (MW) (Adjusted)'].sum() * 100

eia_930.sort_values(by='Demand (MW) (Adjusted)', ascending=False)

## Check that Carbonara data is available for all of our relevant BAs
NOTE: This only needs to be run once. We ran it and identified that all BAs are available

In [None]:
# determine for which of these BAs data is available in the singularity API
pass

"""# LOAD API CREDENTIALS from CSV file
api_credentials = pd.read_csv('../data/api_credentials.csv', index_col='API').to_dict()
api_key = api_credentials['PASSWORD']['Singularity']

# define parameters for API call
event_type = 'carbon_intensity'
header = {'X-Api-Key': api_key}

start='2019-02-02T00:00:00%2B00:00'
end='2019-02-02T16:45:00%2B00:00'

data_exists = []
data_dne = []
for ba in bas_to_consider:

    output = requests.get(f'https://api.singularity.energy/v1/region_events/search?region={ba}&start={start}&end={end}&event_type={event_type}&per_page=1000&page=1', headers=header)
    df = pd.json_normalize(output.json(), 'data')
    if df.empty: 
        eia_ba = f'EIA.{ba}'
        # see if the EIA-appended version exists
        output = requests.get(f'https://api.singularity.energy/v1/region_events/search?region={eia_ba}&start={start}&end={end}&event_type={event_type}&per_page=1000&page=1', headers=header)
        df = pd.json_normalize(output.json(), 'data')
        if df.empty: 
            data_dne.append(ba)
            print(f'{ba}: No Data')
        else: 
            data_exists.append(ba)
            print(f'{ba}: Data Exists ({eia_ba})')
    else: 
        data_exists.append(ba)
        print(f'{ba}: Data Exists')"""



# 2) Identify the set of all climate zones in each BA

### Identify the BAs that are not included in the GIS shapefile and will have to be manually inputted later

In [None]:
# Make sure all of the BAs from EIA-930 are included in our shapefile

# load the balancing area data
ba_shp = gpd.read_file('../data/downloaded/gis_shapefiles/Electric_Planning_Areas/Planning_Areas.shp')[['NAME','ABBRV','geometry']]
# rename some columns
ba_shp = ba_shp.rename(columns={'NAME':'ba_name','ABBRV':'ba_code'})

# get a list of all BAs in our shapefile
bas_in_shp = list(ba_shp.ba_code.unique())

# list all of the BAs of interest that are not in the shapefile
missing_bas = [i for i in bas_to_consider if i not in bas_in_shp]

ba_meta[ba_meta['ba_code'].isin(missing_bas)]

### Create a GeoDataFrame of unique BA-Climate Zones

In [None]:
# get a list of all climate zones in each BA
# load the balancing area data
ba_shp = gpd.read_file('../data/downloaded/gis_shapefiles/Electric_Planning_Areas/Planning_Areas.shp')[['NAME','ABBRV','geometry']]
# rename some columns
ba_shp = ba_shp.rename(columns={'NAME':'ba_name','ABBRV':'ba_code'})

# only keep the BAs in our list
ba_shp = ba_shp[ba_shp['ba_code'].isin(bas_to_consider)]

# import climate zone data and create a new zone column that combines the climate and moisture codes
climate_zones = gpd.read_file('../data/manual/gis/climate_zones_edited.shp').replace('N/A', '').replace('None', '')
climate_zones['climate_zone'] = climate_zones['IECC_Clima'].astype(str) + climate_zones['IECC_Moist'].astype(str).replace('None', '')

# create a new gdf that intersects balancing areas and climate zones
# this will give us a unique polygon for each climate zone in each BA
ba_cz = gpd.overlay(ba_shp.to_crs('EPSG:4326'), climate_zones, how='intersection')

# only keep certain columns
ba_cz = ba_cz[['ba_name','ba_code','climate_zone','geometry']]

# calculate the area of each bz_cz
ba_cz['area'] = ba_cz.area

# set the index to be ba_code_cz
ba_cz = ba_cz.set_index(['ba_code','climate_zone'])

# calculate the percent of area that each cz is
ba_cz['cz_pct_of_ba_area'] = ba_cz['area'] / ba_cz.groupby(['ba_code']).sum()['area']

# drop any climate zones that are less than 1% of the BA area
ba_cz = ba_cz[ba_cz['cz_pct_of_ba_area'] >= 0.01]

# drop the area and pct columns
ba_cz = ba_cz.drop(columns=['area','cz_pct_of_ba_area'])

ba_cz

# 3) Identify all of the census tracts in each BA-CZ

In [None]:
# load the shapefile containing all of the census tracts
tracts = gpd.read_file('../data/downloaded/gis_shapefiles/nhgis0001_shapefile_tl2019_us_tract_2019/US_tract_2019.shp')[['GISJOIN','geometry']]

# change the crs to match
tracts = tracts.to_crs('EPSG:4326')

tracts.head(3)

In [None]:
# determine which tracts are located in each BA_CZ
ba_cz_tract = gpd.sjoin(ba_cz, tracts, how='left', op='intersects')

# drop columns and reset index
ba_cz_tract = ba_cz_tract.drop(columns=['geometry','index_right'])
ba_cz_tract = ba_cz_tract.reset_index()

ba_cz_tract.head(3)

In [None]:
ba_cz_tract.to_csv('../data/processed/ba_tract_crosswalk_2019.csv', index=False)

# 4) Identify all of the weather file locations in each BA-CZ

In [None]:
res_meta_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/metadata/metadata.parquet'
weather_file_locations = pd.read_parquet(res_meta_url, columns=['in.weather_file_city','in.weather_file_latitude','in.weather_file_longitude'])
weather_file_locations = weather_file_locations.drop_duplicates()
# convert to a geodataframe
weather_file_locations = gpd.GeoDataFrame(weather_file_locations, geometry=gpd.points_from_xy(weather_file_locations['in.weather_file_longitude'], weather_file_locations['in.weather_file_latitude']))
weather_file_locations = weather_file_locations.set_crs('EPSG:4326')

# add climate zone to each location
#weather_file_locations = gpd.sjoin(weather_file_locations, climate_zones[['geometry','climate_zone']], how='left', op='intersects')
#weather_file_locations = weather_file_locations.drop(columns=['index_right'])
weather_file_locations

In [None]:
# determine which weather files are located in each BA_CZ
ba_cz_wf_location = gpd.sjoin(ba_cz, weather_file_locations, how='left', op='contains')
ba_cz_wf_location

In [None]:
weather_file_points = weather_file_locations.set_index('in.weather_file_city')

# get a list of all indexes that are missing
missing_wf_list = list(ba_cz_wf_location[ba_cz_wf_location['in.weather_file_city'].isna()].index)

for missing in missing_wf_list:
    # get the missing polygon
    missing_polygon = missing_wf.loc[missing,'geometry']
    # find the distance to each point
    closest_points = weather_file_locations.set_index('in.weather_file_city').distance(missing_polygon).sort_values(ascending=True)
    # find the closest location to the polygon
    closest = closest_points.index[0]
    # add the closest location to the dataframe
    ba_cz_wf_location.loc[missing,'in.weather_file_city'] = closest

ba_cz_wf_location

In [None]:
# drop any duplicate rows
ba_cz_wf_location = ba_cz_wf_location.reset_index()
ba_cz_wf_location = ba_cz_wf_location.drop_duplicates(subset=['ba_code','in.weather_file_city'])
# rename the column
ba_cz_wf_location = ba_cz_wf_location.rename(columns={'in.weather_file_city':'weather_file_city'})
# export the crosswalk
ba_cz_wf_location = ba_cz_wf_location.reset_index()[['ba_code','climate_zone','ba_name','weather_file_city']]
ba_cz_wf_location.to_csv('../data/processed/ba_weather_file_crosswalk.csv', index=False)