In [24]:
from pathlib import Path
import pandas as pd
import numpy as np

# Import EPA-EIA Crosswalk spreadsheet

In [25]:
crosswalk_file = Path.cwd() / 'epa_eia_crosswalk_from_epa.csv'
df = pd.read_csv(crosswalk_file, 
                       names=['plant_name',
                              'plant_id_epa',
                              'plant_id_eia',
                              'unitid',
                              'generator_id',
                              'boiler_id',
                              'fuel_type_primary',
                              'prime_mover_code',
                              'edat_capacity_mw',
                              'heat_input_mmbtu',
                              'generator_id_match_method',
                              'op_status',
                              'op_status_date'],
                      header=0,
                      index_col=False)
df.head(5)

Unnamed: 0,plant_name,plant_id_epa,plant_id_eia,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date
0,Barry,3,,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954
1,Barry,3,,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954
2,Barry,3,,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969
3,Barry,3,,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971
4,Barry,3,,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000


# Fill in plant_id_eia
Some plant ids have been matched across EPA and EIA, but many of the associations are missing. We will take a layered approach to matching the missing plant ids:

1. Attempt to match based on plant name strings from EIA and EPA
2. In most cases, the EPA has already associated an EIA generator_id with each EPA unit_id. In this case, if we are able to find an EIA plant_id generator_id pair that matches an EPA plant_id generator_id pair, we will assume that the EPA and EIA plant_ids are identical.
3. For whatever is left, we will simply look for instances where there is an EIA plant_id that matches an EPA plant_id. By comparing the plant names, we should be able to identify if these plants are the same.


In [26]:
df.reset_index(inplace=True)
#extract rows wheere plants_id_eia is missing
missing_plant_eia = df.query("plant_id_eia == 'NaN'")
missing_plant_eia.head(5)

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date
0,0,Barry,3,,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954
1,1,Barry,3,,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954
2,2,Barry,3,,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969
3,3,Barry,3,,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971
4,4,Barry,3,,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000


In [27]:
#how many missing values are there?
missing_plant_eia.plant_id_eia.isna().sum()

6747

### Attempt to match by plant name

In [28]:
#import plants_eia file, which contains the plant id and plant name from the EIA records
datapkg = Path.home() / 'Box/PUDL/datapkg/pudl-data-release/pudl-eia860-eia923-epacems/data'
plants_eia = pd.read_csv(datapkg / 'plants_entity_eia.csv')

#drop rows with mostly null values, as these seem to be from non-EIA-860 sources
#plants_eia.dropna(axis='index',thresh=10,inplace=True)

plants_eia = plants_eia.filter(['plant_id_eia','plant_name_eia'])

plants_eia.head(5)

Unnamed: 0,plant_id_eia,plant_name_eia
0,62940,Starratt Solar
1,62939,South Peak Wind
2,62938,Glen Ullin Energy Center
3,62937,Athens BESS
4,62936,Rattlesnake


In [29]:
#need to drop two plants with duplicate names
plants_eia.drop(plants_eia[plants_eia['plant_id_eia']==880081].index,inplace=True) #duplicate value for "DTE Pontiac North LLC"
plants_eia.drop(plants_eia[plants_eia['plant_id_eia']==14013].index,inplace=True) #duplicate value for "The Ohio State University"

In [30]:
#merge the EIA plant name data with the crosswalk on the plant name
missing_plant_eia = missing_plant_eia.merge(plants_eia, how='left', left_on='plant_name', right_on='plant_name_eia')
missing_plant_eia.head(5)

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia_x,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date,plant_id_eia_y,plant_name_eia
0,0,Barry,3,,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954,3.0,Barry
1,1,Barry,3,,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954,3.0,Barry
2,2,Barry,3,,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969,3.0,Barry
3,3,Barry,3,,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971,3.0,Barry
4,4,Barry,3,,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000,3.0,Barry


In [31]:
#need to drop any duplicates based on merge
missing_plant_eia[missing_plant_eia.duplicated(subset="index")]

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia_x,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date,plant_id_eia_y,plant_name_eia


In [32]:
missing_plant_eia.plant_id_eia_x = missing_plant_eia.plant_id_eia_y #replace plant id values with those from the string match
missing_plant_eia.drop(columns=['plant_id_eia_y','plant_name_eia'],inplace=True) #drop the columns that were merged in from EIA
missing_plant_eia.rename(columns={"plant_id_eia_x":"plant_id_eia"},inplace=True)
missing_plant_eia.head(5)

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date
0,0,Barry,3,3.0,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954
1,1,Barry,3,3.0,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954
2,2,Barry,3,3.0,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969
3,3,Barry,3,3.0,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971
4,4,Barry,3,3.0,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000


In [33]:
missing_plant_eia.plant_id_eia.isna().sum()

3845

### Attempt to match by plant_id and generator_id

In [34]:
generators_eia = pd.read_csv(datapkg / 'generators_entity_eia.csv', usecols=['plant_id_eia','generator_id'])
generators_eia.head(5)

Unnamed: 0,plant_id_eia,generator_id
0,62940,PV
1,62939,41001
2,62938,39001
3,62937,BA
4,62936,RAT


In [35]:
#if the epa plant has already been matched with a generator_id from EIA, then pull the EIA plant_id
missing_plant_eia = missing_plant_eia.merge(generators_eia, 
                                            how='left', 
                                            left_on=['plant_id_epa','generator_id'], 
                                            right_on=['plant_id_eia','generator_id'])
missing_plant_eia.head(5)

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia_x,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date,plant_id_eia_y
0,0,Barry,3,3.0,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954,3.0
1,1,Barry,3,3.0,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954,3.0
2,2,Barry,3,3.0,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969,3.0
3,3,Barry,3,3.0,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971,3.0
4,4,Barry,3,3.0,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000,3.0


In [36]:
missing_plant_eia.plant_id_eia_x.fillna(missing_plant_eia.plant_id_eia_y, inplace=True) #if a match was found, fill any missing values with that id

#let's do some cleanup
missing_plant_eia.drop(columns=['plant_id_eia_y'],inplace=True)
missing_plant_eia.rename(columns={'plant_id_eia_x':'plant_id_eia'}, inplace=True)

In [37]:
#how many are missing now?
missing_plant_eia.plant_id_eia.isna().sum()

52

## See if there is a matching EIA plant_id

In [38]:
#find where there is a matching plant_id
missing_plant_eia = missing_plant_eia.merge(plants_eia, how='left', left_on='plant_id_epa', right_on='plant_id_eia')

In [39]:
#what values are still missing?
pd.set_option('display.max_rows',200)
missing_plant_eia.query('plant_id_eia_x == "NaN"')

Unnamed: 0,index,plant_name,plant_id_epa,plant_id_eia_x,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date,plant_id_eia_y,plant_name_eia
2196,2234,Cumberland,3399,,A1,,,Diesel Oil,ST,349.0,33879.5,,Operating,,3399,Cumberland (TN)
2197,2235,Cumberland,3399,,B1,,,Diesel Oil,ST,349.0,33216.1,,Operating,,3399,Cumberland (TN)
3175,3233,"KapStone Charleston Kraft, LLC",7737,,B002,,,Natural Gas,ST,500.0,25583.3,,Operating,,7737,Kapstone
3176,3234,"KapStone Charleston Kraft, LLC",7737,,B003,,,Natural Gas,ST,500.0,40862.6,,Operating,,7737,Kapstone
3177,3235,"KapStone Charleston Kraft, LLC",7737,,B004,,,Natural Gas,ST,500.0,89275.8,,Operating,,7737,Kapstone
3773,3845,"Gilroy Energy Center, LLC for King City",10294,,2,,,Pipeline Natural Gas,GT,47.0,52704.141,,Operating,,10294,King City Power Plant
3812,3884,Coastal Carolina Clean Power LLC,10381,,BLR01A,Plant not in EIA,,Wood,ST,160.0,,Plant not in EIA,Operating,,10381,Coastal Carolina Clean Power
3813,3885,Coastal Carolina Clean Power LLC,10381,,BLR01B,Plant not in EIA,,Wood,ST,160.0,,Plant not in EIA,Operating,,10381,Coastal Carolina Clean Power
3847,3919,ArcelorMittal USA - Indiana Harbor East,10474,,211,,,Process Gas,ST,200.0,,,Operating,,10474,4 AC Station
3848,3920,ArcelorMittal USA - Indiana Harbor East,10474,,212,,,Process Gas,ST,200.0,,,Operating,,10474,4 AC Station


In [40]:
missing_plant_eia.plant_id_eia_x.fillna(missing_plant_eia.plant_id_eia_y, inplace=True) #if a match was found, fill any missing values with that id

#let's do some cleanup
missing_plant_eia.drop(columns=['plant_id_eia_y','plant_name_eia'],inplace=True)
missing_plant_eia.rename(columns={'plant_id_eia_x':'plant_id_eia'}, inplace=True)

#how many are missing now?
missing_plant_eia.plant_id_eia.isna().sum()

0

## Replace missing values from original dataframe

In [41]:
missing_plant_eia.set_index('index', drop=True, inplace=True)
missing_plant_eia.head(5)

Unnamed: 0_level_0,plant_name,plant_id_epa,plant_id_eia,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Barry,3,3.0,1,1,1,Pipeline Natural Gas,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954
1,Barry,3,3.0,2,2,2,Pipeline Natural Gas,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954
2,Barry,3,3.0,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969
3,Barry,3,3.0,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971
4,Barry,3,3.0,6A,A1ST,6A,Pipeline Natural Gas,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000


In [42]:
df.update(missing_plant_eia)
df.drop(columns="index", inplace=True)
df.to_csv('eia_epa_id_crosswalk.csv', index=False)

# Standardize fuel_type_primary column (TODO)
Change fuel type description to the standard EIA fuel type code

## 1) Pull in data from EIA

### Replace unambiguous fuel type descriptions with EIA code

In [10]:
df.fuel_type_primary.unique()

array(['Pipeline Natural Gas', 'Coal', 'Natural Gas', nan, 'Other Oil',
       'Residual Oil', 'Diesel Oil', 'Wood', 'Other Gas', 'Process Gas',
       'Petroleum Coke', 'Coal Refuse', 'Other Solid Fuel',
       'Tire Derived Fuel'], dtype=object)

In [11]:
fuel_type_dict = {'Pipeline Natural Gas':'NG', 
                  #'Coal' is ambiguous
                  'Natural Gas':'NG', 
                  #'Other Oil' is ambiguous
                  'Residual Oil':'RFO', 
                  'Diesel Oil':'DFO', 
                  'Wood':'WDS', 
                  #'Other Gas' is ambiguous 
                  #'Process Gas' not sure what this refers to
                  'Petroleum Coke':'PC', 
                  'Coal Refuse':'WC', 
                  #'Other Solid Fuel' is ambiguous
                  'Tire Derived Fuel':'TDF'}
df['fuel_type_code'] = df.fuel_type_primary #create a new column for the fuel type code
df.fuel_type_code.replace(fuel_type_dict, inplace=True) #replace the unambiguous fuel types with the fuel type code
df.head(5)

Unnamed: 0,plant_name,plant_id_epa,plant_id_eia,unitid,generator_id,boiler_id,fuel_type_primary,prime_mover_code,edat_capacity_mw,heat_input_mmbtu,generator_id_match_method,op_status,op_status_date
0,Barry,3,,1,1,1,NG,ST,180.0,184780.5,X-walk from Travis,OPR,Feb/11/1954
1,Barry,3,,2,2,2,NG,ST,180.0,188881.7,X-walk from Travis,OPR,Jun/18/1954
2,Barry,3,,4,4,4,Coal,ST,400.0,12242520.0,X-walk from Travis,OPR,May/28/1969
3,Barry,3,,5,5,5,Coal,ST,800.0,33224780.0,X-walk from Travis,OPR,Jul/24/1971
4,Barry,3,,6A,A1ST,6A,NG,CT,312.0,15147520.0,X-walk from Travis,OPR,Feb/22/2000


# Fill Missing Boiler-Generator Associations (TODO)
- There are currently 71 EPA units that are not matched to EIA generators
- Most boiler associations are missing
