# eGRID Annual Database

NOTE: Query numbers mentioned throughout the code refer to SQL queries in the MS Access Database version of eGRID on which this code is based

In [1]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import numpy as np
import pandas as pd
import sqlalchemy as sa
import importlib
import calendar

# Local libraries
import pudl

## Notebook Parameters

In [2]:
EGRID_YEAR = 2018

# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

# set up access to output tables at different frequencies
# list of frequency aliases: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
pudl_out_annual = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='AS', start_date=f'{EGRID_YEAR}-01-01', end_date=f'{EGRID_YEAR}-12-31') #annual frequency
pudl_out_monthly = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='MS', start_date=f'{EGRID_YEAR}-01-01', end_date=f'{EGRID_YEAR}-12-31') #monthly frequency

datapkg_dir = pudl_settings['datapkg_dir'] + '/to_parquet/epacems-eia/data/'

In [3]:
# test accessing db
eia_860 = pudl_out_annual.bf_eia923()
eia_860.columns

Index(['report_date', 'plant_id_eia', 'plant_id_pudl', 'plant_name_eia',
       'utility_id_eia', 'utility_id_pudl', 'utility_name_eia', 'boiler_id',
       'ash_content_pct', 'fuel_consumed_units', 'fuel_mmbtu_per_unit',
       'fuel_type_code_pudl', 'sulfur_content_pct',
       'total_heat_content_mmbtu'],
      dtype='object')

In [4]:
# examples
# access datapkg
#pd.read_csv(datapkg_dir + 'generators_eia860.csv', parse_dates=['report_date']).query('report_date.dt.year == @EGRID_YEAR')

#access db
pudl_out_monthly.gen_eia923()

Unnamed: 0,report_date,plant_id_eia,plant_id_pudl,plant_name_eia,utility_id_eia,utility_id_pudl,utility_name_eia,generator_id,net_generation_mwh
0,2018-01-01,3,32,Barry,195,18,Alabama Power Co,1,10738.0
1,2018-02-01,3,32,Barry,195,18,Alabama Power Co,1,-348.0
2,2018-03-01,3,32,Barry,195,18,Alabama Power Co,1,-414.0
3,2018-04-01,3,32,Barry,195,18,Alabama Power Co,1,-411.0
4,2018-05-01,3,32,Barry,195,18,Alabama Power Co,1,
...,...,...,...,...,...,...,...,...,...
47875,2018-08-01,62319,13272,Western Sugar Cooperative - Billings,61819,,Western Sugar Cooperative - Billings,1,
47876,2018-09-01,62319,13272,Western Sugar Cooperative - Billings,61819,,Western Sugar Cooperative - Billings,1,
47877,2018-10-01,62319,13272,Western Sugar Cooperative - Billings,61819,,Western Sugar Cooperative - Billings,1,
47878,2018-11-01,62319,13272,Western Sugar Cooperative - Billings,61819,,Western Sugar Cooperative - Billings,1,


# Generator (GEN) File
This file includes generation from steam boilers and nuclear units in the EIA-923, plant prime movers in the EIA-923 that have only one generator in the EIA-860, and the EIA-923 plant prime movers where generation is distributed to the generator level based on nameplate capacity.


### 1. Create EIA-860 Generator Combined table
This table includes operable, proposed, and retired units  
(Queries 1g01, 1g02, 1g03, 1g04, 1g05)

In [5]:
# Load data from EIA-860 Generator Data Package
eia_860_gen_columns = ['plant_id_eia', 'generator_id','plant_name_eia','state', 'operational_status_code','prime_mover_code', 'energy_source_code_1', 'capacity_mw', 'planned_retirement_date', 'retirement_date', 'report_date']
#gen_file = pd.read_csv(datapkg_dir + 'generators_eia860.csv', parse_dates=['report_date', 'retirement_date', 'planned_retirement_date'], usecols=eia_860_gen_columns).query('report_date.dt.year == @EGRID_YEAR')
gen_file = pudl_out_annual.gens_eia860()[eia_860_gen_columns]

# parse datetime columns
gen_file[['planned_retirement_date', 'retirement_date']] = gen_file[['planned_retirement_date', 'retirement_date']].apply(pd.to_datetime)

# Merge Prime Mover data
#gen_file = gen_file.merge(pd.read_csv(datapkg_dir + 'generators_entity_eia.csv', usecols=['plant_id_eia','generator_id','prime_mover_code']), how='left', on=['plant_id_eia','generator_id'])

# Merge State and Plant Name
#gen_file = gen_file.merge(pd.read_csv(datapkg_dir + 'plants_entity_eia.csv', usecols=['plant_id_eia','plant_name_eia', 'state']), how='left', on=['plant_id_eia'])

# Add new columns
gen_file['sequence_number'] = np.NaN
gen_file['CFACT'] = np.NaN
gen_file['NUMBLR'] = 0
gen_file['NETGEN'] = np.NaN
gen_file['NETGENOZ'] = np.NaN
gen_file['data_source'] = ''

# combine planned_retirement_year and retirement_year columns
gen_file['retirement_date'] = gen_file['retirement_date'].fillna(gen_file['planned_retirement_date'])
#convert this column to a year instead of a date
gen_file['retirement_year'] = gen_file['retirement_date'].dt.year
#drop the old columns
gen_file = gen_file.drop(columns=['planned_retirement_date','retirement_date'])

# drop plants not connected to grid
# NOTE: in 1g04, only Plant 10788 is dropped
non_grid_connected_plant_ids = list(pd.read_csv(importlib.resources.open_text(
        'pudl.package_data.epa.egrid', 'table_4-2_plants_not_connected_to_grid.csv'),
        usecols=['Plant ID'])['Plant ID'])
gen_file = gen_file[~gen_file['plant_id_eia'].isin(non_grid_connected_plant_ids)]

gen_file = gen_file.set_index(['plant_id_eia', 'generator_id'])

gen_file.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,plant_name_eia,state,operational_status_code,prime_mover_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,1,Bankhead Dam,AL,OP,HY,WAT,53.9,2018-01-01,,,0,,,,
3,1,Barry,AL,OP,ST,NG,153.1,2018-01-01,,,0,,,,
3,2,Barry,AL,OP,ST,NG,153.1,2018-01-01,,,0,,,,


### 2. Count number of boilers per generator
(Queries 1g07, 1g08, 1g09)

In [6]:
# number_of_boilers = pd.read_csv(datapkg_dir + 'boiler_generator_assn_eia860.csv', parse_dates=['report_date'], usecols=['plant_id_eia','report_date','generator_id','boiler_id']).query('report_date.dt.year == @EGRID_YEAR').drop(columns='report_date')
number_of_boilers = pudl_out_annual.bga_eia860()[['plant_id_eia','generator_id','boiler_id']]

# count the number of boilers per generator
number_of_boilers = number_of_boilers.groupby(['plant_id_eia','generator_id']).count().rename(columns={'boiler_id':'NUMBLR'})

# merge this data into gen_file
gen_file.update(number_of_boilers)

gen_file.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,plant_name_eia,state,operational_status_code,prime_mover_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,1,Bankhead Dam,AL,OP,HY,WAT,53.9,2018-01-01,,,0.0,,,,
3,1,Barry,AL,OP,ST,NG,153.1,2018-01-01,,,1.0,,,,


### 3. Update Net Generation Data
(Queries 1g10, 1g11)  
NOTE: 1g11 not necessary because generator ids between EIA-860 and EIA-923 have already been standardized in the PUDL data package

In [7]:
#eia_923_generator = pd.read_csv(datapkg_dir + 'generation_eia923.csv', parse_dates=['report_date']).query('report_date.dt.year == @EGRID_YEAR')
eia_923_generator = pudl_out_monthly.gen_eia923()[['report_date','plant_id_eia','generator_id','net_generation_mwh']]

# sum annual net generation
eia_923_generator_NETGEN = eia_923_generator.drop(columns='report_date').groupby(['plant_id_eia','generator_id']).sum().rename(columns={'net_generation_mwh':'NETGEN'})
eia_923_generator_NETGEN['data_source'] = 'EIA-923 Generator File'

# calculate ozone season net generation, which includes months May - September
eia_923_generator_NETGENOZ = eia_923_generator[(eia_923_generator['report_date'].dt.month >= 5) & (eia_923_generator['report_date'].dt.month <= 9)].groupby(['plant_id_eia','generator_id']).sum().rename(columns={'net_generation_mwh':'NETGENOZ'})
eia_923_generator_NETGENOZ['data_source'] = 'EIA-923 Generator File'

# merge this data into gen_file
gen_file.update(eia_923_generator_NETGEN)
gen_file.update(eia_923_generator_NETGENOZ)

gen_file.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,plant_name_eia,state,operational_status_code,prime_mover_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,1,Bankhead Dam,AL,OP,HY,WAT,53.9,2018-01-01,,,0.0,,,,
3,1,Barry,AL,OP,ST,NG,153.1,2018-01-01,,,1.0,23392.0,10444.0,EIA-923 Generator File,


### 4. Distribute generation
(Queries 1g12, 1g13, 1g14, 1g15, 1g16, 1g18, 1g19)

In [8]:
# Sum net generation by prime mover (1g12)
net_gen_by_PM = gen_file.reset_index()[['plant_id_eia','prime_mover_code', 'NETGEN', 'NETGENOZ']].groupby(['plant_id_eia','prime_mover_code']).sum()

# Sum EIA-923 Generation and Fuel by prime mover (1g13)
# Load EIA-923 Generation and Fuel data
#eia_923_gen_fuel = pd.read_csv(datapkg_dir + 'generation_fuel_eia923.csv', parse_dates=['report_date'], usecols=['plant_id_eia','report_date','prime_mover_code','net_generation_mwh']).query('report_date.dt.year == @EGRID_YEAR')
eia_923_gen_fuel = pudl_out_monthly.gf_eia923()[['report_date','plant_id_eia','prime_mover_code','net_generation_mwh']]

# Sum annual and ozone season net generation by plant and prime mover
eia_923_gen_fuel_NETGEN = eia_923_gen_fuel.drop(columns='report_date').groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'net_generation_mwh':'NETGEN'}).reset_index()
eia_923_gen_fuel_NETGENOZ = eia_923_gen_fuel[(eia_923_gen_fuel['report_date'].dt.month >= 5) & (eia_923_gen_fuel['report_date'].dt.month <= 9)].groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'net_generation_mwh':'NETGENOZ'}).reset_index()

# Calculate the difference between PM net generation from gen_file and EIA-923 Generation and Fuel (1g14)
#merge the net gen by PM data from the gen file and the EIA-923 generation and fuel file together
netgen_diff_by_PM = net_gen_by_PM.merge(eia_923_gen_fuel_NETGEN, how='left', on=['plant_id_eia','prime_mover_code'], suffixes=('_gen','_923gf'))
netgen_diff_by_PM = netgen_diff_by_PM.merge(eia_923_gen_fuel_NETGENOZ, how='left', on=['plant_id_eia','prime_mover_code'], suffixes=('_gen','_923gf'))

# find the netgen_diff_by_PM between the net generation from the two sources
netgen_diff_by_PM['netgen_diff_by_PM'] = netgen_diff_by_PM['NETGEN_923gf'].round(decimals=0) - netgen_diff_by_PM['NETGEN_gen'].fillna(0).round(decimals=0)
netgen_diff_by_PM['netgen_diff_by_PM_oz'] = netgen_diff_by_PM['NETGENOZ_923gf'].round(decimals=0) - netgen_diff_by_PM['NETGENOZ_gen'].fillna(0).round(decimals=0)

netgen_diff_by_PM.head(3)

Unnamed: 0,plant_id_eia,prime_mover_code,NETGEN_gen,NETGENOZ_gen,NETGEN_923gf,NETGENOZ_923gf,netgen_diff_by_PM,netgen_diff_by_PM_oz
0,2,HY,0.0,0.0,-392.0,-166.577,-392.0,-167.0
1,3,CA,2345699.0,862291.0,2345699.0,862291.0,0.0,0.0
2,3,CT,4341821.0,1594449.0,4341821.0,1594449.0,0.0,0.0


In [9]:
# get units in gen_file without generation (1g15)
np_capacity = gen_file[['prime_mover_code', 'NETGEN', 'NETGENOZ', 'operational_status_code', 'data_source', 'capacity_mw', 'retirement_year']]
# only keep generators that are missing data and are either in operation or are retired in 2018
np_capacity = np_capacity[
    (np_capacity['data_source'] == '') & 
    ((np_capacity['operational_status_code'].isin(["OP","SB","OS","OA","IP","TS","U","V"])) | 
    ((np_capacity['operational_status_code'] == 'RE') & (np_capacity['retirement_year'] == EGRID_YEAR)))]

"""
# group these generators by Prime Mover (1g16)
np_capacity_by_PM = np_capacity.reset_index().drop(columns=['NETGEN','NETGENOZ']).groupby(['plant_id_eia','prime_mover_code','operational_status_code','retirement_year'], dropna=False).sum()
"""

# group these generators by plant (1g17)
np_capacity_by_plant = np_capacity.reset_index().drop(columns=['NETGEN','NETGENOZ','retirement_year']).groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'capacity_mw':'capacity_mw_by_plant'})

# calculate nameplate capacity ratio (1g18)
np_capacity = np_capacity.reset_index().merge(np_capacity_by_plant.reset_index(), how='left', on=['plant_id_eia','prime_mover_code'])
np_capacity['capacity_ratio'] = np_capacity['capacity_mw'] / np_capacity['capacity_mw_by_plant']
np_capacity = np_capacity.sort_values(by=['plant_id_eia','generator_id'])

np_capacity.head(5)

Unnamed: 0,plant_id_eia,generator_id,prime_mover_code,NETGEN,NETGENOZ,operational_status_code,data_source,capacity_mw,retirement_year,capacity_mw_by_plant,capacity_ratio
0,2,1,HY,,,OP,,53.9,,53.9,1.0
1,4,1,HY,,,OP,,75.0,,225.0,0.333333
2,4,2,HY,,,OP,,75.0,,225.0,0.333333
3,4,3,HY,,,OP,,75.0,,225.0,0.333333
4,9,1,GT,,,OP,,86.9,,86.9,1.0


In [10]:
# Distribute generation by prime mover where missing (1g19)
generation_and_fuel_to_distribute = np_capacity.merge(netgen_diff_by_PM[['plant_id_eia','prime_mover_code','netgen_diff_by_PM','netgen_diff_by_PM_oz']], how='inner', on=['plant_id_eia','prime_mover_code']).drop(columns=['operational_status_code','capacity_mw','retirement_year','capacity_mw_by_plant'])

generation_and_fuel_to_distribute['NETGEN_to_use'] = generation_and_fuel_to_distribute['netgen_diff_by_PM'] * generation_and_fuel_to_distribute['capacity_ratio']
generation_and_fuel_to_distribute['NETGENOZ_to_use'] = generation_and_fuel_to_distribute['netgen_diff_by_PM_oz'] * generation_and_fuel_to_distribute['capacity_ratio']

generation_and_fuel_to_distribute['data_source'] = 'Distributed from 923 Generation And Fuel'

generation_and_fuel_to_distribute.head(2)

Unnamed: 0,plant_id_eia,generator_id,prime_mover_code,NETGEN,NETGENOZ,data_source,capacity_ratio,netgen_diff_by_PM,netgen_diff_by_PM_oz,NETGEN_to_use,NETGENOZ_to_use
0,2,1,HY,,,Distributed from 923 Generation And Fuel,1.0,-392.0,-167.0,-392.0,-167.0
1,4,1,HY,,,Distributed from 923 Generation And Fuel,0.333333,687921.0,292328.0,229307.0,97442.666667


In [11]:
# Update gen file (1g20 & 1g21)
gen_file = gen_file.reset_index().set_index(['plant_id_eia','generator_id','prime_mover_code'])
gen_file.update(generation_and_fuel_to_distribute.set_index(['plant_id_eia','generator_id','prime_mover_code'])[['NETGEN_to_use','NETGENOZ_to_use','data_source']].rename(columns={'NETGEN_to_use':'NETGEN','NETGENOZ_to_use':'NETGENOZ'}), overwrite=False)
gen_file

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,1,HY,Bankhead Dam,AL,OP,WAT,53.9,2018-01-01,,,0.0,-392.0,-167.0,,
3,1,ST,Barry,AL,OP,NG,153.1,2018-01-01,,,1.0,23392.0,10444.0,EIA-923 Generator File,
3,2,ST,Barry,AL,OP,NG,153.1,2018-01-01,,,1.0,23814.0,10362.0,EIA-923 Generator File,
3,3,ST,Barry,AL,RE,BIT,272.0,2018-01-01,,,0.0,,,,2015.0
3,4,ST,Barry,AL,OP,BIT,403.7,2018-01-01,,,1.0,827718.0,385060.0,EIA-923 Generator File,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62936,RAT,WT,Rattlesnake,WA,L,WND,144.0,2018-01-01,,,0.0,,,,
62937,BA,BA,Athens BESS,MN,OP,MWH,6.0,2018-01-01,,,0.0,,,,
62938,39001,WT,Glen Ullin Energy Center,ND,T,WND,106.7,2018-01-01,,,0.0,,,,
62939,41001,WT,South Peak Wind,MT,T,WND,80.0,2018-01-01,,,0.0,,,,


### 5. Add Capacity Factors
(Query 1g21)

In [12]:
# get the number of hours in the year, accounting for leap years
if calendar.isleap(EGRID_YEAR) == True:
    hours_in_year = 8784
else:
    hours_in_year = 8760

gen_file['CFACT'] = gen_file['NETGEN'] / (gen_file['capacity_mw'] * hours_in_year)

### 6. Update Fuel type of "Other Gas" (OG)
(Query 1g22)  
NOTE: This loads data in a static table - not sure what the source of these data are

In [13]:
updated_fuel_type_codes = pd.read_csv(importlib.resources.open_text(
        'pudl.package_data.epa.egrid', 'updated_fuel_type_codes.csv'),
        usecols=['plant_id_eia','generator_id','updated_fuel_type_code'], index_col=['plant_id_eia','generator_id']).rename(columns={'updated_fuel_type_code':'energy_source_code_1'})

gen_file.update(updated_fuel_type_codes)

### 7. Update net generation data from Ventyx data
(Query 1g23)  
NOTE: The Ventyx data may be confidential so for now I am not including it in this code. This step only updates the net generation total for a single generator (plant_id 58478, generator_id CC01), which does not seem to exist in the gen_file

In [14]:
gen_file[gen_file.index.get_level_values(0) == 58478]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
58478,LEPA1,CT,LEPA Unit No. 1,LA,OP,NG,57.0,2018-01-01,,0.465189,0.0,232278.0,98245.0,EIA-923 Generator File,
58478,LEPA2,CA,LEPA Unit No. 1,LA,OP,NG,17.1,2018-01-01,,0.462462,0.0,69275.0,29300.0,EIA-923 Generator File,


### 8. Distribute Generation, Part 2 (can this be part of step 4?)
(Queries 1g24, 1g25, 1g26, 1g27, 1g28, 1g29)

In [15]:
# sum generation by prime mover (1g24)
netgen_by_PM_from_gen_file = gen_file.reset_index()[['plant_id_eia','prime_mover_code','NETGEN','NETGENOZ']].groupby(['plant_id_eia','prime_mover_code']).sum()

# sum EIA-923 gen and fuel data by prime mover (1g25)
# NOTE: we performed this step already, so we will reuse these dfs
netgen_by_PM_from_gf_eia923 = eia_923_gen_fuel_NETGEN.merge(eia_923_gen_fuel_NETGENOZ, how='outer', on=['plant_id_eia','prime_mover_code'])

# compare net generation from the gen_file and eia_923 (1g26)
# only keep values if the percent difference is greater than +/-0.001
compare_net_generation = netgen_by_PM_from_gen_file.merge(netgen_by_PM_from_gf_eia923, how='inner', on=['plant_id_eia','prime_mover_code'], suffixes=("_GenFile","_923"))
compare_net_generation['AbsDiff'] = compare_net_generation['NETGEN_923'] - compare_net_generation['NETGEN_GenFile'].fillna(0)
compare_net_generation['PctDiff'] = (compare_net_generation['AbsDiff'] / compare_net_generation['NETGEN_923'])
compare_net_generation = compare_net_generation[abs(compare_net_generation['PctDiff']) > 0.001].sort_values(by='AbsDiff')
compare_net_generation

Unnamed: 0,plant_id_eia,prime_mover_code,NETGEN_GenFile,NETGENOZ_GenFile,NETGEN_923,NETGENOZ_923,AbsDiff,PctDiff
2650,7378,IC,0.0,0.0,-44.000,12.000,-44.000,1.000000
4073,52149,IC,23.0,8.0,22.540,7.692,-0.460,-0.020408
3595,50240,IC,72.0,27.0,71.540,27.440,-0.460,-0.006430
1081,1875,IC,23.0,9.0,22.540,9.109,-0.460,-0.020408
7057,58265,IC,71.0,40.0,70.560,40.180,-0.440,-0.006236
...,...,...,...,...,...,...,...,...
6027,57074,ST,11334.0,8241.0,242476.000,132637.000,231142.000,0.953257
1619,2951,ST,392500.0,301576.0,632798.000,484127.000,240298.000,0.379739
6026,57073,ST,13045.0,9421.0,277054.000,150030.000,264009.000,0.952915
6028,57075,ST,9988.0,5439.0,276326.000,142985.000,266338.000,0.963854


In [16]:
# calculate the nameplate capacity by prime mover (1g27)

np_capacity = gen_file.reset_index()[['plant_id_eia','prime_mover_code', 'operational_status_code', 'capacity_mw', 'retirement_year']]
# only keep generators that are missing data and are either in operation or are retired in 2018
np_capacity = np_capacity[
    ((np_capacity['operational_status_code'].isin(["OP","SB","OS","OA","IP","TS","U","V"])) | 
    ((np_capacity['operational_status_code'] == 'RE') & (np_capacity['retirement_year'] == EGRID_YEAR)))]
np_capacity_by_PM = np_capacity.reset_index().drop(columns=['index','operational_status_code','retirement_year']).groupby(['plant_id_eia','prime_mover_code'], dropna=False).sum()

# calculate nameplate capacity ratio (1g28)
np_capacity_ratio = gen_file.reset_index()[['plant_id_eia','prime_mover_code','generator_id','capacity_mw']].merge(np_capacity_by_PM.reset_index(), how='inner', on=['plant_id_eia','prime_mover_code'], suffixes=("","_sum_by_PM"))
np_capacity_ratio['ratio'] = np_capacity_ratio['capacity_mw'] / np_capacity_ratio['capacity_mw_sum_by_PM']
np_capacity_ratio = np_capacity_ratio.sort_values(by=['plant_id_eia','generator_id'])

np_capacity_ratio

Unnamed: 0,plant_id_eia,prime_mover_code,generator_id,capacity_mw,capacity_mw_sum_by_PM,ratio
0,2,HY,1,53.9,53.9,1.000000
1,3,ST,1,153.1,1498.7,0.102155
2,3,ST,2,153.1,1498.7,0.102155
3,3,ST,3,272.0,1498.7,0.181491
4,3,ST,4,403.7,1498.7,0.269367
...,...,...,...,...,...,...
24853,62920,IC,CG-4,0.3,3.6,0.083333
24854,62931,PV,PRATT,6.0,6.0,1.000000
24855,62934,PV,GEN1,190.0,190.0,1.000000
24856,62937,BA,BA,6.0,6.0,1.000000


In [17]:
# redistribute generation (1g29)
# merge data from previous steps together
gf_to_distribute_2 = np_capacity_ratio.merge(netgen_by_PM_from_gf_eia923, how='inner', on=['plant_id_eia','prime_mover_code'])
gf_to_distribute_2 = gf_to_distribute_2.merge(compare_net_generation, how='inner', on=['plant_id_eia','prime_mover_code'])
gf_to_distribute_2 = gf_to_distribute_2[['plant_id_eia','generator_id','prime_mover_code','ratio','NETGEN','NETGENOZ']]
#calculate how much generation to distribute based on ratio
gf_to_distribute_2['NETGEN'] = (gf_to_distribute_2['ratio'] * gf_to_distribute_2['NETGEN']).round(3)
gf_to_distribute_2['NETGENOZ'] = (gf_to_distribute_2['ratio'] * gf_to_distribute_2['NETGENOZ']).round(3) 

#update data_source
gf_to_distribute_2['data_source'] = 'Data from EIA-923 Generator File overwritten with distributed data from EIA-923 Generation and Fuel'

#prepare for updating
gf_to_distribute_2 = gf_to_distribute_2.drop(columns=['prime_mover_code','ratio'])
gf_to_distribute_2 = gf_to_distribute_2.set_index(['plant_id_eia','generator_id'])

# Update gen_file (1g30)
gen_file.update(gf_to_distribute_2)
gen_file

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,1,HY,Bankhead Dam,AL,OP,WAT,53.9,2018-01-01,,-0.000830,0.0,-392.0,-167.0,,
3,1,ST,Barry,AL,OP,NG,153.1,2018-01-01,,0.017442,1.0,23392.0,10444.0,EIA-923 Generator File,
3,2,ST,Barry,AL,OP,NG,153.1,2018-01-01,,0.017756,1.0,23814.0,10362.0,EIA-923 Generator File,
3,3,ST,Barry,AL,RE,BIT,272.0,2018-01-01,,,0.0,,,,2015.0
3,4,ST,Barry,AL,OP,BIT,403.7,2018-01-01,,0.234056,1.0,827718.0,385060.0,EIA-923 Generator File,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62936,RAT,WT,Rattlesnake,WA,L,WND,144.0,2018-01-01,,,0.0,,,,
62937,BA,BA,Athens BESS,MN,OP,MWH,6.0,2018-01-01,,,0.0,,,,
62938,39001,WT,Glen Ullin Energy Center,ND,T,WND,106.7,2018-01-01,,,0.0,,,,
62939,41001,WT,South Peak Wind,MT,T,WND,80.0,2018-01-01,,,0.0,,,,


### 9. Distribute Generation, Part 3
It seems like some generators reported all of their generation for the year in December. This step does what?

In [32]:
# 1g31 find where the generation from december equals the annual generation
gens_where_dec_eq_annual = eia_923_generator_NETGEN.merge(eia_923_generator[eia_923_generator.report_date.dt.month == 12].set_index(['plant_id_eia','generator_id']), how='left', left_index=True, right_index=True)
# NETGEN is the annual total, and net_generation_mwh is the monthly data from december
gens_where_dec_eq_annual = gens_where_dec_eq_annual[(gens_where_dec_eq_annual.NETGEN == gens_where_dec_eq_annual.net_generation_mwh) & (gens_where_dec_eq_annual.NETGEN != 0)]
gens_where_dec_eq_annual = gens_where_dec_eq_annual.rename(columns={'NETGEN':'net_generation_annual','net_generation_mwh':'net_generation_december'})
gens_where_dec_eq_annual = gens_where_dec_eq_annual[['net_generation_annual','net_generation_december']]
gens_where_dec_eq_annual

Unnamed: 0_level_0,Unnamed: 1_level_0,net_generation_annual,net_generation_december
plant_id_eia,generator_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1271,6,126.00,126.00
1702,1B,100457.00,100457.00
10061,GEN1,59001.00,59001.00
10149,GEN1,45142.00,45142.00
10301,GEN2,445.00,445.00
...,...,...,...
60464,STG1,41650.00,41650.00
61761,GEN-1,27929.00,27929.00
61838,ST1,769.21,769.21
61838,ST2,2128.18,2128.18


In [52]:
# distribute the ozone season net generation for generators when all reported net generation was in december 1g32
gen_to_distribute_3 = gens_where_dec_eq_annual.merge(np_capacity_ratio.set_index(['plant_id_eia','generator_id'])[['prime_mover_code','ratio']], how='inner', left_index=True, right_index=True).reset_index().set_index(['plant_id_eia','generator_id','prime_mover_code'])

gen_to_distribute_3 = gen_to_distribute_3.merge(netgen_by_PM_from_gf_eia923.set_index(['plant_id_eia','prime_mover_code']), how='inner', left_index=True, right_index=True)

gen_to_distribute_3['NETGENOZ_update'] = gen_to_distribute_3['ratio'] * gen_to_distribute_3['NETGENOZ']

gen_to_distribute_3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,net_generation_annual,net_generation_december,ratio,NETGEN,NETGENOZ,NETGENOZ_update
plant_id_eia,prime_mover_code,generator_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1271,ST,6,126.00,126.00,0.318569,19014.000,13964.000,4448.497445
1702,ST,1B,100457.00,100457.00,0.069876,2399523.021,1400129.007,97835.659946
10061,CT,GEN1,59001.00,59001.00,1.000000,59001.000,29530.805,29530.805000
10149,ST,GEN1,45142.00,45142.00,1.000000,45142.000,19323.334,19323.334000
10301,ST,GEN2,445.00,445.00,0.373134,16094.000,7555.714,2819.296269
...,...,...,...,...,...,...,...,...
60464,CT,CTG2,72698.00,72698.00,0.500000,104255.000,0.000,0.000000
61761,ST,GEN-1,27929.00,27929.00,1.000000,27929.000,12181.451,12181.451000
61838,ST,ST1,769.21,769.21,0.500000,2897.390,1238.663,619.331500
61838,ST,ST2,2128.18,2128.18,0.500000,2897.390,1238.663,619.331500


In [42]:
gen_file[gen_file['NETGENOZ'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
63,IC1,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,
63,IC2,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,
63,IC3,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,
63,IC4,IC,Gold Creek,AK,SB,DFO,3.5,2018-01-01,,0.000099,0.0,3.024691,0.0,,
63,IC5,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62713,DSM,PV,DSM Nutritional Products Solar,NJ,OP,SUN,10.0,2018-01-01,,0.000000,0.0,0.000000,0.0,,
62784,FRANK,PV,Franklin Solar Site,NY,OP,SUN,1.6,2018-01-01,,0.000928,0.0,13.000000,0.0,,
62785,MALON,PV,Malone Solar Site,NY,OP,SUN,1.1,2018-01-01,,0.000934,0.0,9.000000,0.0,,
62805,BLOOM,PV,Bloomington Solar I,UT,OP,SUN,2.0,2018-01-01,,0.005651,0.0,99.000000,0.0,,


In [53]:
# Update gen file with new netgenoz data (1g33)
gen_to_distribute_3 = gen_to_distribute_3[['NETGENOZ_update']].droplevel(1)

gen_file = gen_file.merge(gen_to_distribute_3, how='left', left_index=True, right_index=True)

gen_file[gen_file['NETGENOZ'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year,NETGENOZ_update
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
63,IC1,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,,
63,IC2,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,,
63,IC3,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,,
63,IC4,IC,Gold Creek,AK,SB,DFO,3.5,2018-01-01,,0.000099,0.0,3.024691,0.0,,,
63,IC5,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62713,DSM,PV,DSM Nutritional Products Solar,NJ,OP,SUN,10.0,2018-01-01,,0.000000,0.0,0.000000,0.0,,,
62784,FRANK,PV,Franklin Solar Site,NY,OP,SUN,1.6,2018-01-01,,0.000928,0.0,13.000000,0.0,,,
62785,MALON,PV,Malone Solar Site,NY,OP,SUN,1.1,2018-01-01,,0.000934,0.0,9.000000,0.0,,,
62805,BLOOM,PV,Bloomington Solar I,UT,OP,SUN,2.0,2018-01-01,,0.005651,0.0,99.000000,0.0,,,


In [51]:
gen_file[gen_file['NETGENOZ'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,plant_name_eia,state,operational_status_code,energy_source_code_1,capacity_mw,report_date,sequence_number,CFACT,NUMBLR,NETGEN,NETGENOZ,data_source,retirement_year
plant_id_eia,generator_id,prime_mover_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
63,IC1,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,
63,IC2,IC,Gold Creek,AK,SB,DFO,1.2,2018-01-01,,0.000099,0.0,1.037037,0.0,,
63,IC3,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,
63,IC4,IC,Gold Creek,AK,SB,DFO,3.5,2018-01-01,,0.000099,0.0,3.024691,0.0,,
63,IC5,IC,Gold Creek,AK,SB,DFO,1.1,2018-01-01,,0.000099,0.0,0.950617,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62713,DSM,PV,DSM Nutritional Products Solar,NJ,OP,SUN,10.0,2018-01-01,,0.000000,0.0,0.000000,0.0,,
62784,FRANK,PV,Franklin Solar Site,NY,OP,SUN,1.6,2018-01-01,,0.000928,0.0,13.000000,0.0,,
62785,MALON,PV,Malone Solar Site,NY,OP,SUN,1.1,2018-01-01,,0.000934,0.0,9.000000,0.0,,
62805,BLOOM,PV,Bloomington Solar I,UT,OP,SUN,2.0,2018-01-01,,0.005651,0.0,99.000000,0.0,,


In [48]:
gen_to_distribute_3 = gen_to_distribute_3[['NETGENOZ_update']].rename(columns={'NETGENOZ_update':'NETGENOZ'}).droplevel(1)
gen_to_distribute_3

Unnamed: 0_level_0,Unnamed: 1_level_0,NETGENOZ
plant_id_eia,generator_id,Unnamed: 2_level_1
1271,6,4448.497445
1702,1B,97835.659946
10061,GEN1,29530.805000
10149,GEN1,19323.334000
10301,GEN2,2819.296269
...,...,...
60464,CTG2,0.000000
61761,GEN-1,12181.451000
61838,ST1,619.331500
61838,ST2,619.331500
