In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
path = Path('../data/data_cache/pudl/v2024.5.0/core_eia860m__changelog_generators.parquet')
assert path.exists()

In [4]:
ch = pd.read_parquet(path, use_nullable_dtypes=True)
ch.shape

(195978, 32)

In [5]:
ch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195978 entries, 0 to 195977
Data columns (total 32 columns):
 #   Column                                    Non-Null Count   Dtype   
---  ------                                    --------------   -----   
 0   report_date                               195978 non-null  object  
 1   valid_until_date                          194777 non-null  object  
 2   plant_id_eia                              195978 non-null  Int32   
 3   plant_name_eia                            195978 non-null  string  
 4   utility_id_eia                            195978 non-null  Int32   
 5   utility_name_eia                          195907 non-null  string  
 6   generator_id                              195978 non-null  string  
 7   balancing_authority_code_eia              109223 non-null  string  
 8   capacity_mw                               142176 non-null  float32 
 9   county                                    169757 non-null  string  
 10  current_

In [6]:
# coverage by operational_status
ch.groupby('operational_status')['balancing_authority_code_eia'].agg(lambda x: x.notna().mean())

operational_status
existing    0.519633
proposed    0.801738
retired     0.385126
Name: balancing_authority_code_eia, dtype: float64

In [18]:
# coverage by operational_status, looking only at current projects
max_date = pd.to_datetime(ch['valid_until_date']).max()
ch.loc[ch['valid_until_date'].eq(max_date),:].groupby('operational_status')['balancing_authority_code_eia'].agg(lambda x: x.notna().mean())

  result = libops.scalar_compare(x.ravel(), y, op)


operational_status
existing    0.970888
proposed    0.992342
retired     0.605713
Name: balancing_authority_code_eia, dtype: float64

In [7]:
# how many BAs?
with pd.option_context('display.max_rows', 80):
    display(ch['balancing_authority_code_eia'].value_counts(dropna=False))

<NA>    86755
MISO    20147
PJM     16767
CISO    12338
ISNE     8854
ERCO     8113
NYIS     7075
SWPP     5659
DUK      3225
CPLE     3113
SOCO     2945
PACE     1352
TVA      1337
BPAT     1204
FPL      1154
NEVP     1126
PSCO     1090
FPC       969
WACM      950
IPCO      901
LDWP      808
PGE       670
PNM       669
SCEG      649
NWMT      588
AZPS      556
PACW      527
AECI      507
BANC      481
IID       461
TEC       442
SRP       408
TEPC      346
WALC      295
PSEI      261
HECO      250
FMPP      246
SC        241
EPE       229
AVA       227
WAUW      197
SPA       180
JEA       164
LGEE      155
SCL       131
SEC        86
AEC        84
TAL        83
TIDC       82
AVRN       81
NBSO       72
SEPA       71
CHPD       64
GCPD       64
CPLW       61
EEI        57
TPWR       55
GVL        55
YAD        54
GRMA       51
DOPD       36
HST        27
GLHB       22
HGMA       18
NSB        14
MPS        14
GRIF       13
SECI       12
OVEC       11
NPPD        8
CSTO        6
GRID  

In [8]:
# what places are most frequently missing BAs?
ch.loc[ch['balancing_authority_code_eia'].isna(), ['state', 'county']].value_counts(dropna=True)

state  county         
CA     Los Angeles        955
       Kern               593
       San Bernardino     462
HI     Honolulu           449
CA     Riverside          416
                         ... 
OH     Columbiana           1
AR     Jackson              1
TX     Zapata               1
IL     Fayette              1
VA     Petersburg City      1
Length: 2488, dtype: int64

In [9]:
# the overwhelming majority of counties with missing BA codes have
# only one unique BA code. Imputation is low risk there.
cty = ch.groupby(['state', 'county'])['balancing_authority_code_eia'].agg(['size', 'count', 'nunique'])
cty.loc[cty['count'].lt(cty['size']), 'nunique'].value_counts()

1    1932
2     370
0      96
3      77
4      11
8       1
5       1
Name: nunique, dtype: int64

In [10]:
meta_cols = {
    "report_date",
    "generator_id",
    "plant_id_eia",
    "valid_until_date",
    "plant_name_eia",
    "utility_id_eia",
    "utility_name_eia",
    "capacity_mw",
    "county",
    "current_planned_generator_operating_date",
    "balancing_authority_code_eia",
    "data_maturity",
    "energy_source_code_1",
    "energy_storage_capacity_mwh",
    "fuel_type_code_pudl",
    "generator_retirement_date",
    "latitude",
    "longitude",
    "net_capacity_mwdc",
    "operational_status",
    "raw_operational_status_code",
    "operational_status_code",
    "planned_derate_date",
    "planned_generator_retirement_date",
    "planned_net_summer_capacity_derate_mw",
    "planned_net_summer_capacity_uprate_mw",
    "planned_uprate_date",
    "prime_mover_code",
    "state",
    "summer_capacity_mw",
    "technology_description",
    "winter_capacity_mw",
    "state_id_fips",
    "county_id_fips",
}
ch.columns.difference(meta_cols)

Index(['sector_id_eia'], dtype='object')

In [11]:
ch['sector_id_eia'].value_counts(dropna=False)

2       93932
1       74206
7        9671
4        4951
5        4740
3        4608
6        3843
<NA>       27
Name: sector_id_eia, dtype: Int64