# GridStatus <-> LBNL Status Mapping

GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from dbcp.extract.lbnl_iso_queue import extract



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


## Get the data
### LBNL-Compiled Queues

In [2]:
# partial implementation of transform. I don't want to include deduplication.
def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:
    """Transform active iso queue data."""
    rename_dict = {
        "state": "raw_state_name",
        "county": "raw_county_name",
    }
    active_projects = active_projects.rename(columns=rename_dict)  # copy
    # Harmonize the interconnection_status_lbnl values.
    mapping = {
        "Feasability Study": "Feasibility Study",
        "Feasibility": "Feasibility Study",
        "Facilities Study": "Facility Study",
        "IA in Progress": "In Progress (unknown study)",
        "Unknown": "In Progress (unknown study)",
        "Withdrawn, Feasibility Study": "Withdrawn",
    }
    active_projects.loc[:, "interconnection_status_lbnl"] = active_projects.loc[
        :, "interconnection_status_lbnl"
    ].replace(mapping)
    # drop irrelevant columns (structurally all nan due to 'active' filter)
    active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True)
    return active_projects


source_path = Path("/app/data/raw/queues_2022_clean_data.xlsx")
raw_lbnl = extract(source_path)["lbnl_iso_queue"]
lbnl = partial_transform(raw_lbnl)


In [3]:
lbnl.shape, lbnl.columns


((29033, 27),
 Index(['queue_id', 'queue_status', 'queue_date', 'queue_year', 'interconnection_date', 'entity', 'project_name', 'developer', 'utility', 'county_1', 'county_2', 'county_3', 'raw_state_name', 'region', 'interconnection_service_type', 'point_of_interconnection', 'date_proposed', 'year_proposed', 'interconnection_status_raw', 'interconnection_status_lbnl', 'resource_type_lbnl', 'resource_type_1', 'resource_type_2', 'resource_type_3', 'capacity_mw_resource_1', 'capacity_mw_resource_2', 'capacity_mw_resource_3'], dtype='object'))

In [4]:
lbnl.head(2)


Unnamed: 0,queue_id,queue_status,queue_date,queue_year,interconnection_date,entity,project_name,developer,utility,county_1,county_2,county_3,raw_state_name,region,interconnection_service_type,point_of_interconnection,date_proposed,year_proposed,interconnection_status_raw,interconnection_status_lbnl,resource_type_lbnl,resource_type_1,resource_type_2,resource_type_3,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3
0,GIA-97,withdrawn,1/7/2022,2022.0,,AEC,,,AEC,new madrid,,,MO,Southeast (non-ISO),Network,New Madrid - Essex 345kV,10/31/2024,2024.0,Withdrawn,Withdrawn,Solar,Solar,,,350.0,,
1,GIA-40,active,10/24/2009,2009.0,,AEC,,,AEC,new madrid,,,MO,Southeast (non-ISO),Network Resource,NM Switchyard (345 kV Bus),11/1/2011,2011.0,Upgrade Approved,IA Executed,Coal,Coal,,,20.0,,


### GridStatus Queues

In [5]:
import dbcp

# These are the revision numbers of the oldest archives we have
iso_queue_versions: dict[str, str] = {
    "miso": "1681775160487863",
    "caiso": "1681775162586588",
    "pjm": "1681775160979859",
    "ercot": "1681775161342766",
    "spp": "1681775162935809",
    "nyiso": "1681775159356063",
    "isone": "1681775162111351",
}

gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=pHtZTUFyuug8sHFsYzeqo4BFCN3zBR&access_type=offline


Enter the authorization code:  4/0AfJohXlv43QP8GCDALOl6lTrxhSUKCXHCzW7sUthEeBCCg9b0Z2cdz2xjDwjU9rb-1igTQ


In [6]:
{k: v.shape for k, v in gs_dfs.items()}

{'miso': (4253, 31),
 'caiso': (2278, 36),
 'pjm': (9030, 37),
 'ercot': (1203, 35),
 'spp': (950, 24),
 'nyiso': (1535, 24),
 'isone': (1533, 31)}

In [7]:
# # These are manually downloaded from our archives. I went back as far as I could,
# # which is April 17 2023.
# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.
# root_path = Path("/app/data/raw/gridstatus/interconnection_queues")
# assert root_path.exists()
# # filenames are like "interconnection_queues_caiso_4-17-2023.parquet"
# gs_dfs = {
#     path.name.split("_")[2]: pd.read_parquet(path)
#     for path in root_path.glob("*.parquet")
# }
# {k: v.shape for k, v in gs_dfs.items()}


In [8]:
# wayyy fewer items in GridStatus than LBNL.
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]


-8251

In [9]:
lbnl.region.value_counts()

PJM                    7738
West (non-ISO)         6610
MISO                   4071
Southeast (non-ISO)    3070
CAISO                  2274
ERCOT                  1952
NYISO                  1260
ISO-NE                 1255
SPP                     802
Name: region, dtype: int64

In [10]:
lbnl_iso = lbnl[~lbnl["region"].str.contains("non-ISO", na=False)].copy()

In [11]:
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]

1429

Gridstatus has more ISO projects

In [17]:
# globals().update(gs_dfs)  # this works fine but the static type checker/linter can't introspect it.
raw_ercot = gs_dfs["ercot"]
raw_nyiso = gs_dfs["nyiso"]
raw_isone = gs_dfs["isone"]
raw_miso = gs_dfs["miso"]
raw_pjm = gs_dfs["pjm"]
raw_spp = gs_dfs["spp"]
raw_caiso = gs_dfs["caiso"]


In [18]:
LBNL_JOIN_COLS = [
    "queue_id",  # join key
    "project_name",  # for manually checking the joins
    "queue_date",  # for manually checking the joins
    "queue_status",  # for manually checking the joins
    "interconnection_status_raw",  # see what LBNL interpreted
    "interconnection_status_lbnl",  # final mapping value
    "capacity_mw_resource_1",
    "resource_type_1"
    
]


def join_lbnl(
    iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col="Queue ID"
) -> pd.DataFrame:
    """Join LBNL queue data to GridStatus queue data."""
    assert iso_df[iso_id_col].is_unique, "ID column not unique"
    lbnl_iso = lbnl.loc[lbnl["entity"].eq(iso_name), LBNL_JOIN_COLS]
    assert not lbnl_iso.empty, f"Empty LBNL queue for {iso_name}"
    assert lbnl_iso["queue_id"].is_unique, "LBNL queue ID not unique"
    out = iso_df.merge(lbnl_iso, how="outer", left_on=iso_id_col, right_on="queue_id")
    out["in_lbnl"] = ~out["queue_id"].isna()
    out["in_gs"] = ~out[iso_id_col].isna()
    return out


## Comparisons
### ERCOT
* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.
  * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?
* GridStatus defines "status" as "IA Signed".isna(). LBNL calls the entire "large active" dataset "active".

In [19]:
raw_ercot.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'Fuel', 'Technology', 'GIM Study Phase', 'Screening Study Started', 'Screening Study Complete', 'FIS Requested', 'FIS Approved', 'Economic Study Required', 'IA Signed', 'Air Permit', 'GHG Permit', 'Water Availability', 'Meets Planning', 'Meets All Planning', 'CDR Reporting Zone', 'Approved for Energization', 'Approved for Synchronization', 'Comment'], dtype='object')

In [20]:
ercot = join_lbnl(raw_ercot, lbnl, "ERCOT")
ercot.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2273 entries, 0 to 2272
Data columns (total 45 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Queue ID                      1203 non-null   object        
 1   Project Name                  1203 non-null   object        
 2   Interconnecting Entity        1203 non-null   object        
 3   County                        1203 non-null   object        
 4   State                         1203 non-null   object        
 5   Interconnection Location      1203 non-null   object        
 6   Transmission Owner            0 non-null      object        
 7   Generation Type               1203 non-null   object        
 8   Capacity (MW)                 1203 non-null   float64       
 9   Summer Capacity (MW)          0 non-null      object        
 10  Winter Capacity (MW)          0 non-null      object        
 11  Queue Date                    

In [21]:
ercot[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
False  True       1070
True   True        882
       False       321
dtype: int64

In [25]:
ercot["resource_type_1"].value_counts()

Solar            772
Wind             580
Battery          352
Gas              236
Other              4
Other Storage      4
Unknown            4
Name: resource_type_1, dtype: int64

In [28]:
ercot["Fuel"].value_counts()

Other      556
Solar      507
Wind       101
Gas         38
Biomass      1
Name: Fuel, dtype: int64

In [29]:
ercot["Technology"].value_counts()

Battery Energy Storage                                        550
Photovoltaic Solar                                            507
Wind Turbine                                                  101
Combustion (gas) Turbine, but not part of a Combined-Cycle     18
Combined-Cycle                                                 14
Internal Combustion Engine, eg. Reciprocating                   4
Other                                                           4
Steam Turbine other than Combined-Cycle                         3
Energy Storage                                                  2
Name: Technology, dtype: int64

Great! between GS.Fuel and GS.Technology it should be pretty easy to map to the values used in LBNL.

### NYISO

* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:
  * 0=Withdrawn
  * 1=Scoping Meeting Pending
  * 2=FES Pending
  * 3=FES in Progress
  * 4=SRIS/SIS Pending
  * 5=SRIS/SIS in Progress
  * 6=SRIS/SIS Approved
  * 7=FS Pending
  * 8=Rejected Cost Allocation/Next FS Pending
  * 9=FS in Progress
  * 10=Accepted Cost Allocation/IA in Progress
  * 11=IA Completed
  * 12=Under Construction
  * 13=In Service for Test
  * 14=In Service Commercial
  * 15=Partial In-Service
* Availability of Studies  Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available
* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.


In [30]:
raw_nyiso.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'Proposed  In-Service', 'Proposed Initial-Sync Date', 'Last Updated Date', 'Z', 'S', 'Availability of Studies', 'SGIA Tender Date'], dtype='object')

In [31]:
raw_nyiso[raw_nyiso["Queue ID"].duplicated(keep=False)]

Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Proposed In-Service,Proposed Initial-Sync Date,Last Updated Date,Z,S,Availability of Studies,SGIA Tender Date
5,0430,Cedar Rapids Transmission,,St. Lawrence,NY,,NM-NG,AC Transmission,0.0,,,2014-03-05,Active,NaT,,,,NaT,NaT,12/31/2019,D,12.0,"SIS, FS",
626,0127A,Munnsville,"Airtricity Munnsville Wind Farm, LLC",Madison,NY,46kV line,NYSEG,Wind,6.0,6.0,,2002-10-09,Withdrawn,NaT,2014-01-31 00:00:00,,,NaT,NaT,,E,0.0,,
1442,0127A,Munnsville,,Madison,NY,,NYSEG,Wind,40.0,40.0,,2002-10-09,Completed,NaT,,,3/31/13,NaT,NaT,3/31/13,E,1114.0,"SRIS, CY06",
1517,0430,Cedar Rapids Transmission,,St. Lawrence,NY,,NM-NG,AC Transmission,0.0,,,2014-03-05,Completed,NaT,,,2022-03-31 00:00:00,NaT,2021-10-01,2022-03-31 00:00:00,D,14.0,"SIS, FS",


In [32]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Proposed In-Service,Proposed Initial-Sync Date,Last Updated Date,Z,S,Availability of Studies,SGIA Tender Date
626,0127A,Munnsville,"Airtricity Munnsville Wind Farm, LLC",Madison,NY,46kV line,NYSEG,Wind,6.0,6.0,,2002-10-09,Withdrawn,NaT,2014-01-31 00:00:00,,,NaT,NaT,,E,0.0,,
1442,0127A,Munnsville,,Madison,NY,,NYSEG,Wind,40.0,40.0,,2002-10-09,Completed,NaT,,,3/31/13,NaT,NaT,3/31/13,E,1114.0,"SRIS, CY06",
5,0430,Cedar Rapids Transmission,,St. Lawrence,NY,,NM-NG,AC Transmission,0.0,,,2014-03-05,Active,NaT,,,,NaT,NaT,12/31/2019,D,12.0,"SIS, FS",
1517,0430,Cedar Rapids Transmission,,St. Lawrence,NY,,NM-NG,AC Transmission,0.0,,,2014-03-05,Completed,NaT,,,2022-03-31 00:00:00,NaT,2021-10-01,2022-03-31 00:00:00,D,14.0,"SIS, FS",


In [33]:
nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, "NYISO")
nyiso.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1607 entries, 0 to 1606
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Queue ID                     1533 non-null   object        
 1   Project Name                 1533 non-null   object        
 2   Interconnecting Entity       887 non-null    object        
 3   County                       1450 non-null   object        
 4   State                        1451 non-null   object        
 5   Interconnection Location     877 non-null    object        
 6   Transmission Owner           1527 non-null   object        
 7   Generation Type              1392 non-null   object        
 8   Capacity (MW)                1533 non-null   float64       
 9   Summer Capacity (MW)         1433 non-null   string        
 10  Winter Capacity (MW)         1280 non-null   string        
 11  Queue Date                   1533 non-null 

In [34]:
nyiso.head()


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Proposed In-Service,Proposed Initial-Sync Date,Last Updated Date,Z,S,Availability of Studies,SGIA Tender Date,queue_id,project_name,queue_date,queue_status,interconnection_status_raw,interconnection_status_lbnl,capacity_mw_resource_1,resource_type_1,in_lbnl,in_gs
0,0276,Homer Solar Energy Center,,Cortland,NY,,NM-NG,Solar,90.0,90.0,90.0,2008-01-30,Active,2023-09-01,,,,NaT,2023-09-01,9/30/2022,C,10.0,"FES, SRIS, FS",,276.0,Homer Solar Energy Center,1/30/2008,active,"FES, SRIS, FS",Facility Study,90.0,Solar,True,True
1,0396,Baron Winds,,Steuben,NY,,NYSEG,Wind,235.0,235.0,235.0,2012-11-30,Active,2023-12-01,,,,NaT,NaT,12/31/2022,C,15.0,"FES, SRIS, FS",,396.0,Baron Winds,11/30/2012,active,"FES, SRIS, FS",Facility Study,238.0,Wind,True,True
2,0396A,Wood Street Transformer,,Putnam,NY,,NYSEG,AC Transmission,0.0,,,2012-12-14,Active,NaT,,,,NaT,NaT,4/30/2021,G,12.0,SIS,,,,,,,,,,False,True
3,0422,Eight Point Wind Energy Center,,Steuben-Allegany,NY,,NYSEG,Wind,101.8,101.8,101.8,2013-11-07,Active,NaT,,,,NaT,NaT,3/31/2021,C,14.0,"FES, SRIS, FS",,422.0,Eight Point Wind Energy Center,11/7/2013,active,"FES, SRIS, FS",Facility Study,102.0,Wind,True,True
4,0429,North Rockland Station,,Steuben,NY,,ConEd,AC Transmission,0.0,,,2014-02-12,Active,NaT,,,,NaT,NaT,3/31/2022,G,12.0,SIS,,,,,,,,,,False,True


In [35]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
True   True       1186
       False       347
False  True         74
dtype: int64

In [36]:
# marginal improvement from date filter
nyiso.loc[
    nyiso["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


in_gs  in_lbnl
True   True       1186
       False       324
False  True         74
dtype: int64

### Compare fuel types

In [38]:
nyiso["resource_type_1"].value_counts()

Solar             392
Battery           333
Wind              184
Gas               119
Unknown            86
Offshore Wind      77
Methane            24
Hydro              17
Pumped Storage      8
Fuel Cell           8
Nuclear             6
Biomass             3
Flywheel            2
Landfill            1
Name: resource_type_1, dtype: int64

In [41]:
nyiso["Generation Type"].value_counts()

Energy Storage                                      369
Solar                                               366
Wind                                                266
AC Transmission                                     143
DC Transmission                                      46
Combustion Turbine                                   46
Combined Cycle                                       35
Methane                                              24
Hydro                                                18
Load                                                 16
Natural Gas                                          12
Steam Turbine                                        11
Pumped Storage                                        8
Fuel Cell                                             8
Nuclear                                               6
Dual Fuel                                             5
Wood                                                  3
Flywheel                                        

In [47]:
nyiso_transmission = nyiso[nyiso["Generation Type"].str.contains("Transmission",na=False)]
nyiso_transmission.queue_id.isna().value_counts()

True    190
Name: queue_id, dtype: int64

Looks like LBNL removes transmission projects.

GS hasmore generation types but should be so hard to consolidate them to LBNLs types.

### MISO
Very good project coverage, but the IA status categories are a mess.

In [50]:
raw_miso.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'facilityType', 'Post Generator Interconnection Agreement Status', 'Interconnection Approval Date', 'inService', 'giaToExec', 'studyCycle', 'studyGroup', 'studyPhase', 'svcType', 'dp1ErisMw', 'dp1NrisMw', 'dp2ErisMw', 'dp2NrisMw', 'sisPhase1'], dtype='object')

In [51]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,facilityType,Post Generator Interconnection Agreement Status,Interconnection Approval Date,inService,giaToExec,studyCycle,studyGroup,studyPhase,svcType,dp1ErisMw,dp1NrisMw,dp2ErisMw,dp2NrisMw,sisPhase1
2213,J392,,,Otsego County,MI,ITC/METC 138 kV Livingston to Stover line,"Michigan Electric Transmission Company, LLC",Gas,420.7,383.1,420.7,2014-09-15T04:00:00Z,Done,2015-09-01T04:00:00Z,,,,CT Combustion Turbine (Simple Cycle),In Service,2016-03-23T04:00:00Z,2016-05-16T04:00:00Z,2016-03-23T04:00:00Z,DPP-2015-FEB,East (ITC),Network Upgrade,NRIS,0.0,0.0,0.0,0.0,
2214,J392,,,Otsego County,MI,ITC/METC 138 kV Livingston to Stover line,"Michigan Electric Transmission Company, LLC",Gas,420.7,383.1,420.7,2014-09-15T04:00:00Z,Done,2015-09-01T04:00:00Z,,,,CT Combustion Turbine (Simple Cycle),In Service,2016-03-23T04:00:00Z,2016-05-16T04:00:00Z,2016-03-23T04:00:00Z,DPP-2015-FEB,East (ITC),Int FaS,NRIS,0.0,0.0,0.0,0.0,


In [52]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
lbnl.query('entity == "MISO"').loc[lbnl.query('entity == "MISO"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')


Unnamed: 0,queue_id,queue_status,queue_date,queue_year,interconnection_date,entity,project_name,developer,utility,county_1,county_2,county_3,raw_state_name,region,interconnection_service_type,point_of_interconnection,date_proposed,year_proposed,interconnection_status_raw,interconnection_status_lbnl,resource_type_lbnl,resource_type_1,resource_type_2,resource_type_3,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3
10519,J392,operational,9/15/2014,2014.0,3/23/2016,MISO,,,"Michigan Electric Transmission Company, LLC",otsego,,,MI,MISO,NRIS,ITC/METC 138 kV Livingston to Stover line,5/16/2016,2016.0,In Service,Operational,Gas,Gas,,,421.0,,
10842,J392,operational,9/15/2014,2014.0,3/23/2016,MISO,,,"Michigan Electric Transmission Company, LLC",otsego,,,MI,MISO,NRIS,ITC/METC 138 kV Livingston to Stover line,5/16/2016,2016.0,In Service,Operational,Gas,Gas,,,421.0,,


In [53]:
miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == "MISO"').drop_duplicates(subset='queue_id', keep='last'), "MISO")
miso.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4252 entries, 0 to 4251
Data columns (total 41 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Queue ID                                         4252 non-null   object 
 1   Project Name                                     0 non-null      object 
 2   Interconnecting Entity                           0 non-null      object 
 3   County                                           4252 non-null   object 
 4   State                                            4252 non-null   object 
 5   Interconnection Location                         4252 non-null   object 
 6   Transmission Owner                               4252 non-null   object 
 7   Generation Type                                  4252 non-null   object 
 8   Capacity (MW)                                    4252 non-null   float64
 9   Summer Capacity (MW)          

In [54]:
miso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
True   True       4070
       False       182
dtype: int64

### Compare generaiton types

In [55]:
miso["resource_type_1"].value_counts()

Solar             1724
Wind              1363
Battery            293
Gas                292
Unknown            140
Coal               100
Diesel              36
Biomass             32
Nuclear             26
Hydro               22
Hybrid              19
Waste Heat           8
Landfill             8
Pumped Storage       6
Steam                1
Name: resource_type_1, dtype: int64

In [57]:
miso["Generation Type"].value_counts()

Solar                  1479
Wind                   1364
Battery Storage         456
Hybrid                  272
Gas                     232
                        143
Coal                    100
Combined Cycle           57
Diesel                   36
Hydro                    28
Nuclear                  26
Biomass                  22
Wood                     10
Waste Heat Recovery       8
Landfill Gas              8
High Voltage DC           8
Co-Gen                    2
Steam                     1
Name: Generation Type, dtype: int64

Generation type mapping looks pretty straight forward.

### SPP

* neither LBNL nor GridStatus have withdrawn projects
* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between "IA pending" and "System Integration Study". But I don't think that is a problem because both are included in the "actionable" criteria in Synapse's model.

In [58]:
raw_spp.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'In-Service Date', 'Commercial Operation Date', 'Cessation Date', 'Current Cluster', 'Cluster Group', 'Replacement Generator Commercial Op Date', 'Service Type'], dtype='object')

In [59]:
spp = join_lbnl(raw_spp, lbnl, "SPP")
spp.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 965 entries, 0 to 964
Data columns (total 34 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Queue ID                                  950 non-null    object 
 1   Project Name                              0 non-null      object 
 2   Interconnecting Entity                    0 non-null      object 
 3   County                                    945 non-null    object 
 4   State                                     946 non-null    object 
 5   Interconnection Location                  933 non-null    object 
 6   Transmission Owner                        942 non-null    object 
 7   Generation Type                           950 non-null    object 
 8   Capacity (MW)                             950 non-null    float64
 9   Summer Capacity (MW)                      950 non-null    float64
 10  Winter Capacity (MW)                  

In [60]:
spp.sample(4)


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,In-Service Date,Commercial Operation Date,Cessation Date,Current Cluster,Cluster Group,Replacement Generator Commercial Op Date,Service Type,queue_id,project_name,queue_date,queue_status,interconnection_status_raw,interconnection_status_lbnl,capacity_mw_resource_1,resource_type_1,in_lbnl,in_gs
578,GEN-2017-195,,,Johnson,KS,West Gardner 345kV,KCPL,Solar,500.4,500.4,500.4,11/30/2017,,12/30/2026,,,,12/1/2020,12/30/2026,,DISIS-2017-002,03 CENTRAL,,ER/NR,GEN-2017-195,,11/30/2017,active,DISIS STAGE,System Impact Study,500.0,Solar,True,True
962,,,,,,,,,,,,,,,,,,,,,,,,,GEN-2015-055,,10/6/2015,active,IA FULLY EXECUTED/ON SCHEDULE,IA Executed,40.0,Solar,True,False
351,GEN-2021-012,,,Beckham,OK,Border 345 kV interconnection substation,OGE,Battery/Storage,225.0,227.0,227.0,4/30/2021,Active,12/31/2026,,,,11/15/2023,12/31/2026,,DISIS-2021-001,04 SOUTHEAST,,ER/NR,GEN-2021-012,,4/30/2021,active,DISIS STAGE,System Impact Study,225.0,Battery,True,True
438,GEN-2019-066,,,Beckham,OK,Sweetwater 230kV substation,AEP,Battery/Storage,50.0,50.0,50.0,5/24/2019,Active,5/1/2026,,,,6/1/2023,5/1/2026,,DISIS-2019-001,04 SOUTHEAST,,ER/NR,GEN-2019-066,,5/24/2019,active,DISIS STAGE,System Impact Study,50.0,Battery,True,True


In [61]:
spp[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
True   True       787
       False      163
False  True        15
dtype: int64

### Compare generaiton types

In [63]:
spp["resource_type_1"].value_counts()

Wind       335
Solar      272
Battery    108
Gas         65
Coal         8
Hydro        6
Nuclear      4
Unknown      3
Other        1
Name: resource_type_1, dtype: int64

In [65]:
spp["Generation Type"].value_counts()

Wind                              364
Solar                             253
Battery/Storage                   166
Hybrid - Solar/Storage             58
Thermal - Gas                      24
Thermal - CT                       24
Thermal - Coal                      8
                                    6
Hydro                               6
Hybrid - Solar                      5
Thermal - Nuclear                   4
Hybrid - Solar/Battery              4
Thermal - Gas Turbine               4
Thermal                             4
Hybrid - Wind/Storage               4
Thermal - CTG                       3
Thermal - Reciprocating Engine      3
WIND                                2
Thermal - CC                        1
Thermal - NG/CT                     1
Thermal - Diesel/Gas                1
Thermal - Steam Turbine             1
Battery/Storage - WERE              1
Thermal - Combined Cycle            1
Hybrid                              1
Thermal - CT/ST                     1
Name: Genera

Generation type mapping looks tractable

### PJM
Like MISO, good project coverage, but the IA status categories are a mess.

"Active" applied up to IA execution. Then "Engineering and Procurement" applied to IA execution through COD. Then "In Service".

In [66]:
raw_pjm.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'MW In Service', 'Commercial Name', 'Initial Study', 'Feasibility Study', 'Feasibility Study Status', 'System Impact Study', 'System Impact Study Status', 'Facilities Study', 'Facilities Study Status', 'Interim Interconnection Service Agreement', 'Interim/Interconnection Service Agreement Status', 'Wholesale Market Participation Agreement', 'Construction Service Agreement', 'Construction Service Agreement Status', 'Upgrade Construction Service Agreement', 'Upgrade Construction Service Agreement Status', 'Backfeed Date', 'Long-Term Firm Service Start Date', 'Long-Term Firm Service End Date', 'Test Energy Date'], dtype='object')

In [67]:
# "Active" stops at IA execution
raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()


Interim/Interconnection Service Agreement Status  Status                                   
Document Posted                                   Active                                          3
                                                  Deactivated                                    46
                                                  Engineering and Procurement                   240
                                                  In Service                                    839
                                                  Partially in Service - Under Construction      47
                                                  Suspended                                      47
                                                  Under Construction                             38
                                                  Withdrawn                                     206
Interim Study                                     Active                                         20
        

In [68]:
pjm = join_lbnl(raw_pjm, lbnl, "PJM")
pjm.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9030 entries, 0 to 9029
Data columns (total 47 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Queue ID                                          9030 non-null   object 
 1   Project Name                                      8806 non-null   object 
 2   Interconnecting Entity                            0 non-null      object 
 3   County                                            8168 non-null   object 
 4   State                                             8225 non-null   object 
 5   Interconnection Location                          0 non-null      object 
 6   Transmission Owner                                8312 non-null   object 
 7   Generation Type                                   8045 non-null   object 
 8   Capacity (MW)                                     8806 non-null   float64
 9   Summer Capacity (MW

### Compare generaiton types

In [72]:
pjm["resource_type_1"].value_counts()

Solar            4211
Gas              1106
Battery           986
Wind              668
Methane           191
Coal              137
Hydro              94
Offshore Wind      74
Nuclear            72
Oil                51
Biomass            51
Unknown            36
Other              29
Diesel             26
Wood                6
Name: resource_type_1, dtype: int64

In [73]:
pjm["Generation Type"].value_counts()

Solar                                  3882
Natural Gas                            1099
Storage                                1017
Wind                                    724
Solar; Storage                          552
Methane                                 191
Coal                                    137
Hydro                                    95
Offshore Wind                            79
Nuclear                                  73
Biomass                                  51
Oil                                      51
Other                                    27
Diesel                                   24
Storage; Solar                           12
Wood                                      6
Natural Gas; Other                        6
Wind; Storage                             3
Natural Gas; Other; Storage; Solar        2
Diesel; Methane                           2
Solar; Wind                               2
Storage; Wind                             1
Diesel; Solar                   

Generation type mapping looks tractable

### CAISO
Straightforward!

In [76]:
raw_caiso.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'Type-1', 'Type-2', 'Type-3', 'Fuel-1', 'Fuel-2', 'Fuel-3', 'MW-1', 'MW-2', 'MW-3', 'Interconnection Request Receive Date', 'Interconnection Agreement Status', 'Study Process', 'Proposed On-line Date (as filed with IR)', 'System Impact Study or Phase I Cluster Study', 'Facilities Study (FAS) or Phase II Cluster Study', 'Optional Study (OS)', 'Full Capacity, Partial or Energy Only (FC/P/EO)', 'Off-Peak Deliverability and Economic Only', 'Feasibility Study or Supplemental Review'], dtype='object')

In [77]:
caiso = join_lbnl(raw_caiso, lbnl, "CAISO")
caiso.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2278 entries, 0 to 2277
Data columns (total 46 columns):
 #   Column                                            Non-Null Count  Dtype         
---  ------                                            --------------  -----         
 0   Queue ID                                          2278 non-null   object        
 1   Project Name                                      2278 non-null   object        
 2   Interconnecting Entity                            0 non-null      object        
 3   County                                            2273 non-null   object        
 4   State                                             2274 non-null   object        
 5   Interconnection Location                          2278 non-null   object        
 6   Transmission Owner                                2278 non-null   object        
 7   Generation Type                                   2278 non-null   object        
 8   Capacity (MW)               

In [78]:
caiso.query("Status == 'ACTIVE'").sample(8, random_state=42).sort_values('interconnection_status_lbnl')


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Type-1,Type-2,Type-3,Fuel-1,Fuel-2,Fuel-3,MW-1,MW-2,MW-3,Interconnection Request Receive Date,Interconnection Agreement Status,Study Process,Proposed On-line Date (as filed with IR),System Impact Study or Phase I Cluster Study,Facilities Study (FAS) or Phase II Cluster Study,Optional Study (OS),"Full Capacity, Partial or Energy Only (FC/P/EO)",Off-Peak Deliverability and Economic Only,Feasibility Study or Supplemental Review,queue_id,project_name,queue_date,queue_status,interconnection_status_raw,interconnection_status_lbnl,capacity_mw_resource_1,resource_type_1,in_lbnl,in_gs
30,1048,ESCONDIDO ENERGY CENTER 2,,SAN DIEGO,CA,Escondido Substation 69 kV,SDGE,Storage,50.0,,,2014-04-30 07:00:00,ACTIVE,2025-07-17 07:00:00,NaT,,NaT,Storage,,,Battery,,,50.0,,,2014-04-30,In Progress,C07,2019-11-01 07:00:00,Complete,Complete,,Energy Only,,,1048,ESCONDIDO ENERGY CENTER 2,4/30/2014,active,Facilities,Facility Study,50.0,Battery,True,True
204,1736,HAWKINS SOLAR HYBRID,,KERN,CA,ARCO Substation 230kV,PGAE,Photovoltaic + Storage,250.0,,,2020-04-15 07:00:00,ACTIVE,2024-04-01 07:00:00,NaT,,NaT,Photovoltaic,Storage,,Solar,Battery,,250.0,250.0,,2020-04-01,,C13,2024-04-01 07:00:00,Complete,Complete,,Full Capacity,Off-Peak Deliverability,,1736,HAWKINS SOLAR HYBRID,4/1/2020,active,Facilities,Facility Study,250.0,Solar,True,True
266,1852,BULL RUN,,BUTTE,CA,Table Mountain Substation 115 kV,PGAE,Storage + Photovoltaic,225.0,,,2021-04-15 07:00:00,ACTIVE,2024-12-31 08:00:00,NaT,,NaT,Storage,Photovoltaic,,Battery,Solar,,225.0,232.1,,2021-04-06,,C14,2024-12-31 08:00:00,Complete,,,Full Capacity,,,1852,BULL RUN,4/6/2021,active,Feasibility,Feasibility Study,232.0,Solar,True,True
429,2142,SILVER STAR SOLAR,,NYE,NV,Lathrop Wells Substation 138 kV,VEA,Storage + Photovoltaic,500.0,,,2021-04-15 07:00:00,ACTIVE,2028-03-31 07:00:00,NaT,,NaT,Storage,Photovoltaic,,Battery,Solar,,516.7638,516.7638,,2021-03-31,,C14,2028-03-31 07:00:00,,,,Full Capacity,Off-Peak Deliverability,,2142,SILVER STAR SOLAR,3/31/2021,active,Feasibility,Feasibility Study,517.0,Solar,True,True
253,1832,GOAL LINE RELIABILITY,,SAN DIEGO,CA,Esco Substation 69 kV,SDGE,Storage,50.0,,,2021-04-15 07:00:00,ACTIVE,2023-05-30 07:00:00,NaT,,NaT,Storage,,,Battery,,,154.12,,,2021-02-08,,C14,2023-05-30 07:00:00,,,,Full Capacity,,,1832,GOAL LINE RELIABILITY,2/8/2021,active,Feasibility,Feasibility Study,154.0,Battery,True,True
9,297,ALTA VISTA SUNTOWER GENERATING STATION,,LOS ANGELES,CA,Neenach-Bailey 66kV line,SCE,Steam Turbine + Storage,66.0,,,2008-01-18 08:00:00,ACTIVE,2025-04-15 07:00:00,NaT,,NaT,Steam Turbine,Storage,,Solar,Battery,,66.0,66.0,,2008-01-18,Executed,TC,2009-12-01 08:00:00,Complete,Complete,,Full Capacity,Off-Peak Deliverability,,297,ALTA VISTA SUNTOWER GENERATING STATION,1/18/2008,active,Executed,IA Executed,66.0,Solar,True,True
39,1116,ULTRAPOWER CHINESE STATION BESS,,TUOLUMNE,CA,Melones-Curtis 115kV line,PGAE,Storage,10.0,,,2015-04-30 07:00:00,ACTIVE,2022-05-20 07:00:00,NaT,,NaT,Storage,,,Battery,,,10.0,,,2015-04-30,Executed,C08,2017-03-31 07:00:00,Complete,Complete,,Full Capacity,,,1116,ULTRAPOWER CHINESE STATION BESS,4/30/2015,active,Executed,IA Executed,10.0,Battery,True,True
443,2166,UMBRIEL,,IMPERIAL,CA,North Gila - Imperial Valley 500 kV Line,SDGE,Storage + Photovoltaic,1150.0,,,2021-04-15 07:00:00,ACTIVE,2026-12-31 08:00:00,NaT,,NaT,Storage,Photovoltaic,,Battery,Solar,,1182.38,1182.38,,2021-04-08,,C14,2026-12-31 08:00:00,Complete,,,Full Capacity,Off-Peak Deliverability,,2166,UMBRIEL,4/8/2021,active,System Impact,System Impact Study,1182.0,Solar,True,True


In [79]:
caiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
True   True       2274
       False         4
dtype: int64

In [80]:
caiso.loc[
    pd.to_datetime(caiso["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


in_gs  in_lbnl
True   True       2274
dtype: int64

In [81]:
# remarkably easy to match status values
caiso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


Status     queue_status
ACTIVE     active           457
           operational        1
           withdrawn          2
           NaN                4
COMPLETED  active             3
           operational      198
WITHDRAWN  active            35
           withdrawn       1578
dtype: int64

### Compare generaiton types

In [82]:
caiso["resource_type_1"].value_counts()

Solar             1255
Battery            454
Gas                249
Wind               201
Geothermal          36
Biofuel             21
Offshore Wind       16
Hydro               15
Other                8
Pumped Storage       6
Unknown              5
Nuclear              3
Flywheel             3
Gravity Rail         1
Other Storage        1
Name: resource_type_1, dtype: int64

In [83]:
caiso["Generation Type"].value_counts()

Photovoltaic                                    707
Storage                                         475
Photovoltaic + Storage                          231
Wind Turbine                                    194
Storage + Photovoltaic                          182
Steam Turbine                                   161
Combined Cycle                                   94
Gas Turbine                                      63
Combustion Turbine                               57
Reciprocating Engine                             15
Wind Turbine + Storage                           14
Hydro                                            12
Storage + Wind Turbine                            9
Other                                             8
Photovoltaic + Combustion Turbine                 4
Steam Turbine + Storage                           4
Combustion Turbine + Storage                      4
Storage + Wind Turbine + Photovoltaic             3
Storage + Gas Turbine                             3
Storage + Ph

A bit messier than others but still looks tractable

### ISO-NE

* what GridStatus calls "Queue ID" was actually "Queue Position" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects.

In [84]:
raw_isone.columns


Index(['Queue ID', 'Project Name', 'Interconnecting Entity', 'County', 'State', 'Interconnection Location', 'Transmission Owner', 'Generation Type', 'Capacity (MW)', 'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Queue Date', 'Status', 'Proposed Completion Date', 'Withdrawn Date', 'Withdrawal Comment', 'Actual Completion Date', 'Updated', 'Unit', 'Op Date', 'Sync Date', 'Serv', 'I39', 'Dev', 'Zone', 'FS', 'SIS', 'OS', 'FAC', 'IA', 'Project Status'], dtype='object')

In [85]:
raw_isone.duplicated(subset=['Queue ID']).sum()


150

In [86]:
raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)


Withdrawn    159
Completed     83
Name: Status, dtype: int64

In [87]:
raw_isone['Status'].value_counts(dropna=False)


Withdrawn    804
Active       408
Completed    321
Name: Status, dtype: int64

In [88]:
compound_key = ['Queue ID', 'Status']
raw_isone.duplicated(subset=compound_key).sum()


133

In [89]:
# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.
compound_key = ['Queue ID', 'Project Name']
raw_isone.duplicated(subset=compound_key).sum()


27

In [90]:
# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.
raw_isone.query('Status == "Active"')['Queue ID'].is_unique


True

In [91]:
# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.
raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Updated,Unit,Op Date,Sync Date,Serv,I39,Dev,Zone,FS,SIS,OS,FAC,IA,Project Status
1054,350,Wind,,Somerset,ME,CMP Wyman substation,ISO-NE,WND,96.9,96.9,96.9,11/10/2010,Withdrawn,10/1/2016,10/5/2015,,,10/5/2015,WT,12/31/2016,10/1/2016,NR,N,PD,ME,,Y,,,,
1055,350,Wind,,Washington,ME,BHE Washington County 115 kV substation,,WND,92.25,92.25,92.25,11/10/2010,Withdrawn,10/1/2015,11/15/2013,,,10/1/2015,WT,12/31/2015,10/1/2015,NR,N,PD,BHE,,N,,,,
1050,353,Wind,,Plymouth,MA,NSTAR 115 kV Valley substation,ISO-NE,WND,9.9,9.9,9.9,12/21/2010,Withdrawn,11/30/2013,8/8/2012,,,8/8/2012,WT,12/31/2013,11/30/2013,CNR,N,BL,SEMA,,Y,,,,
1051,353,Wind,,Barnstable,MA,NSTAR 115 kV Valley substation,,WND,14.9,14.9,14.9,12/21/2010,Withdrawn,10/31/2012,6/7/2011,,,6/9/2011,WT,11/30/2012,10/31/2012,CNR,N,,SEMA,,N,,,,
1042,358,Hydro,,Penobscot,ME,BHE Stillwater 12.5 kV substation,,WAT,2.274,2.274,2.274,3/4/2011,Withdrawn,8/1/2012,10/5/2011,,,10/6/2011,HD,8/15/2012,8/1/2012,CNR,N,,BHE,,N,,,,
1043,358,Hydro,,Penobscot,ME,BHE Orono 12.5 kV substation,,WAT,3.858,3.858,3.858,3/4/2011,Withdrawn,8/1/2012,4/11/2011,,,4/12/2011,HD,8/15/2012,8/1/2012,CNR,N,,BHE,,N,,,,
1026,368,Wind,,Hillsborough,NH,PSNH 7.2 kV to Milford Substation,,WND,10.0,10.0,10.0,5/27/2011,Withdrawn,9/1/2012,8/23/2011,,,6/10/2014,WT,9/30/2012,9/1/2012,CNR,N,,NH,,N,,,,
1027,368,Wind,,Hillsborough,NH,PSNH 34.5 kV to Monadnock substation,ISO-NE,WND,16.1,16.1,16.1,5/27/2011,Withdrawn,10/1/2014,7/25/2013,,,7/25/2013,WT,10/31/2014,10/1/2014,CNR,Y,CD,NH,,Y,,,,
1010,380,1000 MW HVDC Line,,,MA,NSTAR Carver 345 kV,,,,,,11/10/2011,Withdrawn,1/1/2020,2/16/2015,,,2/17/2015,,6/1/2020,1/1/2020,,N,,SEMA,,N,,,,
1011,380,1000 MW HVDC Line,,,MA,NSTAR Barnstable 345 kV,,,,,,11/10/2011,Withdrawn,1/1/2020,2/16/2015,,,2/17/2015,,6/1/2020,1/1/2020,,N,,SEMA,,N,,,,


In [92]:
raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])


In [93]:
# join manually rather than refactoring the func to take compound key
lbnl_iso = lbnl.loc[lbnl["entity"].eq("ISO-NE"), LBNL_JOIN_COLS].astype({'queue_id': int})
isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')
isone = isone.merge(lbnl_iso, how="outer", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])
isone["in_lbnl"] = ~isone["queue_id"].isna()
isone["in_gs"] = ~isone["Queue ID"].isna()
del lbnl_iso
isone.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1572 entries, 0 to 1571
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Queue ID                     1526 non-null   float64       
 1   Project Name                 1526 non-null   object        
 2   Interconnecting Entity       0 non-null      object        
 3   County                       1423 non-null   object        
 4   State                        1499 non-null   object        
 5   Interconnection Location     1503 non-null   object        
 6   Transmission Owner           590 non-null    object        
 7   Generation Type              1272 non-null   object        
 8   Capacity (MW)                1254 non-null   float64       
 9   Summer Capacity (MW)         1271 non-null   float64       
 10  Winter Capacity (MW)         1268 non-null   float64       
 11  Queue Date                   1526 non-null 

In [94]:
isone.head(2)


Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,Updated,Unit,Op Date,Sync Date,Serv,I39,Dev,Zone,FS,SIS,OS,FAC,IA,Project Status,queue_id,project_name,queue_date,queue_status,interconnection_status_raw,interconnection_status_lbnl,capacity_mw_resource_1,resource_type_1,in_lbnl,in_gs
0,188.0,Gas Turbine Capacity Increase( #178),,Plymouth,MA,115 kV F19 and/or E20 lines,,DFO NG,,69.0,0.0,12/7/2006,Withdrawn,4/1/2010,2/1/2007,,,2007-02-01,CC,6/1/2010,4/1/2010,MIS,N,,SEMA,,N,,,,,188.0,Gas Turbine Capacity Increase( #178),07dec2006,withdrawn,Unknown/Not Started,In Progress (unknown study),,Oil,True,True
1,141.0,Gas Turbine,,Middlesex,MA,115 kV O 167 line between Everett and Mystic S...,ISO-NE,DFO NG,,200.0,200.0,12/14/2005,Withdrawn,8/31/2009,2/7/2007,,,2007-02-07,GT,9/30/2009,8/31/2009,MIS,N,,BOST,,N,,,,,141.0,Gas Turbine,14dec2005,withdrawn,Unknown/Not Started,In Progress (unknown study),,Oil,True,True


In [95]:
isone[["in_gs", "in_lbnl"]].value_counts(dropna=False)


in_gs  in_lbnl
True   True       1209
       False       317
False  True         46
dtype: int64

In [96]:
# small improvement from date filter
isone.loc[
    pd.to_datetime(isone["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


in_gs  in_lbnl
True   True       1209
       False       278
False  True         46
dtype: int64

In [97]:
# status values are decently aligned
isone[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


Status     queue_status
Active     active          303
           NaN             105
Completed  active            2
           operational     270
           withdrawn         1
           NaN              48
Withdrawn  active           11
           operational      29
           withdrawn       593
           NaN             164
NaN        active           34
           operational       6
           withdrawn         6
dtype: int64

In [98]:
isone['interconnection_status_raw'].value_counts(dropna=False)


Unknown/Not Started    778
NaN                    317
operational            305
System Impact Study     68
IA Executed             49
Feasibility Study       31
Not Started             18
Facility Study           6
Name: interconnection_status_raw, dtype: int64

In [99]:
isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


interconnection_status_lbnl  interconnection_status_raw
Facility Study               Facility Study                  6
Feasibility Study            Feasibility Study              31
IA Executed                  IA Executed                    49
In Progress (unknown study)  Unknown/Not Started           778
Not Started                  Not Started                    18
Operational                  operational                   305
System Impact Study          System Impact Study            68
NaN                          NaN                           317
dtype: int64

In [100]:
status_cols_isone = [
    "FS",
    "SIS",
    "OS",
    "FAC",
    "IA",
    "Project Status",
]
with pd.option_context('display.max_rows', None):
    display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())


FS   SIS  OS   FAC  IA   Project Status                        
NaN  N    NaN  NaN  NaN  CSIS                                        3
                         Third Maine Resource Integration Study     11
                         NaN                                       936
     Y    NaN  NaN  NaN  NaN                                       576
     NaN  NaN  NaN  NaN  NaN                                        46
dtype: int64

In [101]:
# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.
isone[status_cols_isone]


Unnamed: 0,FS,SIS,OS,FAC,IA,Project Status
0,,N,,,,
1,,N,,,,
2,,N,,,,
3,,N,,,,
4,,N,,,,
...,...,...,...,...,...,...
1567,,,,,,
1568,,,,,,
1569,,,,,,
1570,,,,,,


In [102]:
raw_isone['Generation Type'].value_counts(dropna=False)


SUN                336
None               260
WND                232
BAT                144
NG                 132
SUN BAT            118
DFO NG             105
WAT                 66
WDS                 40
DFO                 14
LFG                 12
BIT                 11
NUC                 11
WND BAT             10
FC                   5
WAT BAT              5
KER NG               4
DFO KER NG           4
NG WO                3
NG SUN BAT           2
KER                  2
JF KER NG            2
BLQ WDS              2
MSW                  2
NG RFO               1
DFO KER              1
JF KER               1
JF                   1
BLQ DFO KER WDS      1
NG WDS               1
BIT RFO              1
DFO SUB              1
DFO WDS              1
NG OTH               1
JF NG                1
Name: Generation Type, dtype: int64

### Compare generaiton types

In [103]:
isone["resource_type_1"].value_counts()

Solar            434
Wind             178
Oil              135
Gas              131
Battery          117
Unknown           82
Hydro             65
Biomass           38
Offshore Wind     36
Landfill          14
Coal              12
Nuclear           11
Fuel Cell          2
Name: resource_type_1, dtype: int64

In [104]:
isone["Generation Type"].value_counts()

SUN                336
WND                232
BAT                144
NG                 132
SUN BAT            118
DFO NG             105
WAT                 66
WDS                 39
DFO                 14
LFG                 12
BIT                 11
NUC                 11
WND BAT             10
FC                   5
WAT BAT              5
KER NG               4
DFO KER NG           4
NG WO                3
BLQ WDS              2
NG SUN BAT           2
MSW                  2
JF KER NG            2
KER                  2
BLQ DFO KER WDS      1
NG OTH               1
DFO WDS              1
DFO SUB              1
BIT RFO              1
NG WDS               1
JF                   1
DFO KER              1
JF KER               1
NG RFO               1
JF NG                1
Name: Generation Type, dtype: int64

Generation type mapping looks tractable. Will have to look up the codes listed in is the excel sheets on [this ISONE site](https://www.iso-ne.com/isoexpress/web/reports/operations/-/tree/seasonal-claimed-capability).

In [107]:
lbnl["region"].value_counts()

PJM                    7738
West (non-ISO)         6610
MISO                   4071
Southeast (non-ISO)    3070
CAISO                  2274
ERCOT                  1952
NYISO                  1260
ISO-NE                 1255
SPP                     802
Name: region, dtype: int64

## Some fun bonus capcity analysis

In [138]:
active_lbnl = lbnl.query("queue_status == 'active'")
active_lbnl.groupby(lbnl["region"].str.contains("non-ISO", na=False))["capacity_mw_resource_1"].sum() / active_lbnl["capacity_mw_resource_1"].sum()

region
False    0.694277
True     0.305723
Name: capacity_mw_resource_1, dtype: float64

#### What are the top non ISO utilities

In [139]:
non_iso_lbnl = active_lbnl[active_lbnl["region"].str.contains("non-ISO", na=False)]
non_iso_lbnl_mw = non_iso_lbnl.groupby("utility")["capacity_mw_resource_1"].sum().sort_values() / non_iso_lbnl["capacity_mw_resource_1"].sum()


In [140]:
top_mw_non_iso = []

for i in range(5, len(non_iso_lbnl_mw), 5):
    top_mw_non_iso.append((i, non_iso_lbnl_mw.tail(i).sum()))

In [141]:
pd.DataFrame(top_mw_non_iso, columns=("top_n_utilities", "pct_total_non_iso_capacity"))

Unnamed: 0,top_n_utilities,pct_total_non_iso_capacity
0,5,0.414365
1,10,0.585736
2,15,0.697677
3,20,0.767261
4,25,0.806972
5,30,0.83254
6,35,0.848873
7,40,0.855334
8,45,0.855334
9,50,0.855334


In [132]:
non_iso_lbnl_mw.tail(5)

utility
PSCo          0.047498
NVE           0.048835
SOCO          0.083345
PacifiCorp    0.112419
TVA           0.138038
Name: capacity_mw_resource_1, dtype: float64