# GridStatus <-> LBNL Status Mapping

GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two.

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

from dbcp.extract.lbnl_iso_queue import extract


## Get the data
### LBNL-Compiled Queues

In [None]:
# partial implementation of transform. I don't want to include deduplication.
def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:
    """Transform active iso queue data."""
    rename_dict = {
        "state": "raw_state_name",
        "county": "raw_county_name",
    }
    active_projects = active_projects.rename(columns=rename_dict)  # copy
    # Harmonize the interconnection_status_lbnl values.
    mapping = {
        "Feasability Study": "Feasibility Study",
        "Feasibility": "Feasibility Study",
        "Facilities Study": "Facility Study",
        "IA in Progress": "In Progress (unknown study)",
        "Unknown": "In Progress (unknown study)",
        "Withdrawn, Feasibility Study": "Withdrawn",
    }
    active_projects.loc[:, "interconnection_status_lbnl"] = active_projects.loc[
        :, "interconnection_status_lbnl"
    ].replace(mapping)
    # drop irrelevant columns (structurally all nan due to 'active' filter)
    active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True)
    return active_projects


source_path = Path("/app/data/raw/queues_2022_clean_data.xlsx")
raw_lbnl = extract(source_path)["lbnl_iso_queue"]
lbnl = partial_transform(raw_lbnl)


In [None]:
lbnl.shape, lbnl.columns


In [None]:
lbnl.head(2)


### GridStatus Queues

In [None]:
import dbcp

# These are the revision numbers of the oldest archives we have
iso_queue_versions: dict[str, str] = {
    "miso": "1681775160487863",
    "caiso": "1681775162586588",
    "pjm": "1681775160979859",
    "ercot": "1681775161342766",
    "spp": "1681775162935809",
    "nyiso": "1681775159356063",
    "isone": "1681775162111351",
}

gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)

In [None]:
{k: v.shape for k, v in gs_dfs.items()}

In [None]:
for iso, df in gs_dfs.items():
    print(iso)
    print(df["County"].isna().value_counts(normalize=True))
    print(df["State"].isna().value_counts(normalize=True))
    print()

In [None]:
df.info()

In [None]:
# # These are manually downloaded from our archives. I went back as far as I could,
# # which is April 17 2023.
# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.
# root_path = Path("/app/data/raw/gridstatus/interconnection_queues")
# assert root_path.exists()
# # filenames are like "interconnection_queues_caiso_4-17-2023.parquet"
# gs_dfs = {
#     path.name.split("_")[2]: pd.read_parquet(path)
#     for path in root_path.glob("*.parquet")
# }
# {k: v.shape for k, v in gs_dfs.items()}


In [None]:
# wayyy fewer items in GridStatus than LBNL.
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]


In [None]:
lbnl.region.value_counts()

In [None]:
lbnl_iso = lbnl[~lbnl["region"].str.contains("non-ISO", na=False)].copy()

In [None]:
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]

Gridstatus has more ISO projects

In [None]:
# globals().update(gs_dfs)  # this works fine but the static type checker/linter can't introspect it.
raw_ercot = gs_dfs["ercot"]
raw_nyiso = gs_dfs["nyiso"]
raw_isone = gs_dfs["isone"]
raw_miso = gs_dfs["miso"]
raw_pjm = gs_dfs["pjm"]
raw_spp = gs_dfs["spp"]
raw_caiso = gs_dfs["caiso"]


In [None]:
LBNL_JOIN_COLS = [
    "queue_id",  # join key
    "project_name",  # for manually checking the joins
    "queue_date",  # for manually checking the joins
    "queue_status",  # for manually checking the joins
    "interconnection_status_raw",  # see what LBNL interpreted
    "interconnection_status_lbnl",  # final mapping value
    "capacity_mw_resource_1",
    "resource_type_1"
    
]


def join_lbnl(
    iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col="Queue ID"
) -> pd.DataFrame:
    """Join LBNL queue data to GridStatus queue data."""
    assert iso_df[iso_id_col].is_unique, "ID column not unique"
    lbnl_iso = lbnl.loc[lbnl["entity"].eq(iso_name), LBNL_JOIN_COLS]
    assert not lbnl_iso.empty, f"Empty LBNL queue for {iso_name}"
    assert lbnl_iso["queue_id"].is_unique, "LBNL queue ID not unique"
    out = iso_df.merge(lbnl_iso, how="outer", left_on=iso_id_col, right_on="queue_id")
    out["in_lbnl"] = ~out["queue_id"].isna()
    out["in_gs"] = ~out[iso_id_col].isna()
    return out


## Comparisons
### ERCOT
* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.
  * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?
* GridStatus defines "status" as "IA Signed".isna(). LBNL calls the entire "large active" dataset "active".

In [None]:
raw_ercot.columns


In [None]:
ercot = join_lbnl(raw_ercot, lbnl, "ERCOT")
ercot.info()


In [None]:
ercot[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
ercot["resource_type_1"].value_counts()

In [None]:
ercot["Fuel"].value_counts()

In [None]:
ercot["Technology"].value_counts()

Great! between GS.Fuel and GS.Technology it should be pretty easy to map to the values used in LBNL.

### NYISO

* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:
  * 0=Withdrawn
  * 1=Scoping Meeting Pending
  * 2=FES Pending
  * 3=FES in Progress
  * 4=SRIS/SIS Pending
  * 5=SRIS/SIS in Progress
  * 6=SRIS/SIS Approved
  * 7=FS Pending
  * 8=Rejected Cost Allocation/Next FS Pending
  * 9=FS in Progress
  * 10=Accepted Cost Allocation/IA in Progress
  * 11=IA Completed
  * 12=Under Construction
  * 13=In Service for Test
  * 14=In Service Commercial
  * 15=Partial In-Service
* Availability of Studies  Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available
* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.


In [None]:
raw_nyiso.columns


In [None]:
raw_nyiso[raw_nyiso["Queue ID"].duplicated(keep=False)]

In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, "NYISO")
nyiso.info()


In [None]:
nyiso.head()


In [None]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# marginal improvement from date filter
nyiso.loc[
    nyiso["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


### Compare fuel types

In [None]:
nyiso["resource_type_1"].value_counts()

In [None]:
nyiso["Generation Type"].value_counts()

In [None]:
nyiso_transmission = nyiso[nyiso["Generation Type"].str.contains("Transmission",na=False)]
nyiso_transmission.queue_id.isna().value_counts()

Looks like LBNL removes transmission projects.

GS hasmore generation types but should be so hard to consolidate them to LBNLs types.

### MISO
Very good project coverage, but the IA status categories are a mess.

In [None]:
raw_miso.columns


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
lbnl.query('entity == "MISO"').loc[lbnl.query('entity == "MISO"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')


In [None]:
miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == "MISO"').drop_duplicates(subset='queue_id', keep='last'), "MISO")
miso.info()


In [None]:
miso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


### Compare generaiton types

In [None]:
miso["resource_type_1"].value_counts()

In [None]:
miso["Generation Type"].value_counts()

Generation type mapping looks pretty straight forward.

### SPP

* neither LBNL nor GridStatus have withdrawn projects
* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between "IA pending" and "System Integration Study". But I don't think that is a problem because both are included in the "actionable" criteria in Synapse's model.

In [None]:
raw_spp.columns


In [None]:
spp = join_lbnl(raw_spp, lbnl, "SPP")
spp.info()


In [None]:
spp.sample(4)


In [None]:
spp[["in_gs", "in_lbnl"]].value_counts(dropna=False)


### Compare generaiton types

In [None]:
spp["resource_type_1"].value_counts()

In [None]:
spp["Generation Type"].value_counts()

Generation type mapping looks tractable

### PJM
Like MISO, good project coverage, but the IA status categories are a mess.

"Active" applied up to IA execution. Then "Engineering and Procurement" applied to IA execution through COD. Then "In Service".

In [None]:
raw_pjm.columns


In [None]:
# "Active" stops at IA execution
raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()


In [None]:
pjm = join_lbnl(raw_pjm, lbnl, "PJM")
pjm.info()


### Compare generaiton types

In [None]:
pjm["resource_type_1"].value_counts()

In [None]:
pjm["Generation Type"].value_counts()

Generation type mapping looks tractable

### CAISO
Straightforward!

In [None]:
raw_caiso.columns


In [None]:
caiso = join_lbnl(raw_caiso, lbnl, "CAISO")
caiso.info()


In [None]:
caiso.query("Status == 'ACTIVE'").sample(8, random_state=42).sort_values('interconnection_status_lbnl')


In [None]:
caiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
caiso.loc[
    pd.to_datetime(caiso["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# remarkably easy to match status values
caiso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


### Compare generaiton types

In [None]:
caiso["resource_type_1"].value_counts()

In [None]:
caiso["Generation Type"].value_counts()

A bit messier than others but still looks tractable

### ISO-NE

* what GridStatus calls "Queue ID" was actually "Queue Position" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects.

In [None]:
raw_isone.columns


In [None]:
raw_isone.duplicated(subset=['Queue ID']).sum()


In [None]:
raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)


In [None]:
raw_isone['Status'].value_counts(dropna=False)


In [None]:
compound_key = ['Queue ID', 'Status']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.
compound_key = ['Queue ID', 'Project Name']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.
raw_isone.query('Status == "Active"')['Queue ID'].is_unique


In [None]:
# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.
raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)


In [None]:
raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])


In [None]:
# join manually rather than refactoring the func to take compound key
lbnl_iso = lbnl.loc[lbnl["entity"].eq("ISO-NE"), LBNL_JOIN_COLS].astype({'queue_id': int})
isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')
isone = isone.merge(lbnl_iso, how="outer", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])
isone["in_lbnl"] = ~isone["queue_id"].isna()
isone["in_gs"] = ~isone["Queue ID"].isna()
del lbnl_iso
isone.info()


In [None]:
isone.head(2)


In [None]:
isone[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# small improvement from date filter
isone.loc[
    pd.to_datetime(isone["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# status values are decently aligned
isone[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


In [None]:
isone['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
status_cols_isone = [
    "FS",
    "SIS",
    "OS",
    "FAC",
    "IA",
    "Project Status",
]
with pd.option_context('display.max_rows', None):
    display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())


In [None]:
# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.
isone[status_cols_isone]


In [None]:
raw_isone['Generation Type'].value_counts(dropna=False)


### Compare generaiton types

In [None]:
isone["resource_type_1"].value_counts()

In [None]:
isone["Generation Type"].value_counts()

Generation type mapping looks tractable. Will have to look up the codes listed in is the excel sheets on [this ISONE site](https://www.iso-ne.com/isoexpress/web/reports/operations/-/tree/seasonal-claimed-capability).

In [None]:
lbnl["region"].value_counts()

## Some fun bonus capcity analysis

In [None]:
active_lbnl = lbnl.query("queue_status == 'active'")
active_lbnl.groupby(lbnl["region"].str.contains("non-ISO", na=False))["capacity_mw_resource_1"].sum() / active_lbnl["capacity_mw_resource_1"].sum()

#### What are the top non ISO utilities

In [None]:
non_iso_lbnl = active_lbnl[active_lbnl["region"].str.contains("non-ISO", na=False)]
non_iso_lbnl_mw = non_iso_lbnl.groupby("utility")["capacity_mw_resource_1"].sum().sort_values() / non_iso_lbnl["capacity_mw_resource_1"].sum()


In [None]:
top_mw_non_iso = []

for i in range(5, len(non_iso_lbnl_mw), 5):
    top_mw_non_iso.append((i, non_iso_lbnl_mw.tail(i).sum()))

In [None]:
pd.DataFrame(top_mw_non_iso, columns=("top_n_utilities", "pct_total_non_iso_capacity"))

In [None]:
non_iso_lbnl_mw.tail(5)