# GridStatus <-> LBNL Status Mapping

GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two.

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

from dbcp.extract.lbnl_iso_queue import extract


## Get the data
### LBNL-Compiled Queues

In [None]:
# partial implementation of transform. I don't want to include deduplication.
def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:
    """Transform active iso queue data."""
    rename_dict = {
        "state": "raw_state_name",
        "county": "raw_county_name",
    }
    active_projects = active_projects.rename(columns=rename_dict)  # copy
    # Harmonize the interconnection_status_lbnl values.
    mapping = {
        "Feasability Study": "Feasibility Study",
        "Feasibility": "Feasibility Study",
        "Facilities Study": "Facility Study",
        "IA in Progress": "In Progress (unknown study)",
        "Unknown": "In Progress (unknown study)",
        "Withdrawn, Feasibility Study": "Withdrawn",
    }
    active_projects.loc[:, "interconnection_status_lbnl"] = active_projects.loc[
        :, "interconnection_status_lbnl"
    ].replace(mapping)
    # drop irrelevant columns (structurally all nan due to 'active' filter)
    active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True)
    return active_projects


source_path = Path("/app/data/raw/queues_2022_clean_data.xlsx")
raw_lbnl = extract(source_path)["lbnl_iso_queue"]
lbnl = partial_transform(raw_lbnl)


In [None]:
lbnl.shape, lbnl.columns


In [None]:
lbnl.head(2)


### GridStatus Queues

In [None]:
import dbcp

# These are the revision numbers of the oldest archives we have
iso_queue_versions: dict[str, str] = {
    "miso": "1681775160487863",
    "caiso": "1681775162586588",
    "pjm": "1681775160979859",
    "ercot": "1681775161342766",
    "spp": "1681775162935809",
    "nyiso": "1681775159356063",
    "isone": "1681775162111351",
}

gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)

In [None]:
{k: v.shape for k, v in gs_dfs.items()}

In [None]:
# # These are manually downloaded from our archives. I went back as far as I could,
# # which is April 17 2023.
# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.
# root_path = Path("/app/data/raw/gridstatus/interconnection_queues")
# assert root_path.exists()
# # filenames are like "interconnection_queues_caiso_4-17-2023.parquet"
# gs_dfs = {
#     path.name.split("_")[2]: pd.read_parquet(path)
#     for path in root_path.glob("*.parquet")
# }
# {k: v.shape for k, v in gs_dfs.items()}


In [None]:
# wayyy fewer items in GridStatus than LBNL.
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]


In [None]:
lbnl.region.value_counts()

In [None]:
lbnl_iso = lbnl[~lbnl["region"].str.contains("non-ISO", na=False)].copy()

In [None]:
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]

Gridstatus has more ISO projects

In [None]:
# globals().update(gs_dfs)  # this works fine but the static type checker/linter can't introspect it.
raw_ercot = gs_dfs["ercot"]
raw_nyiso = gs_dfs["nyiso"]
raw_isone = gs_dfs["isone"]
raw_miso = gs_dfs["miso"]
raw_pjm = gs_dfs["pjm"]
raw_spp = gs_dfs["spp"]
raw_caiso = gs_dfs["caiso"]


In [None]:
LBNL_JOIN_COLS = [
    "queue_id",  # join key
    "project_name",  # for manually checking the joins
    "queue_date",  # for manually checking the joins
    "queue_status",  # for manually checking the joins
    "interconnection_status_raw",  # see what LBNL interpreted
    "interconnection_status_lbnl",  # final mapping value
    "capacity_mw_resource_1",
    "resource_type_1"
    
]


def join_lbnl(
    iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col="Queue ID"
) -> pd.DataFrame:
    """Join LBNL queue data to GridStatus queue data."""
    assert iso_df[iso_id_col].is_unique, "ID column not unique"
    lbnl_iso = lbnl.loc[lbnl["entity"].eq(iso_name), LBNL_JOIN_COLS]
    assert not lbnl_iso.empty, f"Empty LBNL queue for {iso_name}"
    assert lbnl_iso["queue_id"].is_unique, "LBNL queue ID not unique"
    out = iso_df.merge(lbnl_iso, how="outer", left_on=iso_id_col, right_on="queue_id")
    out["in_lbnl"] = ~out["queue_id"].isna()
    out["in_gs"] = ~out[iso_id_col].isna()
    return out


## Comparisons
### ERCOT
* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.
  * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?
* GridStatus defines "status" as "IA Signed".isna(). LBNL calls the entire "large active" dataset "active".

In [None]:
raw_ercot.columns


In [None]:
ercot = join_lbnl(raw_ercot, lbnl, "ERCOT")
ercot.info()


In [None]:
ercot[["in_gs", "in_lbnl"]].value_counts(dropna=False)


#### Compare total capacity for projects in both

In [None]:
both_ercot = ercot[ercot.in_lbnl & ercot.in_gs]

compare_capacity(both_ercot)

#### Compare total capacity for **active** projects in both

In [None]:
ercot[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
ercot_active_in_both = ercot[ercot["queue_status"].eq("active") & ercot["Status"].eq("Active")]
compare_capacity(ercot_active_in_both)

Very little mismatching in status columns and capacities look good for projects that are active in both. There a few dozen lbnl active projects that are withdrawn or completed in GS.

#### Compare total capacity for all projects

In [None]:
gs_capacities = ercot[ercot.in_gs]["Capacity (MW)"]
lbnl_capacities = ercot[ercot.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

Where is LBNL getting historic queue data from?

In [None]:
ercot["queue_status"].value_counts()

In [None]:
pd.to_datetime(ercot[ercot["queue_status"].eq("withdrawn")]["queue_date"]).dt.year.plot.hist()

The [ISO Queue sheet](https://www.ercot.com/misdownload/servlets/mirDownload?doclookupId=955158734') Grid Status uses has an Inactive sheet that only goes back to about 2019. 

### NYISO

* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:
  * 0=Withdrawn
  * 1=Scoping Meeting Pending
  * 2=FES Pending
  * 3=FES in Progress
  * 4=SRIS/SIS Pending
  * 5=SRIS/SIS in Progress
  * 6=SRIS/SIS Approved
  * 7=FS Pending
  * 8=Rejected Cost Allocation/Next FS Pending
  * 9=FS in Progress
  * 10=Accepted Cost Allocation/IA in Progress
  * 11=IA Completed
  * 12=Under Construction
  * 13=In Service for Test
  * 14=In Service Commercial
  * 15=Partial In-Service
* Availability of Studies  Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available
* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.


In [None]:
raw_nyiso.columns


In [None]:
raw_nyiso[raw_nyiso["Queue ID"].duplicated(keep=False)]

In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, "NYISO")
nyiso.info()


In [None]:
nyiso.head()


In [None]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# marginal improvement from date filter
nyiso.loc[
    nyiso["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


### NYISO Capacity Comparison
- Compare total capacity for projects in lbnl and gs
- Compare total capacity for active projects in lbnl and gs
- Compare total capacity for active projects
- Compare capacity by fuel type? Might challenging because the categories are all over the place

#### Compare total capacity for projects in both

In [None]:
def compare_capacity(comb_iso: pd.DataFrame, gs_cap_col="Capacity (MW)", lbnl_cap_col="capacity_mw_resource_1"):    
    gs_iso_cap = comb_iso[gs_cap_col].sum()
    lbnl_iso_cap = comb_iso[lbnl_cap_col].sum()

    print(f"GS Capacity for projects in GS and LBNL: {gs_iso_cap}")
    print(f"LBNL Capacity for project in GS and LBNL: {lbnl_iso_cap}")
    print(f"Ratio of LBNL to GS Capacity for project in GS and LBNL: {lbnl_iso_cap / gs_iso_cap}")
    
    

In [None]:
both_nyiso = nyiso[nyiso.in_lbnl & nyiso.in_gs]

compare_capacity(both_nyiso)

Ok! Projects that exist in both have very similar total capacities! That's a good start. I could look into which projects have different capacity values but the amount is so minor that I'm going to skip for now.

#### Compare total capacity for **active** projects in both

In [None]:
both_nyiso["queue_status"].value_counts()

In [None]:
both_nyiso[["Status", "queue_status"]].value_counts(dropna=False)

There are some active LBNL projects that have been withdrawn maybe because the GS data is a bit fresher? If so the withdrawl dates should be after 2022-12-31.

In [None]:
pd.to_datetime(both_nyiso[(both_nyiso["Status"] == "Withdrawn") & (both_nyiso["queue_status"] == "active")]["Withdrawn Date"]).dt.year.value_counts()

Great! That explains it. This means we can just compare projects marked active in lbnl

In [None]:
active_both_nyiso = both_nyiso[(both_nyiso["queue_status"] == "active")]
gs_nyiso_cap = active_both_nyiso["Capacity (MW)"].sum()
lbnl_nyiso_cap = active_both_nyiso["capacity_mw_resource_1"].sum()

print(gs_nyiso_cap)
print(lbnl_nyiso_cap)
print(lbnl_nyiso_cap / gs_nyiso_cap)

Similar capacity totals I'm comfortable with.

#### Compare total capacity for all projects

In [None]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
nyiso[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
print(nyiso[nyiso.in_gs & ~nyiso.in_lbnl]["Capacity (MW)"].sum() / nyiso[nyiso.in_gs]["Capacity (MW)"].sum())
print(nyiso[~nyiso.in_gs & nyiso.in_lbnl]["capacity_mw_resource_1"].sum() / nyiso[nyiso.in_lbnl]["Capacity (MW)"].sum())

Projects in GS but not in lbnl account for 27% of total capacity in GS nyiso.
Projects in LBNL but not in GS account for 4% of total capacity in LBNL NYISO.

Why does GS have so much more capacity than LBNL here? Do we care if GS has more capacity than LBNL given GS more closely resembles the source data? Is LBNL doing deduplication work behind the scenes?

In [None]:
gs_capacities = nyiso[nyiso.in_gs]["Winter Capacity (MW)"]
lbnl_capacities = nyiso[nyiso.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / pd.to_numeric(gs_capacities, errors='coerce').sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

There are 190 transmission projects in GS that aren't in LBNL. See generation type analysis notebook.

### MISO
Very good project coverage, but the IA status categories are a mess.

In [None]:
raw_miso.columns


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
lbnl.query('entity == "MISO"').loc[lbnl.query('entity == "MISO"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')


In [None]:
miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == "MISO"').drop_duplicates(subset='queue_id', keep='last'), "MISO")
miso.info()


In [None]:
miso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


#### Compare total capacity for projects in both

In [None]:
both_miso = miso[miso.in_lbnl & miso.in_gs]

compare_capacity(both_miso)

#### Compare total capacity for **active** projects in both

In [None]:
both_miso = miso[miso.in_lbnl & miso.in_gs]

both_miso[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
active_in_lbnl_not_active_in_gs = both_miso[both_miso["queue_status"].eq("active") & both_miso["Status"].ne("Active")]
active_in_lbnl_not_active_in_gs["Status"].value_counts(dropna=False)

In [None]:
pd.to_datetime(active_in_lbnl_not_active_in_gs.query("Status == 'Done'")["inService"]).dt.year.value_counts(dropna=False)

Seems like a bulk of projects that are active in lbnl but considered done in GS were completed in the last 5ish years. Only 43 projects went in service in 2023. Shouldn't LBNL have caught the other projects in service?

In [None]:
pd.to_datetime(active_in_lbnl_not_active_in_gs.query("Status == 'Withdrawn'")["Withdrawn Date"]).dt.year.value_counts(dropna=False)

Great! The active LBNL projects that are Withdrawn in GS were mostly withdrawn in 2023.

In [None]:
active_both_miso = both_miso[both_miso["queue_status"].eq("active") & both_miso["Status"].eq("Active")]

compare_capacity(active_both_miso)

Projects that are marked active in both LBNL and GS have very similar total capacities.

In [None]:
gs_capacities = miso[miso.in_gs]["Winter Capacity (MW)"]
lbnl_capacities = miso[miso.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

Total MISO Capacity is pretty similar in LBNL and GS. There are some differences in queue status but nothing alarming.

### SPP

* neither LBNL nor GridStatus have withdrawn projects
* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between "IA pending" and "System Integration Study". But I don't think that is a problem because both are included in the "actionable" criteria in Synapse's model.

In [None]:
raw_spp.columns


In [None]:
spp = join_lbnl(raw_spp, lbnl, "SPP")
spp.info()


In [None]:
spp.sample(4)


In [None]:
spp[["in_gs", "in_lbnl"]].value_counts(dropna=False)


#### Compare total capacity for projects in both

In [None]:
both_spp = spp[spp.in_lbnl & spp.in_gs]

compare_capacity(both_spp)

#### Compare total capacity for **active** projects in both

In [None]:
spp[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
spp_gs_completed_lbnl_active = spp[spp["queue_status"].eq("active") & spp["Status"].eq("Active")]
compare_capacity(spp_gs_completed_lbnl_active)

Some mismatching in status column for projects that are active in both datasets the capacities look good.

#### Compare total capacity for all projects

In [None]:
gs_capacities = spp[spp.in_gs]["Capacity (MW)"]
lbnl_capacities = spp[spp.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

### PJM
Like MISO, good project coverage, but the IA status categories are a mess.

"Active" applied up to IA execution. Then "Engineering and Procurement" applied to IA execution through COD. Then "In Service".

In [None]:
raw_pjm.columns


In [None]:
# "Active" stops at IA execution
raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()


In [None]:
pjm = join_lbnl(raw_pjm, lbnl, "PJM")
pjm.info()


#### Compare total capacity for projects in both

In [None]:
both_pjm = pjm[pjm.in_lbnl & pjm.in_gs].copy()

compare_capacity(both_pjm)

GS has about 25% more capacity for projects in both. Which projects have super different values? Is it a units issue?

In [None]:
both_pjm["capacity_diff"] = both_pjm["Capacity (MW)"] - both_pjm["capacity_mw_resource_1"]

both_pjm["capacity_diff"].describe()

In [None]:
print(both_pjm["capacity_diff"].ne(0).value_counts())

cap_fields = [
    "Capacity (MW)",
    "Summer Capacity (MW)",
    "Winter Capacity (MW)",
    "capacity_mw_resource_1"
]

both_pjm_diff_caps = both_pjm[both_pjm["capacity_diff"].ne(0) & ~both_pjm["Capacity (MW)"].isna()].sort_values("capacity_diff").copy()
both_pjm_diff_caps[cap_fields].head(20)

What is the difference between "MW Capacity", "MW Energy", "MFO", "MW In Service"? It seems like LBNL used "MW Capacity"

In [None]:
# GS renames "MW Energy" to "Winter Capacity (MW)"
compare_capacity(both_pjm, gs_cap_col="Winter Capacity (MW)")

Looks like lbnl used "MW Energy" to measure capacity. Which columns should we be using?!

#### Compare total capacity for **active** projects in both

In [None]:
pjm[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
pjm_active_in_both = pjm[pjm["queue_status"].eq("active") & pjm["Status"].eq("Active")]
compare_capacity(pjm_active_in_both, gs_cap_col="Winter Capacity (MW)")

Some mismatching in status column for projects that are active in both datasets the capacities look good.

#### Compare total capacity for all projects

In [None]:
gs_capacities = pjm[pjm.in_gs]["Winter Capacity (MW)"]
lbnl_capacities = pjm[pjm.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

GS has a little more capacity and projecdts than LBNL.

### CAISO
Straightforward!

In [None]:
raw_caiso.columns


In [None]:
caiso = join_lbnl(raw_caiso, lbnl, "CAISO")
caiso.info()


In [None]:
caiso.query("Status == 'ACTIVE'").sample(8, random_state=42).sort_values('interconnection_status_lbnl')


In [None]:
caiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
caiso.loc[
    pd.to_datetime(caiso["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# remarkably easy to match status values
caiso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


#### Compare total capacity for projects in both

In [None]:
both_caiso = caiso[caiso.in_lbnl & caiso.in_gs]

compare_capacity(both_caiso)

#### Compare total capacity for **active** projects in both

In [None]:
caiso[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
caiso_active_in_both = caiso[caiso["queue_status"].eq("active") & caiso["Status"].eq("ACTIVE")]
compare_capacity(caiso_active_in_both)

Very little mismatching in status columns and capacities look good for projects that are active in both. There a few dozen lbnl active projects that are withdrawn or completed in GS.

#### Compare total capacity for all projects

In [None]:
gs_capacities = caiso[caiso.in_gs]["Capacity (MW)"]
lbnl_capacities = caiso[caiso.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

Basically the same amount of capacity and number of projects. love it

### ISO-NE

* what GridStatus calls "Queue ID" was actually "Queue Position" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects.

In [None]:
raw_isone.columns


In [None]:
raw_isone.duplicated(subset=['Queue ID']).sum()


In [None]:
raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)


In [None]:
raw_isone['Status'].value_counts(dropna=False)


In [None]:
compound_key = ['Queue ID', 'Status']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.
compound_key = ['Queue ID', 'Project Name']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.
raw_isone.query('Status == "Active"')['Queue ID'].is_unique


In [None]:
# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.
raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)


In [None]:
raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])


In [None]:
# join manually rather than refactoring the func to take compound key
lbnl_iso = lbnl.loc[lbnl["entity"].eq("ISO-NE"), LBNL_JOIN_COLS].astype({'queue_id': int})
isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')
isone = isone.merge(lbnl_iso, how="outer", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])
isone["in_lbnl"] = ~isone["queue_id"].isna()
isone["in_gs"] = ~isone["Queue ID"].isna()
del lbnl_iso
isone.info()


In [None]:
isone.head(2)


In [None]:
isone[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# small improvement from date filter
isone.loc[
    pd.to_datetime(isone["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# status values are decently aligned
isone[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


In [None]:
isone['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
status_cols_isone = [
    "FS",
    "SIS",
    "OS",
    "FAC",
    "IA",
    "Project Status",
]
with pd.option_context('display.max_rows', None):
    display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())


In [None]:
# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.
isone[status_cols_isone]


In [None]:
raw_isone['Generation Type'].value_counts(dropna=False)


#### Compare total capacity for projects in both

In [None]:
both_isone = isone[isone.in_lbnl & isone.in_gs]

compare_capacity(both_isone)

#### Compare total capacity for **active** projects in both

In [None]:
isone[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
isone_active_in_both = isone[isone["queue_status"].eq("active") & isone["Status"].eq("Active")]
compare_capacity(isone_active_in_both)

Decent alignment between project statuses. GS has a bit more projects. Did their status change in 2023?

In [None]:
pd.to_datetime(isone[isone["queue_status"].isna() & isone["Status"].eq("Active")]["Queue Date"]).dt.year.plot.hist()

#### Compare total capacity for all projects

In [None]:
gs_capacities = isone[isone.in_gs]["Capacity (MW)"]
lbnl_capacities = isone[isone.in_lbnl]["capacity_mw_resource_1"]

print(f"Ratio of total LBNL capacity to total GS capacity {(gs_capacities.sum() - lbnl_capacities.sum()) / lbnl_capacities.sum() * 100}")
print(f"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}")

Hmmm substatially more capacity in GS.