# GridStatus <-> LBNL Status Mapping

GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two.

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

from dbcp.extract.lbnl_iso_queue import extract


## Get the data
### LBNL-Compiled Queues

In [None]:
# partial implementation of transform. I don't want to include deduplication.
def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:
    """Transform active iso queue data."""
    rename_dict = {
        "state": "raw_state_name",
        "county": "raw_county_name",
    }
    active_projects = active_projects.rename(columns=rename_dict)  # copy
    # Harmonize the interconnection_status_lbnl values.
    mapping = {
        "Feasability Study": "Feasibility Study",
        "Feasibility": "Feasibility Study",
        "Facilities Study": "Facility Study",
        "IA in Progress": "In Progress (unknown study)",
        "Unknown": "In Progress (unknown study)",
        "Withdrawn, Feasibility Study": "Withdrawn",
    }
    active_projects.loc[:, "interconnection_status_lbnl"] = active_projects.loc[
        :, "interconnection_status_lbnl"
    ].replace(mapping)
    # drop irrelevant columns (structurally all nan due to 'active' filter)
    active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True)
    return active_projects


source_path = Path("/app/data/raw/queues_2022_clean_data.xlsx")
raw_lbnl = extract(source_path)["lbnl_iso_queue"]
lbnl = partial_transform(raw_lbnl)


In [None]:
lbnl.shape, lbnl.columns


In [None]:
lbnl.head(2)


### GridStatus Queues

In [None]:
import dbcp

# These are the revision numbers of the oldest archives we have
iso_queue_versions: dict[str, str] = {
    "miso": "1681775160487863",
    "caiso": "1681775162586588",
    "pjm": "1681775160979859",
    "ercot": "1681775161342766",
    "spp": "1681775162935809",
    "nyiso": "1681775159356063",
    "isone": "1681775162111351",
}

gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)

In [None]:
{k: v.shape for k, v in gs_dfs.items()}

In [None]:
# # These are manually downloaded from our archives. I went back as far as I could,
# # which is April 17 2023.
# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.
# root_path = Path("/app/data/raw/gridstatus/interconnection_queues")
# assert root_path.exists()
# # filenames are like "interconnection_queues_caiso_4-17-2023.parquet"
# gs_dfs = {
#     path.name.split("_")[2]: pd.read_parquet(path)
#     for path in root_path.glob("*.parquet")
# }
# {k: v.shape for k, v in gs_dfs.items()}


In [None]:
# wayyy fewer items in GridStatus than LBNL.
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]


In [None]:
lbnl.region.value_counts()

In [None]:
lbnl_iso = lbnl[~lbnl["region"].str.contains("non-ISO", na=False)].copy()

In [None]:
sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]

Gridstatus has more ISO projects

In [None]:
# globals().update(gs_dfs)  # this works fine but the static type checker/linter can't introspect it.
raw_ercot = gs_dfs["ercot"]
raw_nyiso = gs_dfs["nyiso"]
raw_isone = gs_dfs["isone"]
raw_miso = gs_dfs["miso"]
raw_pjm = gs_dfs["pjm"]
raw_spp = gs_dfs["spp"]
raw_caiso = gs_dfs["caiso"]


In [None]:
LBNL_JOIN_COLS = [
    "queue_id",  # join key
    "project_name",  # for manually checking the joins
    "queue_date",  # for manually checking the joins
    "queue_status",  # for manually checking the joins
    "interconnection_status_raw",  # see what LBNL interpreted
    "interconnection_status_lbnl",  # final mapping value
    "capacity_mw_resource_1",
    "resource_type_1"
    
]


def join_lbnl(
    iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col="Queue ID"
) -> pd.DataFrame:
    """Join LBNL queue data to GridStatus queue data."""
    assert iso_df[iso_id_col].is_unique, "ID column not unique"
    lbnl_iso = lbnl.loc[lbnl["entity"].eq(iso_name), LBNL_JOIN_COLS]
    assert not lbnl_iso.empty, f"Empty LBNL queue for {iso_name}"
    assert lbnl_iso["queue_id"].is_unique, "LBNL queue ID not unique"
    out = iso_df.merge(lbnl_iso, how="outer", left_on=iso_id_col, right_on="queue_id")
    out["in_lbnl"] = ~out["queue_id"].isna()
    out["in_gs"] = ~out[iso_id_col].isna()
    return out


## Comparisons
### ERCOT
* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.
  * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?
* GridStatus defines "status" as "IA Signed".isna(). LBNL calls the entire "large active" dataset "active".

In [None]:
raw_ercot.columns


In [None]:
ercot = join_lbnl(raw_ercot, lbnl, "ERCOT")
ercot.info()


In [None]:
ercot[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
ercot.loc[
    ercot["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
ercot[["Status", "queue_status"]].value_counts(dropna=False)


In [None]:
ercot.loc[
    ercot["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["Status", "queue_status"],
].value_counts(dropna=False)


In [None]:
ercot[["GIM Study Phase", "queue_status"]].value_counts(dropna=False)


In [None]:
ercot["Queue Date"].eq(pd.to_datetime(ercot["queue_date"])).where(
    (ercot["Queue Date"].notna() & ercot["queue_date"].notnull())
).value_counts(dropna=False)


In [None]:
# get old version of ercot data from December 2022
old_ercot_path = Path("/app/data/raw/gridstatus/GIS_Report_December_2022.xlsx")
assert old_ercot_path.exists()

# copy and paste some GridStatus ETL code
def extract_ercot(path: Path) -> pd.DataFrame:
    queue = pd.read_excel(
        path,
        sheet_name="Project Details - Large Gen",
        skiprows=30,
    ).iloc[4:]

    queue["State"] = "Texas"
    queue["Queue Date"] = queue["Screening Study Started"]

    fuel_type_map = {
        "BIO": "Biomass",
        "COA": "Coal",
        "GAS": "Gas",
        "GEO": "Geothermal",
        "HYD": "Hydrogen",
        "NUC": "Nuclear",
        "OIL": "Fuel Oil",
        "OTH": "Other",
        "PET": "Petcoke",
        "SOL": "Solar",
        "WAT": "Water",
        "WIN": "Wind",
    }

    technology_type_map = {
        "BA": "Battery Energy Storage",
        "CC": "Combined-Cycle",
        "CE": "Compressed Air Energy Storage",
        "CP": "Concentrated Solar Power",
        "EN": "Energy Storage",
        "FC": "Fuel Cell",
        "GT": "Combustion (gas) Turbine, but not part of a Combined-Cycle",
        "HY": "Hydroelectric Turbine",
        "IC": "Internal Combustion Engine, eg. Reciprocating",
        "OT": "Other",
        "PV": "Photovoltaic Solar",
        "ST": "Steam Turbine other than Combined-Cycle",
        "WT": "Wind Turbine",
    }

    queue["Fuel"] = queue["Fuel"].map(fuel_type_map)
    queue["Technology"] = queue["Technology"].map(technology_type_map)

    queue["Generation Type"] = queue["Fuel"] + " - " + queue["Technology"]

    queue["Status"] = (
        queue["IA Signed"]
        .isna()
        .map(
            {
                True: "Active",
                False: "Completed",
            },
        )
    )

    queue["Actual Completion Date"] = queue["Approved for Synchronization"]

    rename = {
        "INR": "Queue ID",
        "Project Name": "Project Name",
        "Interconnecting Entity": "Interconnecting Entity",
        "Projected COD": "Proposed Completion Date",
        "POI Location": "Interconnection Location",
        "County": "County",
        "State": "State",
        "Capacity (MW)": "Capacity (MW)",
        "Queue Date": "Queue Date",
        "Generation Type": "Generation Type",
        "Actual Completion Date": "Actual Completion Date",
        "Status": "Status",
    }
    return queue.rename(columns=rename)
old_ercot = extract_ercot(old_ercot_path)
old_ercot.info()


In [None]:
old_ercot = join_lbnl(old_ercot, lbnl, "ERCOT", iso_id_col="Queue ID")


In [None]:
old_ercot[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
old_ercot[["Status", "queue_status"]].value_counts(dropna=False)


In [None]:
pd.concat(
    (
        ercot[["Status", "queue_status"]].value_counts(dropna=False).rename("gs"),
        old_ercot[["Status", "queue_status"]].value_counts(dropna=False).rename("old"),
    ),
    axis=1,
    join="outer",
).sort_index().drop_duplicates()


In [None]:
old_ercot["Queue Date"].eq(pd.to_datetime(old_ercot["queue_date"])).where(
    (old_ercot["Queue Date"].notna() & old_ercot["queue_date"].notnull())
).value_counts(dropna=False)


In [None]:
old_ercot["Queue Date"].sub(pd.to_datetime(old_ercot["queue_date"])).lt(pd.Timedelta(days=2)).where(
    (old_ercot["Queue Date"].notna() & old_ercot["queue_date"].notnull())
).value_counts(dropna=False)


In [None]:
old_ercot["Queue Date"].sub(pd.to_datetime(old_ercot["queue_date"])).dt.total_seconds().div(60 * 60 * 24).replace(0, 1e-1).transform(np.log10).hist(bins=20, log=True)


In [None]:
mismatched_dates = (
    old_ercot["Queue Date"]
    .ne(pd.to_datetime(old_ercot["queue_date"]))
    .where((old_ercot["Queue Date"].notna() & old_ercot["queue_date"].notnull()))
).fillna(False)


In [None]:
# none of the dates match. No idea where LBNL queue date comes from. Not from the ISO data!
old_ercot.loc[mismatched_dates, [c for c in old_ercot.columns if 'date' in c.lower()]]


In [None]:
old_ercot[['GIM Study Phase', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
old_ercot['GIM Study Phase'].eq(old_ercot['interconnection_status_raw']).value_counts(dropna=False)


In [None]:
# LBNL status looks consistently applied (1:1) but the ordering doesn't really make sense to me.
raw_status_match = old_ercot['GIM Study Phase'].eq(old_ercot['interconnection_status_raw'])
old_ercot.loc[raw_status_match, ['GIM Study Phase', 'interconnection_status_lbnl']].value_counts(dropna=False).sort_index()


In [None]:
old_ercot.loc[raw_status_match, ['GIM Study Phase', 'queue_status']].value_counts(dropna=False).sort_index()


In [None]:
# also includes technology type, which is nice.
raw_ercot['Generation Type'].value_counts(dropna=False)


### NYISO

* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:
  * 0=Withdrawn
  * 1=Scoping Meeting Pending
  * 2=FES Pending
  * 3=FES in Progress
  * 4=SRIS/SIS Pending
  * 5=SRIS/SIS in Progress
  * 6=SRIS/SIS Approved
  * 7=FS Pending
  * 8=Rejected Cost Allocation/Next FS Pending
  * 9=FS in Progress
  * 10=Accepted Cost Allocation/IA in Progress
  * 11=IA Completed
  * 12=Under Construction
  * 13=In Service for Test
  * 14=In Service Commercial
  * 15=Partial In-Service
* Availability of Studies  Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available
* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.


In [None]:
raw_nyiso.columns


In [None]:
raw_nyiso[raw_nyiso["Queue ID"].duplicated(keep=False)]

In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, "NYISO")
nyiso.info()


In [None]:
nyiso.head()


In [None]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# marginal improvement from date filter
nyiso.loc[
    nyiso["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:

nyiso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


Why are there so many projects that have a status in GridStatus but no status in LBNL? NYISO has separate sheets for withdrawn, in service and active. Why is LBNL missing so much information?

In [None]:
nyiso.loc[
    nyiso["Queue Date"]
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["Status", "queue_status"],
].value_counts(dropna=False).sort_index()


In [None]:
nyiso['interconnection_status_raw'].str.replace(' ', '').replace([np.nan, None, 'None'], '').str.split(',').apply(lambda x: ','.join(sorted(set(x)))).value_counts(dropna=False)


In [None]:
nyiso['Availability of Studies'].str.replace(r'[ \d]', '', regex=True).replace([np.nan, None, 'None'], '').str.split(',').apply(lambda x: ','.join(sorted(set(x).difference({'CY'})))).value_counts(dropna=False)


In [None]:
nyiso["simplified_studies"] = (
    nyiso["Availability of Studies"]
    .str.replace(r"[ \d]", "", regex=True)
    .replace([np.nan, None, "None"], "")
    .str.split(",")
    .apply(lambda x: ",".join(sorted(set(x).difference({"CY"}))))
    .replace('', np.nan)
)
nyiso["simplified_lbnl_raw"] = (
    nyiso["interconnection_status_raw"]
    .str.replace(" ", "")
    .replace([np.nan, None, "None"], "")
    .str.split(",")
    .apply(lambda x: ",".join(sorted(set(x))))
    .replace('', np.nan)
)


In [None]:
nyiso['S'].value_counts(dropna=False)


In [None]:
pd.to_numeric(nyiso['S'].astype('string').str.split(',').str[-1], errors='raise').astype(pd.UInt8Dtype()).value_counts(dropna=False)


In [None]:
nyiso["s_simplified"] = pd.to_numeric(nyiso['S'].astype('string').str.split(',').str[-1], errors='raise').astype(pd.UInt8Dtype())


In [None]:
nyiso[["queue_status", "simplified_studies"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso[["simplified_lbnl_raw", "simplified_studies"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso["simplified_lbnl_raw"].astype('string').eq(nyiso["simplified_studies"].astype('string')).value_counts(dropna=False)


In [None]:
nyiso[["queue_status", "s_simplified"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso[["s_simplified", "simplified_studies"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso[["s_simplified", "simplified_lbnl_raw"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso[["interconnection_status_lbnl", "s_simplified"]].value_counts(dropna=False).sort_index()


In [None]:
nyiso["Queue Date"].eq(pd.to_datetime(nyiso["queue_date"])).where(
    (nyiso["Queue Date"].notna() & nyiso["queue_date"].notnull())
).value_counts(dropna=False)


In [None]:
# needs standardization
raw_nyiso['Generation Type'].value_counts(dropna=False)


### NYISO Capacity Comparison
- Compare total capacity for projects in lbnl and gs
- Compare total capacity for active projects in lbnl and gs
- Compare total capacity for active projects
- Compare capacity by fuel type? Might challenging because the categories are all over the place

#### Compare total capacity for projects in both

In [None]:
both_nyiso = nyiso[nyiso.in_lbnl & nyiso.in_gs].copy()
print(nyiso.shape)
print(both_nyiso.shape)

In [None]:
both_nyiso.info()

In [None]:
gs_nyiso_cap = both_nyiso["Capacity (MW)"].sum()
lbnl_nyiso_cap = both_nyiso["capacity_mw_resource_1"].sum()

print(gs_nyiso_cap)
print(lbnl_nyiso_cap)
print(lbnl_nyiso_cap / gs_nyiso_cap)

Ok! Projects that exist in both have very similar total capacities! That's a good start. I could look into which projects have different capacity values but the amount is so minor that I'm going to skip for now.

#### Compare total capacity for **active** projects in both

In [None]:
both_nyiso["queue_status"].value_counts()

In [None]:
both_nyiso[["Status", "queue_status"]].value_counts(dropna=False)

There are some active LBNL projects that have been withdrawn maybe because the GS data is a bit fresher? If so the withdrawl dates should be after 2022-12-31.

In [None]:
pd.to_datetime(both_nyiso[(both_nyiso["Status"] == "Withdrawn") & (both_nyiso["queue_status"] == "active")]["Withdrawn Date"]).dt.year.value_counts()

Great! That explains it. This means we can just compare projects marked active in lbnl

In [None]:
active_both_nyiso = both_nyiso[(both_nyiso["queue_status"] == "active")]
gs_nyiso_cap = active_both_nyiso["Capacity (MW)"].sum()
lbnl_nyiso_cap = active_both_nyiso["capacity_mw_resource_1"].sum()

print(gs_nyiso_cap)
print(lbnl_nyiso_cap)
print(lbnl_nyiso_cap / gs_nyiso_cap)

Similar capacity totals I'm comfortable with.

#### Compare total capacity for all projects

In [None]:
nyiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
nyiso[["Status", "queue_status"]].value_counts(dropna=False)

In [None]:
print(nyiso[nyiso.in_gs & ~nyiso.in_lbnl]["Capacity (MW)"].sum() / nyiso[nyiso.in_gs]["Capacity (MW)"].sum())
print(nyiso[~nyiso.in_gs & nyiso.in_lbnl]["capacity_mw_resource_1"].sum() / nyiso[nyiso.in_lbnl]["Capacity (MW)"].sum())

Projects in GS but not in lbnl account for 27% of total capacity in GS nyiso.
Projects in LBNL but not in GS account for 4% of total capacity in LBNL NYISO.

Why does GS have so much more capacity than LBNL here? Do we care if GS has more capacity than LBNL given GS more closely resembles the source data? Is LBNL doing deduplication work behind the scenes?

### MISO
Very good project coverage, but the IA status categories are a mess.

In [None]:
raw_miso.columns


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')


In [None]:
# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'
# Nearly whole-row duplicate, except for "studyPhase"
lbnl.query('entity == "MISO"').loc[lbnl.query('entity == "MISO"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')


In [None]:
miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == "MISO"').drop_duplicates(subset='queue_id', keep='last'), "MISO")
miso.info()


In [None]:
miso.sample(4)


In [None]:
miso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# somehow makes things worse?
miso.loc[
    pd.to_datetime(miso["Queue Date"].str.replace(r'\d{2}:\d{2}:\d{2}Z$', '', regex=True))  # remove time
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
pd.to_datetime(miso["Queue Date"].str.replace(r'\d{2}:\d{2}:\d{2}Z$', '', regex=True)).describe()


In [None]:
pd.to_datetime(miso['queue_date']).describe()


In [None]:
miso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


In [None]:
miso[['queue_status', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
miso['interconnection_status_lbnl'].value_counts(dropna=False)


In [None]:
miso.loc[:, ['studyPhase', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
miso.loc[:, ['studyPhase', 'interconnection_status_lbnl']].value_counts(dropna=False).sort_index()


In [None]:
# inconsistent mix of fuel/tech type
raw_miso['Generation Type'].value_counts(dropna=False)


### SPP

* neither LBNL nor GridStatus have withdrawn projects
* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between "IA pending" and "System Integration Study". But I don't think that is a problem because both are included in the "actionable" criteria in Synapse's model.

In [None]:
raw_spp.columns


In [None]:
spp = join_lbnl(raw_spp, lbnl, "SPP")
spp.info()


In [None]:
spp.sample(4)


In [None]:
spp[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
spp.loc[
    pd.to_datetime(spp["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
spp[["Status", "queue_status"]].value_counts(dropna=False)


In [None]:
spp.loc[
    pd.to_datetime(spp["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["Status", "queue_status"],
].value_counts(dropna=False)


In [None]:
# it turns out these values come from the raw "status" values, which GridStatus overwrites 😡
spp['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
spp[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
spp['Current Cluster'].str.replace(r'-\d{4}-\d{3}(?:-\d)?', '', regex=True).value_counts(dropna=False)  # remove date/ID


In [None]:
spp['cluster_simplified'] = spp['Current Cluster'].str.replace(r'-\d{4}-\d{3}(?:-\d)?', '', regex=True) # remove date/ID


In [None]:
spp[['interconnection_status_raw', 'cluster_simplified']].value_counts(dropna=False).sort_index()


In [None]:
spp[['interconnection_status_lbnl', 'cluster_simplified']].value_counts(dropna=False).sort_index()


In [None]:
# needs standardization
raw_spp['Generation Type'].value_counts(dropna=False)


### PJM
Like MISO, good project coverage, but the IA status categories are a mess.

"Active" applied up to IA execution. Then "Engineering and Procurement" applied to IA execution through COD. Then "In Service".

In [None]:
raw_pjm.columns


In [None]:
# "Active" stops at IA execution
raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()


In [None]:
pjm = join_lbnl(raw_pjm, lbnl, "PJM")
pjm.info()


In [None]:
pjm.sample(4)


In [None]:
pjm['Wholesale Market Participation Agreement'].value_counts(dropna=False)


In [None]:
# no status column for this one for some reason
pjm['wholesale_not_none'] = pjm['Wholesale Market Participation Agreement'].notna()


In [None]:
pjm[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
pjm.loc[
    pd.to_datetime(pjm["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
pjm[["queue_status", "Status"]].value_counts(dropna=False).sort_index()


In [None]:
pjm['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
# lots of IA status info encoded in various status columns. Have to figure out how to reconstruct the LBNL definitions
status_cols = [
    'Feasibility Study Status',
    'System Impact Study Status',
    'Facilities Study Status',
    'Interim/Interconnection Service Agreement Status',
    # 'wholesale_not_none',  # redundant with IA status "Wholesale Market Participation Agreement"
    'Construction Service Agreement Status',
    'Upgrade Construction Service Agreement Status'
]
with pd.option_context('display.max_rows', None):
    display(pjm.loc[pjm['queue_status'].eq('active'), status_cols[:4] + ['interconnection_status_lbnl']].value_counts(dropna=False).sort_index())


In [None]:
# needs standardization. Also has a long tail of multivalued entries.
raw_pjm['Generation Type'].value_counts(dropna=False)


### CAISO
Straightforward!

In [None]:
raw_caiso.columns


In [None]:
caiso = join_lbnl(raw_caiso, lbnl, "CAISO")
caiso.info()


In [None]:
caiso.query("Status == 'ACTIVE'").sample(8, random_state=42).sort_values('interconnection_status_lbnl')


In [None]:
caiso[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
caiso.loc[
    pd.to_datetime(caiso["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# remarkably easy to match status values
caiso[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


In [None]:
caiso['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
caiso[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
caiso['Study Process'].value_counts(dropna=False).sort_index()


In [None]:
# I don't see any pattern to the Study Process values.
status_cols_caiso = [
    'Interconnection Agreement Status',
    'Facilities Study (FAS) or Phase II Cluster Study',
    'System Impact Study or Phase I Cluster Study',
    'Study Process',
]
with pd.option_context('display.max_rows', None):
    display(caiso[status_cols_caiso[:-1]].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())


In [None]:
with pd.option_context('display.max_rows', None):
    display(caiso.query("queue_status == 'active'")[status_cols_caiso[:-1] + ['interconnection_status_lbnl']].replace(['None'], np.nan).value_counts(dropna=False).sort_index())


In [None]:
# categories look standardized, but need to handle multivalued-ness
raw_caiso['Generation Type'].value_counts(dropna=False)


### ISO-NE

* what GridStatus calls "Queue ID" was actually "Queue Position" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects.

In [None]:
raw_isone.columns


In [None]:
raw_isone.duplicated(subset=['Queue ID']).sum()


In [None]:
raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)


In [None]:
raw_isone['Status'].value_counts(dropna=False)


In [None]:
compound_key = ['Queue ID', 'Status']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.
compound_key = ['Queue ID', 'Project Name']
raw_isone.duplicated(subset=compound_key).sum()


In [None]:
# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.
raw_isone.query('Status == "Active"')['Queue ID'].is_unique


In [None]:
# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.
raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)


In [None]:
raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])


In [None]:
# join manually rather than refactoring the func to take compound key
lbnl_iso = lbnl.loc[lbnl["entity"].eq("ISO-NE"), LBNL_JOIN_COLS].astype({'queue_id': int})
isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')
isone = isone.merge(lbnl_iso, how="outer", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])
isone["in_lbnl"] = ~isone["queue_id"].isna()
isone["in_gs"] = ~isone["Queue ID"].isna()
del lbnl_iso
isone.info()


In [None]:
isone.head(2)


In [None]:
isone[["in_gs", "in_lbnl"]].value_counts(dropna=False)


In [None]:
# small improvement from date filter
isone.loc[
    pd.to_datetime(isone["Queue Date"])
    .fillna(pd.to_datetime("2020-01-01"))
    .lt(pd.to_datetime("2023-01-01")),
    ["in_gs", "in_lbnl"],
].value_counts(dropna=False)


In [None]:
# status values are decently aligned
isone[["Status", "queue_status"]].value_counts(dropna=False).sort_index()


In [None]:
isone['interconnection_status_raw'].value_counts(dropna=False)


In [None]:
isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()


In [None]:
status_cols_isone = [
    "FS",
    "SIS",
    "OS",
    "FAC",
    "IA",
    "Project Status",
]
with pd.option_context('display.max_rows', None):
    display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())


In [None]:
# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.
isone[status_cols_isone]


In [None]:
raw_isone['Generation Type'].value_counts(dropna=False)
