# Update interconnection FYI data and validate against LBNL + GridStatus data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import dbcp
from dbcp.extract.helpers import cache_gcs_archive_file_locally
from dbcp.helpers import get_sql_engine

In [None]:
pd.set_option('display.max_columns', None)

# Raw Data

In [None]:
old_fyi = dbcp.extract.fyi_queue.extract("gs://dgm-archive/interconnection.fyi/interconnection_fyi_dataset_2025-11-04.csv")
old_fyi = old_fyi["fyi_queue"]

In [None]:
new_fyi = dbcp.extract.fyi_queue.extract("gs://dgm-archive/interconnection.fyi/interconnection_fyi_dataset_2025-12-01.csv")
new_fyi = new_fyi["fyi_queue"]

## Compare max dates of raw data
Print out the latest date a project entered a queue for each ISO in the old and new data. We should expect the latest project date in the new data to be larger than the that of the old data. Notable exceptions:
* PJM: PJM [is working through a backlog of projects](https://www.utilitydive.com/news/pjm-fast-track-reliability-projects-interconnection-queue-invenergy/729311/) and isn't accepting new projects until mid 2026.

In [None]:
for power_market in old_fyi.power_market.unique():
    print(power_market)
    old_df = old_fyi[old_fyi.power_market == power_market]
    new_df = new_fyi[new_fyi.power_market == power_market]
    
    old_df.loc[:, 'queue_date'] = pd.to_datetime(old_df.loc[:, 'queue_date'])
    new_df.loc[:, 'queue_date'] = pd.to_datetime(new_df.loc[:, 'queue_date'])
    
    print(f" - Old max date {old_df['queue_date'].max()}")
    print(f" - New max date {new_df['queue_date'].max()}")
    print()

# Create data warehouse and data mart tables

Update the URI of the archived FYI data in `dbcp.etl.etl_fyi_queue`. Then run `make all` to generate the data warehouse and data mart tables. Debug any errors that arise.

## Compare data warehouse tables to raw data

In [None]:
engine = get_sql_engine()
with engine.connect() as con:
    fyi_locations = pd.read_sql_table("fyi_locations", con, schema="private_data_warehouse")
    fyi_projects = pd.read_sql_table("fyi_projects", con, schema="private_data_warehouse")
    fyi_res_cap = pd.read_sql_table("fyi_resource_capacity", con, schema="private_data_warehouse")

We deduplicate the data so there are project IDs in the raw data that aren't in the data warehouse tables, but ensure that we're not losing an unexpectedly high number. During the creation of the data warehouse tables we log how many projects are dropped because they are found to be duplicates. Make sure that no table is missing many more than that number of IDs. The location table will have more missing IDs because there is more nullness in the location columns than in the capacity columns.

In [None]:
print(len(set(new_fyi.unique_id) - set(fyi_projects.project_id)))
print(len(set(new_fyi.unique_id) - set(fyi_locations.project_id)))
print(len(set(new_fyi.unique_id) - set(fyi_res_cap.project_id)))

## Compare new data mart tables to the old
Compare the old and new total active capacity in regions.

### How to grab the new data
To get the new data, replace the URI in `dbcp.etl.etl_fyi_queue` with the updated GCS URI. Then run `make all`. There might be some data validation errors due to small changes in the expected number of projects. If the changes seem reasonable, just update the expected value in the assertion. If they don't seem reason, do some digging!

Once the ETL successfully finishes the new data is available in the databse.

<!-- - download the `dev` data to compare to
- load the relevent tables

data warehouse
- check the old and new iso have a similar n and capacity
- plot total capacity


data mart:
- total capacity, n_projects and max date have all the same: caiso, ercot, pjm
- total capacity, n_projects and max date have all increased: miso, pjm, spp, nyiso, isone
- withdrawn and in service capacity have increased: miso, pjm, spp, nyiso, isone

- active capacity has changed for isos in GS_REGIONS
- how much has the active capacity changed by? -->

In [None]:
engine = get_sql_engine()
with engine.connect() as con:
    fyi_projects_long_format = pd.read_sql_table("fyi_projects_long_format", con, schema="private_data_mart")

### How to grab the old data
The following code grabs the latest version number for data in the development datasets then downloads the parquet file.

In [None]:
from google.cloud import bigquery

def get_bigquery_table_version(dataset_id, table_name, project_id="dbcp-dev-350818"):
    """
    Get the data version of a BigQuery table.

    The dbcp.commands.publish script generates a version number for each data release
    and adds it as a label to the BQ tables.

    Args:
        dataset_id: the BQ dataset ID
        table_name: the name of the table
        project_id: the GCP project id

    Return:
        the current DBCP version number of the requested table
    """
    client = bigquery.Client()

    table_ref = f"{project_id}.{dataset_id}.{table_name}"
    table = client.get_table(table_ref)  # Fetch table metadata

    labels = table.labels  # Get the labels dictionary
    return labels["version"]

In [None]:
from dbcp.extract.helpers import cache_gcs_archive_file_locally

table_name = "fyi_projects_long_format"
version = get_bigquery_table_version("private_data_mart_dev", table_name)
uri = f"gs://dgm-outputs/{version}/private_data_mart/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

fyi_projects_long_format_path = cache_gcs_archive_file_locally(uri, data_cache)
old_fyi_projects_long_format = pd.read_parquet(fyi_projects_long_format_path)

In [None]:
def agg_iso_projects_long_format(df, iso_col, id_col):
    """Calculate some aggregate metrics for each ISO"""
    agg = df.groupby(iso_col).agg({id_col: "count", "capacity_mw": "sum", "date_entered_queue": "max"})
    agg = agg.rename(columns={id_col: "n_projects", "capacity_mw": "total_capacity_mw", "date_entered_queue": "max_date_entered_queue"})
    return agg

In [None]:
new_fyi_project_agg = agg_iso_projects_long_format(fyi_projects_long_format, "power_market", "project_id")
old_fyi_project_agg = agg_iso_projects_long_format(old_fyi_projects_long_format, "power_market", "project_id")

In [None]:
new_fyi_project_agg.max_date_entered_queue

In [None]:
both_project_aggs = old_fyi_project_agg.merge(new_fyi_project_agg, left_index=True, right_index=True, validate="1:1", suffixes=("_old", "_new"))
both_project_aggs

In [None]:
# Calculate the differences between the old and new
for col in old_fyi_project_agg.columns:
    if pd.api.types.is_datetime64_any_dtype(old_fyi_project_agg[col]):
        continue
    else:
        both_project_aggs[f"{col}_pct_diff"] = (both_project_aggs[f"{col}_new"] - both_project_aggs[f"{col}_old"]) / both_project_aggs[f"{col}_old"]

In [None]:
old_fyi_project_agg

both_project_aggs.sort_values(by="total_capacity_mw_old", ascending=False)[["n_projects_pct_diff", "total_capacity_mw_pct_diff"]] * 100

Make sure there isn't an surprising change in total capacity between the old and new data. We currently don't expect the active capacity to change that much in the span of a month. The `max_change` value is an arbitrary number so dig into the data if something looks fishy to you.

It's challenging to validate total capacity changes in ISOs. If there is an unexpected change, I would check the ISO's website to see if they changed their study process. For example, there was a surprising drop in active capacity in NYISO during the 2024 Q4 update. It turns out they [changed their study process](https://www.utilitydive.com/news/new-york-iso-reforms-interconnection-queue-launches-cluster-study/724054/) and the layout of the spreadsheet Gridstatus pulls in. Sites like S&P and Utility Drive might have relevant informaiton.

In [None]:
mw_pct_diff = both_project_aggs["total_capacity_mw_pct_diff"].abs()
max_change = 0.2
assert mw_pct_diff.lt(max_change).all(), f"{mw_pct_diff} substantial change in an ISO's interconneciton queue active capacity."

In [None]:
both_project_aggs["total_capacity_mw_pct_diff"] * 100

## Compare to LBNL + GridStatus ISO queue data

In [None]:
engine = get_sql_engine()
with engine.connect() as con:
    iso_projects_long_format = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")

In [None]:
iso_projects_long_format.queue_status.value_counts()

In [None]:
from dbcp.data_mart.projects import create_long_format, create_fyi_long_format

# The dataframe this function returns includes all projects, active, withdrawn and operational. ERCOT only tracks active projects.
iso_all_projects_long_format = create_long_format(engine, active_projects_only=False)
fyi_all_projects_long_format = create_fyi_long_format(engine, active_projects_only=False)

In [None]:
iso_projects_long_format.resource_clean.value_counts()

In [None]:
fyi_projects_long_format.resource_clean.value_counts()

Compare the county coverage of the datasets

In [None]:
len(fyi_projects_long_format.county_id_fips.unique()), len(iso_projects_long_format.county_id_fips.unique())

Investigate the projects in counties that are in GS + LBNL but not FYI 

In [None]:
missing_counties = set(iso_projects_long_format.county_id_fips.unique()) - set(fyi_projects_long_format.county_id_fips.unique())
len(missing_counties)

In [None]:
# counties in FYI but not LBNL/GS
len(set(fyi_projects_long_format.county_id_fips.unique()) - set(iso_projects_long_format.county_id_fips.unique()))

In [None]:
# active FYI counties missing from all ISO projects
len(set(fyi_projects_long_format.county_id_fips.unique()) - set(iso_all_projects_long_format.county_id_fips.unique()))

In [None]:
# see if these active counties are missing from all projects
missing_counties_all_projects = set(iso_projects_long_format.county_id_fips.unique()) - set(fyi_all_projects_long_format.county_id_fips.unique())

In [None]:
len(missing_counties_all_projects)

In [None]:
iso_projects_long_format[iso_projects_long_format.county_id_fips.isin(missing_counties_all_projects)].state.value_counts()

In [None]:
iso_projects_long_format[iso_projects_long_format.county_id_fips.isin(missing_counties_all_projects)].iso_region.value_counts()

In [None]:
iso_projects_long_format[iso_projects_long_format.county_id_fips.isin(missing_counties_all_projects)].county_id_fips.value_counts()

In [None]:
iso_projects_long_format[iso_projects_long_format.county_id_fips == "51760"]

Investigate FYI projects that are missing counties. See if we can fill any in from LBNL/GS

In [None]:
fyi_projects_long_format.county_id_fips.isnull().value_counts()

In [None]:
null_fips = fyi_projects_long_format[fyi_projects_long_format.county_id_fips.isnull()]

In [None]:
null_fips.source.value_counts()

In [None]:
null_fips = null_fips[null_fips.source != "proprietary"]

In [None]:
null_fips["year_entered_queue"] = null_fips["date_entered_queue"].dt.year

Check if there are projects that have location information in GS/LBNL that we can fill in
in the FYI data. If there are, add these projects to the manual fill in data in
`dbcp.transform.interconnection_queue_helpers.fyi_manual_county_state_name_fill_ins`.

In [None]:
# merge the GS/LBNL data on to see if we can fill in any counties
merged = null_fips.merge(iso_projects_long_format.rename(columns={"iso_region": "power_market"})[["county", "state", "county_id_fips", "power_market", "queue_id", "resource_clean", "point_of_interconnection"]],
                         how="left",
                         on=["queue_id", "power_market", "resource_clean", "point_of_interconnection"],
                         suffixes=("_fyi", "_iso")
                        )

In [None]:
merged[~merged.county_iso.isnull()]

In [None]:
from dbcp.constants import FYI_RESOURCE_DICT
clean_resources = [resource for resource, codes_dict in FYI_RESOURCE_DICT.items() if codes_dict["type"] == "Renewable"]

In [None]:
len(fyi_projects_long_format[fyi_projects_long_format.resource_clean.isin(clean_resources)].county_id_fips.unique())

In [None]:
len(iso_projects_long_format[iso_projects_long_format.resource_clean.isin(clean_resources)].county_id_fips.unique())

Compare metrics between datasets for each ISO.

In [None]:
def agg_iso_projects_long_format(df, iso_col, id_col):
    """Calculate some aggregate metrics for each ISO"""
    agg = df.groupby(iso_col).agg({id_col: "count", "capacity_mw": "sum", "date_entered_queue": "max"})
    agg = agg.rename(columns={id_col: "n_projects", "capacity_mw": "total_capacity_mw", "date_entered_queue": "max_date_entered_queue"})
    return agg

fyi_project_agg = agg_iso_projects_long_format(fyi_projects_long_format, "power_market", "project_id")
iso_project_agg = agg_iso_projects_long_format(iso_projects_long_format, "iso_region", "surrogate_id")

In [None]:
fyi_project_agg.max_date_entered_queue

In [None]:
both_project_aggs = fyi_project_agg.merge(iso_project_agg, how="outer", left_index=True, right_index=True, validate="1:1", suffixes=("_fyi", "_iso"))
both_project_aggs

In [None]:
# Calculate the differences between the old and new
for col in iso_project_agg.columns:
    if pd.api.types.is_datetime64_any_dtype(iso_project_agg[col]):
        continue
    else:
        both_project_aggs[f"{col}_pct_diff"] = (both_project_aggs[f"{col}_fyi"] - both_project_aggs[f"{col}_iso"]) / both_project_aggs[f"{col}_iso"]

Ideally a less than 20% percent change in capacity for each region. It's expected that there will be more capacity in FYI than in GS + LBNL because data from more utilities are included in the FYI data. It's not too worrying if the differences in this chart are positive, it's more worrying if they're negative.

CAISO is updated by LBNL annually, not by quarterly GS updates, so this difference in update frequency can likely account for much of the difference in CAISO numbers.

In [None]:
iso_project_agg

both_project_aggs.sort_values(by="total_capacity_mw_iso", ascending=False)[["n_projects_pct_diff", "total_capacity_mw_pct_diff"]] * 100

## Dig deeper into project level changes for regions with big differences in capacity

### Start with ISOs where the FYI capacity is less than the GS capacity.

* Were projects that are not active in FYI withdrawn recently? Vice versa?

In [None]:
iso_region = "NYISO"

fyi_iso = fyi_all_projects_long_format.query("power_market == @iso_region")
gs_lbnl_iso = iso_all_projects_long_format.query("iso_region == @iso_region")

In [None]:
fyi_iso.queue_status.value_counts()

In [None]:
gs_lbnl_iso.queue_status.value_counts()

In [None]:
fyi_iso.queue_id.is_unique

In [None]:
fyi_iso[fyi_iso.queue_id.duplicated(keep=False)].head(5)

In [None]:
len(gs_lbnl_iso[gs_lbnl_iso.queue_id.duplicated()])

In [None]:
active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status == "active"]

In [None]:
not_active_fyi = fyi_iso[fyi_iso.queue_status != "active"]

In [None]:
# look at projects active in GS which are not active in FYI
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].queue_status.value_counts()

In [None]:
# make sure projects were withdrawn recently
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].withdrawn_date.value_counts()

In [None]:
# does this missing capacity make up the difference in total capacity?
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].capacity_mw.sum()/active_gs.capacity_mw.sum()

In [None]:
# look at projects in GS which aren't in FYI
# it is likely that these projects were dropped during the deduplication cleaning
# step in the transform. You can spot check to make sure that a different project ID with the
# same interconnection point, capacity, resource etc. is in the data
active_gs[~active_gs.queue_id.isin(fyi_iso.queue_id)].sort_values(by="capacity_mw", ascending=False).head(10)

In [None]:
active_gs[~active_gs.queue_id.isin(fyi_iso.queue_id)].capacity_mw.sum()/active_gs.capacity_mw.sum()

In [None]:
fyi_all_projects_long_format[fyi_all_projects_long_format.queue_id == "C24-325"]

In [None]:
fyi_projects[fyi_projects.queue_id == "C24-325"]

In [None]:
fyi_projects_long_format = pd.read_parquet("/app/data/output/private_data_mart/fyi_projects_long_format.parquet")

In [None]:
new_fyi[new_fyi.queue_id == "C24-325"]

### Now look at ISOs where there is more FYI capacity than in GS/LBNL

In [None]:
iso_region = "West"

fyi_iso = fyi_all_projects_long_format.query("power_market == @iso_region")
gs_lbnl_iso = iso_all_projects_long_format.query("iso_region == @iso_region")

In [None]:
active_fyi = fyi_iso[fyi_iso.queue_status == "active"]

In [None]:
non_active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status != "active"]

In [None]:
active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status == "active"]

In [None]:
# look at projects in FYI which are not active in GS
active_in_fyi_inactive_in_gs = active_fyi[active_fyi.queue_id.isin(non_active_gs.queue_id.unique())]

In [None]:
active_in_fyi_inactive_in_gs

In [None]:
# check the status of these projects in GS/LBNL
non_active_gs[non_active_gs.queue_id.isin(active_in_fyi_inactive_in_gs.queue_id)].queue_status.value_counts()

In [None]:
# look at projects that are active in FYI and not in GS
active_in_fyi_not_in_gs = active_fyi[
    ~(active_fyi.queue_id.isin(gs_lbnl_iso.queue_id)) &
    ~(active_fyi.capacity_mw.isnull())
]

In [None]:
active_in_fyi_not_in_gs.sort_values(by="capacity_mw", ascending=False)

In [None]:
active_in_fyi_not_in_gs[active_in_fyi_not_in_gs.resource_clean != "Offshore Wind"].sort_values(by="capacity_mw", ascending=False).head(10)

In [None]:
# it's worth checking the most recent raw data to see if these projects get dropped during deduplication
raw_gs = pd.read_parquet("/app/data/data_cache/gridstatus/interconnection_queues/parquet/pjm.parquet#1761671630863094")

In [None]:
# if they appear in the raw data, check if the location or resource / generation type
# would exclude the project from the data warehouse tables
raw_gs[raw_gs["Queue ID"] == "AH1-695"]

In [None]:
# check the GS data warehouse tables
gs_proj = pd.read_parquet("/app/data/output/data_warehouse/gridstatus_projects.parquet")

In [None]:
active_in_fyi_not_in_gs.resource_clean.value_counts()

In [None]:
# see if the capacity of these projects makes up the difference in GS
active_in_fyi_not_in_gs.capacity_mw.sum()/active_gs.capacity_mw.sum()

In [None]:
active_fyi.sort_values(by="capacity_mw", ascending=False).head(10)[["queue_id", "capacity_mw"]]