# Update interconnection FYI data and validate against LBNL + GridStatus data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import dbcp
from dbcp.extract.helpers import cache_gcs_archive_file_locally
from dbcp.helpers import get_sql_engine



In [3]:
pd.set_option('display.max_columns', None)

# Raw Data

In [18]:
old_fyi = dbcp.extract.fyi_queue.extract("gs://dgm-archive/interconnection.fyi/interconnection_fyi_dataset_2025-10-01.csv")
old_fyi = old_fyi["fyi_queue"]

In [4]:
new_fyi = dbcp.extract.fyi_queue.extract("gs://dgm-archive/interconnection.fyi/interconnection_fyi_dataset_2025-11-04.csv")
new_fyi = new_fyi["fyi_queue"]

## Compare max dates of raw data
Print out the latest date a project entered a queue for each ISO in the old and new data. We should expect the latest project date in the new data to be larger than the that of the old data. Notable exceptions:
* PJM: PJM [is working through a backlog of projects](https://www.utilitydive.com/news/pjm-fast-track-reliability-projects-interconnection-queue-invenergy/729311/) and isn't accepting new projects until mid 2026.

In [None]:
for power_market in old_fyi.power_market.unique():
    print(power_market)
    old_df = old_fyi[old_fyi.power_market == power_market]
    new_df = new_fyi[new_fyi.power_market == power_market]
    
    old_df.loc[:, 'queue_date'] = pd.to_datetime(old_df.loc[:, 'queue_date'])
    new_df.loc[:, 'queue_date'] = pd.to_datetime(new_df.loc[:, 'queue_date'])
    
    print(f" - Old max date {old_df['queue_date'].max()}")
    print(f" - New max date {new_df['queue_date'].max()}")
    print()

## Compare data warehouse tables to raw data

In [4]:
engine = get_sql_engine()
with engine.connect() as con:
    fyi_locations = pd.read_sql_table("fyi_locations", con, schema="private_data_warehouse")
    fyi_projects = pd.read_sql_table("fyi_projects", con, schema="private_data_warehouse")
    fyi_res_cap = pd.read_sql_table("fyi_resource_capacity", con, schema="private_data_warehouse")

# Offshore Wind

In [8]:
with engine.connect() as con:
    wind_cable = pd.read_sql_table("offshore_wind_cable_landing_association", con, schema="data_warehouse")
    wind_proj = pd.read_sql_table("offshore_wind_projects", con, schema="data_warehouse")

In [10]:
wind_cable

Unnamed: 0,location_id,project_id
0,8,32
1,84,32
2,51,20
3,52,20
4,41,11
...,...,...
56,3,3
57,62,3
58,45,7
59,43,47


We deduplicate the data so there are project IDs in the raw data that aren't in the data warehouse tables, but ensure that we're not losing an unexpectedly high number. During the creation of the data warehouse tables we log how many projects are dropped because they are found to be duplicates. Make sure that no table is missing many more than that number of IDs. The location table will have more missing IDs because there is more nullness in the location columns than in the capacity columns.

In [9]:
print(len(set(new_fyi.unique_id) - set(fyi_projects.project_id)))
print(len(set(new_fyi.unique_id) - set(fyi_locations.project_id)))
print(len(set(new_fyi.unique_id) - set(fyi_res_cap.project_id)))

3358
4078
3358


## Compare data mart tables
Compare the old and new total active capacity in regions.

### How to grab the new data
To get the new data, replace the URI in `dbcp.etl.etl_fyi_queue` with the updated GCS URI. Then run `make all`. There might be some data validation errors due to small changes in the expected number of projects. If the changes seem reasonable, just update the expected value in the assertion. If they don't seem reason, do some digging!

Once the ETL successfully finishes the new data is available in the databse.

<!-- - download the `dev` data to compare to
- load the relevent tables

data warehouse
- check the old and new iso have a similar n and capacity
- plot total capacity


data mart:
- total capacity, n_projects and max date have all the same: caiso, ercot, pjm
- total capacity, n_projects and max date have all increased: miso, pjm, spp, nyiso, isone
- withdrawn and in service capacity have increased: miso, pjm, spp, nyiso, isone

- active capacity has changed for isos in GS_REGIONS
- how much has the active capacity changed by? -->

In [17]:
engine = get_sql_engine()
with engine.connect() as con:
    fyi_projects_long_format = pd.read_sql_table("fyi_projects_long_format", con, schema="private_data_mart")

### How to grab the old data
The following code grabs the latest version number for data in the development datasets then downloads the parquet file.

In [4]:
from google.cloud import bigquery

def get_bigquery_table_version(dataset_id, table_name, project_id="dbcp-dev-350818"):
    """
    Get the data version of a BigQuery table.

    The dbcp.commands.publish script generates a version number for each data release
    and adds it as a label to the BQ tables.

    Args:
        dataset_id: the BQ dataset ID
        table_name: the name of the table
        project_id: the GCP project id

    Return:
        the current DBCP version number of the requested table
    """
    client = bigquery.Client()

    table_ref = f"{project_id}.{dataset_id}.{table_name}"
    table = client.get_table(table_ref)  # Fetch table metadata

    labels = table.labels  # Get the labels dictionary
    return labels["version"]

In [4]:
from dbcp.extract.helpers import cache_gcs_archive_file_locally

table_name = "fyi_projects_long_format"
version = get_bigquery_table_version("private_data_mart_dev", table_name)
uri = f"gs://dgm-outputs/{version}/data_mart/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

fyi_projects_long_format_path = cache_gcs_archive_file_locally(uri, data_cache)
old_fyi_projects_long_format = pd.read_parquet(iso_projects_long_format_path)

## Compare to LBNL + GridStatus ISO queue data

In [20]:
engine = get_sql_engine()
with engine.connect() as con:
    iso_projects_long_format = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")

In [5]:
iso_projects_long_format.queue_status.value_counts()

active    10350
Name: queue_status, dtype: int64

In [26]:
iso_projects_long_format.resource_clean.value_counts()

Solar                    4753
Battery Storage          3593
Onshore Wind              936
Natural Gas               469
Unknown                   274
Offshore Wind              68
Other                      52
Hydro                      46
Geothermal                 34
Oil                        30
Nuclear                    28
Coal                       22
Other Storage              17
Biofuel                    11
Pumped Storage              8
Municipal Solid Waste       4
Biomass                     4
Hydrogen                    1
Name: resource_clean, dtype: int64

In [27]:
fyi_projects_long_format.resource_clean.value_counts()

Solar              4736
Battery Storage    3024
Onshore Wind        954
Natural Gas         642
Other               558
Offshore Wind        68
Hydro                47
Nuclear              44
Geothermal           42
Coal                 23
Biofuel              20
Oil                  18
Pumped Storage       15
Biomass              12
Other Storage        12
Waste Heat            4
Name: resource_clean, dtype: int64

Compare the county coverage of the datasets

In [28]:
len(fyi_projects_long_format.county_id_fips.unique()), len(iso_projects_long_format.county_id_fips.unique())

(1886, 1912)

In [30]:
from dbcp.constants import FYI_RESOURCE_DICT
clean_resources = [resource for resource, codes_dict in FYI_RESOURCE_DICT.items() if codes_dict["type"] == "Renewable"]

In [31]:
len(fyi_projects_long_format[fyi_projects_long_format.resource_clean.isin(clean_resources)].county_id_fips.unique())

1812

In [32]:
len(iso_projects_long_format[iso_projects_long_format.resource_clean.isin(clean_resources)].county_id_fips.unique())

1850

Compare metrics between datasets for each ISO.

In [33]:
def agg_iso_projects_long_format(df, iso_col, id_col):
    """Calculate some aggregate metrics for each ISO"""
    agg = df.groupby(iso_col).agg({id_col: "count", "capacity_mw": "sum", "date_entered_queue": "max"})
    agg = agg.rename(columns={id_col: "n_projects", "capacity_mw": "total_capacity_mw", "date_entered_queue": "max_date_entered_queue"})
    return agg

fyi_project_agg = agg_iso_projects_long_format(fyi_projects_long_format, "power_market", "project_id")
iso_project_agg = agg_iso_projects_long_format(iso_projects_long_format, "iso_region", "surrogate_id")

In [34]:
fyi_project_agg.max_date_entered_queue

power_market
AESO        2025-08-07
CAISO       2025-02-12
ERCOT       2025-10-21
ISONE       2024-12-12
MISO        2025-10-27
NYISO       2025-09-02
PJM         2023-06-30
SPP         2025-10-02
Southeast   2025-09-30
West        2025-10-23
Name: max_date_entered_queue, dtype: datetime64[ns]

In [35]:
both_project_aggs = fyi_project_agg.merge(iso_project_agg, how="outer", left_index=True, right_index=True, validate="1:1", suffixes=("_fyi", "_iso"))
both_project_aggs

Unnamed: 0,n_projects_fyi,total_capacity_mw_fyi,max_date_entered_queue_fyi,n_projects_iso,total_capacity_mw_iso,max_date_entered_queue_iso
AESO,226,48848.974,2025-08-07,,,NaT
CAISO,654,196155.478732,2025-02-12,900.0,269052.636325,2023-04-17 00:00:00
ERCOT,1807,384783.44,2025-10-21,1793.0,386328.6,2025-09-23 00:00:00
ISONE,96,19253.40209,2024-12-12,95.0,19560.3305,2024-12-12 00:00:00
MISO,1776,352382.73,2025-10-27,1813.0,337477.89,2025-10-07 04:00:00
NYISO,363,50489.58,2025-09-02,354.0,54695.38,2025-09-02 00:00:00
PJM,1715,165004.5228,2023-06-30,1608.0,131084.1638,2023-06-30 00:00:00
SPP,729,161200.679,2025-10-02,763.0,168384.879,2025-10-02 00:00:00
Southeast,857,126735.081,2025-09-30,930.0,128113.102,2024-12-19 00:00:00
West,1928,473389.4629,2025-10-23,2026.0,383993.53,2024-12-30 00:00:00


In [36]:
# Calculate the differences between the old and new
for col in iso_project_agg.columns:
    if pd.api.types.is_datetime64_any_dtype(iso_project_agg[col]):
        continue
    else:
        both_project_aggs[f"{col}_pct_diff"] = (both_project_aggs[f"{col}_fyi"] - both_project_aggs[f"{col}_iso"]) / both_project_aggs[f"{col}_iso"]

Ideally a less than 20% percent change in capacity for each region. It's expected that there will be more capacity in FYI than in GS + LBNL because data from more utilities are included in the FYI data. It's not too worrying if the differences in this chart are positive, it's more worrying if they're negative.

CAISO is updated by LBNL annually, not by quarterly GS updates, so this difference in update frequency can likely account for much of the difference in CAISO numbers.

In [37]:
iso_project_agg

both_project_aggs.sort_values(by="total_capacity_mw_iso", ascending=False)[["n_projects_pct_diff", "total_capacity_mw_pct_diff"]] * 100

Unnamed: 0,n_projects_pct_diff,total_capacity_mw_pct_diff
ERCOT,0.780814,-0.39996
West,-4.837117,23.280583
MISO,-2.040816,4.416538
CAISO,-27.333333,-27.094013
SPP,-4.456094,-4.266535
PJM,6.654229,25.876779
Southeast,-7.849462,-1.075628
NYISO,2.542373,-7.689498
ISONE,1.052632,-1.569137
AESO,,


## Dig deeper into project level changes for regions with big differences in capacity

### Start with ISOs where the FYI capacity is less than the GS capacity.

* Were projects that are not active in FYI withdrawn recently? Vice versa?

In [26]:
from dbcp.data_mart.projects import create_long_format, create_fyi_long_format

# The dataframe this function returns includes all projects, active, withdrawn and operational. ERCOT only tracks active projects.
iso_all_projects_long_format = create_long_format(engine, active_projects_only=False)
fyi_all_projects_long_format = create_long_format(engine, active_projects_only=False)

In [42]:
iso_region = "SPP"

fyi_iso = fyi_all_projects_long_format.query("power_market == @iso_region")
gs_lbnl_iso = iso_all_projects_long_format.query("iso_region == @iso_region")

In [43]:
fyi_iso.queue_status.value_counts()

withdrawn      1610
active          729
operational     255
suspended         7
Name: queue_status, dtype: int64

In [44]:
gs_lbnl_iso.queue_status.value_counts()

withdrawn      1716
active          763
operational     277
suspended         8
Name: queue_status, dtype: int64

In [45]:
fyi_iso.queue_id.is_unique

True

In [46]:
fyi_iso[fyi_iso.queue_id.duplicated(keep=False)].head(5)

Unnamed: 0,state,county,project_id,queue_id,date_proposed_online,developer,power_market,interconnection_status,point_of_interconnection,project_name,date_entered_queue,queue_status,iso,utility,is_actionable,is_nearly_certain,actual_completion_date,withdrawn_date,capacity_mw,resource_clean,state_id_fips,county_id_fips,frac_locations_in_county,source,state_permitting_type,co2e_tonnes_per_year,ordinance_earliest_year_mentioned,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance_text,ordinance_via_reldi,ordinance_via_solar_nrel,ordinance_via_wind_nrel,ordinance_via_nrel_is_de_facto,ordinance_via_self_maintained,ordinance_is_restrictive,is_hybrid,resource_class


In [47]:
len(gs_lbnl_iso[gs_lbnl_iso.queue_id.duplicated()])

138

In [48]:
active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status == "active"]

In [49]:
not_active_fyi = fyi_iso[fyi_iso.queue_status != "active"]

In [50]:
# look at projects active in GS which are not active in FYI
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].queue_status.value_counts()

Series([], Name: queue_status, dtype: int64)

In [51]:
# make sure projects were withdrawn recently
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].withdrawn_date.value_counts()

Series([], Name: withdrawn_date, dtype: int64)

In [52]:
# does this missing capacity make up the difference in total capacity?
not_active_fyi[not_active_fyi.queue_id.isin(active_gs.queue_id)].capacity_mw.sum()/active_gs.capacity_mw.sum()

0.0

In [53]:
# look at projects in GS which aren't in FYI
# it is likely that these projects were dropped during the deduplication cleaning
# step in the transform. You can spot check to make sure that a different project ID with the
# same interconnection point, capacity, resource etc. is in the data
active_gs[~active_gs.queue_id.isin(fyi_iso.queue_id)].sort_values(by="capacity_mw", ascending=False).head(5)

Unnamed: 0,state,county,queue_id,is_nearly_certain,project_id,project_name,capacity_mw,developer,entity,iso_region,utility,date_proposed_online,point_of_interconnection,is_actionable,resource_clean,queue_status,date_entered_queue,actual_completion_date,withdrawn_date,interconnection_status,state_id_fips,county_id_fips,frac_locations_in_county,source,state_permitting_type,co2e_tonnes_per_year,ordinance_earliest_year_mentioned,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance_text,ordinance_via_reldi,ordinance_via_solar_nrel,ordinance_via_wind_nrel,ordinance_via_nrel_is_de_facto,ordinance_via_self_maintained,ordinance_is_restrictive,is_hybrid,resource_class,surrogate_id
6454,New Mexico,Roosevelt,GEN-2024-304,False,42891,,500.0,,SPP,SPP,SPS,2029-12-31 00:00:00,Crossroads 345 kV Substation,True,Solar,active,2025-03-01,NaT,NaT,DISIS STAGE,35,35041,1.0,gridstatus,Local,,,,,,False,,,,False,False,False,renewable,6454
6644,New Mexico,Chaves,GEN-2024-199,False,43081,,500.0,,SPP,SPP,SPS,2028-12-04 00:00:00,Eddy County - Crossroads 345 kV Line,True,Onshore Wind,active,2025-03-01,NaT,NaT,DISIS STAGE,35,35005,1.0,gridstatus,Local,,,,,,False,,,,False,False,False,renewable,6644
10130,Texas,Hansford,GEN-2024-046,False,46622,,345.0,,SPP,SPP,SPS,2029-12-01 00:00:00,Hitchland-Moore 230 kV line,True,Solar,active,2024-10-30,NaT,NaT,DISIS STAGE,48,48195,1.0,gridstatus,Local,,,,,,False,,,,,False,False,renewable,10130
7808,Nebraska,Cass,GEN-2024-212,False,44273,,303.0,,SPP,SPP,OPPD,2035-02-01 00:00:00,Substation 3740 345 kV,True,Natural Gas,active,2025-03-01,NaT,NaT,DISIS STAGE,31,31025,1.0,gridstatus,Hybrid,166918.128068,,,,,False,False,False,False,True,True,False,fossil,7808
9644,Texas,Hutchinson,GEN-2024-078,False,46113,,300.0,,SPP,SPP,SPS,2029-12-01 00:00:00,Pringle 230 kV Substation,True,Solar,active,2024-10-30,NaT,NaT,DISIS STAGE,48,48233,1.0,gridstatus,Local,,,,,,False,,,,,False,False,renewable,9644


### Now look at ISOs where there is more FYI capacity than in GS/LBNL

In [90]:
iso_region = "West"

fyi_iso = fyi_all_projects_long_format.query("power_market == @iso_region")
gs_lbnl_iso = iso_all_projects_long_format.query("iso_region == @iso_region")

In [91]:
active_fyi = fyi_iso[fyi_iso.queue_status == "active"]

In [92]:
non_active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status != "active"]

In [93]:
active_gs = gs_lbnl_iso[gs_lbnl_iso.queue_status == "active"]

In [94]:
# look at projects in FYI which are not active in GS
active_in_fyi_inactive_in_gs = active_fyi[active_fyi.queue_id.isin(non_active_gs.queue_id.unique())]

In [95]:
active_in_fyi_inactive_in_gs

Unnamed: 0,state,county,project_id,queue_id,date_proposed_online,developer,power_market,interconnection_status,point_of_interconnection,project_name,date_entered_queue,queue_status,iso,utility,is_actionable,is_nearly_certain,actual_completion_date,withdrawn_date,capacity_mw,resource_clean,state_id_fips,county_id_fips,frac_locations_in_county,source,state_permitting_type,co2e_tonnes_per_year,ordinance_earliest_year_mentioned,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance_text,ordinance_via_reldi,ordinance_via_solar_nrel,ordinance_via_wind_nrel,ordinance_via_nrel_is_de_facto,ordinance_via_self_maintained,ordinance_is_restrictive,is_hybrid,resource_class
482,Arizona,Maricopa,arizona-public-service-q288,Q288,2021-12-31,"RWE Solar Development, LLC",West,In Progress (unknown study),Panda 230kV,,2018-09-28,active,,Arizona Public Service,False,False,NaT,NaT,200.0,Solar,04,04013,1.0,fyi,Hybrid,,,,,,False,False,False,False,False,False,False,renewable
1146,Wyoming,Laramie,black-hills-clpt-clpt-g11,CLPT G11,2023-12-31,NextEra Energy,West,IA Executed,Cheyenne West,,2017-06-19,active,,Black Hills Cheyenne Light Fuel and Power Tran...,False,True,NaT,NaT,150.0,Onshore Wind,56,56021,1.0,fyi,Hybrid,,,,,,False,False,False,False,False,False,False,renewable
1151,Wyoming,Platte,black-hills-clpt-clpt-g16,CLPT G16,2025-12-31,NextEra Energy,West,IA Executed,Cheyenne - Windstar 230kV Line,,2022-04-01,active,,Black Hills Cheyenne Light Fuel and Power Tran...,False,True,NaT,NaT,300.0,Onshore Wind,56,56031,1.0,fyi,Hybrid,,,,,,False,False,False,False,,False,False,renewable
1153,Wyoming,Converse,black-hills-clpt-clpt-g18,CLPT G18,2024-12-15,,West,IA Executed,Windstar - West Cheyenne 230 kV,,2022-06-24,active,,Black Hills Cheyenne Light Fuel and Power Tran...,False,True,NaT,NaT,255.0,Solar,56,56009,1.0,fyi,Hybrid,,,,,,False,False,False,False,,False,False,renewable
1173,Colorado,Huerfano,black-hills-colorado-electric-bhct-g11,BHCT-G11,2020-06-01,Black Hills Energy,West,IA Executed,Rattlesnake Butte 115kV,,2012-06-26,active,,Black Hills Colorado Electric,False,True,NaT,NaT,29.0,Onshore Wind,08,08055,1.0,fyi,Hybrid,,,,,,False,False,False,False,,False,False,renewable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37026,Arizona,Mohave,wapa-desert-southwest-region-2023-g6,2023-G6,2027-01-15,,West,,White Hills – Peacock 345kV,,2023-05-02,active,,WAPA Desert Southwest Region,False,False,NaT,NaT,150.0,Solar,04,04015,1.0,fyi,Hybrid,,,,,,False,,,,True,True,False,renewable
37027,Arizona,La Paz,wapa-desert-southwest-region-2023-g8,2023-G8,2027-04-01,,West,,Bouse 230kV Substation,,2023-07-10,active,,WAPA Desert Southwest Region,False,False,NaT,NaT,250.0,Solar,04,04012,1.0,fyi,Hybrid,,,,,,False,False,False,False,False,False,False,renewable
37244,North Dakota,Burke,wapa-gi-1103,GI-1103,NaT,,West,IA Executed,Kenmare 115kV,,2011-09-09,active,,WAPA/BEPC/HCPD Integrated System,False,True,NaT,NaT,0.8,Natural Gas,38,38013,1.0,fyi,Hybrid,2504.20954,,,,,False,,,,,False,False,fossil
37338,,,wapa-rocky-mountain-region-2017-g2,2017-G2,2020-09-15,,West,In Progress (unknown study),Roundtop - Stegall 115-kV,,2017-01-17,active,,WAPA Rocky Mountain Region,False,False,NaT,NaT,130.0,Onshore Wind,,,1.0,fyi,,,,,,,False,False,True,True,,True,False,renewable


In [96]:
# check the status of these projects in GS/LBNL
non_active_gs[non_active_gs.queue_id.isin(active_in_fyi_inactive_in_gs.queue_id)].queue_status.value_counts()

withdrawn      115
operational     54
suspended       14
Name: queue_status, dtype: int64

In [98]:
# look at projects that are active in FYI and not in GS
active_in_fyi_not_in_gs = active_fyi[
    ~(active_fyi.queue_id.isin(gs_lbnl_iso.queue_id)) &
    ~(active_fyi.capacity_mw.isnull())
]

In [104]:
active_in_fyi_not_in_gs.sort_values(by="capacity_mw", ascending=False)

Unnamed: 0,state,county,project_id,queue_id,date_proposed_online,developer,power_market,interconnection_status,point_of_interconnection,project_name,date_entered_queue,queue_status,iso,utility,is_actionable,is_nearly_certain,actual_completion_date,withdrawn_date,capacity_mw,resource_clean,state_id_fips,county_id_fips,frac_locations_in_county,source,state_permitting_type,co2e_tonnes_per_year,ordinance_earliest_year_mentioned,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance_text,ordinance_via_reldi,ordinance_via_solar_nrel,ordinance_via_wind_nrel,ordinance_via_nrel_is_de_facto,ordinance_via_self_maintained,ordinance_is_restrictive,is_hybrid,resource_class
31910,Arizona,Navajo,salt-river-project-cluster-24-q2409,Q2409,2028-12-31,,West,,Sugarloaf 500 kV,,2024-02-05,active,,Salt River Project,False,False,NaT,NaT,2626.235,Solar,04,04017,1.0,fyi,Hybrid,,,,,,False,False,False,False,False,False,False,renewable
31925,Arizona,Pinal,salt-river-project-cluster-25-q2507,Q2507,2027-12-31,,West,,Duke-Pinal Central 500 kV Line,,2025-02-05,active,,Salt River Project,False,False,NaT,NaT,2538.000,Natural Gas,04,04021,1.0,fyi,Hybrid,5.583324e+06,,,,,False,True,False,False,False,False,False,fossil
2787,Oregon,Umatilla,bpa-l0653,L0653,2028-06-03,,West,In Progress (unknown study),"Longhorn Substation, McNary Substation",Longhorn-McNary 230 kV Loop,2025-06-02,active,,Umatilla Electric Cooperative,False,False,NaT,NaT,2225.000,Other,41,41059,1.0,fyi,Hybrid,,,Umatilla County,county,Wind turbines must be set back 2 miles from th...,True,False,True,True,,True,False,fossil
2695,Oregon,Washington,bpa-l0561,L0561,2032-12-31,,West,In Progress (unknown study),Keeler-Forest Grove No. 1 & No. 2 115kV Lines,NW Hillsboro-North Plains Semiconductor Plants...,2023-05-12,active,,Portland General Electric Company,False,False,NaT,NaT,1800.000,Other,41,41067,1.0,fyi,Hybrid,,,,,,False,False,True,True,True,True,False,fossil
2694,Oregon,Washington,bpa-l0560,L0560,2028-12-31,,West,In Progress (unknown study),Keeler-Forest Grove No. 1 & No. 2 115kV Lines,West Hillsboro Data Center Project (Phase 1),2023-05-12,active,,Portland General Electric Company,False,False,NaT,NaT,1400.000,Other,41,41067,1.0,fyi,Hybrid,,,,,,False,False,True,True,True,True,False,fossil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,Oregon,Marion,bpa-l0501,L0501,2023-08-23,,West,In Progress (unknown study),Chemawa Substation,Chemawa-Waconda 115 kV Line Project,2021-08-25,active,,Portland General Electric Company,False,False,NaT,NaT,0.000,Other,41,41047,1.0,fyi,Hybrid,,,,,,False,,,,True,True,False,fossil
2730,Oregon,Lincoln,bpa-l0595,L0595,2027-03-19,,West,In Progress (unknown study),Toledo Substation,CLPUD-CPI Loop Tie Project,2024-03-20,active,,Central Lincoln People's Utility District,False,False,NaT,NaT,0.000,Other,41,41041,1.0,fyi,Hybrid,,,,,,False,,,,,False,False,fossil
2696,Oregon,Morrow,bpa-l0562,L0562,2026-05-22,,West,In Progress (unknown study),Keeler-Forest Grove No. 1 & No. 2 115kV Lines,Keeler Horizon #1 & #2 Terminal Upgrades,2023-05-12,active,,Portland General Electric Company,False,False,NaT,NaT,0.000,Other,41,41049,1.0,fyi,Hybrid,,,,,,False,,,,,False,False,fossil
2629,Oregon,Lane,bpa-l0495,L0495,2023-07-10,,West,In Progress (unknown study),BPA-owned 2.92 miles section of Alvey-Martin C...,Latham Substation Reliability Project,2021-07-12,active,,Emerald People's Utility District,False,False,NaT,NaT,0.000,Other,41,41039,1.0,fyi,Hybrid,,,,,,False,,,,True,True,False,fossil


In [85]:
active_in_fyi_not_in_gs[active_in_fyi_not_in_gs.resource_clean != "Offshore Wind"].sort_values(by="capacity_mw", ascending=False).head(10)

Unnamed: 0,state,county,project_id,queue_id,date_proposed_online,developer,power_market,interconnection_status,point_of_interconnection,project_name,date_entered_queue,queue_status,iso,utility,is_actionable,is_nearly_certain,actual_completion_date,withdrawn_date,capacity_mw,resource_clean,state_id_fips,county_id_fips,frac_locations_in_county,source,state_permitting_type,co2e_tonnes_per_year,ordinance_earliest_year_mentioned,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance_text,ordinance_via_reldi,ordinance_via_solar_nrel,ordinance_via_wind_nrel,ordinance_via_nrel_is_de_facto,ordinance_via_self_maintained,ordinance_is_restrictive,is_hybrid,resource_class
26148,West Virginia,Doddridge,pjm-ag2-582,AG2-582,2026-07-01,Competitive Power Ventures,PJM,System Impact Study,Flint Run 500 kV,,2021-03-31,active,pjm,APS,True,False,NaT,NaT,2100.0,Natural Gas,54.0,54017.0,1.0,fyi,State,4619772.0,,,,,False,,,,,False,False,fossil
26922,Ohio,Lucas,pjm-ah1-690,AH1-690,2031-03-01,Competitive Power Ventures,PJM,System Impact Study,Bay Shore - Lallendorf 345 kV,,NaT,active,pjm,ATSI,True,False,NaT,NaT,1475.0,Natural Gas,39.0,39095.0,1.0,fyi,Hybrid,3244840.0,,,,,False,,,,False,False,False,fossil
26912,Ohio,Carroll,pjm-ah1-680,AH1-680,2030-05-01,Advanced Power,PJM,System Impact Study,Sammis - South Canton 345 kV,,NaT,active,pjm,ATSI,True,False,NaT,NaT,1300.0,Natural Gas,39.0,39019.0,1.0,fyi,Hybrid,2859859.0,,,,,False,,,,False,False,False,fossil
26908,Virginia,Orange,pjm-ah1-676,AH1-676,2031-05-31,,PJM,System Impact Study,Gordonsville 230 kV,,NaT,active,pjm,Dominion,True,False,NaT,NaT,1294.6,Natural Gas,51.0,51137.0,1.0,fyi,Hybrid,2847979.0,,,,,False,,,,False,False,False,fossil
29067,,,pjm-u1-052-889837,U1-052 889837,NaT,,PJM,,Duquesne-PJM,,2008-03-26,active,pjm,,False,False,NaT,NaT,1188.0,Other,,,1.0,fyi,,,,,,,False,False,True,True,,True,False,fossil
27299,Kentucky,,pjm-ah2-353,AH2-353,2028-12-31,,PJM,Feasibility Study,Jefferson - Hanging Rock 765 kV,,2022-03-10,active,pjm,AEP,False,False,NaT,NaT,1000.0,Solar,21.0,,1.0,fyi,Hybrid,,,,,,False,False,True,True,,True,False,renewable
26926,Pennsylvania,Dauphin,pjm-ah1-695,AH1-695,2027-10-01,Constellation,PJM,System Impact Study,Three Mile Island 230 kV,,NaT,active,pjm,ME,True,False,NaT,NaT,859.0,Nuclear,42.0,42043.0,1.0,fyi,Local,,,,,,False,,,,False,False,False,other
26954,Kentucky,,pjm-ah1-721,AH1-721,2030-12-31,"East Kentucky Power Cooperative, Inc.",PJM,System Impact Study,Laurel-Cooper 161 kV,,NaT,active,pjm,EKPC,True,False,NaT,NaT,786.0,Natural Gas,21.0,,1.0,fyi,Hybrid,1729114.0,,,,,False,False,True,True,,True,False,fossil
26944,Virginia,Fluvanna,pjm-ah1-712,AH1-712,2031-05-30,Tenaska,PJM,System Impact Study,Cunningham 500 kV,,NaT,active,pjm,Dominion,True,False,NaT,NaT,776.0,Natural Gas,51.0,51065.0,1.0,fyi,Hybrid,1707116.0,,,,,False,,,,True,True,False,fossil
30307,,,pjm-y1-004-3673272,Y1-004 3673272,NaT,,PJM,System Impact Study,TVA-PJM,,2012-02-07,active,pjm,,True,False,NaT,NaT,720.0,Other,,,1.0,fyi,,,,,,,False,False,True,True,,True,False,fossil


In [67]:
# it's worth checking the most recent raw data to see if these projects get dropped during deduplication
raw_gs = pd.read_parquet("/app/data/data_cache/gridstatus/interconnection_queues/parquet/pjm.parquet#1761671630863094")

In [89]:
# if they appear in the raw data, check if the location or resource / generation type
# would exclude the project from the data warehouse tables
raw_gs[raw_gs["Queue ID"] == "AH1-695"]

Unnamed: 0,Queue ID,Project Name,Interconnecting Entity,County,State,Interconnection Location,Transmission Owner,Generation Type,Capacity (MW),Summer Capacity (MW),Winter Capacity (MW),Queue Date,Status,Proposed Completion Date,Withdrawn Date,Withdrawal Comment,Actual Completion Date,MW In Service,Commercial Name,Initial Study,Feasibility Study,Feasibility Study Status,System Impact Study,System Impact Study Status,Facilities Study,Facilities Study Status,Interim/Interconnection Service/Generation Interconnection Agreement,Interim/Interconnection Service/Generation Interconnection Agreement Status,Wholesale Market Participation Agreement,Construction Service Agreement,Construction Service Agreement Status,Upgrade Construction Service Agreement,Upgrade Construction Service Agreement Status,Backfeed Date,Long-Term Firm Service Start Date,Long-Term Firm Service End Date,Test Energy Date


In [71]:
# check the GS data warehouse tables
gs_proj = pd.read_parquet("/app/data/output/data_warehouse/gridstatus_projects.parquet")

In [31]:
active_in_fyi_not_in_gs.resource_clean.value_counts()

Other              3
Solar              1
Battery Storage    1
Name: resource_clean, dtype: int64

In [32]:
# see if the capacity of these projects makes up the difference in GS
active_in_fyi_not_in_gs.capacity_mw.sum()/active_gs.capacity_mw.sum()

0.06072956450154523

In [39]:
active_fyi.sort_values(by="capacity_mw", ascending=False).head(10)[["queue_id", "capacity_mw"]]

Unnamed: 0,queue_id,capacity_mw
20138,C24-152,4000.0
19243,0971,2383.6
19229,0956,2380.0
20066,C24-089,2064.15
20259,C24-266,1990.0
20117,C24-131,1800.0
20304,C24-309,1600.0
20224,C24-233,1600.0
20164,C24-178,1500.0
20213,C24-223,1400.0
