# Validate the interconnection.fyi data county coverage against LBNL data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import dbcp



In [3]:
pd.set_option('display.max_columns', None)

# Run Extract

In [4]:
fyi_uri = (
    "gs://dgm-archive/interconnection.fyi/interconnection_fyi_dataset_2025-10-01.csv"
)
fyi_raw_dfs = dbcp.extract.fyi_queue.extract(fyi_uri)

In [5]:
raw_fyi = fyi_raw_dfs["fyi_queue"]

# Load data warehouse tables
Run `make all` to create data warehouse and data mart tables

In [6]:
fyi_locs = pd.read_parquet("/app/data/output/private_data_warehouse/fyi_locations.parquet")

In [7]:
lbnl_locs = pd.read_parquet("/app/data/output/data_warehouse/iso_locations.parquet")

# Load data mart tables

In [8]:
fyi_long = pd.read_parquet("/app/data/output/data_mart/fyi_projects_long_format.parquet")

In [9]:
len(fyi_long.county_id_fips.unique())

2642

In [45]:
# filter for active projects
fyi_long_active = fyi_long[fyi_long.queue_status == "active"]

In [46]:
len(fyi_long_active.county_id_fips.unique())

1879

In [10]:
iso_long = pd.read_parquet("/app/data/output/data_mart/iso_projects_long_format.parquet")

In [11]:
len(iso_long.county_id_fips.unique())

1992

In [12]:
fyi_clean = pd.read_parquet("/app/data/output/data_mart/fyi_counties_proposed_clean_projects.parquet")

In [13]:
len(fyi_clean.county_id_fips.unique())

1801

In [55]:
def get_iso_clean_active(df_long):
    clean_resources = [
        "Solar",
        "Battery Storage",
        "Wind",
        "Onshore Wind",
        "Offshore Wind",
        "Hydro",
        "Geothermal",
        "Pumped Storage",
        "Nuclear",
    ]
    df_long = df_long[df_long["resource_clean"].isin(clean_resources)]
    df_long = df_long.drop(columns=["co2e_tonnes_per_year"])
    df_long.loc[:, ["capacity_mw"]] = df_long.loc[:, ["capacity_mw"]].mul(
        df_long["frac_locations_in_county"], axis=0
    )
    grp = df_long.groupby(["county_id_fips", "resource_clean"])
    aggs = grp.agg(
        {
            "capacity_mw": "sum",
            "project_id": "count",
        }
    )

    aggs.reset_index(inplace=True)
    aggs.rename(
        columns={
            "project_id": "facility_count",
            "capacity_mw": "renewable_and_battery_proposed_capacity_mw",
        },
        inplace=True,
    )

    return aggs

In [56]:
iso_clean = get_iso_clean_active(iso_long)

In [57]:
len(iso_clean.county_id_fips.unique())

1931

# Look at the counties in LBNL + GS but not FYI

In [58]:
iso_clean[~iso_clean.county_id_fips.isin(fyi_clean.county_id_fips)]

Unnamed: 0,county_id_fips,resource_clean,renewable_and_battery_proposed_capacity_mw,facility_count
0,01001,Battery Storage,80.0,1
1,01001,Solar,80.0,1
5,01017,Battery Storage,80.0,1
6,01017,Solar,80.0,1
7,01031,Battery Storage,80.0,1
...,...,...,...,...
3165,55047,Onshore Wind,160.0,1
3168,55053,Solar,46.2,1
3180,55075,Solar,65.0,1
3195,55115,Battery Storage,100.0,1


In [32]:
missing_fips = set(iso_clean.county_id_fips) - set(fyi_clean.county_id_fips)

In [37]:
iso_missing = iso_long[iso_long.county_id_fips.isin(missing_fips)]
fyi_missing = fyi_long[fyi_long.county_id_fips.isin(missing_fips)]

In [38]:
fyi_missing.queue_status.value_counts()

withdrawn      899
operational    103
suspended       41
active          31
Name: queue_status, dtype: int64

In [44]:
fyi_missing.interconnection_status.value_counts()

Withdrawn                      358
IA Executed                    153
Cluster Study                   95
Suspended                       41
System Impact Study             36
In Progress (unknown study)     32
Not Started                     30
Feasibility Study               24
Operational                     22
Facility Study                  12
Construction                     1
IA Pending                       1
Name: interconnection_status, dtype: int64

In [36]:
iso_missing.interconnection_status.value_counts()

Suspended                                  50
Study Not Started                          38
DISIS STAGE                                34
SRIS/SIS Pending                           31
Phase 1                                    28
In Progress (unknown study)                25
IA Executed                                19
Cluster Study                              15
Facility Study                             13
Network Upgrade                             8
IA FULLY EXECUTED/ON SCHEDULE               7
GIA                                         4
IA Pending                                  3
IA PENDING                                  3
FACILITY STUDY STAGE                        3
Accepted Cost Allocation/IA in Progress     3
Phase 3                                     3
Construction                                2
SRIS/SIS in Progress                        2
System Impact Study                         2
Not Started                                 2
SRIS/SIS Approved                 

In [54]:
len(fyi_missing[fyi_missing.queue_status.isin(["active"])].county_id_fips.unique())

18

In [53]:
raw_fyi[raw_fyi.unique_id == "isone-924"]

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url
12347,isone-924,Transmission,ISO-NE,,,924,NY QP543 AC,,2023-12-31,,2019-09-06,"Albany County, NY; Dutchess County, NY",NY Greenbush PV 345 kV,Albany County,NY,Other,,0.0,0.0,0.0,Active,,,,,,,,,isone,,,,,,345.0,3600136027,,,,,


# County coverage compared to LBNL

In [19]:
lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2024_clean_data.xlsx"
raw_lbnl = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)["lbnl_iso_queue"]

In [27]:
def clean_raw_fips(raw_fips: pd.Series):
    raw_fips = raw_fips.dropna()
    raw_fips = raw_fips.str.zfill(5)
    comma_separated_fips_regex = r"^\d{5}(?:,\d{5})*$"
    bad_values = raw_fips[
        ~raw_fips.str.contains(comma_separated_fips_regex)
    ]
    print(
        f"{len(bad_values)} values found in the FYI fips_codes "
        "column which are not 5-digit comma separated strings."
        f"They are: {bad_values}"
    )
    # drop these bad values
    clean_fips = raw_fips.drop(bad_values.index, axis=0)
    # only use the first FIPS code in the list
    five_digit_fips_regex = r"^(\d{5})"
    clean_fips = clean_fips.str.extract(five_digit_fips_regex)[0]
    return clean_fips

In [29]:
lbnl_clean_fips = clean_raw_fips(raw_lbnl["fips_codes"])

271 values found in the FYI fips_codes column which are not 5-digit comma separated strings.They are: 190           5500155141
696           2000120011
783      290032907529063
784      290032907529063
785           2900329147
              ...       
35131         3010730059
35504         4508945027
35770         4015340059
35987         5307753005
36168    230312301723005
Name: fips_codes, Length: 271, dtype: string


In [31]:
fyi_clean_fips = clean_raw_fips(raw_fyi["fips_codes"])

0 values found in the FYI fips_codes column which are not 5-digit comma separated strings.They are: Series([], Name: fips_codes, dtype: object)


In [39]:
print(f"n shared FIPS: {len(set(lbnl_clean_fips) & set(fyi_clean_fips))}")
print(f"n FIPS in LBNL not FYI: {len(set(lbnl_clean_fips) - set(fyi_clean_fips))}")
print(f"n FIPS in FYI not LBNL: {len(set(fyi_clean_fips) - set(lbnl_clean_fips))}")

n shared FIPS: 2551
n FIPS in LBNL not FYI: 7
n FIPS in FYI not LBNL: 93


In [40]:
len(set(lbnl_clean_fips)), len(set(fyi_clean_fips))

(2558, 2644)

In [48]:
len(fyi_locs.county_id_fips.unique())

2643

In [50]:
len(lbnl_locs.county_id_fips.unique())

2572

In [51]:
print(f"n shared FIPS: {len(set(lbnl_locs.county_id_fips) & set(fyi_locs.county_id_fips))}")
print(f"n FIPS in LBNL not FYI: {len(set(lbnl_locs.county_id_fips) - set(fyi_locs.county_id_fips))}")
print(f"n FIPS in FYI not LBNL: {len(set(fyi_locs.county_id_fips) - set(lbnl_locs.county_id_fips))}")

n shared FIPS: 2558
n FIPS in LBNL not FYI: 14
n FIPS in FYI not LBNL: 85
