In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import dbcp

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
fyi_uri = (
    "gs://dgm-archive/inconnection.fyi/interconnection_fyi_dataset_2025-09-01.csv"
)
fyi_raw_dfs = dbcp.extract.fyi_queue.extract(fyi_uri)

In [5]:
raw_df = fyi_raw_dfs["fyi_queue"]

Check if there are any projects that don't have queue date or status.

In [6]:
raw_df[(raw_df.queue_date.isnull()) & (raw_df.queue_status.isnull())]

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url


Check if `unique_id` is really unique. We will rename this column to `project_id`

In [7]:
raw_df.unique_id.is_unique

True

Harmonize interconnection statuses.

In [10]:
date_cols = [
        col
        for col in raw_df.columns
        if (
            (col.startswith("date_") or col.endswith("_date"))
            and not pd.api.types.is_datetime64_any_dtype(raw_df.loc[:, col])
        )
    ]

In [8]:
[col for col in raw_df.columns if "status" in col]

['queue_status', 'interconnection_status_raw', 'interconnection_status_fyi']

In [9]:
raw_df["interconnection_status_fyi"].value_counts()

IA Executed                    7276
Withdrawn                      7249
System Impact Study            5029
Feasibility Study              4400
Facility Study                 2270
Cluster Study                  2103
In Progress (unknown study)    1943
Operational                    1104
Not Started                     805
Suspended                       339
IA Pending                      273
Construction                    180
Name: interconnection_status_fyi, dtype: int64

In [10]:
allowed_statuses = {
        "Cluster Study",
        "Combined",
        "Construction",
        "Facility Study",
        "Feasibility Study",
        "IA Executed",
        "IA Pending",
        "In Progress (unknown study)",
        "Not Started",
        "Operational",
        "Phase 4 Study",
        "Suspended",
        "System Impact Study",
        "Withdrawn",
    }

In [11]:
fyi_status_values = set(raw_df["interconnection_status_fyi"].unique())

In [12]:
fyi_status_values - allowed_statuses

{nan}

Look at the `capacity_by_generation_type_breakdown` column

In [42]:
[col for col in raw_df if "capacity" in col]

['capacity_mw',
 'summer_capacity_mw',
 'winter_capacity_mw',
 'capacity_by_generation_type_breakdown']

In [21]:
raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()][["capacity_mw", "capacity_by_generation_type_breakdown"]]

Unnamed: 0,capacity_mw,capacity_by_generation_type_breakdown
1065,375.0,- canonical_gen_type: Solar\n mw: 100\n- cano...
1066,375.0,- canonical_gen_type: Battery\n mw: 185\n- ca...
1067,125.0,- canonical_gen_type: Battery\n mw: 65\n- can...
1134,80.0,- canonical_gen_type: Battery\n mw: 30\n- can...
1341,50.0,- canonical_gen_type: Wind\n mw: 50\n
...,...,...
39238,200.0,- canonical_gen_type: Solar\n mw: 200\n- cano...
39239,500.0,- canonical_gen_type: Solar\n mw: 500\n- cano...
39244,600.0,- canonical_gen_type: Battery\n mw: 200\n- ca...
39247,350.0,- canonical_gen_type: Solar\n mw: 350\n- cano...


In [25]:
raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()]["capacity_by_generation_type_breakdown"].iloc[0]

'- canonical_gen_type: Solar\n  mw: 100\n- canonical_gen_type: Battery\n  mw: 185\n- canonical_gen_type: Wind\n  mw: 275\n'

Capacity is a sum of the capacities in capacity_by_generation_type_breakdown (excluding battery)

In [51]:
raw_df["capacity_by_generation_type_breakdown"] = raw_df["capacity_by_generation_type_breakdown"].astype(str)

In [53]:
raw_df[(~raw_df["capacity_by_generation_type_breakdown"].isnull()) &
~(raw_df["capacity_by_generation_type_breakdown"].str.startswith("- canonical_gen_type:"))
]["capacity_by_generation_type_breakdown"].value_counts()

nan    35600
Name: capacity_by_generation_type_breakdown, dtype: int64

In [56]:
raw_df.head(1)

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url
0,aeso-p1756,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1756,P1756 TPG Canyon Creek PHES Storage Project,,2027-05-01,,2016-03-02,"Yellowhead County, AB",29-Hinton/Edson,Yellowhead County,AB,Pumped Storage,,139.0,,,Active,3,,In Progress (unknown study),,WindRiver Power Corporation,,,,aeso,,,,,,,,,,,,


In [6]:
import yaml

In [7]:
# Function to parse the YAML string
def parse_capacity(s, n=3):
    try:
        data = yaml.safe_load(s)
    except Exception:
        return {}
    out = {}
    for i, item in enumerate(data[:n], start=1):
        out[f"resource_type_{i}"] = item.get("canonical_gen_type")
        out[f"capacity_mw_resource_{i}"] = item.get("mw")
    return out

# Apply parsing
parsed = raw_df["capacity_by_generation_type_breakdown"].apply(parse_capacity)

# Expand into columns
parsed_df = pd.json_normalize(parsed)

In [10]:
raw_df[~raw_df.capacity_by_generation_type_breakdown.isnull()]

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url
1065,avista-110,Generation,West,Avista Utilities,Avista Utilities,110,,,2025-12-01,,2021-01-07,"Garfield County, WA",DryCreek 230 kV or Walla Walla to Dry Creek 23...,Garfield County,WA,Battery + Solar + Wind,NRIS,375.0,375.0,375.0,Active,,,,,,,,Avista Utilities,,,,,,- canonical_gen_type: Solar\n mw: 100\n- cano...,230.0,53023,,,,,
1066,avista-111,Generation,West,Avista Utilities,Avista Utilities,111,,,2024-12-01,,2021-01-07,"Walla Walla County, WA; Franklin County, WA",Walla Walla-Wanapum 230 kV,Walla Walla County,WA,Battery + Solar,NRIS,375.0,375.0,375.0,Withdrawn,,,,,,,,Avista Utilities,,,,,,- canonical_gen_type: Battery\n mw: 185\n- ca...,230.0,5307153021,,,,,
1067,avista-112,Generation,West,Avista Utilities,Avista Utilities,112,,,2024-12-01,,2021-01-07,"Lincoln County, WA",Irby 115kV substation or Irby to Wilson Creek ...,Lincoln County,WA,Battery + Solar,NRIS,125.0,125.0,125.0,Withdrawn,,,,,,,,Avista Utilities,,,,,,- canonical_gen_type: Battery\n mw: 65\n- can...,115.0,53043,,,,,
1134,avista-64,Generation,West,Avista Utilities,Avista Utilities,64,,,2021-12-15,,2018-08-16,"Adams County, WA",Marengo 115 kV Station,Adams County,WA,Battery + Solar,NRIS and ERIS,80.0,80.0,80.0,Withdrawn,,,,,,,,Avista Utilities,,,,,,- canonical_gen_type: Battery\n mw: 30\n- can...,115.0,53001,,,,,
1341,bpa-g0203,Generation,West,Bonneville Power Administration,Bonneville Power Administration,G0203,Juniper Canyon 1,,2007-12-01,,2005-05-26,"Klickitat County, WA",Rock Creek,Klickitat County,WA,Wind,NRIS,50.0,50.0,50.0,Operational,Raw Status: ENERGIZED,,Operational,,,,,"Avangrid Power, LLC",,,on-line February 2011,45.783352,-120.531594,- canonical_gen_type: Wind\n mw: 50\n,,53039,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39238,tucson-electric-power-83,Generation,West,Tucson Electric Power Company,Tucson Electric Power,83,,,2023-06-01,,2020-09-17,"Pima County, AZ",South Loop 138kV,Pima County,AZ,Battery + Solar,NRIS,200.0,200.0,200.0,Withdrawn,WITHDRAWN,,Withdrawn,,,,,Tucson Electric Power Company,,,,32.401059,-111.126335,- canonical_gen_type: Solar\n mw: 200\n- cano...,138.0,04019,,,,,
39239,tucson-electric-power-84,Generation,West,Tucson Electric Power Company,Tucson Electric Power,84,,,2023-12-31,,2020-09-30,"Pima County, AZ",Pinal West - South Loop 345kV,Pima County,AZ,Other,NRIS,500.0,500.0,,Active,IA EXECUTION IN PROGRESS,IA EXECUTION IN PROGRESS,IA Pending,,,,,Tucson Electric Power Company,,,,,,- canonical_gen_type: Solar\n mw: 500\n- cano...,345.0,04019,,,,,
39244,tucson-electric-power-90,Generation,West,Tucson Electric Power Company,Tucson Electric Power,90,,,2025-12-01,,2021-01-27,"Pima County, AZ",Pinal West - South Loop 345kV,Pima County,AZ,Other,NRIS,600.0,600.0,,Active,IA EXECUTION IN PROGRESS,IA EXECUTION IN PROGRESS,IA Pending,,,,,Tucson Electric Power Company,,,,,,- canonical_gen_type: Battery\n mw: 200\n- ca...,345.0,04019,,,,,
39247,tucson-electric-power-93,Generation,West,Tucson Electric Power Company,Tucson Electric Power,93,,,2021-12-31,,2021-04-26,"Apache County, AZ",Springerville-Mckinley 345kV Transmission Line,Apache County,AZ,Other,Energy,350.0,350.0,,Active,IA EXECUTION IN PROGRESS,IA EXECUTION IN PROGRESS,IA Pending,,,,,Tucson Electric Power Company,,,,,,- canonical_gen_type: Solar\n mw: 350\n- cano...,345.0,04001,,,,,


In [14]:
parsed.iloc[1065]

{'resource_type_1': 'Solar',
 'capacity_mw_resource_1': 100,
 'resource_type_2': 'Battery',
 'capacity_mw_resource_2': 185,
 'resource_type_3': 'Wind',
 'capacity_mw_resource_3': 275}

In [12]:
parsed_df[~parsed_df.resource_type_1.isnull()]

Unnamed: 0,resource_type_1,capacity_mw_resource_1,resource_type_2,capacity_mw_resource_2,resource_type_3,capacity_mw_resource_3
1065,Solar,100.0,Battery,185.0,Wind,275.0
1066,Battery,185.0,Solar,375.0,,
1067,Battery,65.0,Solar,125.0,,
1134,Battery,30.0,Solar,80.0,,
1341,Wind,50.0,,,,
...,...,...,...,...,...,...
39238,Solar,200.0,Battery,200.0,,
39239,Solar,500.0,Battery,,,
39244,Battery,200.0,Solar,400.0,,
39247,Solar,350.0,Battery,350.0,,


In [59]:
raw_df[raw_df["capacity_by_generation_type_breakdown"]!="nan"][["capacity_by_generation_type_breakdown", "capacity_mw", "canonical_generation_types"]]

Unnamed: 0,capacity_by_generation_type_breakdown,capacity_mw,canonical_generation_types
1065,- canonical_gen_type: Solar\n mw: 100\n- cano...,375.0,Battery + Solar + Wind
1066,- canonical_gen_type: Battery\n mw: 185\n- ca...,375.0,Battery + Solar
1067,- canonical_gen_type: Battery\n mw: 65\n- can...,125.0,Battery + Solar
1134,- canonical_gen_type: Battery\n mw: 30\n- can...,80.0,Battery + Solar
1341,- canonical_gen_type: Wind\n mw: 50\n,50.0,Wind
...,...,...,...
39238,- canonical_gen_type: Solar\n mw: 200\n- cano...,200.0,Battery + Solar
39239,- canonical_gen_type: Solar\n mw: 500\n- cano...,500.0,Other
39244,- canonical_gen_type: Battery\n mw: 200\n- ca...,600.0,Other
39247,- canonical_gen_type: Solar\n mw: 350\n- cano...,350.0,Other


In [33]:
[col for col in raw_df if "date" in col]

['actual_completion_date',
 'proposed_completion_date',
 'withdrawn_date',
 'queue_date',
 'interconnection_date',
 'schedule_next_event_date',
 'most_recent_study_date']

In [30]:
raw_df["project_type"].value_counts()

Generation      37531
Transmission     1369
Load              576
Upgrade           102
Surplus           102
Replacement        27
Name: project_type, dtype: int64

Try running transform

In [7]:
transformed = dbcp.transform.fyi_queue.transform(fyi_raw_dfs)

In [19]:
transformed["fyi_projects"].dtypes

project_type                                          object
power_market                                          object
transmission_owner                                    object
canonical_transmission_owners                         object
queue_id                                              object
project_name                                          object
actual_completion_date                        datetime64[ns]
proposed_completion_date                      datetime64[ns]
withdrawn_date                                datetime64[ns]
queue_date                                    datetime64[ns]
county_state_pairs                                    object
point_of_interconnection                              object
raw_county_name                                       object
raw_state_name                                        object
canonical_generation_types                            object
interconnection_service_type                          object
capacity_mw             

In [87]:
transformed["fyi_projects"].to_parquet("fyi_projects.parquet")

In [13]:
lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2023_clean_data.xlsx"
lbnl_raw_dfs = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)

In [14]:
lbnl_raw_df = lbnl_raw_dfs['lbnl_iso_queue']

In [28]:
raw_df.columns

Index(['unique_id', 'project_type', 'power_market', 'transmission_owner',
       'canonical_transmission_owners', 'queue_id', 'project_name',
       'actual_completion_date', 'proposed_completion_date', 'withdrawn_date',
       'queue_date', 'county_state_pairs', 'point_of_interconnection',
       'county', 'state', 'canonical_generation_types',
       'interconnection_service_type', 'capacity_mw', 'summer_capacity_mw',
       'winter_capacity_mw', 'queue_status', 'current_phase_or_stage_raw',
       'interconnection_status_raw', 'interconnection_status_fyi',
       'interconnection_date', 'developer', 'raw_developer', 'project_spv',
       'utility', 'iso', 'cluster', 'general_comments', 'latitude',
       'longitude', 'capacity_by_generation_type_breakdown',
       'interconnection_voltage_kv', 'fips_codes', 'schedule_next_event_date',
       'schedule_next_event_name', 'most_recent_study_date',
       'most_recent_allocated_network_upgrade_cost', 'most_recent_study_url'],
      dtyp

In [22]:
[col for col in raw_df.columns if 'poi' in col]

['point_of_interconnection']

In [30]:
raw_df['interconnection_status_fyi'].value_counts().head(20)

IA Executed                    7276
Withdrawn                      7249
System Impact Study            5029
Feasibility Study              4400
Facility Study                 2270
Cluster Study                  2103
In Progress (unknown study)    1943
Operational                    1104
Not Started                     805
Suspended                       339
IA Pending                      273
Construction                    180
Name: interconnection_status_fyi, dtype: int64