In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import dbcp

In [3]:
pd.set_option('display.max_columns', None)

# Run Extract

In [4]:
fyi_uri = (
    "gs://dgm-archive/inconnection.fyi/interconnection_fyi_dataset_2025-09-01.csv"
)
fyi_raw_dfs = dbcp.extract.fyi_queue.extract(fyi_uri)

In [5]:
raw_df = fyi_raw_dfs["fyi_queue"]

# Look at raw data

Check if there are any projects that don't have queue date or status.

In [6]:
raw_df[(raw_df.queue_date.isnull()) & (raw_df.queue_status.isnull())]

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url


Check if `unique_id` is really unique. We will rename this column to `project_id`

In [7]:
raw_df.unique_id.is_unique

True

Harmonize interconnection statuses.

In [8]:
date_cols = [
        col
        for col in raw_df.columns
        if (
            (col.startswith("date_") or col.endswith("_date"))
            and not pd.api.types.is_datetime64_any_dtype(raw_df.loc[:, col])
        )
    ]

In [9]:
[col for col in raw_df.columns if "status" in col]

['queue_status', 'interconnection_status_raw', 'interconnection_status_fyi']

In [10]:
raw_df["interconnection_status_fyi"].value_counts()

IA Executed                    7276
Withdrawn                      7249
System Impact Study            5029
Feasibility Study              4400
Facility Study                 2270
Cluster Study                  2103
In Progress (unknown study)    1943
Operational                    1104
Not Started                     805
Suspended                       339
IA Pending                      273
Construction                    180
Name: interconnection_status_fyi, dtype: int64

In [11]:
allowed_statuses = {
        "Cluster Study",
        "Combined",
        "Construction",
        "Facility Study",
        "Feasibility Study",
        "IA Executed",
        "IA Pending",
        "In Progress (unknown study)",
        "Not Started",
        "Operational",
        "Phase 4 Study",
        "Suspended",
        "System Impact Study",
        "Withdrawn",
    }

In [12]:
fyi_status_values = set(raw_df["interconnection_status_fyi"].unique())

In [13]:
fyi_status_values - allowed_statuses

{nan}

Look at the `capacity_by_generation_type_breakdown` column

In [14]:
[col for col in raw_df if "capacity" in col]

['capacity_mw',
 'summer_capacity_mw',
 'winter_capacity_mw',
 'capacity_by_generation_type_breakdown']

In [15]:
raw_df["capacity_by_generation_type_breakdown"].isnull().value_counts()

True     35600
False     4107
Name: capacity_by_generation_type_breakdown, dtype: int64

In [16]:
raw_df["capacity_mw"].isnull().value_counts()

False    38961
True       746
Name: capacity_mw, dtype: int64

In [21]:
cap_by_gen_df = raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()]
cap_by_gen_df[["capacity_mw", "capacity_by_generation_type_breakdown"]]

Unnamed: 0,capacity_mw,capacity_by_generation_type_breakdown
1065,375.0,- canonical_gen_type: Solar\n mw: 100\n- cano...
1066,375.0,- canonical_gen_type: Battery\n mw: 185\n- ca...
1067,125.0,- canonical_gen_type: Battery\n mw: 65\n- can...
1134,80.0,- canonical_gen_type: Battery\n mw: 30\n- can...
1341,50.0,- canonical_gen_type: Wind\n mw: 50\n
...,...,...
39238,200.0,- canonical_gen_type: Solar\n mw: 200\n- cano...
39239,500.0,- canonical_gen_type: Solar\n mw: 500\n- cano...
39244,600.0,- canonical_gen_type: Battery\n mw: 200\n- ca...
39247,350.0,- canonical_gen_type: Solar\n mw: 350\n- cano...


In [25]:
raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()]["capacity_by_generation_type_breakdown"].iloc[0]

'- canonical_gen_type: Solar\n  mw: 100\n- canonical_gen_type: Battery\n  mw: 185\n- canonical_gen_type: Wind\n  mw: 275\n'

Test the capacity parsing function in the transform step.

In [17]:
from dbcp.transform.fyi_queue import parse_capacity

In [22]:
parsed = cap_by_gen_df.apply(parse_capacity, result_type="expand", axis=1)

In [27]:
pd.concat([cap_by_gen_df["capacity_by_generation_type_breakdown"], parsed], axis=1)

Unnamed: 0,capacity_by_generation_type_breakdown,resource,capacity_mw
1065,- canonical_gen_type: Solar\n mw: 100\n- cano...,"[Solar, Battery, Wind]","[100, 185, 275]"
1066,- canonical_gen_type: Battery\n mw: 185\n- ca...,"[Battery, Solar]","[185, 375]"
1067,- canonical_gen_type: Battery\n mw: 65\n- can...,"[Battery, Solar]","[65, 125]"
1134,- canonical_gen_type: Battery\n mw: 30\n- can...,"[Battery, Solar]","[30, 80]"
1341,- canonical_gen_type: Wind\n mw: 50\n,[Wind],[50]
...,...,...,...
39238,- canonical_gen_type: Solar\n mw: 200\n- cano...,"[Solar, Battery]","[200, 200]"
39239,- canonical_gen_type: Solar\n mw: 500\n- cano...,"[Solar, Battery]","[500, None]"
39244,- canonical_gen_type: Battery\n mw: 200\n- ca...,"[Battery, Solar]","[200, 400]"
39247,- canonical_gen_type: Solar\n mw: 350\n- cano...,"[Solar, Battery]","[350, 350]"


Look at the capacity by generation type breakdown. Capacity is a sum of the capacities in capacity_by_generation_type_breakdown (excluding battery)

In [51]:
raw_df["capacity_by_generation_type_breakdown"] = raw_df["capacity_by_generation_type_breakdown"].astype(str)

In [53]:
raw_df[(~raw_df["capacity_by_generation_type_breakdown"].isnull()) &
~(raw_df["capacity_by_generation_type_breakdown"].str.startswith("- canonical_gen_type:"))
]["capacity_by_generation_type_breakdown"].value_counts()

nan    35600
Name: capacity_by_generation_type_breakdown, dtype: int64

In [59]:
raw_df[raw_df["capacity_by_generation_type_breakdown"]!="nan"][["capacity_by_generation_type_breakdown", "capacity_mw", "canonical_generation_types"]]

Unnamed: 0,capacity_by_generation_type_breakdown,capacity_mw,canonical_generation_types
1065,- canonical_gen_type: Solar\n mw: 100\n- cano...,375.0,Battery + Solar + Wind
1066,- canonical_gen_type: Battery\n mw: 185\n- ca...,375.0,Battery + Solar
1067,- canonical_gen_type: Battery\n mw: 65\n- can...,125.0,Battery + Solar
1134,- canonical_gen_type: Battery\n mw: 30\n- can...,80.0,Battery + Solar
1341,- canonical_gen_type: Wind\n mw: 50\n,50.0,Wind
...,...,...,...
39238,- canonical_gen_type: Solar\n mw: 200\n- cano...,200.0,Battery + Solar
39239,- canonical_gen_type: Solar\n mw: 500\n- cano...,500.0,Other
39244,- canonical_gen_type: Battery\n mw: 200\n- ca...,600.0,Other
39247,- canonical_gen_type: Solar\n mw: 350\n- cano...,350.0,Other


When is capacity reported but not capacity_by_generation_type_breakdown, and vice versa?

In [31]:
cap_no_cap_by_gen_df = raw_df[(~raw_df["capacity_mw"].isnull()) & (raw_df["capacity_by_generation_type_breakdown"].isnull())]

In [47]:
len(raw_df)

39707

In [32]:
cap_no_cap_by_gen_df.canonical_generation_types.value_counts()

Solar                            12902
Wind                              5821
Battery                           5282
Gas                               3142
Battery + Solar                   3056
Other                             2171
Hydro                              476
Coal                               407
Biomass                            231
Methane                            216
Nuclear                            179
Geothermal                         120
Gas + Oil                          114
Diesel                             109
Offshore Wind                       88
Battery + Wind                      87
Oil                                 79
Pumped Storage                      75
Battery + Solar + Wind              50
Biogas                              46
Landfill                            36
Waste Heat                          34
Solar + Wind                        24
Battery + Gas + Solar               18
Battery + Gas                       15
Fuel Cell                

In [49]:
cap_no_cap_by_gen_df.canonical_generation_types.str.replace(r'^Battery\s\+\s|\s\+\sBattery', "", regex=True)

0        Pumped Storage
1                 Solar
2                  Wind
3                 Solar
4                 Solar
              ...      
39695             Solar
39696             Solar
39697             Solar
39698             Solar
39699             Solar
Name: canonical_generation_types, Length: 34854, dtype: object

In [55]:
res_cap_df = pd.read_parquet("/app/data/output/private_data_warehouse/fyi_resource_capacity.parquet")

In [56]:
res_cap_df

Unnamed: 0,project_id,resource,resource_clean,capacity_mw
0,avista-110,Solar,Solar,100.0
1,avista-110,Battery,Battery Storage,185.0
2,avista-110,Wind,Onshore Wind,275.0
3,avista-111,Battery,Battery Storage,185.0
4,avista-111,Solar,Solar,375.0
...,...,...,...,...
36230,wapa-rocky-mountain-region-2024-g3,Solar,Solar,170.0
36231,wapa-rocky-mountain-region-2024-g4,Solar,Solar,170.0
36232,wapa-rocky-mountain-region-2025-g1,Solar,Solar,250.0
36233,wapa-rocky-mountain-region-2025-g2,Solar,Solar,200.0


In [None]:
# ids to look into
# caiso-1085, caiso-1088, caiso-472, caiso-54873, caiso-908, caiso-955, tucson-electric-power-94 (solar and battery)

# Run Transform

In [11]:
transformed = dbcp.transform.fyi_queue.transform(fyi_raw_dfs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df.dropna(


In [12]:
transformed.keys()

dict_keys(['fyi_projects', 'fyi_locations', 'fyi_resource_capacity'])

In [13]:
transformed["fyi_resource_capacity"]

Unnamed: 0,project_id,resource,capacity_mw,resource_clean
0,avista-110,Solar,100.0,Solar
1,avista-110,Wind,275.0,Onshore Wind
2,avista-110,Battery,185.0,Battery Storage
3,avista-111,Solar,375.0,Solar
4,avista-111,Battery,185.0,Battery Storage
...,...,...,...,...
4682,tucson-electric-power-93,Solar,350.0,Solar
4683,tucson-electric-power-93,Battery,350.0,Battery Storage
4684,tucson-electric-power-94,Solar,255.0,Solar
4685,tucson-electric-power-94,Battery,255.0,Battery Storage


In [14]:
transformed["fyi_locations"]

Unnamed: 0,raw_county_name,raw_state_name,state_id_fips,county_id_fips,geocoded_locality_name,geocoded_locality_type,geocoded_containing_county
0,,,,,,,
1,,,,,Cypress,city,Orange County
2,,,,,,,
3,,,,,Taber,city,Bingham County
4,,,,,Newell,city,Hancock County
...,...,...,...,...,...,...,...
35640,,,19,19193,woodbury,county,woodbury
35641,,,31,31089,holt,county,holt
35642,,,31,31157,scotts bluff,county,scotts bluff
35643,,,38,38017,cass,county,cass


In [16]:
transformed["fyi_projects"]

Unnamed: 0,project_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url,is_actionable,is_nearly_certain
0,aeso-p1756,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1756,P1756 TPG Canyon Creek PHES Storage Project,NaT,2027-05-01,NaT,2016-03-02,"Yellowhead County, AB",29-Hinton/Edson,Pumped Storage,,139.000,,,Active,3,,In Progress (unknown study),NaT,WindRiver Power Corporation,,,,aeso,,,,,,,,NaT,,NaT,,,,
1,aeso-p1828,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1828,P1828 hep Energy Alderson MPC Solar,NaT,2026-03-05,NaT,2016-07-29,"Cypress County, AB",04-Medicine Hat,Solar,,100.500,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
2,aeso-p1885,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1885,P1885 Northern Lights WAGF,NaT,NaT,NaT,2016-11-01,"Big Lakes County, AB",26-Swan Hills,Wind,,403.000,,,Withdrawn,3,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
3,aeso-p1926,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1926,P1926 Solar Krafte Vauxhall,NaT,2025-12-15,NaT,2017-03-23,"Taber, AB",52-Vauxhall,Solar,,60.375,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
4,aeso-p1927,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1927,P1927 Beargrass Solar,NaT,2026-10-01,NaT,2017-03-23,"Newell County, AB",47-Brooks,Solar,,360.990,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36359,wapa-ti-1001,Transmission,West,WAPA/BEPC/HCPD Integrated System,WAPA/BEPC/HCPD Integrated System,TI-1001,,NaT,NaT,NaT,2010-09-30,"Woodbury, IA",Sioux City 161kV,Other,,,,,Withdrawn,,IAP,In Progress (unknown study),NaT,,,,WAPA/BEPC/HCPD Integrated System,,,,42.602926,-96.306326,,161.0,19193,NaT,,NaT,,,,
36360,wapa-ti-1301,Transmission,West,WAPA/BEPC/HCPD Integrated System,WAPA/BEPC/HCPD Integrated System,TI-1301,,NaT,NaT,NaT,2013-02-05,"Holt, NE",Ft Thomp-Grand Island 345kV,Other,,,,,Active,,IAP/CP,In Progress (unknown study),NaT,,,,WAPA/BEPC/HCPD Integrated System,,,,,,,345.0,31089,NaT,,NaT,,,,
36361,wapa-ti-1302,Transmission,West,WAPA/BEPC/HCPD Integrated System,WAPA/BEPC/HCPD Integrated System,TI-1302,,NaT,NaT,NaT,2013-12-16,"Scotts Bluff, NE",Stegall 345kV,Other,,,,,Active,,IAE,IA Executed,NaT,,,,WAPA/BEPC/HCPD Integrated System,,,,41.819513,-103.942528,,345.0,31157,NaT,,NaT,,,,
36362,wapa-ti-1401,Transmission,West,WAPA/BEPC/HCPD Integrated System,WAPA/BEPC/HCPD Integrated System,TI-1401,,NaT,NaT,NaT,2014-04-30,"Cass, ND",Fargo 115kV,Other,,,,,Withdrawn,,IAP,In Progress (unknown study),NaT,,,,WAPA/BEPC/HCPD Integrated System,,,,46.834563,-96.948690,,115.0,38017,NaT,,NaT,,,,


In [87]:
transformed["fyi_projects"].to_parquet("fyi_projects.parquet")

# Look at LBNL data

In [13]:
lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2023_clean_data.xlsx"
lbnl_raw_dfs = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)

In [14]:
lbnl_raw_df = lbnl_raw_dfs['lbnl_iso_queue']

In [28]:
raw_df.columns

Index(['unique_id', 'project_type', 'power_market', 'transmission_owner',
       'canonical_transmission_owners', 'queue_id', 'project_name',
       'actual_completion_date', 'proposed_completion_date', 'withdrawn_date',
       'queue_date', 'county_state_pairs', 'point_of_interconnection',
       'county', 'state', 'canonical_generation_types',
       'interconnection_service_type', 'capacity_mw', 'summer_capacity_mw',
       'winter_capacity_mw', 'queue_status', 'current_phase_or_stage_raw',
       'interconnection_status_raw', 'interconnection_status_fyi',
       'interconnection_date', 'developer', 'raw_developer', 'project_spv',
       'utility', 'iso', 'cluster', 'general_comments', 'latitude',
       'longitude', 'capacity_by_generation_type_breakdown',
       'interconnection_voltage_kv', 'fips_codes', 'schedule_next_event_date',
       'schedule_next_event_name', 'most_recent_study_date',
       'most_recent_allocated_network_upgrade_cost', 'most_recent_study_url'],
      dtyp