In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import dbcp

In [3]:
pd.set_option('display.max_columns', None)

# Run Extract

In [36]:
fyi_uri = (
    "gs://dgm-archive/inconnection.fyi/interconnection_fyi_dataset_2025-09-01.csv"
)
fyi_raw_dfs = dbcp.extract.fyi_queue.extract(fyi_uri)

In [37]:
raw_df = fyi_raw_dfs["fyi_queue"]

# Look at raw data

Check if there are any projects that don't have queue date or status.

In [38]:
raw_df[(raw_df.queue_date.isnull()) & (raw_df.queue_status.isnull())]

Unnamed: 0,unique_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,county,state,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url


Check if `unique_id` is really unique. We will rename this column to `project_id`

In [39]:
raw_df.unique_id.is_unique

True

Harmonize interconnection statuses.

In [8]:
date_cols = [
        col
        for col in raw_df.columns
        if (
            (col.startswith("date_") or col.endswith("_date"))
            and not pd.api.types.is_datetime64_any_dtype(raw_df.loc[:, col])
        )
    ]

In [9]:
[col for col in raw_df.columns if "status" in col]

['queue_status', 'interconnection_status_raw', 'interconnection_status_fyi']

In [10]:
raw_df["interconnection_status_fyi"].value_counts()

IA Executed                    7276
Withdrawn                      7249
System Impact Study            5029
Feasibility Study              4400
Facility Study                 2270
Cluster Study                  2103
In Progress (unknown study)    1943
Operational                    1104
Not Started                     805
Suspended                       339
IA Pending                      273
Construction                    180
Name: interconnection_status_fyi, dtype: int64

In [10]:
allowed_statuses = {
        "Cluster Study",
        "Combined",
        "Construction",
        "Facility Study",
        "Feasibility Study",
        "IA Executed",
        "IA Pending",
        "In Progress (unknown study)",
        "Not Started",
        "Operational",
        "Phase 4 Study",
        "Suspended",
        "System Impact Study",
        "Withdrawn",
    }

In [11]:
fyi_status_values = set(raw_df["interconnection_status_fyi"].unique())

In [12]:
fyi_status_values - allowed_statuses

{nan}

Look at the `capacity_by_generation_type_breakdown` column

In [42]:
[col for col in raw_df if "capacity" in col]

['capacity_mw',
 'summer_capacity_mw',
 'winter_capacity_mw',
 'capacity_by_generation_type_breakdown']

In [40]:
raw_df["capacity_by_generation_type_breakdown"].isnull().value_counts()

True     35600
False     4107
Name: capacity_by_generation_type_breakdown, dtype: int64

In [41]:
raw_df["capacity_mw"].isnull().value_counts()

False    38961
True       746
Name: capacity_mw, dtype: int64

In [21]:
raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()][["capacity_mw", "capacity_by_generation_type_breakdown"]]

Unnamed: 0,capacity_mw,capacity_by_generation_type_breakdown
1065,375.0,- canonical_gen_type: Solar\n mw: 100\n- cano...
1066,375.0,- canonical_gen_type: Battery\n mw: 185\n- ca...
1067,125.0,- canonical_gen_type: Battery\n mw: 65\n- can...
1134,80.0,- canonical_gen_type: Battery\n mw: 30\n- can...
1341,50.0,- canonical_gen_type: Wind\n mw: 50\n
...,...,...
39238,200.0,- canonical_gen_type: Solar\n mw: 200\n- cano...
39239,500.0,- canonical_gen_type: Solar\n mw: 500\n- cano...
39244,600.0,- canonical_gen_type: Battery\n mw: 200\n- ca...
39247,350.0,- canonical_gen_type: Solar\n mw: 350\n- cano...


In [25]:
raw_df[~raw_df["capacity_by_generation_type_breakdown"].isnull()]["capacity_by_generation_type_breakdown"].iloc[0]

'- canonical_gen_type: Solar\n  mw: 100\n- canonical_gen_type: Battery\n  mw: 185\n- canonical_gen_type: Wind\n  mw: 275\n'

Look at the capacity by generation type breakdown. Capacity is a sum of the capacities in capacity_by_generation_type_breakdown (excluding battery)

In [51]:
raw_df["capacity_by_generation_type_breakdown"] = raw_df["capacity_by_generation_type_breakdown"].astype(str)

In [53]:
raw_df[(~raw_df["capacity_by_generation_type_breakdown"].isnull()) &
~(raw_df["capacity_by_generation_type_breakdown"].str.startswith("- canonical_gen_type:"))
]["capacity_by_generation_type_breakdown"].value_counts()

nan    35600
Name: capacity_by_generation_type_breakdown, dtype: int64

In [59]:
raw_df[raw_df["capacity_by_generation_type_breakdown"]!="nan"][["capacity_by_generation_type_breakdown", "capacity_mw", "canonical_generation_types"]]

Unnamed: 0,capacity_by_generation_type_breakdown,capacity_mw,canonical_generation_types
1065,- canonical_gen_type: Solar\n mw: 100\n- cano...,375.0,Battery + Solar + Wind
1066,- canonical_gen_type: Battery\n mw: 185\n- ca...,375.0,Battery + Solar
1067,- canonical_gen_type: Battery\n mw: 65\n- can...,125.0,Battery + Solar
1134,- canonical_gen_type: Battery\n mw: 30\n- can...,80.0,Battery + Solar
1341,- canonical_gen_type: Wind\n mw: 50\n,50.0,Wind
...,...,...,...
39238,- canonical_gen_type: Solar\n mw: 200\n- cano...,200.0,Battery + Solar
39239,- canonical_gen_type: Solar\n mw: 500\n- cano...,500.0,Other
39244,- canonical_gen_type: Battery\n mw: 200\n- ca...,600.0,Other
39247,- canonical_gen_type: Solar\n mw: 350\n- cano...,350.0,Other


# Run Transform

In [20]:
transformed = dbcp.transform.fyi_queue.transform(fyi_raw_dfs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # reset project_id index


In [21]:
transformed.keys()

dict_keys(['fyi_projects', 'fyi_locations', 'fyi_resource_capacity'])

In [34]:
transformed["fyi_resource_capacity"].resource.value_counts()

Solar             1619
Battery           1545
Wind               539
Other              415
Gas                373
Biofuel             82
Geothermal          43
Hydro               35
Pumped Storage      22
Nuclear              6
Flywheel             3
Wave                 2
Other Storage        1
Biomass              1
Compressed Air       1
Name: resource, dtype: int64

In [33]:
transformed["fyi_locations"]

Unnamed: 0,project_id,raw_county_name,raw_state_name,state_id_fips,county_id_fips,geocoded_locality_name,geocoded_locality_type,geocoded_containing_county
0,aeso-p1756,Yellowhead County,AB,,,,,
1,aeso-p1828,Cypress County,AB,,,Cypress,city,Orange County
2,aeso-p1885,Big Lakes County,AB,,,,,
3,aeso-p1926,Taber,AB,,,Taber,city,Bingham County
4,aeso-p1927,Newell County,AB,,,Newell,city,Hancock County
...,...,...,...,...,...,...,...,...
35640,wapa-ti-1001,Woodbury,IA,19,19193,woodbury,county,woodbury
35641,wapa-ti-1301,Holt,NE,31,31089,holt,county,holt
35642,wapa-ti-1302,Scotts Bluff,NE,31,31157,scotts bluff,county,scotts bluff
35643,wapa-ti-1401,Cass,ND,38,38017,cass,county,cass


In [29]:
transformed["fyi_projects"].queue_status.value_counts()

Withdrawn      21415
Active         10130
Operational     4162
Suspended        632
Unknown           25
Name: queue_status, dtype: int64

In [30]:
transformed["fyi_projects"].to_parquet("fyi_projects.parquet")

In [31]:
transformed["fyi_locations"].to_parquet("fyi_locations.parquet")

In [32]:
transformed["fyi_resource_capacity"].to_parquet("fyi_resource_capacity.parquet")

In [35]:
proj = transformed["fyi_projects"]

In [39]:
proj.head(10)

Unnamed: 0,project_id,project_type,power_market,transmission_owner,canonical_transmission_owners,queue_id,project_name,actual_completion_date,proposed_completion_date,withdrawn_date,queue_date,county_state_pairs,point_of_interconnection,canonical_generation_types,interconnection_service_type,capacity_mw,summer_capacity_mw,winter_capacity_mw,queue_status,current_phase_or_stage_raw,interconnection_status_raw,interconnection_status_fyi,interconnection_date,developer,raw_developer,project_spv,utility,iso,cluster,general_comments,latitude,longitude,capacity_by_generation_type_breakdown,interconnection_voltage_kv,fips_codes,schedule_next_event_date,schedule_next_event_name,most_recent_study_date,most_recent_allocated_network_upgrade_cost,most_recent_study_url,is_actionable,is_nearly_certain
0,aeso-p1756,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1756,P1756 TPG Canyon Creek PHES Storage Project,NaT,2027-05-01,NaT,2016-03-02,"Yellowhead County, AB",29-Hinton/Edson,Pumped Storage,,139.0,,,Active,3,,In Progress (unknown study),NaT,WindRiver Power Corporation,,,,aeso,,,,,,,,NaT,,NaT,,,,
1,aeso-p1828,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1828,P1828 hep Energy Alderson MPC Solar,NaT,2026-03-05,NaT,2016-07-29,"Cypress County, AB",04-Medicine Hat,Solar,,100.5,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
2,aeso-p1885,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1885,P1885 Northern Lights WAGF,NaT,NaT,NaT,2016-11-01,"Big Lakes County, AB",26-Swan Hills,Wind,,403.0,,,Withdrawn,3,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
3,aeso-p1926,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1926,P1926 Solar Krafte Vauxhall,NaT,2025-12-15,NaT,2017-03-23,"Taber, AB",52-Vauxhall,Solar,,60.375,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
4,aeso-p1927,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1927,P1927 Beargrass Solar,NaT,2026-10-01,NaT,2017-03-23,"Newell County, AB",47-Brooks,Solar,,360.99,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
5,aeso-p1984,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P1984,P1984 FortisAlberta Gleichen DG Solar,2025-04-25,2025-04-25,NaT,2017-07-17,"Wheatland County, AB",45-Strathmore/Blackie,Solar,,11.9,,,Operational,6,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
6,aeso-p2034,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P2034,P2034 Cold Lake Energy (Lindbergh Cogeneration),NaT,2027-10-01,NaT,2018-01-02,"Bonnyville No. 87, AB",28-Cold Lake,Gas,,75.0,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
7,aeso-p2091,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P2091,P2091 Arenosum Solar,NaT,2028-05-26,NaT,2018-05-22,"Special Area No. 2, AB",48-Empress,Solar,,151.0,,,Active,4,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
8,aeso-p2102,Load,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P2102,P2102 EPC East Calgary Area Reliability,2025-06-25,2025-06-25,NaT,2018-06-07,"Calgary, AB",06-Calgary,Other,,26.0,,,Operational,6,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,
9,aeso-p2195,Generation,AESO,Alberta Electric System Operator,Alberta Electric System Operator,P2195,P2195 FortisAlberta Bassano 435S DER Solar,NaT,NaT,NaT,2019-01-07,"Newell County, AB",47-Brooks,Solar,,4.65,,,Active,5,,In Progress (unknown study),NaT,,,,,aeso,,,,,,,,NaT,,NaT,,,,


In [47]:
[col for col in proj.columns if "county" in col]

['county_state_pairs']

# Look at LBNL data

In [13]:
lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2023_clean_data.xlsx"
lbnl_raw_dfs = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)

In [14]:
lbnl_raw_df = lbnl_raw_dfs['lbnl_iso_queue']

In [28]:
raw_df.columns

Index(['unique_id', 'project_type', 'power_market', 'transmission_owner',
       'canonical_transmission_owners', 'queue_id', 'project_name',
       'actual_completion_date', 'proposed_completion_date', 'withdrawn_date',
       'queue_date', 'county_state_pairs', 'point_of_interconnection',
       'county', 'state', 'canonical_generation_types',
       'interconnection_service_type', 'capacity_mw', 'summer_capacity_mw',
       'winter_capacity_mw', 'queue_status', 'current_phase_or_stage_raw',
       'interconnection_status_raw', 'interconnection_status_fyi',
       'interconnection_date', 'developer', 'raw_developer', 'project_spv',
       'utility', 'iso', 'cluster', 'general_comments', 'latitude',
       'longitude', 'capacity_by_generation_type_breakdown',
       'interconnection_voltage_kv', 'fips_codes', 'schedule_next_event_date',
       'schedule_next_event_name', 'most_recent_study_date',
       'most_recent_allocated_network_upgrade_cost', 'most_recent_study_url'],
      dtyp

# Create a data mart table

In [55]:
from google.cloud import bigquery

# Initialize client
client = bigquery.Client(project="dbcp-dev-350818")

In [58]:
query = """
WITH
    fyi_proj_res as (
        SELECT
            proj.project_id,
            proj.queue_id,
            proj.proposed_completion_date as date_proposed_online,
            proj.developer,
            proj.power_market, -- does entity in LBNL map to power_market?
            proj.interconnection_status_fyi as interconnection_status,
            proj.point_of_interconnection,
            proj.project_name,
            proj.queue_date as date_entered_queue,
            proj.queue_status,
            proj.iso as iso_region, -- does region in LBNL map to iso?
            proj.utility,
            proj.is_actionable,
            proj.is_nearly_certain,
            proj.actual_completion_date,
            proj.withdrawn_date,
            res.capacity_mw,
            res.resource_clean
        FROM `dbcp-dev-350818.private_data_warehouse_dev.fyi_projects` as proj
        INNER JOIN `dbcp-dev-350818.private_data_warehouse_dev.fyi_resource_capacity` as res
        ON proj.project_id = res.project_id
    ),
    loc as (
        -- Remember that projects can have multiple locations, though 99 percent have only one.
        -- Can optionally multiply capacity by frac_locations_in_county to allocate it equally.
        -- Note that there are some duplicates of (project_id, county_id_fips) as well.
        -- This happens when the original data lists multiple city names that are in the
        -- same county. This does not cause double counting because of frac_locations_in_county.
        SELECT
            project_id,
            state_id_fips,
            county_id_fips,
            raw_county_name, -- for validation only
            CAST(1.0 / COUNT(*) OVER (PARTITION BY project_id) AS FLOAT64) AS frac_locations_in_county
        FROM `dbcp-dev-350818.private_data_warehouse_dev.fyi_locations`
    ),
    iso as (
        SELECT
            fyi_proj_res.*,
            loc.state_id_fips,
            loc.county_id_fips,
            loc.raw_county_name, -- for validation only
            -- projects with missing location info get full capacity allocation
            coalesce(loc.frac_locations_in_county, 1.0) as frac_locations_in_county
        from fyi_proj_res
        LEFT JOIN loc
        ON fyi_proj_res.project_id = loc.project_id
    )
    SELECT
        sfip.state_name as state,
        cfip.county_name as county,
        iso.*,
        'fyi' as source,
        ncsl.permitting_type as state_permitting_type
    from iso
    left join `dbcp-dev-350818.data_warehouse.state_fips` as sfip
        on iso.state_id_fips = sfip.state_id_fips
    left join `dbcp-dev-350818.data_warehouse.county_fips` as cfip
        on iso.county_id_fips = cfip.county_id_fips
    left join `dbcp-dev-350818.data_warehouse.ncsl_state_permitting` as ncsl
        on iso.state_id_fips = ncsl.state_id_fips
    ;
"""

In [59]:
df = client.query(query).to_dataframe()



In [63]:
df.to_parquet("fyi_projects_long_format.parquet")

In [73]:
from dbcp.data_mart.helpers import _estimate_proposed_power_co2e
import numpy as np

In [68]:
_estimate_proposed_power_co2e(df)

In [74]:
# Distribute project-level quantities across locations, when there are multiple.
# A handful of ISO projects are in multiple counties and the proprietary offshore
# wind projects have an entry for each cable landing.
# This approximation assumes an equal distribution between sites.
# Also note that this model represents everything relevant to each county,
# so multi-county projects are intentionally double-counted; for each relevant county.
df.loc[:, ["capacity_mw", "co2e_tonnes_per_year"]] = df.loc[
    :, ["capacity_mw", "co2e_tonnes_per_year"]
].mul(df["frac_locations_in_county"], axis=0)

grp = df.groupby(["county_id_fips", "resource_clean"])
aggs = grp.agg(
    {
        "co2e_tonnes_per_year": "sum",  # type: ignore
        "capacity_mw": "sum",
        "project_id": "count",
    }
)
aggs.loc[:, "co2e_tonnes_per_year"].replace(
    0, np.nan, inplace=True
)  # sums of 0 are simply unmodeled
aggs["facility_type"] = "power plant"
aggs["status"] = "proposed"
aggs.reset_index(inplace=True)
aggs.rename(
    columns={
        "project_id": "facility_count",
        "resource_clean": "resource_or_sector",
    },
    inplace=True,
)

In [76]:
aggs

Unnamed: 0,county_id_fips,resource_or_sector,co2e_tonnes_per_year,capacity_mw,facility_count,facility_type,status
0,04001,Battery Storage,,350.0000,1,power plant,proposed
1,04001,Solar,,350.0000,1,power plant,proposed
2,04003,Battery Storage,,0.0000,1,power plant,proposed
3,04003,Solar,,80.0000,1,power plant,proposed
4,04012,Battery Storage,,8099.8103,8,power plant,proposed
...,...,...,...,...,...,...,...
620,53077,Pumped Storage,,600.0000,1,power plant,proposed
621,53077,Solar,,1224.0000,9,power plant,proposed
622,56009,Battery Storage,,199.0000,1,power plant,proposed
623,56009,Solar,,199.0000,1,power plant,proposed


In [77]:
aggs.to_parquet("fyi_counties_agg.parquet")

# Look at data mart tables

In [7]:
loc_df = pd.read_parquet("/app/data/output/private_data_warehouse/fyi_locations.parquet")

In [8]:
len(loc_df.county_id_fips.unique())

2641

In [13]:
res_df = pd.read_parquet("/app/data/output/private_data_warehouse/fyi_resource_capacity.parquet")

In [35]:
len(proj_df)

36364

In [19]:
res_df

Unnamed: 0,project_id,resource,resource_clean,capacity_mw
0,avista-110,Solar,Solar,100.0
1,avista-110,Wind,Onshore Wind,275.0
2,avista-110,Battery,Battery Storage,185.0
3,avista-111,Solar,Solar,375.0
4,avista-111,Battery,Battery Storage,185.0
...,...,...,...,...
4682,tucson-electric-power-93,Solar,Solar,350.0
4683,tucson-electric-power-93,Battery,Battery Storage,350.0
4684,tucson-electric-power-94,Solar,Solar,255.0
4685,tucson-electric-power-94,Battery,Battery Storage,255.0


In [15]:
proj_df = pd.read_parquet("/app/data/output/private_data_warehouse/fyi_projects.parquet")

In [31]:
df = proj_df.merge(res_df, on="project_id", how="inner")

In [32]:
len(df)

4687

In [33]:
df = df.merge(loc_df, on="project_id", how="left")

In [34]:
len(df.county_id_fips.unique())

240

In [26]:
df["queue_status"].value_counts()

withdrawn      21817
active         10485
operational     4209
suspended        641
unknown           25
Name: queue_status, dtype: Int64

In [28]:
len(df[df.queue_status == "active"].county_id_fips.unique())

1912

In [10]:
long_df = pd.read_parquet("/app/data/output/data_mart/fyi_projects_long_format.parquet")

In [12]:
len(long_df.county_id_fips.unique())

240

In [4]:
from dbcp.constants import OUTPUT_DIR

In [5]:
fyi_proj_df = pd.read_parquet(OUTPUT_DIR / "data_mart/fyi_projects_long_format.parquet")

In [20]:
prop_proj_df = pd.read_parquet(OUTPUT_DIR / "data_mart/counties_proposed_clean_projects.parquet")

In [21]:
prop_proj_df

Unnamed: 0,county_id_fips,resource_clean,capacity_mw,facility_count
0,04001,Battery Storage,350.0000,1
1,04001,Solar,350.0000,1
2,04012,Battery Storage,6570.3303,6
3,04012,Solar,6900.8605,6
4,04013,Battery Storage,2326.2781,6
...,...,...,...,...
259,53071,Hydro,210.0000,1
260,53071,Onshore Wind,300.0000,2
261,53071,Solar,325.0000,2
262,53077,Battery Storage,588.5000,4
