## Plan
- Compare the raw data, capacity change by iso, number of projects
- Run the data through the etl
- Compare the old and the new data mart tables

In [1]:
import pandas as pd

import dbcp

## Extract and compare the raw data

In [2]:
old_acp_uri = "gs://dgm-archive/acp/projects_Q4_2024.csv"
old_raw_dfs = dbcp.extract.acp_projects.extract(old_acp_uri)["raw_acp_projects"]

new_acp_uri = "gs://dgm-archive/acp/projects_Q1_2025.csv"
new_raw_dfs = dbcp.extract.acp_projects.extract(new_acp_uri)["raw_acp_projects"]

In [3]:
new_raw_dfs["ISORTOs"].value_counts(dropna=False)

NaN                                                4371
MISO                                               1296
PJM                                                1169
CAISO                                              1071
ISO-NE                                              903
ERCOT                                               483
NYISO                                               417
Southwest Power Pool                                257
SPP                                                 106
ISO New England                                      25
Southwest Power Pool | MISO                          12
PJM | MISO                                           12
WECC-RMRG                                             5
PJM Interconnection                                   3
NYISO | ISO-NE                                        3
NYISO | PJM                                           3
Southwest Power Pool | ERCOT                          2
Southwest Power Pool | PJM                      

In [4]:
def aggregate_acp_df(df, grp_col, agg_col):
    return df.groupby(grp_col, dropna=False).sum()[agg_col]

new_iso_agg = aggregate_acp_df(new_raw_dfs, "ISORTOs", "MW_Total_Capacity")
old_iso_agg = aggregate_acp_df(old_raw_dfs, "ISORTOs", "MW_Total_Capacity")

((new_iso_agg - old_iso_agg) / old_iso_agg).sort_values()

ISORTOs
PJM Interconnection                               -0.924679
Southwest Power Pool                              -0.012929
ERCOT                                             -0.005072
PJM                                               -0.003150
MISO                                              -0.002697
ISO New England                                    0.000000
ISO-NE                                             0.000000
Midcontinent Independent System Operator (MISO)    0.000000
Southwest Power Pool | PJM                         0.000000
NYISO | ISO-NE                                     0.000000
NYISO | PJM                                        0.000000
WECC-RMRG                                          0.000000
PJM | MISO                                         0.000000
Southwest Power Pool | ERCOT                       0.000000
Southwest Power Pool | MISO                        0.000000
SPP                                                0.000245
CAISO                           

In [5]:
new_state_agg = aggregate_acp_df(new_raw_dfs, "States", "MW_Total_Capacity")
old_state_agg = aggregate_acp_df(old_raw_dfs, "States", "MW_Total_Capacity")

((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).head(10)

States
 PR    0.748659
 OR    0.344240
 DC    0.232824
 MO    0.195773
 WV    0.179949
 WI    0.178144
 CO    0.173865
 UT    0.132983
 NY    0.117532
 AL    0.109178
Name: MW_Total_Capacity, dtype: float64

In [6]:
((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).tail(10)

States
 OR | WA   -0.837905
NaN              NaN
 CO | NE         NaN
 CO | WY         NaN
 IA | MO         NaN
 IL | IN         NaN
 KS | NE         NaN
 KS | OK         NaN
 MT | WY         NaN
 PA | WV         NaN
Name: MW_Total_Capacity, dtype: float64

In [7]:
new_state_agg = new_raw_dfs["States"].value_counts()
old_state_agg = old_raw_dfs["States"].value_counts()

((new_state_agg - old_state_agg) / old_state_agg).abs().sort_values(ascending=False).head(10)

 OR | WA    0.909091
 OK | TX    0.909091
 MN | SD    0.875000
 IA | MN    0.666667
 NM | TX    0.500000
 PR         0.173913
 GA         0.115108
 MO         0.109091
 DC         0.100000
 OR         0.082051
Name: States, dtype: float64

## Compare the old and new data warehouse tables
First, you'll need to update the `acp_uri` in `dbcp.etl` to point at the new data in the `dgm-archive` bucket, run the ETL and debug any failures. Then we'll compare the data mart tables.

In [8]:
from google.cloud import bigquery

def get_bigquery_table_version(dataset_id, table_name, project_id="dbcp-dev-350818"):
    """
    Get the data version of a BigQuery table.

    The dbcp.commands.publish script generates a version number for each data release
    and adds it as a label to the BQ tables.

    Args:
        dataset_id: the BQ dataset ID
        table_name: the name of the table
        project_id: the GCP project id

    Return:
        the current DBCP version number of the requested table
    """
    client = bigquery.Client()

    table_ref = f"{project_id}.{dataset_id}.{table_name}"
    table = client.get_table(table_ref)  # Fetch table metadata

    labels = table.labels  # Get the labels dictionary
    return labels["version"]


from dbcp.extract.helpers import cache_gcs_archive_file_locally

table_name = "acp_projects"
schema = "private_data_warehouse"
bq_dataset = schema + "_dev"
version = get_bigquery_table_version(bq_dataset, table_name)
uri = f"gs://dgm-outputs/{version}/{schema}/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

acp_projects = cache_gcs_archive_file_locally(uri, data_cache)
old_acp_projects = pd.read_parquet(acp_projects)

In [39]:
# OR can use:
"""
import pandas as pd

from dbcp.helpers import get_sql_engine

engine = get_sql_engine()

with engine.connect() as con:
    new_acp_projects_2 = pd.read_sql_table("acp_projects", con, schema="private_data_warehouse")
"""

acp_project_path = "/app/data/output/private_data_warehouse/acp_projects.parquet"

new_acp_projects = pd.read_parquet(acp_project_path)

In [29]:
new_acp_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10140 entries, 0 to 10139
Data columns (total 72 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   proj_id                             10140 non-null  Int64  
 1   status                              10140 non-null  string 
 2   plant_id_eia                        8844 non-null   Int64  
 3   project_name                        10140 non-null  string 
 4   phase_name                          10140 non-null  string 
 5   resource                            10140 non-null  string 
 6   developers                          5836 non-null   string 
 7   owners                              9895 non-null   string 
 8   iso_region                          5769 non-null   string 
 9   owner_types                         9860 non-null   string 
 10  capacity_mw                         10140 non-null  float64
 11  state_id_fips                       10139

In [30]:
new_iso_agg = aggregate_acp_df(new_acp_projects, "iso_region", "capacity_mw")
old_iso_agg = aggregate_acp_df(old_acp_projects, "iso_region", "capacity_mw")

((new_iso_agg - old_iso_agg) / old_iso_agg).sort_values()

  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]
  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]


iso_region
PJM         -0.045338
SPP         -0.010307
ERCOT       -0.003255
ISONE        0.000000
WECC-RMRG    0.000000
MISO         0.002261
CAISO        0.003721
NYISO        0.030529
<NA>         0.076721
Name: capacity_mw, dtype: float64

In [38]:
new_state_agg = aggregate_acp_df(new_acp_projects, "raw_states", "capacity_mw")
old_state_agg = aggregate_acp_df(old_acp_projects, "raw_states", "capacity_mw")

((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).head(10)

  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]
  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]


raw_states
PR    0.722559
OR    0.344240
DC    0.232824
MO    0.195773
WI    0.177787
CO    0.174491
WV    0.171444
UT    0.132983
NY    0.117532
AL    0.109178
Name: capacity_mw, dtype: float64

In [32]:
states_with_capacity_increases = ((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).head(10).index.to_list()

In [33]:
((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).tail(10)

raw_states
 OR | WA   -0.837905
 CO | NE         NaN
 CO | WY         NaN
 IA | MO         NaN
 IL | IN         NaN
 KS | NE         NaN
 KS | OK         NaN
 MT | WY         NaN
 PA | WV         NaN
<NA>             NaN
Name: capacity_mw, dtype: float64

- compare capacity change by state by status? (can an acp project be withdrawn?)
- What's the difference between advanced development and under construction
- For states that have large increases in capacity, we'd expect that show up in the first phase (advanced development)
- Do a little research, ACP 
- utility dive, S&P

In [24]:
old_acp_projects["status"].value_counts()

Online                     8364
Advanced Development        774
Under Construction          574
Decommissioned              127
Online | Decommissioned      41
Name: status, dtype: Int64

In [25]:
old_acp_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9881 entries, 0 to 9880
Data columns (total 72 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   proj_id                             9881 non-null   Int64  
 1   status                              9880 non-null   string 
 2   plant_id_eia                        8650 non-null   Int64  
 3   project_name                        9881 non-null   string 
 4   phase_name                          9881 non-null   string 
 5   resource                            9881 non-null   string 
 6   developers                          5565 non-null   string 
 7   owners                              9637 non-null   string 
 8   iso_region                          5770 non-null   string 
 9   owner_types                         9606 non-null   string 
 10  capacity_mw                         9881 non-null   float64
 11  state_id_fips                       9881 no

In [21]:
def pct_change_mw_by_status(new, old):

    new_agg = new.groupby("status").capacity_mw.sum()
    old_agg = old.groupby("status").capacity_mw.sum()
    return (new_agg - old_agg) / old_agg * 100

In [22]:
pct_change_mw_by_status(new_acp_projects, old_acp_projects)

status
Advanced Development       -1.995066
Decommissioned             11.965563
Online                      3.193611
Online | Decommissioned     0.000000
Under Construction          8.500073
Name: capacity_mw, dtype: float64

## % capacity change by status and ISO

In [23]:
new_acp_projects.iso_region.value_counts(dropna=False)

<NA>         4371
MISO         1297
PJM          1184
CAISO        1071
ISONE         928
ERCOT         483
NYISO         423
SPP           378
WECC-RMRG       5
Name: iso_region, dtype: Int64

In [24]:
for iso_region in new_acp_projects.iso_region.unique():
    new_prj_iso = new_acp_projects[new_acp_projects.iso_region == iso_region]
    old_prj_iso = old_acp_projects[old_acp_projects.iso_region == iso_region]

    pct_change = pct_change_mw_by_status(new_prj_iso, old_prj_iso)

    print(iso_region)
    print(pct_change)
    print()

CAISO
status
Advanced Development      -1.020903
Decommissioned             0.000000
Online                     1.233308
Online | Decommissioned    0.000000
Under Construction        -8.916254
Name: capacity_mw, dtype: float64

<NA>
Series([], Name: capacity_mw, dtype: float64)

PJM
status
Advanced Development      -30.836083
Decommissioned              0.000000
Online                      2.634876
Online | Decommissioned     0.000000
Under Construction        -10.868479
Name: capacity_mw, dtype: float64

ISONE
status
Advanced Development    0.000000
Decommissioned          0.000000
Online                  2.949164
Under Construction     -6.858421
Name: capacity_mw, dtype: float64

NYISO
status
Advanced Development    -48.257130
Online                    3.112100
Under Construction      601.199913
Name: capacity_mw, dtype: float64

MISO
status
Advanced Development       -1.572384
Decommissioned             79.666003
Online                      1.003694
Online | Decommissioned     0.000

In [27]:
for state in states_with_capacity_increases:
    new_prj_iso = new_acp_projects[new_acp_projects.raw_states == state]
    old_prj_iso = old_acp_projects[old_acp_projects.raw_states == state]

    pct_change = pct_change_mw_by_status(new_prj_iso, old_prj_iso)

    print(state)
    print(pct_change)
    print()

 PR
status
Advanced Development      0.000000
Decommissioned            0.000000
Online                    0.000000
Under Construction      253.263708
Name: capacity_mw, dtype: float64

 OR
status
Advanced Development       56.375195
Online                     28.532250
Online | Decommissioned          NaN
Under Construction        -11.704120
Name: capacity_mw, dtype: float64

 DC
status
Online    23.282443
Name: capacity_mw, dtype: float64

 MO
status
Advanced Development    26.182432
Online                  20.064258
Under Construction      15.152171
Name: capacity_mw, dtype: float64

 WI
status
Advanced Development    70.913830
Decommissioned           0.000000
Online                  12.844362
Under Construction     -39.227960
Name: capacity_mw, dtype: float64

 CO
status
Advanced Development       75.706801
Decommissioned             50.769231
Online                      4.215098
Online | Decommissioned     0.000000
Under Construction         21.147087
Name: capacity_mw, dtype: fl