## Plan
- Compare the raw data, capacity change by iso, number of projects
- Run the data through the etl
- Compare the old and the new data mart tables

In [29]:
import pandas as pd

import dbcp

## Extract and compare the raw data

In [30]:
old_acp_uri = "gs://dgm-archive/acp/projects_Q3_2024.csv"
old_raw_dfs = dbcp.extract.acp_projects.extract(old_acp_uri)["raw_acp_projects"]

new_acp_uri = "gs://dgm-archive/acp/projects_Q4_2024.csv"
new_raw_dfs = dbcp.extract.acp_projects.extract(new_acp_uri)["raw_acp_projects"]

In [31]:
new_raw_dfs["ISORTOs"].value_counts(dropna=False)

NaN                                                4111
MISO                                               1296
PJM                                                1170
CAISO                                              1071
ISO-NE                                              903
ERCOT                                               484
NYISO                                               414
Southwest Power Pool                                258
SPP                                                 106
ISO New England                                      25
Southwest Power Pool | MISO                          12
PJM | MISO                                           12
WECC-RMRG                                             5
PJM Interconnection                                   4
NYISO | ISO-NE                                        3
NYISO | PJM                                           3
Southwest Power Pool | ERCOT                          2
Southwest Power Pool | PJM                      

In [32]:
def aggregate_acp_df(df, grp_col, agg_col):
    return df.groupby(grp_col, dropna=False).sum()[agg_col]

new_iso_agg = aggregate_acp_df(new_raw_dfs, "ISORTOs", "MW_Total_Capacity")
old_iso_agg = aggregate_acp_df(old_raw_dfs, "ISORTOs", "MW_Total_Capacity")

((new_iso_agg - old_iso_agg) / old_iso_agg).sort_values()

ISORTOs
SPP                                               -0.032631
ISO-NE                                            -0.008834
ERCOT                                             -0.008030
PJM                                               -0.007363
ISO New England                                   -0.001879
CAISO                                             -0.000090
NYISO | ISO-NE                                     0.000000
NYISO | PJM                                        0.000000
WECC-RMRG                                          0.000000
PJM Interconnection                                0.000000
PJM | MISO                                         0.000000
Southwest Power Pool | ERCOT                       0.000000
Southwest Power Pool | MISO                        0.000000
Southwest Power Pool | PJM                         0.000000
Midcontinent Independent System Operator (MISO)    0.000000
MISO                                               0.003593
Southwest Power Pool            

In [33]:
new_state_agg = aggregate_acp_df(new_raw_dfs, "States", "MW_Total_Capacity")
old_state_agg = aggregate_acp_df(old_raw_dfs, "States", "MW_Total_Capacity")

((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).head(10)

States
 OR | WA    5.169203
 OK | TX    4.567508
 MN | SD    4.435000
 IA | MN    1.857798
 NM | TX    0.884645
 AZ         0.555936
 PR         0.471577
 MD         0.250855
 AR         0.196688
 ID         0.164094
Name: MW_Total_Capacity, dtype: float64

In [34]:
((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).tail(10)

States
 MT        -0.118903
 OR        -0.242912
 CO | NE         NaN
 CO | WY         NaN
 IA | MO         NaN
 IL | IN         NaN
 KS | NE         NaN
 KS | OK         NaN
 MT | WY         NaN
 PA | WV         NaN
Name: MW_Total_Capacity, dtype: float64

In [35]:
new_state_agg = new_raw_dfs["States"].value_counts()
old_state_agg = old_raw_dfs["States"].value_counts()

((new_state_agg - old_state_agg) / old_state_agg).abs().sort_values(ascending=False).head(10)

 OK | TX    10.000000
 OR | WA    10.000000
 MN | SD     7.000000
 IA | MN     2.000000
 NM | TX     1.000000
 ME          0.522523
 IL          0.253788
 AZ          0.193069
 NY          0.177215
 PR          0.150000
Name: States, dtype: float64

## Compare the old and new data warehouse tables
First, you'll need to update the `acp_uri` in `dbcp.etl` to point at the new data in the `dgm-archive` bucket, run the ETL and debug any failures. Then we'll compare the data mart tables.

In [39]:
from google.cloud import bigquery

def get_bigquery_table_version(dataset_id, table_name, project_id="dbcp-dev-350818"):
    """
    Get the data version of a BigQuery table.

    The dbcp.commands.publish script generates a version number for each data release
    and adds it as a label to the BQ tables.

    Args:
        dataset_id: the BQ dataset ID
        table_name: the name of the table
        project_id: the GCP project id

    Return:
        the current DBCP version number of the requested table
    """
    client = bigquery.Client()

    table_ref = f"{project_id}.{dataset_id}.{table_name}"
    table = client.get_table(table_ref)  # Fetch table metadata

    labels = table.labels  # Get the labels dictionary
    return labels["version"]


from dbcp.extract.helpers import cache_gcs_archive_file_locally

table_name = "acp_projects"
schema = "private_data_warehouse"
bq_dataset = schema + "_dev"
version = get_bigquery_table_version(bq_dataset, table_name)
uri = f"gs://dgm-outputs/{version}/{schema}/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

acp_projects = cache_gcs_archive_file_locally(uri, data_cache)
old_acp_projects = pd.read_parquet(acp_projects)

In [42]:
acp_project_path = "/app/data/output/private_data_warehouse/acp_projects.parquet"

new_acp_projects = pd.read_parquet(acp_project_path)

In [44]:
new_acp_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9881 entries, 0 to 9880
Data columns (total 72 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   proj_id                             9881 non-null   Int64  
 1   status                              9880 non-null   string 
 2   plant_id_eia                        8650 non-null   Int64  
 3   project_name                        9881 non-null   string 
 4   phase_name                          9881 non-null   string 
 5   resource                            9881 non-null   string 
 6   developers                          5565 non-null   string 
 7   owners                              9637 non-null   string 
 8   iso_region                          5770 non-null   string 
 9   owner_types                         9606 non-null   string 
 10  capacity_mw                         9881 non-null   float64
 11  state_id_fips                       9881 no

In [45]:
new_iso_agg = aggregate_acp_df(new_acp_projects, "iso_region", "capacity_mw")
old_iso_agg = aggregate_acp_df(old_acp_projects, "iso_region", "capacity_mw")

((new_iso_agg - old_iso_agg) / old_iso_agg).sort_values()

  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]
  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]


iso_region
ERCOT       -0.010053
ISONE       -0.008549
PJM         -0.006723
CAISO       -0.000087
WECC-RMRG    0.000000
SPP          0.000409
MISO         0.003563
NYISO        0.035088
<NA>         0.145762
Name: capacity_mw, dtype: float64

In [51]:
new_state_agg = aggregate_acp_df(new_acp_projects, "raw_states", "capacity_mw")
old_state_agg = aggregate_acp_df(old_acp_projects, "raw_states", "capacity_mw")

((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).head(10)

  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]
  casted = values.astype(dtype, copy=copy)
  return df.groupby(grp_col, dropna=False).sum()[agg_col]


raw_states
 OR | WA    5.169203
 OK | TX    4.567508
 MN | SD    4.435000
 IA | MN    1.857798
 NM | TX    0.884645
 AZ         0.555936
 PR         0.524732
 MD         0.250855
 AR         0.196688
 ID         0.164079
Name: capacity_mw, dtype: float64

In [52]:
((new_state_agg - old_state_agg) / old_state_agg).sort_values(ascending=False).tail(10)

raw_states
 MT        -0.118903
 OR        -0.242912
 CO | NE         NaN
 CO | WY         NaN
 IA | MO         NaN
 IL | IN         NaN
 KS | NE         NaN
 KS | OK         NaN
 MT | WY         NaN
 PA | WV         NaN
Name: capacity_mw, dtype: float64

- compare capacity change by state by status? (can an acp project be withdrawn?)
- What's the difference between advanced development and under construction
- For states that have large increases in capacity, we'd expect that show up in the first phase (advanced development)
- Do a little research, ACP 
- utility dive, S&P

In [55]:
old_acp_projects["status"].value_counts()

Online                     7788
Advanced Development        815
Under Construction          603
Decommissioned              126
Online | Decommissioned      40
Name: status, dtype: Int64

In [56]:
old_acp_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9372 entries, 0 to 9371
Data columns (total 72 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   proj_id                             9372 non-null   Int64  
 1   status                              9372 non-null   string 
 2   plant_id_eia                        8096 non-null   Int64  
 3   project_name                        9372 non-null   string 
 4   phase_name                          9372 non-null   string 
 5   resource                            9372 non-null   string 
 6   developers                          5018 non-null   string 
 7   owners                              9137 non-null   string 
 8   iso_region                          5784 non-null   string 
 9   owner_types                         9107 non-null   string 
 10  capacity_mw                         9372 non-null   float64
 11  state_id_fips                       9372 no

In [18]:
import pandas as pd

from dbcp.helpers import get_sql_engine

engine = get_sql_engine()

with engine.connect() as con:
    new_acp_projects = pd.read_sql_table("acp_projects", con, schema="private_data_warehouse")

In [19]:
githash = "a0fc9e0"
old_acp_projects = pd.read_parquet(f"../outputs/{githash}/dev/private_data_warehouse/acp_projects.parquet")

In [24]:
def pct_change_mw_by_status(new, old):

    new_agg = new.groupby("status").capacity_mw.sum()
    old_agg = old.groupby("status").capacity_mw.sum()
    return (new_agg - old_agg) / old_agg * 100

In [25]:
pct_change_mw_by_status(new_acp_projects, old_acp_projects)

status
Advanced Development      -11.403581
Decommissioned              2.578608
Online                      5.934032
Online | Decommissioned     0.416892
Under Construction          5.628867
Name: capacity_mw, dtype: float64

## % capacity change by status and ISO

In [30]:
new_acp_projects.iso_region.value_counts(dropna=False)

None         3588
MISO         1297
PJM          1193
CAISO        1072
ISONE         931
ERCOT         489
NYISO         418
SPP           379
WECC-RMRG       5
Name: iso_region, dtype: int64

In [31]:
for iso_region in new_acp_projects.iso_region.unique():
    new_prj_iso = new_acp_projects[new_acp_projects.iso_region == iso_region]
    old_prj_iso = old_acp_projects[old_acp_projects.iso_region == iso_region]

    pct_change = pct_change_mw_by_status(new_prj_iso, old_prj_iso)

    print(iso_region)
    print(pct_change)
    print()

CAISO
status
Advanced Development      -24.509428
Decommissioned              0.000000
Online                      4.616339
Online | Decommissioned     0.000000
Under Construction        -21.645777
Name: capacity_mw, dtype: float64

None
Series([], Name: capacity_mw, dtype: float64)

PJM
status
Advanced Development      -25.538129
Decommissioned            -22.468304
Online                      4.177580
Online | Decommissioned     0.000000
Under Construction         -3.123384
Name: capacity_mw, dtype: float64

ISONE
status
Advanced Development    0.859603
Decommissioned          0.000000
Online                  3.317459
Under Construction     -6.784373
Name: capacity_mw, dtype: float64

NYISO
status
Advanced Development   -7.567308
Online                  3.476287
Under Construction     -5.726114
Name: capacity_mw, dtype: float64

MISO
status
Advanced Development      -43.586332
Decommissioned              0.000000
Online                      3.340689
Online | Decommissioned     2.5840