In [2]:
import dbcp
from dbcp.extract.gridstatus_isoqueues import ISO_QUEUE_VERSIONS
import pandas as pd

## Get latest generation number for archives
Each time the [gridstatus archiver](https://github.com/deployment-gap-model-education-fund/deployment-gap-model-archiver) is run, GCS creates a new generation number for the new version of the data. The follow code grabs the latest generation number of the interconneciton queue data for each ISO.

In [3]:
from google.cloud import storage
from datetime import datetime

def get_generation_number_closest_to_date(bucket_name, blob_name, target_date):
    """Find the first GCS blob that was modified after the target_date."""
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Enable listing versions of the blob
    blobs = bucket.list_blobs(prefix=blob_name, versions=True)

    # Filter and sort blobs by time difference
    target_timestamp = target_date.timestamp()
    
    blobs = sorted(blobs, key=lambda blob: blob.updated.timestamp())

    for blob in blobs:
        last_modified = blob.updated.timestamp()

        if target_timestamp < last_modified:
            print(blob.updated)
            return str(blob.generation)

# Example usage
bucket_name = "dgm-archive"
target_date = datetime(2024, 12, 31)  # Specify the date you want to find closest to

updated_iso_queue_version = ISO_QUEUE_VERSIONS.copy()

for iso_region in ISO_QUEUE_VERSIONS.keys():
    if iso_region == "miso-pre-2017":
        continue
    blob_name = f"gridstatus/interconnection_queues/parquet/{iso_region}.parquet"
    updated_iso_queue_version[iso_region] = get_generation_number_closest_to_date(bucket_name, blob_name, target_date)


2025-01-29 13:11:50.903000+00:00
2025-01-29 13:11:51.194000+00:00
2025-01-29 13:11:51.524000+00:00
2025-01-29 13:11:51.823000+00:00
2025-01-29 13:11:52.130000+00:00
2025-01-29 13:11:52.425000+00:00
2025-01-29 13:11:52.727000+00:00


Copy and past the old version numbers from `dbcp.extract.gridstatus_isoqueues.ISO_QUEUE_VERSIONS`.

In [6]:
old_queue_version = {
    "miso": "1728242350923420",
    "miso-pre-2017": "1709776311574737",
    "caiso": "1728242351254356",
    "pjm": "1728242351606642",
    "ercot": "1728242351929200",
    "spp": "1728242352244156",
    "nyiso": "1731568799445816",
    "isone": "1728242352913470",
}

In [7]:
old_iso_queues = dbcp.extract.gridstatus_isoqueues.extract(old_queue_version)
new_iso_queues = dbcp.extract.gridstatus_isoqueues.extract(updated_iso_queue_version)

## Compare max dates of raw data
The follow code prints out the latest date a project entered a queue for each ISO in the old and new data. We should expect the latest project date in the new data to be larger than the that of the old data. There are currently two exceptions to this:

1. CAISO: We haven't been able to figure out how CAISO publishes data about active projects in the interconneciton queue. The CAISO data from Gridstatus rarely updates so we rely on the LBNL data.
2. PJM: PJM [is working through a backlog of projects](https://www.utilitydive.com/news/pjm-fast-track-reliability-projects-interconnection-queue-invenergy/729311/) and isn't accepting new projects until mid 2026.

In [8]:
for iso_region in old_iso_queues.keys():
    if iso_region == "miso-pre-2017":
        continue
    print(iso_region)
    old_df = old_iso_queues[iso_region]
    new_df = new_iso_queues[iso_region]
    
    old_df['Queue Date'] = pd.to_datetime(old_df['Queue Date'])
    new_df['Queue Date'] = pd.to_datetime(new_df['Queue Date'])
    
    print(f" - Old max date {old_df['Queue Date'].max()}")
    print(f" - New max date {new_df['Queue Date'].max()}")
    print()

miso
 - Old max date 2024-09-27 04:00:00+00:00
 - New max date 2025-02-19 05:00:00+00:00

caiso
 - Old max date 2023-03-02 08:00:00
 - New max date 2023-03-02 08:00:00

pjm
 - Old max date 2023-07-08 00:00:00
 - New max date 2023-07-08 00:00:00

ercot
 - Old max date 2024-09-10 00:00:00
 - New max date 2024-12-26 00:00:00

spp
 - Old max date 2024-08-02 00:00:00
 - New max date 2024-12-03 00:00:00

nyiso
 - Old max date 2024-10-29 00:00:00
 - New max date 2024-12-09 00:00:00

isone
 - Old max date 2024-08-21 00:00:00
 - New max date 2025-01-14 00:00:00



## Compare data mart tables
The follow code compares the old and new total active capacity in regions.

### How to grab the new data
To get the new data, replace `dbcp.extract.gridstatus_isoqueues.ISO_QUEUE_VERSIONS` with the updated generation numbers. Then run `make all`. There might be some data validation errors due to small changes in the expected number of projects. If the changes seem reasonable, just update the expected value in the assertion. If they don't seem reason, do some digging!

Once the ETL succesfully finishes the new data is available in the databse.

<!-- - download the `dev` data to compare to
- load the relevent tables

data warehouse
- check the old and new iso have a similar n and capacity
- plot total capacity


data mart:
- total capacity, n_projects and max date have all the same: caiso, ercot, pjm
- total capacity, n_projects and max date have all increased: miso, pjm, spp, nyiso, isone
- withdrawn and in service capacity have increased: miso, pjm, spp, nyiso, isone

- active capacity has changed for isos in GS_REGIONS
- how much has the active capacity changed by? -->

In [9]:
from dbcp.helpers import get_sql_engine

engine = get_sql_engine()
with engine.connect() as con:
    new_iso_projects_long_format = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")

### How to grab the old data
The following code grabs the latest version number for data in the development datasets then downloads the parquet file.

In [10]:
from google.cloud import bigquery

def get_bigquery_table_version(dataset_id, table_name, project_id="dbcp-dev-350818"):
    """
    Get the data version of a BigQuery table.

    The dbcp.commands.publish script generates a version number for each data release
    and adds it as a label to the BQ tables.

    Args:
        dataset_id: the BQ dataset ID
        table_name: the name of the table
        project_id: the GCP project id

    Return:
        the current DBCP version number of the requested table
    """
    client = bigquery.Client()

    table_ref = f"{project_id}.{dataset_id}.{table_name}"
    table = client.get_table(table_ref)  # Fetch table metadata

    labels = table.labels  # Get the labels dictionary
    return labels["version"]

In [11]:
from dbcp.extract.helpers import cache_gcs_archive_file_locally

table_name = "iso_projects_long_format"
version = get_bigquery_table_version("data_mart_dev", table_name)
uri = f"gs://dgm-outputs/{version}/data_mart/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

iso_projects_long_format_path = cache_gcs_archive_file_locally(uri, data_cache)
old_iso_projects_long_format = pd.read_parquet(iso_projects_long_format_path)

In [12]:
def agg_iso_projects_long_format(df):
    """Calculate some aggregate metrics for each ISO"""
    agg = df.groupby("iso_region").agg({"surrogate_id": "count", "capacity_mw": "sum", "date_entered_queue": "max"})
    agg = agg.rename(columns={"surrogate_id": "n_projects", "capacity_mw": "total_capacity_mw", "date_entered_queue": "max_date_entered_queue"})
    return agg

old_project_agg = agg_iso_projects_long_format(old_iso_projects_long_format)
new_project_agg = agg_iso_projects_long_format(new_iso_projects_long_format)

In [13]:
new_project_agg.max_date_entered_queue

iso_region
CAISO                 2023-04-17 00:00:00
ERCOT                 2024-12-26 00:00:00
ISONE                 2025-01-14 00:00:00
MISO                  2025-02-19 05:00:00
NYISO                 2024-12-09 00:00:00
PJM                   2023-07-08 00:00:00
SPP                   2024-12-03 00:00:00
Southeast (non-ISO)   2023-12-15 00:00:00
West (non-ISO)        2023-12-30 00:00:00
Name: max_date_entered_queue, dtype: datetime64[ns]

In [15]:
both_project_aggs = old_project_agg.merge(new_project_agg, left_index=True, right_index=True, validate="1:1", suffixes=("_old", "_new"))
both_project_aggs

Unnamed: 0_level_0,n_projects_old,total_capacity_mw_old,max_date_entered_queue_old,n_projects_new,total_capacity_mw_new,max_date_entered_queue_new
iso_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CAISO,1410,500444.8,2023-04-17 00:00:00,1410,500444.8,2023-04-17 00:00:00
ERCOT,1605,322303.2,2024-09-10 00:00:00,1661,333642.24,2024-12-26 00:00:00
ISONE,503,60720.3512,2024-08-21 00:00:00,459,56635.314,2025-01-14 00:00:00
MISO,2186,378040.1,2024-09-27 04:00:00,2261,389585.19,2025-02-19 05:00:00
NYISO,450,75817.11,2024-10-15 00:00:00,448,79796.68,2024-12-09 00:00:00
PJM,2569,203615.4318,2023-07-08 00:00:00,1866,148290.9228,2023-07-08 00:00:00
SPP,610,121268.002,2024-08-02 00:00:00,659,134342.702,2024-12-03 00:00:00
Southeast (non-ISO),1072,136024.03,2023-12-15 00:00:00,1072,136024.03,2023-12-15 00:00:00
West (non-ISO),2542,488401.05,2023-12-30 00:00:00,2542,488401.05,2023-12-30 00:00:00


In [16]:
# Calculate the differences between the old and new
for col in old_project_agg.columns:
    if pd.api.types.is_datetime64_any_dtype(old_project_agg[col]):
        continue
    else:
        both_project_aggs[f"{col}_pct_diff"] = (both_project_aggs[f"{col}_new"] - both_project_aggs[f"{col}_old"]) / both_project_aggs[f"{col}_old"]

In [18]:
old_project_agg

both_project_aggs.sort_values(by="total_capacity_mw_old", ascending=False)[["n_projects_pct_diff", "total_capacity_mw_pct_diff"]] * 100

Unnamed: 0_level_0,n_projects_pct_diff,total_capacity_mw_pct_diff
iso_region,Unnamed: 1_level_1,Unnamed: 2_level_1
CAISO,0.0,0.0
West (non-ISO),0.0,0.0
MISO,3.430924,3.053933
ERCOT,3.489097,3.518128
PJM,-27.364733,-27.171079
Southeast (non-ISO),0.0,0.0
SPP,8.032787,10.781657
NYISO,-0.444444,5.248908
ISONE,-8.747515,-6.727624


We don't use Gridstatus for CAISO for the reasons stated above so we filter it out in this analysis.

In [19]:
from dbcp.data_mart.projects import GS_REGIONS

changed_project_aggs = both_project_aggs[both_project_aggs.index.isin(GS_REGIONS)]

changed_project_aggs

Unnamed: 0_level_0,n_projects_old,total_capacity_mw_old,max_date_entered_queue_old,n_projects_new,total_capacity_mw_new,max_date_entered_queue_new,n_projects_pct_diff,total_capacity_mw_pct_diff
iso_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ERCOT,1605,322303.2,2024-09-10 00:00:00,1661,333642.24,2024-12-26 00:00:00,0.034891,0.035181
ISONE,503,60720.3512,2024-08-21 00:00:00,459,56635.314,2025-01-14 00:00:00,-0.087475,-0.067276
MISO,2186,378040.1,2024-09-27 04:00:00,2261,389585.19,2025-02-19 05:00:00,0.034309,0.030539
NYISO,450,75817.11,2024-10-15 00:00:00,448,79796.68,2024-12-09 00:00:00,-0.004444,0.052489
PJM,2569,203615.4318,2023-07-08 00:00:00,1866,148290.9228,2023-07-08 00:00:00,-0.273647,-0.271711
SPP,610,121268.002,2024-08-02 00:00:00,659,134342.702,2024-12-03 00:00:00,0.080328,0.107817


Make sure there isn't an surprising change in total capacity between the old and new data. We currently don't expect the active capacity to change that much in the span of a quarter. The `max_change` value is an arbitrary number so dig into the data if something looks fishy to you.

It's challenging to validate total capacity changes in ISOs. If there is an unexpected change, I would check the ISO's website to see if they changed their study process. For example, there was a surprising drop in active capacity in NYISO during the 2024 Q4 update. It turns out they [changed their study process](https://www.utilitydive.com/news/new-york-iso-reforms-interconnection-queue-launches-cluster-study/724054/) and the layout of the spreadsheet Gridstatus pulls in. Sites like S&P and Utility Drive might have relevant informaiton.

In [21]:
mw_pct_diff = changed_project_aggs["total_capacity_mw_pct_diff"].abs()
max_change = 0.2
assert mw_pct_diff.lt(max_change).all(), f"{mw_pct_diff} substantial change in an ISO's interconneciton queue active capacity."

AssertionError: iso_region
ERCOT    0.035181
ISONE    0.067276
MISO     0.030539
NYISO    0.052489
PJM      0.271711
SPP      0.107817
Name: total_capacity_mw_pct_diff, dtype: float64 substantial change in an ISO's interconneciton queue active capacity.

In [22]:
changed_project_aggs["total_capacity_mw_pct_diff"] * 100

iso_region
ERCOT     3.518128
ISONE    -6.727624
MISO      3.053933
NYISO     5.248908
PJM     -27.171079
SPP      10.781657
Name: total_capacity_mw_pct_diff, dtype: float64

## ISO Capacity Change
The `iso_regions_active_projects_capacity_mw_change_log` data mart table contains historic snapshots of total active capacity in the ISO queues. Ploting the change over time if helpful for identifying issues with the data update.

In [23]:
with engine.connect() as con:
    iso_regions_active_projects_capacity_mw_change_log = pd.read_sql_table("iso_regions_active_projects_capacity_mw_change_log", con, schema="data_mart")

In [24]:
iso_regions_active_projects_capacity_mw_change_log.groupby("iso_region").report_date.max()

iso_region
CAISO   2025-03-31
ISONE   2025-03-31
MISO    2025-03-31
NYISO   2025-03-31
PJM     2027-09-30
SPP     2025-03-31
Name: report_date, dtype: datetime64[ns]

In [25]:
chnglog = iso_regions_active_projects_capacity_mw_change_log.groupby(["iso_region", "report_date"]).sum().reset_index()

for iso_region in chnglog.iso_region.unique():
    iso_df = chnglog.query("iso_region == @iso_region")
    iso_df = iso_df[iso_df.report_date.dt.year.gt(2017) & (iso_df.report_date < "2025-01-01")]
    iso_df = iso_df.set_index("report_date")
    iso_df.plot.bar(color=["green", "red", "grey"], title=iso_region, stacked=True)

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.


## State Comparison
Compare old and updated total renewable and storage capacity by state.

Grab the old and updated `counties_wide_format` table. Each row in this table contains aggregate metrics about energy for a given county.

In [26]:
table_name = "counties_wide_format"

with engine.connect() as con:
    new_counties_wide_format = pd.read_sql_table(table_name, con, schema="data_mart")


version = get_bigquery_table_version("data_mart_dev", table_name)
uri = f"gs://dgm-outputs/{version}/data_mart/{table_name}.parquet"
data_cache = "/app/data/gcp_outputs"

counties_wide_format_path = cache_gcs_archive_file_locally(uri, data_cache)
old_counties_wide_format = pd.read_parquet(counties_wide_format_path)

Aggregate the county table by state and sum by proposed renewable and battery capcity. Calculate the percent change between the old and updated data.

In [27]:
def group_and_compare(raw_new, raw_old, groupby_col, metric_col):
    grouped_new = raw_new.groupby(groupby_col)[metric_col].sum()
    grouped_old = raw_old.groupby(groupby_col)[metric_col].sum()

    return (grouped_new - grouped_old) / grouped_old

state_pct_change = group_and_compare(new_counties_wide_format, old_counties_wide_format, "state", "renewable_and_battery_proposed_capacity_mw")

state_pct_change.sort_values()

state
New Hampshire              -0.540108
Ohio                       -0.340631
Virginia                   -0.328919
Connecticut                -0.285079
West Virginia              -0.278632
Rhode Island               -0.271935
Kentucky                   -0.258522
Maryland                   -0.191538
Pennsylvania               -0.150430
North Carolina             -0.127393
Indiana                    -0.110271
Massachusetts              -0.068535
Maine                      -0.064783
Illinois                   -0.064306
Delaware                   -0.052479
Nebraska                   -0.037155
New Jersey                 -0.035020
Wisconsin                  -0.011983
Iowa                       -0.007893
Tennessee                  -0.003660
Mississippi                -0.003301
Vermont                     0.000000
South Carolina              0.000000
Oregon                      0.000000
Washington                  0.000000
Utah                        0.000000
Alabama                     0.00

In [28]:
import us

def fips_to_abbreviation(fips_code):
    """Convert fips_code to state abbreviate"""
    state = us.states.lookup(str(fips_code).zfill(2))  # Ensure FIPS is zero-padded
    return state.abbr if state else None

ModuleNotFoundError: No module named 'us'

In [29]:
import plotly.express as px
import pandas as pd

df = state_pct_change.reset_index()
df.columns = ["state", "pct_change"]

df["state"] = df.state.apply(fips_to_abbreviation)

# Create the choropleth map
fig = px.choropleth(
    df,
    locations="state",
    locationmode="USA-states",  # Matches state names to US state locations
    color="pct_change",
    color_continuous_scale="RdYlGn",  # Red for negative, Green for positive changes
    range_color=[-1.0, 1.0],  # Scale based on actual data range
    title="Percentage Change by State",
    scope="usa"
)

fig.show()


NameError: name 'fips_to_abbreviation' is not defined

## Project level changes
Dig into states with unexpected capacity changes by looking as the status changes of projects.

In [30]:
from dbcp.data_mart.projects import create_long_format

# The dataframe this function returns includes all projects, active, withdrawn and operational. ERCOT only tracks active projects.
new_all_projects_long_format = create_long_format(engine, active_projects_only=False)

In [31]:
gs_old_iso_projects_long_format = old_iso_projects_long_format.query("source == 'gridstatus'")
gs_old_iso_projects_long_format = gs_old_iso_projects_long_format[gs_old_iso_projects_long_format.resource_class.isin(("remewable", "storage"))]

gs_new_all_projects_long_format = new_all_projects_long_format.query("source == 'gridstatus'")
gs_new_all_projects_long_format = gs_new_all_projects_long_format[gs_new_all_projects_long_format.resource_class.isin(("remewable", "storage"))]


# combine the queue_id and the iso region to create a project identifier.
gs_new_all_projects_long_format["full_queue_id"] = gs_new_all_projects_long_format.iso_region + "-" + gs_new_all_projects_long_format.queue_id
gs_new_all_projects_long_format["full_queue_id"] = gs_new_all_projects_long_format.iso_region + "-" + gs_new_all_projects_long_format.queue_id

In [32]:
assert gs_old_iso_projects_long_format["queue_status"].eq("active").all()

In [33]:
gs_old_iso_projects_long_format["full_queue_id"] = gs_old_iso_projects_long_format.iso_region + "-" + gs_old_iso_projects_long_format.queue_id
gs_new_all_projects_long_format["full_queue_id"] = gs_new_all_projects_long_format.iso_region + "-" + gs_new_all_projects_long_format.queue_id

What percent of total capacity in each state was in the old data but not in the new data? For all ISO except ERCOT (Texas) which doesn't keep track of withdrawn and in services projects, we'd expect projects that were active in the old data to still be present in the updated data. In other words, projects shouldn't have completely disapeared from the interconneciton queues.

In [34]:
old_projects_not_in_new_projects = ~gs_old_iso_projects_long_format.full_queue_id.isin(gs_new_all_projects_long_format.full_queue_id)
capacity_by_state_old_projects_not_in_new_projects = gs_old_iso_projects_long_format[old_projects_not_in_new_projects].groupby("state").capacity_mw.sum()
capacity_by_state_old_projects = old_iso_projects_long_format.groupby("state").capacity_mw.sum()

(capacity_by_state_old_projects_not_in_new_projects / capacity_by_state_old_projects).dropna().sort_values(ascending=False)

state
Texas           0.018755
New York        0.004685
Pennsylvania    0.003803
Ohio            0.001613
Virginia        0.000414
Arkansas        0.000000
Name: capacity_mw, dtype: float64

Investigate what happened to the active projects in the old data. The projects should still be active, withdrawn or in service.

In [35]:
# Grab all projects in the new data that existed in the old data
new_projects_in_old = gs_new_all_projects_long_format[gs_new_all_projects_long_format.full_queue_id.isin(gs_old_iso_projects_long_format["full_queue_id"])]

# Group by state and queue status and sum the capacity
new_projects_in_old_status_state_agg = new_projects_in_old.groupby(["state", "queue_status"]).capacity_mw.sum()

# Combine with total capacity change by state so we can investigate the most dramatic changes.
state_pct_change_reshape = state_pct_change.reset_index()
state_pct_change_reshape.columns = ["state", "capacity_mw_pct_change"]
new_projects_in_old_status_state_agg = new_projects_in_old_status_state_agg.reset_index().merge(state_pct_change_reshape, on="state", how="left")

In [36]:
new_projects_in_old_status_state_agg.set_index(["state", "queue_status"]).sort_values(by="capacity_mw_pct_change").head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,capacity_mw,capacity_mw_pct_change
state,queue_status,Unnamed: 2_level_1,Unnamed: 3_level_1
New Hampshire,withdrawn,401.68,-0.540108
New Hampshire,active,226.0,-0.540108
Ohio,active,3071.0,-0.340631
Ohio,suspended,50.0,-0.340631
Ohio,withdrawn,2204.0,-0.340631
Virginia,withdrawn,5048.4,-0.328919
Virginia,active,10708.72,-0.328919
Connecticut,active,3608.933,-0.285079
Connecticut,withdrawn,1945.388,-0.285079
West Virginia,active,473.0,-0.278632


In [37]:
# The ISO the state is in might provide withdrawn dates. Make sure these projects were withdrawn recently

state = "Indiana"
new_projects_in_old.query("state == @state & queue_status == 'withdrawn'").groupby(pd.Grouper(key="withdrawn_date", freq="Q")).count().full_queue_id

withdrawn_date
2024-12-31    19
2025-03-31     2
Freq: Q-DEC, Name: full_queue_id, dtype: int64

## PJM Vaidation (Q4 2024)
- how many projects in the old are not in the new data?
- For how many projects in the new and old data, how did their status change? When did the status change?

In [80]:
iso_region = "PJM"

new_pjm = gs_new_all_projects_long_format.query("iso_region == @iso_region")
old_pjm = gs_old_iso_projects_long_format.query("iso_region == @iso_region")

In [81]:
old_pjm.queue_status.value_counts()

active    548
Name: queue_status, dtype: Int64

In [82]:
# how many projects in the old are not in the new data?
len(old_pjm[~old_pjm.full_queue_id.isin(new_pjm.full_queue_id)]) / len(old_pjm)

0.0036496350364963502

In [83]:
pjm_in_both = new_pjm[new_pjm.full_queue_id.isin(old_pjm.full_queue_id)]

pjm_in_both["queue_status"].value_counts()

active         376
withdrawn      168
suspended        1
operational      1
Name: queue_status, dtype: int64

In [84]:
pjm_in_both.query("queue_status == 'withdrawn'")["withdrawn_date"].dt.year.value_counts(dropna=False)

2024    149
2025     19
Name: withdrawn_date, dtype: int64

In [85]:
pjm_in_both.query("queue_status == 'withdrawn'")["withdrawn_date"].value_counts(dropna=False)

2024-12-18    102
2024-12-17     30
2025-01-22     13
2025-01-14      4
2024-12-19      2
2024-12-03      2
2024-12-12      2
2024-12-09      1
2024-11-27      1
2025-01-28      1
2024-10-09      1
2024-11-11      1
2024-12-24      1
2024-12-02      1
2025-01-15      1
2024-10-14      1
2024-10-17      1
2024-11-15      1
2024-11-05      1
2024-12-26      1
Name: withdrawn_date, dtype: int64

In [86]:
pjm_in_both.query("queue_status == 'withdrawn'").date_entered_queue.dt.year.value_counts()

2021    138
2020     25
2022      4
2019      1
Name: date_entered_queue, dtype: int64

In [87]:
pjm_in_both.query("queue_status == 'withdrawn'").state.value_counts()

Virginia          79
Ohio              21
Pennsylvania      18
Indiana           15
Illinois           8
West Virginia      7
Maryland           7
Kentucky           4
New Jersey         4
North Carolina     4
Delaware           1
Name: state, dtype: int64

In [88]:
pjm_in_both.query("queue_status == 'withdrawn'").resource_clean.value_counts()

Battery Storage    168
Name: resource_clean, dtype: int64

## Compare county_wide_coverage
Often when doing an ISO queue update the `test_county_wide_coverage` test will fail. Hopefully by no more than a few dozen counties. Run the following cells to look into counties that had technical data in the old data but none in the update. Hopefully they align with the changes investigated above.

In [50]:
from dbcp.validation.tests import _get_non_county_cols_from_wide_format

cols_to_fetch = _get_non_county_cols_from_wide_format(engine)

In [51]:
old_null = old_counties_wide_format[~old_counties_wide_format[cols_to_fetch].notnull().any(axis=1)]
new_null = new_counties_wide_format[~new_counties_wide_format[cols_to_fetch].notnull().any(axis=1)]

In [52]:
# grab counties that are null in the new data but not null in old
null_in_new = new_null[~new_null.county_id_fips.isin(old_null.county_id_fips)]

In [53]:
null_in_new.state.value_counts()

Kentucky         6
Virginia         6
Ohio             3
New Hampshire    2
Indiana          1
Missouri         1
West Virginia    1
Name: state, dtype: int64

In [54]:
# Grab the old county data for counties that are missing data in the update.
old_counties_without_data_in_update = old_counties_wide_format[old_counties_wide_format.county_id_fips.isin(null_in_new.county_id_fips)]

assert old_counties_without_data_in_update.renewable_and_battery_proposed_capacity_mw.fillna(0).lt(500).all(), "There is a county that saw a lot of capacity disappear!"

In [55]:
old_counties_without_data_in_update[["county", "state", "renewable_and_battery_proposed_capacity_mw"]].sort_values(by="renewable_and_battery_proposed_capacity_mw")

Unnamed: 0,county,state,renewable_and_battery_proposed_capacity_mw
3195,Belknap,New Hampshire,19.9
328,Floyd,Virginia,20.0
2257,Williamsburg,Virginia,20.0
585,Pike,Ohio,21.0
2908,Warren,Ohio,39.87
2804,Brooke,West Virginia,40.0
2745,Lynchburg,Virginia,40.0
840,Cheshire,New Hampshire,49.5
2743,Letcher,Kentucky,50.0
133,Scott,Kentucky,65.0
