In [None]:
# import dbcp
# from dbcp.helpers import get_sql_engine

# import pandas as pd


# engine = get_sql_engine()
# iso_projects = dbcp.data_mart.projects._get_and_join_iso_tables(engine)

# iso_projects.to_parquet("iso_projects_with_normalized_gs.parquet")

## Load projects
I didn't pull `county_wide` because I wanted project level information to create aggregated column like "primary iso" and "has an ISO project" for a given county. To do this I created parquet files of the dataframe retruned by `dbcp.data_mart.projects._get_and_join_iso_tables(engine)` on `dev` and `rebase-gridstatus-iso-queues-to-dev`. The projects on `rebase-gridstatus-iso-queues-to-dev` use the old archives we have for GS data except for SPP and ISONE because the certainty and actionable columns weren't available until archives from 12/23. This isn't ideal because it is difficult to recreate but felt like the best way to observe differences between the versions.

In [None]:
import pandas as pd

In [None]:
old_projects = pd.read_parquet("iso_projects_without_gs.parquet")
# old_counties_wide["county_id_fips"] = old_counties_wide.county_id_fips.astype("string").str.zfill(5)
old_projects.head()

In [None]:
old_projects.select_dtypes("object").columns

In [None]:
new_projects  = pd.read_parquet("iso_projects_with_normalized_gs.parquet")

new_projects.select_dtypes("object").columns

## Explore data variations

In [None]:
new_projects.shape

In [None]:
print(new_projects.date_entered_queue.max())
print(old_projects.date_entered_queue.max())

In [None]:
new_projects_2023 = new_projects[new_projects.date_entered_queue.dt.year == 2023]
new_projects_2023.iso_region.value_counts()

In [None]:
new_projects_2023.date_entered_queue.dt.month.value_counts()

In [None]:
new_projects_2023[new_projects_2023.date_entered_queue.dt.month.gt(4)].iso_region.value_counts()

This makes sense given the earliest snapshot we have for ISOs (excluding SPP and ISONE) is April 2023. Also, everything after april 2024 is from ISONE and SPP.

## Aggregate project dataframes by county

In [None]:
def aggregate_iso_projects_by_count(df):

    def contains_iso_project(grp):
        return any(["non-ISO" not in region for region in grp])
    
    def get_primary_iso(grp):
        # There are 16 counties that have equal number of projects in multiple regions. Select the first one
        return grp.mode().head(1)

    agg_df = df.groupby("county_id_fips").agg(
        has_iso_project=pd.NamedAgg(column="iso_region", aggfunc=contains_iso_project),
        primary_iso_region=pd.NamedAgg(column="iso_region", aggfunc=get_primary_iso),
        capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum")
    )
    
    def agg_actionable_mw(grp_df):
        return grp_df[grp_df.is_actionable].capacity_mw.sum()

    def agg_certain_mw(grp_df):
        return grp_df[grp_df.is_nearly_certain].capacity_mw.sum()


    agg_df["actionable_capacity_mw"] = df.groupby("county_id_fips").apply(agg_actionable_mw)
    agg_df["nearly_certain_capacity_mw"] = df.groupby("county_id_fips").apply(agg_certain_mw)
    
    agg_df["actionable_n_projects"] = df.groupby("county_id_fips").is_actionable.sum()
    agg_df["nearly_certain_n_projects"] = df.groupby("county_id_fips").is_nearly_certain.count()
    return agg_df.reset_index()

new_projects_counties = aggregate_iso_projects_by_count(new_projects)
old_projects_counties = aggregate_iso_projects_by_count(old_projects)

## Number of counties with projects

In [None]:
n_counties_with_projects_in_new_not_in_old = len(set(new_projects_counties.county_id_fips) - set(old_projects_counties.county_id_fips))
n_counties_with_projects_in_old_not_in_new = len(set(old_projects_counties.county_id_fips) - set(new_projects_counties.county_id_fips))

print(n_counties_with_projects_in_new_not_in_old)
print(n_counties_with_projects_in_old_not_in_new)

In [None]:
print(len(old_projects_counties))
print(len(new_projects_counties))

## Make sure counties that don't have any ISO projects capacity remain unchanged

In [None]:
project_counties = old_projects_counties.merge(new_projects_counties, on="county_id_fips", how="outer", validate="1:1", suffixes=("_old", "_new"))

In [None]:
project_counties["has_iso_project_old"] = project_counties.has_iso_project_old.astype("boolean")
project_counties["has_iso_project_new"] = project_counties.has_iso_project_new.astype("boolean")

In [None]:
print((~project_counties.has_iso_project_old).value_counts())
print()
print((~project_counties.has_iso_project_new).value_counts())

Pretty similar number of counties that don't have any ISO projects. Lets plot them to make sure it makes sense with the ISO boundaries.

In [None]:
is_county_without_iso_projects = (~project_counties.has_iso_project_new) & (~project_counties.has_iso_project_old)
print(is_county_without_iso_projects.value_counts())

counties_without_iso_projects = project_counties[is_county_without_iso_projects]

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(counties_without_iso_projects, geojson=counties, locations='county_id_fips', color='has_iso_project_new',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'has_iso_project_new': "Counties that don't have any ISO projects in new and old data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


Great! That rougly aligns with [ISO borders](https://hifld-geoplatform.opendata.arcgis.com/datasets/50f80920d36e435d9a34db2bd0fd3ad8/explore?location=32.163459%2C-97.519448%2C5.23).

In [None]:
assert (counties_without_iso_projects.capacity_mw_old.eq(counties_without_iso_projects.capacity_mw_new)).all(), "Capacity in counties without ISO projects has changed!"

## Compare overall capacity between new and old data amongst counties with ISO projecdts

In [None]:
is_county_with_iso_projects = project_counties.has_iso_project_new | project_counties.has_iso_project_old
print(is_county_with_iso_projects.value_counts())

counties_with_iso_projects = project_counties[is_county_with_iso_projects].copy()

In [None]:
counties_with_iso_projects["capacity_mw_diff"] = (counties_with_iso_projects.capacity_mw_new - counties_with_iso_projects.capacity_mw_old)
counties_with_iso_projects["capacity_mw_pct_change"] = (counties_with_iso_projects["capacity_mw_diff"] / counties_with_iso_projects.capacity_mw_old) * 100

In [None]:
print(counties_with_iso_projects.capacity_mw_diff.describe())
print()
counties_with_iso_projects.capacity_mw_diff.plot.box()

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").capacity_mw_diff.describe()

In [None]:
print(counties_with_iso_projects.capacity_mw_pct_change.abs().describe())
print()
counties_with_iso_projects.capacity_mw_pct_change.abs().plot.box()

Pretty good! 50% of counties capacity changed by no more than 1%. 75% of counties capacity changed by no more than 24%. 

In [None]:
counties_with_iso_projects.primary_iso_region_new.apply(type).value_counts()

In [None]:
counties_with_iso_projects["capacity_mw_abs_pct_change"] = counties_with_iso_projects.capacity_mw_pct_change.abs()

counties_with_iso_projects.groupby("primary_iso_region_new").capacity_mw_pct_change.describe()

- CAISO's distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- ERCOT also took a drop. Not sure why. Variability could be explained by the 4 month delay of the data.
- It looks like ISONE generally increased which kind of makes sense given we have a whole other year of data for this ISO.
- MISO's distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- NYISO distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- PJM distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- Capacity change in SPP has a pretty wide distribution which could be explained by the additional year of data. 

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='capacity_mw_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'capacity_mw_pct_change': "Capcity MW % change between old and new data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

## Compare **actionable** capacity between new and old data amongst counties with ISO projecdts

In [None]:
counties_with_iso_projects["actionable_capacity_mw_diff"] = (counties_with_iso_projects.actionable_capacity_mw_new - counties_with_iso_projects.actionable_capacity_mw_old)
counties_with_iso_projects["actionable_capacity_mw_diff_pct_change"] = (counties_with_iso_projects["actionable_capacity_mw_diff"] / counties_with_iso_projects.actionable_capacity_mw_old) * 100

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").actionable_capacity_mw_diff_pct_change.describe()

In [None]:
old_projects["iso_region"] = old_projects["iso_region"].replace("ISO-NE", "ISONE")

n_actionable_by_iso = pd.concat([old_projects.groupby("iso_region").is_actionable.sum(), new_projects.groupby("iso_region").is_actionable.sum()], axis=1)
n_actionable_by_iso

- Same number of ISONE projects are marked actionable in old and new data but the MW distributions don’t line up.
- Significantly more projects marked actionable in new CAISO and MISO data.

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='actionable_capacity_mw_diff_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'actionable_capacity_mw_diff_pct_change': "Actionable Capacity MW % change"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


### Compare nearly certain capacity

In [None]:
counties_with_iso_projects["nearly_certain_capacity_mw_diff"] = (counties_with_iso_projects.nearly_certain_capacity_mw_new - counties_with_iso_projects.nearly_certain_capacity_mw_old)
counties_with_iso_projects["nearly_certain_capacity_mw_diff_pct_change"] = (counties_with_iso_projects["nearly_certain_capacity_mw_diff"] / counties_with_iso_projects.nearly_certain_capacity_mw_old) * 100

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").nearly_certain_capacity_mw_diff_pct_change.describe()

In [None]:
pd.concat([old_projects.groupby("iso_region").is_nearly_certain.sum(), new_projects.groupby("iso_region").is_nearly_certain.sum()], axis=1)

- Significantly less number of projects marked nearly certain in new MISO data.

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='nearly_certain_capacity_mw_diff_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'nearly_certain_capacity_mw_diff_pct_change': "Counties that don't have any ISO projects in new and old data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


## (WIP) Compare projects
I'm not sure if merging on queue_id and iso_region is the way to go.

In [None]:
print(new_projects[["queue_id", "iso_region"]].duplicated(keep=False).value_counts())
print(old_projects[["queue_id", "iso_region"]].duplicated(keep=False).value_counts())

In [None]:
merged_projects = new_projects.merge(old_projects, how="outer", on=["queue_id", "iso_region"], suffixes=("_new", "_old"))

In [None]:
merged_projects["in_both"] = ~merged_projects.project_id_new.isna() & ~merged_projects.project_id_old.isna()
merged_projects["in_both"].value_counts()

In [None]:
caiso_projects = merged_projects.query("iso_region == 'CAISO'")

In [None]:
caiso_projects.is_actionable_new.value_counts()

In [None]:
caiso_projects.is_actionable_old.value_counts()

#### NYISO

In [None]:
nyiso_projects = merged_projects.query("iso_region == 'NYISO'")

In [None]:
nyiso_projects.is_nearly_certain_new.value_counts()

In [None]:
nyiso_projects.is_nearly_certain_old.value_counts()

#### SPP

In [None]:
spp_projects = merged_projects.query("iso_region == 'SPP'")

In [None]:
spp_projects.is_nearly_certain_new.value_counts()

In [None]:
spp_projects.is_nearly_certain_old.value_counts()

#### MISO

In [None]:
miso_projects = merged_projects.query("iso_region == 'MISO'")

In [None]:
print(miso_projects.is_nearly_certain_new.value_counts())
print(miso_projects.is_nearly_certain_old.value_counts())

In [None]:
miso_projects.in_both.value_counts()

In [None]:
miso_projects[miso_projects.in_both]

In [None]:
miso_projects[miso_projects.project_id_old.isna() & ~miso_projects.in_both].date_entered_queue_new.dt.year.value_counts()

## Actionable

In [None]:
print(new_projects.groupby("iso_region").is_actionable.sum())
print()
print(old_projects.groupby("iso_region").is_actionable.sum())

## Nearly Certain

In [None]:
print(new_projects.groupby("iso_region").is_nearly_certain.sum())
print()
print(old_projects.groupby("iso_region").is_nearly_certain.sum())

In [None]:
certain_in_old_not_in_new = miso_projects[miso_projects.in_both].is_nearly_certain_old & ~miso_projects[miso_projects.in_both].is_nearly_certain_new
certain_in_old_not_in_new.value_counts()

In [None]:
(merged_projects.is_nearly_certain_new & merged_projects.is_nearly_certain_old).value_counts()

In [None]:
merged_projects["is_nearly_certain_both"] = merged_projects.is_nearly_certain_new.eq(merged_projects.is_nearly_certain_old)
merged_projects["is_actionable_both"] = merged_projects.is_actionable_new.eq(merged_projects.is_actionable_old)
print(is_nearly_certain_both.value_counts())

In [None]:
merged_projects[["is_actionable_new", "is_actionable_old", "is_actionable_both"]]

In [None]:
ercot = merged_projects.query("iso_region == 'ERCOT'")
print(ercot.is_nearly_certain_both.sum() / len(ercot))
print(ercot.is_actionable_both.sum() / len(ercot))

In [None]:
ercot

In [None]:
((merged_projects.groupby("iso_region").is_nearly_certain_both.sum() / merged_projects.groupby("iso_region").is_nearly_certain_both.count()) * 100).sort_values()

In [None]:
merged_projects["is_actionable_both"] = merged_projects.is_actionable_new.eq(merged_projects.is_actionable_old)

((merged_projects.groupby("iso_region").is_actionable_both.sum() / merged_projects.groupby("iso_region").is_actionable_both.count()) * 100).sort_values()

In [None]:
merged_projects.query("iso_region == 'SPP'")[["is_nearly_certain_new", "is_nearly_certain_old"]]

- ISONE and IS-ONE aren't merging correctly
- I'm worried queue_id and iso_region isn't the correct way to merge these together.

In [None]:
merged_projects[is_nearly_certain_both].iso_region.value_counts()

In [None]:
merged_projects[~is_nearly_certain_both].iso_region.value_counts()

In [None]:
counties_wide = old_counties_wide.merge(new_counties_wide, on="county_id_fips", how="left", suffixes=('_old', '_new'))

In [None]:
counties_wide

In [None]:
old_counties_wide.info(verbose=True)

In [None]:
for version in ("old", "new"):
    col = f"renewable_and_battery_proposed_capacity_mw_{version}"
    print(counties_wide[col].isna().value_counts())
    print()
    

In [None]:
new_has_capacity = ~counties_wide.renewable_and_battery_proposed_capacity_mw_new.isna()
old_has_capacity = ~counties_wide.renewable_and_battery_proposed_capacity_mw_old.isna()

counties_wide["capacity_in_old_none_in_new"] = new_has_capacity
counties_wide["capacity_in_new_none_in_old"] = old_has_capacity

# print(f"Counties that have renewable capacity in both LBNL and GS data:\n{(old_has_capacity & new_has_capacity).value_counts()}")
# print()
# print(f"Counties that have renewable capacity in LBNL and NOT in GS data:\n{(old_has_capacity & ~new_has_capacity).value_counts()}")
# print()
# print(f"Counties that have renewable capacity NOT in LBNL and in GS data:\n{(~old_has_capacity & new_has_capacity).value_counts()}")

In [None]:
set(("a", "b")) - set(("b", "c"))

In [None]:
county_id_fips_with_capacity_in_old = set(counties_wide[old_has_capacity].county_id_fips)
county_id_fips_with_capacity_in_new = set(counties_wide[new_has_capacity].county_id_fips)

counties_capacity_removed = county_id_fips_with_capacity_in_old - county_id_fips_with_capacity_in_new
counties_capacity_added = county_id_fips_with_capacity_in_new - county_id_fips_with_capacity_in_old

In [None]:
counties_wide["counties_capacity_removed"] = counties_wide.county_id_fips.isin(counties_capacity_removed)
counties_wide["counties_capacity_added"] = counties_wide.county_id_fips.isin(counties_capacity_added)

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(counties_wide, geojson=counties, locations='county_id_fips', color='counties_capacity_removed',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'counties_capacity_removed': "Counties that had renewable capacity in old version but don't in new versoin"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


In [None]:
fig = px.choropleth(counties_wide, geojson=counties, locations='county_id_fips', color='counties_capacity_added',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'counties_capacity_added': "Counties that had renewable capacity in new version but don't in old versoin"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


In [None]:
counties_wide["renewable_and_battery_proposed_capacity_mw_diff"] = counties_wide["renewable_and_battery_proposed_capacity_mw_new"] - counties_wide["renewable_and_battery_proposed_capacity_mw_old"]
counties_wide["renewable_and_battery_proposed_capacity_mw_pct_change"] = counties_wide["renewable_and_battery_proposed_capacity_mw_diff"] / counties_wide["renewable_and_battery_proposed_capacity_mw_old"]

In [None]:
counties_wide["renewable_and_battery_proposed_capacity_mw_diff"].plot.box()

In [None]:
counties_wide["renewable_and_battery_proposed_capacity_mw_pct_change"].plot.box()

In [None]:
fig = px.choropleth(counties_wide, geojson=counties, locations='county_id_fips', color='renewable_and_battery_proposed_capacity_mw_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'renewable_and_battery_proposed_capacity_mw_pct_change': "Counties that had renewable capacity in new version but don't in old versoin"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


In [None]:
counties_wide["renewable_and_battery_proposed_capacity_mw_pct_change"].describe()

In [None]:
counties_wide[~counties_wide["renewable_and_battery_proposed_capacity_mw_pct_change"].isna()].renewable_and_battery_proposed_capacity_mw_pct_change

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(counties_wide, geojson=counties, locations='county_id_fips', color='renewable_and_battery_proposed_capacity_mw_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'renewable_and_battery_proposed_capacity_mw_pct_change':'Total proposed renewable MW % change'},
                          )
fig