## Purpose
This notebooks compares some high level metrics about two versions of the `iso_projects_long_format`. This is helpful for running sanity checks when updating queue data.

In [None]:
from dbcp.helpers import get_sql_engine
import pandas as pd

engine = get_sql_engine()

# with engine.connect() as con:
#     projects_long = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")
    
# projects_long = projects_long.convert_dtypes()
# print(projects_long.date_entered_queue.max())
# projects_long.to_parquet("iso_projects_long_format_gs.parquet")

## Load projects

In [None]:
import pandas as pd

In [None]:
lbnl_projects = pd.read_parquet("iso_projects_long_format_lbnl.parquet")
gs_projects = pd.read_parquet("iso_projects_long_format_gs.parquet")

print(gs_projects.date_entered_queue.max())
print(lbnl_projects.date_entered_queue.max())

## Aggregate project dataframes by county

In [None]:
def aggregate_iso_projects_by_count(df):

    def contains_iso_project(grp):
        return any(["non-ISO" not in region for region in grp if not isinstance(region, type(pd.NA))])
    
    def get_primary_iso(grp):
        # There are 16 counties that have equal number of projects in multiple regions. Select the first one
        return grp.mode().head(1)

    agg_df = df.groupby("county_id_fips").agg(
        has_iso_project=pd.NamedAgg(column="iso_region", aggfunc=contains_iso_project),
        primary_iso_region=pd.NamedAgg(column="iso_region", aggfunc=get_primary_iso),
        capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
        co2e_tonnes_per_year=pd.NamedAgg(column="co2e_tonnes_per_year", aggfunc="sum")
    )
    
    def agg_actionable_mw(grp_df):
        return grp_df[grp_df.is_actionable].capacity_mw.sum()

    def agg_certain_mw(grp_df):
        return grp_df[grp_df.is_nearly_certain].capacity_mw.sum()


    agg_df["actionable_capacity_mw"] = df.groupby("county_id_fips").apply(agg_actionable_mw)
    agg_df["nearly_certain_capacity_mw"] = df.groupby("county_id_fips").apply(agg_certain_mw)
    
    agg_df["actionable_n_projects"] = df.groupby("county_id_fips").is_actionable.sum()
    agg_df["nearly_certain_n_projects"] = df.groupby("county_id_fips").is_nearly_certain.count()
    return agg_df.reset_index()

new_projects_counties = aggregate_iso_projects_by_count(gs_projects)
old_projects_counties = aggregate_iso_projects_by_count(lbnl_projects)

## Number of counties with projects

In [None]:
n_counties_with_projects_in_new_not_in_old = len(set(new_projects_counties.county_id_fips) - set(old_projects_counties.county_id_fips))
n_counties_with_projects_in_old_not_in_new = len(set(old_projects_counties.county_id_fips) - set(new_projects_counties.county_id_fips))

print(n_counties_with_projects_in_new_not_in_old)
print(n_counties_with_projects_in_old_not_in_new)

In [None]:
print(len(old_projects_counties))
print(len(new_projects_counties))

## Make sure counties that don't have any ISO projects capacity remain unchanged

In [None]:
project_counties = old_projects_counties.merge(new_projects_counties, on="county_id_fips", how="outer", validate="1:1", suffixes=("_old", "_new"))

In [None]:
project_counties["has_iso_project_old"] = project_counties.has_iso_project_old.astype("boolean")
project_counties["has_iso_project_new"] = project_counties.has_iso_project_new.astype("boolean")

In [None]:
print((~project_counties.has_iso_project_old).value_counts())
print()
print((~project_counties.has_iso_project_new).value_counts())

Pretty similar number of counties that don't have any ISO projects. Lets plot them to make sure it makes sense with the ISO boundaries.

In [None]:
is_county_without_iso_projects = (~project_counties.has_iso_project_new) & (~project_counties.has_iso_project_old)
print(is_county_without_iso_projects.value_counts())

counties_without_iso_projects = project_counties[is_county_without_iso_projects]

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(counties_without_iso_projects, geojson=counties, locations='county_id_fips', color='has_iso_project_new',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'has_iso_project_new': "Counties that don't have any ISO projects in new and old data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


Great! That rougly aligns with [ISO borders](https://hifld-geoplatform.opendata.arcgis.com/datasets/50f80920d36e435d9a34db2bd0fd3ad8/explore?location=32.163459%2C-97.519448%2C5.23).

In [None]:
assert (counties_without_iso_projects.capacity_mw_old.eq(counties_without_iso_projects.capacity_mw_new)).all(), "Capacity in counties without ISO projects has changed!"

## Compare overall capacity between new and old data amongst counties with ISO projects

In [None]:
is_county_with_iso_projects = project_counties.has_iso_project_new | project_counties.has_iso_project_old
print(is_county_with_iso_projects.value_counts())

counties_with_iso_projects = project_counties[is_county_with_iso_projects].copy()

In [None]:
counties_with_iso_projects["capacity_mw_diff"] = (counties_with_iso_projects.capacity_mw_new - counties_with_iso_projects.capacity_mw_old)
counties_with_iso_projects["capacity_mw_pct_change"] = (counties_with_iso_projects["capacity_mw_diff"] / counties_with_iso_projects.capacity_mw_old) * 100

In [None]:
print(counties_with_iso_projects.capacity_mw_diff.describe())

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").capacity_mw_diff.describe()

In [None]:
print(counties_with_iso_projects.capacity_mw_pct_change.abs().describe())
print()
counties_with_iso_projects.capacity_mw_pct_change.abs().plot.box()

Pretty good! 50% of counties capacity changed by no more than 1%. 75% of counties capacity changed by no more than 25%. 

In [None]:
counties_with_iso_projects.primary_iso_region_new.apply(type).value_counts()

In [None]:
counties_with_iso_projects["capacity_mw_abs_pct_change"] = counties_with_iso_projects.capacity_mw_pct_change.abs()

counties_with_iso_projects.groupby("primary_iso_region_new").capacity_mw_pct_change.describe()

- CAISO's distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- ERCOT mostly went unchanged. Maybe a slight increase. Variability could be explained by the 4 month delay of the data.
- It looks like ISONE generally increased which kind of makes sense given we have a whole other year of data for this ISO.
- MISO's distribution suggests it mostly went unchaned with a slight increase. Variability could be explained by the 4 month delay of the data.
- NYISO distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- PJM distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.
- Capacity change in SPP has a pretty wide distribution which could be explained by the additional year of data. 

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='capacity_mw_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'capacity_mw_pct_change': "Capcity MW % change between old and new data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

## Compare **actionable** capacity between new and old data amongst counties with ISO projecdts

In [None]:
counties_with_iso_projects["actionable_capacity_mw_diff"] = (counties_with_iso_projects.actionable_capacity_mw_new - counties_with_iso_projects.actionable_capacity_mw_old)
counties_with_iso_projects["actionable_capacity_mw_diff_pct_change"] = (counties_with_iso_projects["actionable_capacity_mw_diff"] / counties_with_iso_projects.actionable_capacity_mw_old) * 100

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").actionable_capacity_mw_diff_pct_change.describe()

In [None]:
lbnl_projects["iso_region"] = lbnl_projects["iso_region"].replace("ISO-NE", "ISONE")

n_actionable_by_iso = pd.concat([lbnl_projects.groupby("iso_region").is_actionable.sum(), gs_projects.groupby("iso_region").is_actionable.sum()], axis=1)
n_actionable_by_iso

- Significantly more projects marked actionable in new CAISO

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='actionable_capacity_mw_diff_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'actionable_capacity_mw_diff_pct_change': "Actionable Capacity MW % change"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


### Compare nearly certain capacity

In [None]:
counties_with_iso_projects["nearly_certain_capacity_mw_diff"] = (counties_with_iso_projects.nearly_certain_capacity_mw_new - counties_with_iso_projects.nearly_certain_capacity_mw_old)
counties_with_iso_projects["nearly_certain_capacity_mw_diff_pct_change"] = (counties_with_iso_projects["nearly_certain_capacity_mw_diff"] / counties_with_iso_projects.nearly_certain_capacity_mw_old) * 100

In [None]:
counties_with_iso_projects.groupby("primary_iso_region_new").nearly_certain_capacity_mw_diff_pct_change.describe()

In [None]:
pd.concat([lbnl_projects.groupby("iso_region").is_nearly_certain.sum(), gs_projects.groupby("iso_region").is_nearly_certain.sum()], axis=1)

- Significantly less number of projects marked nearly certain in new MISO data.
- LBNL didn't mark any NYISO projects as nearly certain.

In [None]:
fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='nearly_certain_capacity_mw_diff_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'nearly_certain_capacity_mw_diff_pct_change': "Counties that don't have any ISO projects in new and old data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


## Compare CO2 estimate

In [None]:
counties_with_iso_projects

In [None]:
counties_with_iso_projects["co2e_tonnes_per_year_diff"] = counties_with_iso_projects.co2e_tonnes_per_year_new - counties_with_iso_projects.co2e_tonnes_per_year_old
counties_with_iso_projects["co2e_tonnes_per_year_pct_change"] = counties_with_iso_projects["co2e_tonnes_per_year_diff"].div(counties_with_iso_projects.co2e_tonnes_per_year_old) * 100

In [None]:
print(counties_with_iso_projects[counties_with_iso_projects.co2e_tonnes_per_year_old.ne(0)].co2e_tonnes_per_year_pct_change.describe())
print()
print(counties_with_iso_projects[counties_with_iso_projects.co2e_tonnes_per_year_new.ne(0)].co2e_tonnes_per_year_pct_change.describe())

Great it looks like co2 estimates mostly stayed the same for counties with fossil fuel projets in the queue.

## Compare at project level

In [None]:
merged_projects = lbnl_projects.merge(gs_projects, how="outer", on=("queue_id", "entity"), suffixes=("_lbnl", "_gs"))

In [None]:
miso = merged_projects.query("entity == 'MISO'")
miso.shape

In [None]:
is_nearly_certain_lbnl = miso.is_nearly_certain_lbnl.fillna(False)
is_nearly_certain_gs = miso.is_nearly_certain_gs.fillna(False)
print(is_nearly_certain_lbnl.value_counts())
print(is_nearly_certain_gs.value_counts())

In [None]:
miso[is_nearly_certain_lbnl].iso_region_gs.isna().value_counts()

In [None]:
miso[is_nearly_certain_lbnl].is_nearly_certain_gs.value_counts(dropna=False)

In [None]:
miso.info()

In [None]:
miso[is_nearly_certain_lbnl][["interconnection_status_lbnl", "interconnection_status_gs"]].value_counts(dropna=False)

In [None]:
is_actionable_lbnl = miso.is_actionable_lbnl.fillna(False)
is_actionable_gs = miso.is_actionable_gs.fillna(False)
print(is_actionable_lbnl.value_counts())
print(is_actionable_gs.value_counts())

miso[is_actionable_gs][["interconnection_status_lbnl", "interconnection_status_gs"]].value_counts(dropna=False)

In [None]:
miso[["interconnection_status_gs", "interconnection_status_lbnl"]].value_counts(dropna=False)

In [None]:
miso[miso.interconnection_status_gs.eq("PHASE 3") & miso.interconnection_status_lbnl.eq("IA Executed")][["date_proposed_online_lbnl", "date_proposed_online_gs", "date_entered_queue_lbnl", "date_entered_queue_gs"]]

In [None]:
miso.head()