## Purpose
This notebooks compares some high level metrics about two versions of the `iso_projects_long_format`. This is helpful for running sanity checks when updating queue data.

In [None]:
from dbcp.helpers import get_sql_engine
import pandas as pd

engine = get_sql_engine()

# with engine.connect() as con:
#     projects_long = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")
    
# projects_long = projects_long.convert_dtypes()
# print(projects_long.date_entered_queue.max())
# projects_long.to_parquet("iso_projects_long_format_gs.parquet")

## Load projects

In [None]:
import pandas as pd

In [None]:
lbnl_projects = pd.read_parquet("iso_projects_long_format_lbnl.parquet")
gs_projects = pd.read_parquet("iso_projects_long_format_gs_jan_24.parquet")

print(gs_projects.date_entered_queue.max())
print(lbnl_projects.date_entered_queue.max())

In [None]:
lbnl_projects["iso_region"] = lbnl_projects["iso_region"].str.replace("ISO-NE", "ISONE")

In [None]:
gs_iso_region_max_queue_date = gs_projects[~gs_projects.iso_region.str.contains("non-ISO")].groupby("iso_region").date_entered_queue.max()
gs_iso_region_max_queue_date

In [None]:
lbnl_iso_region_max_queue_date = lbnl_projects[~lbnl_projects.iso_region.str.contains("non-ISO")].groupby("iso_region").date_entered_queue.max()
lbnl_iso_region_max_queue_date

## Aggregate project dataframes by county

In [None]:
def aggregate_iso_projects_by_count(df):

    def contains_iso_project(grp):
        return any(["non-ISO" not in region for region in grp if not isinstance(region, type(pd.NA))])
    
    def get_primary_iso(grp):
        # There are 16 counties that have equal number of projects in multiple regions. Select the first one
        return grp.mode().head(1)

    agg_df = df.groupby("county_id_fips").agg(
        has_iso_project=pd.NamedAgg(column="iso_region", aggfunc=contains_iso_project),
        primary_iso_region=pd.NamedAgg(column="iso_region", aggfunc=get_primary_iso),
        capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
        co2e_tonnes_per_year=pd.NamedAgg(column="co2e_tonnes_per_year", aggfunc="sum")
    )
    
    def agg_actionable_mw(grp_df):
        return grp_df[grp_df.is_actionable].capacity_mw.sum()

    def agg_certain_mw(grp_df):
        return grp_df[grp_df.is_nearly_certain].capacity_mw.sum()


    agg_df["actionable_capacity_mw"] = df.groupby("county_id_fips").apply(agg_actionable_mw)
    agg_df["nearly_certain_capacity_mw"] = df.groupby("county_id_fips").apply(agg_certain_mw)
    
    agg_df["actionable_n_projects"] = df.groupby("county_id_fips").is_actionable.sum()
    agg_df["nearly_certain_n_projects"] = df.groupby("county_id_fips").is_nearly_certain.count()
    return agg_df.reset_index()

new_projects_counties = aggregate_iso_projects_by_count(gs_projects)
old_projects_counties = aggregate_iso_projects_by_count(lbnl_projects)

## Number of counties with projects

In [None]:
n_counties_with_projects_in_new_not_in_old = len(set(new_projects_counties.county_id_fips) - set(old_projects_counties.county_id_fips))
n_counties_with_projects_in_old_not_in_new = len(set(old_projects_counties.county_id_fips) - set(new_projects_counties.county_id_fips))

print(n_counties_with_projects_in_new_not_in_old)
print(n_counties_with_projects_in_old_not_in_new)

In [None]:
print(len(old_projects_counties))
print(len(new_projects_counties))

In [None]:
project_counties = old_projects_counties.merge(new_projects_counties, on="county_id_fips", how="outer", validate="1:1", suffixes=("_old", "_new"))
project_counties["has_iso_project_old"] = project_counties.has_iso_project_old.astype("boolean")
project_counties["has_iso_project_new"] = project_counties.has_iso_project_new.astype("boolean")

## Compare overall capacity between new and old data amongst counties with ISO projects

In [None]:
is_county_with_iso_projects = project_counties.has_iso_project_new | project_counties.has_iso_project_old
print(is_county_with_iso_projects.value_counts())

counties_with_iso_projects = project_counties[is_county_with_iso_projects].copy()

In [None]:
counties_with_iso_projects["capacity_mw_diff"] = (counties_with_iso_projects.capacity_mw_new - counties_with_iso_projects.capacity_mw_old)
counties_with_iso_projects["capacity_mw_pct_change"] = (counties_with_iso_projects["capacity_mw_diff"] / counties_with_iso_projects.capacity_mw_old) * 100

In [None]:
counties_with_iso_projects[counties_with_iso_projects.primary_iso_region_new.eq("CAISO")].capacity_mw_pct_change.plot.hist(bins=20)

In [None]:
counties_with_iso_projects["capacity_mw_abs_pct_change"] = counties_with_iso_projects.capacity_mw_pct_change.abs()

counties_with_iso_projects.groupby("primary_iso_region_new").capacity_mw_pct_change.describe()

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='capacity_mw_pct_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-100, 100),
                           scope="usa",
                           labels={'capacity_mw_pct_change': "Capcity MW % change between old and new data"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

## Compare  renewable + storage total capacity in each ISO Region

In [None]:
clean_fuel = ("renewable", "storage")

clean_gs_projects = gs_projects[gs_projects.resource_class.isin(clean_fuel)]
clean_lbnl_projects = lbnl_projects[lbnl_projects.resource_class.isin(clean_fuel)]

iso_pct_change = (clean_gs_projects.groupby("iso_region").capacity_mw.sum() - clean_lbnl_projects.groupby("iso_region").capacity_mw.sum()) / clean_lbnl_projects.groupby("iso_region").capacity_mw.sum() * 100
iso_pct_change = iso_pct_change.round()
iso_pct_change

### Number of month delay between LBNL and GS data

In [None]:
(gs_iso_region_max_queue_date - lbnl_iso_region_max_queue_date).astype('timedelta64[M]')

In [None]:
ax = iso_pct_change.plot.bar(xlabel='ISO Region', ylabel='% change between LBNL and GS')
ax.bar_label(ax.containers[0])
ax

- Did CAISO renewable capacity really go down in 7 months?
- ERCOT and SPP seem like reasonable increases.
- Did ISONE AND NYISO really go up that much?
- Did MISO really go down 28%?