## Purpose
This notebooks compares some high level metrics about two versions of the `iso_projects_long_format`. This is helpful for running sanity checks when updating queue data.

In [None]:
from dbcp.helpers import get_sql_engine
import pandas as pd

engine = get_sql_engine()

with engine.connect() as con:
    projects_long = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")
    
projects_long = projects_long.convert_dtypes()
print(projects_long.date_entered_queue.max())
# projects_long.to_parquet("iso_projects_long_format_gs_9e8eed8.parquet")

## Load projects

In [None]:
import pandas as pd

In [None]:
from pathlib import Path

parquet_dir = Path("/app/data/output")
data_mart_dir = parquet_dir / "data_mart"
lbnl_23_iso_projects_long_format = pd.read_parquet("iso_projects_long_format_gs_9e8eed8.parquet")

In [None]:
lbnl_22_iso_projects_long_format = pd.read_parquet("iso_projects_long_format_gs_278cb90.parquet")
lbnl_22_iso_projects_long_format["iso_region"] = lbnl_22_iso_projects_long_format["iso_region"].replace("ISO-NE", "ISONE")
lbnl_22_iso_projects_long_format = lbnl_22_iso_projects_long_format

In [None]:
new_region_max_date = lbnl_23_iso_projects_long_format.groupby("iso_region").date_entered_queue.max()
new_region_max_date

In [None]:
old_region_max_date = lbnl_22_iso_projects_long_format.groupby("iso_region").date_entered_queue.max()
old_region_max_date

In [None]:
new_region_max_date - old_region_max_date

## Load raw data

In [None]:
import dbcp

lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2023_clean_data.xlsx"
raw_lbnl_23 = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)["lbnl_iso_queue"].query("queue_status == 'active'")

lbnl_uri = "gs://dgm-archive/lbnl_iso_queue/queues_2022_clean_data.xlsx"
raw_lbnl_22 = dbcp.extract.lbnl_iso_queue.extract(lbnl_uri)["lbnl_iso_queue"].query("queue_status == 'active'")

print(raw_lbnl_22.shape)
print(raw_lbnl_23.shape)

## Compare Raw and data mart differences

In [None]:
raw_lbnl_22["total_capacity_mw"] = raw_lbnl_22.capacity_mw_resource_1.fillna(0) \
    + raw_lbnl_22.capacity_mw_resource_2.fillna(0) \
    + raw_lbnl_22.capacity_mw_resource_3.fillna(0)

raw_lbnl_23["total_capacity_mw"] = raw_lbnl_23.capacity_mw_resource_1.fillna(0) \
    + raw_lbnl_23.capacity_mw_resource_2.fillna(0) \
    + raw_lbnl_23.capacity_mw_resource_3.fillna(0)

raw_lbnl_22["region"] = raw_lbnl_22["region"].replace({"ISO-NE":"ISONE"})
raw_lbnl_23["region"] = raw_lbnl_23["region"].replace({"ISO-NE":"ISONE"})

In [None]:
raw_agg_22 = raw_lbnl_22.groupby("region").total_capacity_mw.sum()
raw_agg_23 = raw_lbnl_23.groupby("region").total_capacity_mw.sum()

# raw_agg = pd.concat([raw_lbnl_22.groupby("region").total_capacity_mw.sum(), raw_lbnl_23.groupby("region").total_capacity_mw.sum()], axis=1)
                    
# raw_agg.columns = ["22_capacity_mw", "23_capacity_mw"]
# raw_agg

In [None]:
mart_agg_22 = lbnl_22_iso_projects_long_format.groupby("iso_region").capacity_mw.sum()
mart_agg_23 = lbnl_23_iso_projects_long_format.groupby("iso_region").capacity_mw.sum()

In [None]:
agg_22 = pd.concat([raw_agg_22, mart_agg_22], axis=1)
agg_22.columns = ["raw_capacity_mw", "mart_capacity_mw"]
agg_22["pct_diff"] = (agg_22.mart_capacity_mw - agg_22.raw_capacity_mw) / agg_22.raw_capacity_mw

agg_23 = pd.concat([raw_agg_23, mart_agg_23], axis=1)
agg_23.columns = ["raw_capacity_mw", "mart_capacity_mw"]
agg_23["pct_diff"] = (agg_23.mart_capacity_mw - agg_23.raw_capacity_mw) / agg_23.raw_capacity_mw


In [None]:
agg_22[agg_22.index.str.contains("non-ISO")].sort_values(by="pct_diff")

In [None]:
agg_23[agg_23.index.str.contains("non-ISO")].sort_values(by="pct_diff")
# agg_23.sort_values(by="pct_diff")

We are removing 5 percent more capacity in West in 23 than in 22.

## Aggregate project dataframes by county

In [None]:
def aggregate_iso_projects_by_count(df):

    def contains_iso_project(grp):
        return any(["non-ISO" not in region for region in grp if not isinstance(region, type(pd.NA))])
    
    def get_primary_iso(grp):
        # There are 16 counties that have equal number of projects in multiple regions. Select the first one
        return grp.mode().head(1)

    agg_df = df.groupby("county_id_fips").agg(
        has_iso_project=pd.NamedAgg(column="iso_region", aggfunc=contains_iso_project),
        primary_iso_region=pd.NamedAgg(column="iso_region", aggfunc=get_primary_iso),
        capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
        co2e_tonnes_per_year=pd.NamedAgg(column="co2e_tonnes_per_year", aggfunc="sum")
    )
    
    def agg_actionable_mw(grp_df):
        return grp_df[grp_df.is_actionable].capacity_mw.sum()

    def agg_certain_mw(grp_df):
        return grp_df[grp_df.is_nearly_certain].capacity_mw.sum()


    agg_df["actionable_capacity_mw"] = df.groupby("county_id_fips").apply(agg_actionable_mw)
    agg_df["nearly_certain_capacity_mw"] = df.groupby("county_id_fips").apply(agg_certain_mw)
    
    agg_df["actionable_n_projects"] = df.groupby("county_id_fips").is_actionable.sum()
    agg_df["nearly_certain_n_projects"] = df.groupby("county_id_fips").is_nearly_certain.count()
    return agg_df.reset_index()

new_projects_counties = aggregate_iso_projects_by_count(lbnl_23_iso_projects_long_format)
old_projects_counties = aggregate_iso_projects_by_count(lbnl_22_iso_projects_long_format)

## Number of counties with projects

In [None]:
n_counties_with_projects_in_new_not_in_old = len(set(new_projects_counties.county_id_fips) - set(old_projects_counties.county_id_fips))
n_counties_with_projects_in_old_not_in_new = len(set(old_projects_counties.county_id_fips) - set(new_projects_counties.county_id_fips))

print(n_counties_with_projects_in_new_not_in_old)
print(n_counties_with_projects_in_old_not_in_new)

In [None]:
print(len(old_projects_counties))
print(len(new_projects_counties))

In [None]:
merged = old_projects_counties.merge(new_projects_counties, how="outer", on="county_id_fips", suffixes=("_old", "_new"))

In [None]:
merged["pct_mw_change"] = (merged.capacity_mw_new - merged.capacity_mw_old) / merged.capacity_mw_old

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode()

# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


import plotly.express as px

fig = px.choropleth(merged, geojson=counties, locations='county_id_fips', color='pct_mw_change',
                           color_continuous_scale="RdYlGn",
                           range_color=(-4, 4),
                           scope="usa",
                           labels={'pct_mw_change': "% Change in capacity between data on dev and LBNL 2023 branch"},
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


In [None]:
region_caps = pd.concat([lbnl_23_iso_projects_long_format.groupby("iso_region").capacity_mw.sum(), lbnl_22_iso_projects_long_format.groupby("iso_region").capacity_mw.sum()], axis=1)
region_caps.columns = ["23_mw_capacity", "22_mw_capacity"]

In [None]:
region_caps

In [None]:
region_caps.plot.bar()