### Pull data from data warehouse

In [1]:
from dbcp.helpers import get_sql_engine
import pandas as pd

engine = get_sql_engine()

with engine.connect() as con:
    gridstatus_projects = pd.read_sql_table("gridstatus_projects", con, schema="data_warehouse")
    gridstatus_resource_capacity = pd.read_sql_table("gridstatus_resource_capacity", con, schema="data_warehouse")
    gridstatus_locations = pd.read_sql_table("gridstatus_locations", con, schema="data_warehouse")
    
    lbnl_projects = pd.read_sql_table("iso_projects", con, schema="data_warehouse")
    lbnl_resource_capacity = pd.read_sql_table("iso_resource_capacity", con, schema="data_warehouse")
    lbnl_locations = pd.read_sql_table("iso_locations", con, schema="data_warehouse")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [2]:
lbnl_projects["entity"] = lbnl_projects["entity"].str.replace("ISO-NE", "ISONE")

In [3]:
gs_capacities = gridstatus_projects.merge(gridstatus_resource_capacity, how="left", validate="1:m")
# gs_capacities = gs_capacities.merge(gridstatus_locations, how="left", validate="m:m")

lbnl_capacities = lbnl_projects.merge(lbnl_resource_capacity, how="left", validate="1:m")
# lbnl_capacities = lbnl_capacities.merge(lbnl_locations, how="left", validate="m:m")

In [4]:
lbnl_capacities = lbnl_capacities[~lbnl_capacities.region.str.contains("non-ISO")]

In [5]:
resource_map = {
        "Battery Storage": "storage",
        "Biofuel": "renewable",
        "Biomass": "renewable",
        "Coal": "fossil",
        "Combustion Turbine": "fossil",
        "CSP": "renewable",
        "Fuel Cell": "renewable",
        "Geothermal": "renewable",
        "Hydro": "renewable",
        "Landfill Gas": "fossil",
        "Methane; Solar": "other",
        "Municipal Solid Waste": "fossil",
        "Natural Gas; Other; Storage; Solar": "fossil",
        "Natural Gas; Storage": "fossil",
        "Natural Gas": "fossil",
        "Nuclear": "other",
        "Offshore Wind": "renewable",
        "Oil; Biomass": "fossil",
        "Oil": "fossil",
        "Onshore Wind": "renewable",
        "Other Storage": "storage",
        "Other": "fossil",
        "Pumped Storage": "storage",
        "Solar; Biomass": "renewable",
        "Solar; Storage": "renewable",
        "Solar": "renewable",
        "Steam": pd.NA,
        "Transmission": "transmission",
        "Unknown": pd.NA,
        "Waste Heat": "fossil",
        "Wind; Storage": "renewable",
        pd.NA: pd.NA,  # not technically necessary but make it explicit
    }

gs_capacities["resource_class"] = gs_capacities["resource_clean"].map(resource_map)
lbnl_capacities["resource_class"] = lbnl_capacities["resource_clean"].map(resource_map)

In [6]:
clean_fuel = ("renewable", "storage")


clean_gs_capacities= gs_capacities[gs_capacities.resource_class.isin(clean_fuel)]
clean_lbnl_capacities = lbnl_capacities[lbnl_capacities.resource_class.isin(clean_fuel)]

In [49]:
gridstatus_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8029 entries, 0 to 8028
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   project_id                  8029 non-null   int64         
 1   actual_completion_date      128 non-null    datetime64[ns]
 2   interconnecting_entity      0 non-null      object        
 3   point_of_interconnection    7596 non-null   object        
 4   project_name                2488 non-null   object        
 5   proposed_completion_date    4780 non-null   datetime64[ns]
 6   queue_date                  7828 non-null   datetime64[ns]
 7   queue_id                    8029 non-null   object        
 8   queue_status                8029 non-null   object        
 9   interconnection_status_raw  3906 non-null   object        
 10  utility                     6568 non-null   object        
 11  withdrawal_comment          1 non-null      object      

In [53]:
gs_iso_region_max_queue_date = gridstatus_projects[~gridstatus_projects.region.str.contains("non-ISO")].groupby("region").queue_date.max()
lbnl_iso_region_max_queue_date = lbnl_projects[~lbnl_projects.region.str.contains("non-ISO")].groupby("region").queue_date.max()
lbnl_iso_region_max_queue_date.index = lbnl_iso_region_max_queue_date.index.str.lower().str.replace("-", "")


(gs_iso_region_max_queue_date - lbnl_iso_region_max_queue_date).astype('timedelta64[M]')

region
caiso     7.0
ercot     2.0
isone    12.0
miso      3.0
nyiso     3.0
pjm       3.0
spp       9.0
Name: queue_date, dtype: float64

In [54]:
((clean_gs_capacities.groupby("entity").capacity_mw.sum() - clean_lbnl_capacities.groupby("entity").capacity_mw.sum()) / clean_lbnl_capacities.groupby("entity").capacity_mw.sum() * 100).round()

entity
CAISO    -9.0
ERCOT    13.0
ISONE    27.0
MISO    -22.0
NYISO   -11.0
PJM      -0.0
SPP      28.0
Name: capacity_mw, dtype: float64

Do these seem like reasonable changes given the delays for each ISO?

## Compare total clean capacity in warehouse and mart

In [14]:
with engine.connect() as con:
    iso_projects_wide_format = pd.read_sql_table("iso_projects_wide_format", con, schema="data_mart")

In [33]:
iso_projects_wide_format.iso_region.str.contains("non-ISO").isna().value_counts()

False    10683
True        16
Name: iso_region, dtype: int64

In [36]:
iso_projects_wide_format = iso_projects_wide_format[~iso_projects_wide_format.iso_region.str.contains("non-ISO").fillna(True)]
iso_projects_wide_format["total_capacity_mw"] = iso_projects_wide_format[["generation_capacity_mw_1", "generation_capacity_mw_2", "storage_capacity_mw"]].sum(axis=1)

clean_iso_projects_wide_format = iso_projects_wide_format[iso_projects_wide_format.resource_class.isin(clean_fuel)]
iso_projects_wide_format.groupby("entity").total_capacity_mw.sum()

entity
CAISO    181507.992035
ERCOT    226964.690000
ISONE     61618.855000
MISO     283188.650000
NYISO    104658.650000
PJM      245126.462800
SPP      148284.562000
Name: total_capacity_mw, dtype: float64

In [41]:
pd.concat([clean_gs_capacities.groupby("entity").capacity_mw.sum(), iso_projects_wide_format.groupby("entity").total_capacity_mw.sum()], axis=1)

Unnamed: 0_level_0,capacity_mw,total_capacity_mw
entity,Unnamed: 1_level_1,Unnamed: 2_level_1
CAISO,179044.322038,181507.992035
ERCOT,217171.09,226964.69
ISONE,41588.058,61618.855
MISO,221551.35,283188.65
NYISO,94341.65,104658.65
PJM,253160.8448,245126.4628
SPP,123289.422,148284.562


Why does the total capacity differ in the data_warehouse tables and `iso_projects_wide_format`? 