# Revisit De-duplication
LBNL queues were deduplicated based on a specific definition of "duplicate".  This notebook revisits the deduplication process to see if it 1) should and 2) can be applied directly to GridStatus data.

## Get Data
### LBNL Queues

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from dbcp.extract.lbnl_iso_queue import extract


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
from dbcp.transform.lbnl_iso_queue import parse_date_columns
# partial implementation of transform. I don't want to include deduplication.
def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:
    """Transform active iso queue data."""
    rename_dict = {
        "state": "raw_state_name",
        "county": "raw_county_name",
    }
    active_projects = active_projects.rename(columns=rename_dict)  # copy
    # Harmonize the interconnection_status_lbnl values.
    mapping = {
        "Feasability Study": "Feasibility Study",
        "Feasibility": "Feasibility Study",
        "Facilities Study": "Facility Study",
        "IA in Progress": "In Progress (unknown study)",
        "Unknown": "In Progress (unknown study)",
        "Withdrawn, Feasibility Study": "Withdrawn",
    }
    active_projects.loc[:, "interconnection_status_lbnl"] = active_projects.loc[
        :, "interconnection_status_lbnl"
    ].replace(mapping)
    # drop irrelevant columns (structurally all nan due to 'active' filter)
    active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True)
    parse_date_columns(active_projects)
    return active_projects


source_path = Path("/app/data/raw/queues_2022_clean_data.xlsx")
raw_lbnl = extract(source_path)["lbnl_iso_queue"]
lbnl = partial_transform(raw_lbnl)


In [3]:
lbnl.shape, lbnl.columns

((29033, 30),
 Index(['queue_id', 'queue_status', 'queue_date_raw', 'queue_year', 'interconnection_date_raw', 'entity', 'project_name', 'developer', 'utility', 'county_1', 'county_2', 'county_3', 'raw_state_name', 'region', 'interconnection_service_type', 'point_of_interconnection', 'date_proposed_raw', 'year_proposed', 'interconnection_status_raw', 'interconnection_status_lbnl', 'resource_type_lbnl', 'resource_type_1', 'resource_type_2', 'resource_type_3', 'capacity_mw_resource_1', 'capacity_mw_resource_2', 'capacity_mw_resource_3', 'queue_date', 'interconnection_date', 'date_proposed'], dtype='object'))

In [4]:
lbnl.head(2)

Unnamed: 0,queue_id,queue_status,queue_date_raw,queue_year,interconnection_date_raw,entity,project_name,developer,utility,county_1,county_2,county_3,raw_state_name,region,interconnection_service_type,point_of_interconnection,date_proposed_raw,year_proposed,interconnection_status_raw,interconnection_status_lbnl,resource_type_lbnl,resource_type_1,resource_type_2,resource_type_3,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,queue_date,interconnection_date,date_proposed
0,GIA-97,withdrawn,1/7/2022,2022.0,,AEC,,,AEC,new madrid,,,MO,Southeast (non-ISO),Network,New Madrid - Essex 345kV,10/31/2024,2024.0,Withdrawn,Withdrawn,Solar,Solar,,,350.0,,,2022-01-07,NaT,2024-10-31
1,GIA-40,active,10/24/2009,2009.0,,AEC,,,AEC,new madrid,,,MO,Southeast (non-ISO),Network Resource,NM Switchyard (345 kV Bus),11/1/2011,2011.0,Upgrade Approved,IA Executed,Coal,Coal,,,20.0,,,2009-10-24,NaT,2011-11-01


### GridStatus

In [5]:
from dbcp.extract.gridstatus_isoqueues import extract as extract_gs
from dbcp.transform.gridstatus import (
    _transform_miso,
    _transform_caiso,
    _transform_pjm,
    _transform_ercot,
    _transform_spp,
    _transform_nyiso,
    _transform_isone,
    COLUMN_RENAME_DICT,
    _clean_resource_type,
)
def partial_transform_gs(raw_dfs: dict[str, pd.DataFrame]) -> pd.DataFrame:
    # exclude the normalization step
    # create one dataframe
    iso_cleaning_functions = {
        "miso": _transform_miso,
        "caiso": _transform_caiso,
        "pjm": _transform_pjm,
        "ercot": _transform_ercot,
        "spp": _transform_spp,
        "nyiso": _transform_nyiso,
        "isone": _transform_isone,
    }

    projects = []
    for iso, df in raw_dfs.items():
        # Apply rename
        renamed_df = df.rename(columns=COLUMN_RENAME_DICT).copy()

        # Apply iso specific cleaning functions
        renamed_df = iso_cleaning_functions[iso](renamed_df)

        renamed_df["region"] = iso
        renamed_df["entity"] = iso.upper()
        projects.append(renamed_df)

    active_projects = pd.concat(projects)
    active_projects["queue_status"] = active_projects.queue_status.str.lower()

    # parse dates
    date_cols = [col for col in list(active_projects) if "date" in col]
    for col in date_cols:
        active_projects[col] = pd.to_datetime(active_projects[col], utc=True)

    # create project_id
    active_projects["project_id"] = np.arange(len(active_projects), dtype=np.int32)

    # Normalize data
    # (
    #     normalized_projects,
    #     normalized_capacities,
    #     normalized_locations,
    # ) = _normalize_projects(active_projects)

    # harmonize types
    active_projects = _clean_resource_type(active_projects)
    return active_projects

raw_gs = extract_gs()
gs = partial_transform_gs(raw_gs)
gs.shape, gs.columns



((8233, 121),
 Index(['queue_id', 'project_name', 'interconnecting_entity', 'county', 'state', 'point_of_interconnection', 'utility', 'resource', 'capacity_mw', 'summer_capacity_mw',
        ...
        'Serv', 'I39', 'Dev', 'Zone', 'System Impact Study Completed', 'Feasiblity Study Status', 'Optional Interconnection Study Status', 'Project Status', 'project_id', 'resource_clean'], dtype='object', length=121))

## Analyze Duplicates
### ID Duplicates in LBNL

In [6]:
ids_lbnl = ['entity', 'queue_id']
lbnl.duplicated(subset=ids_lbnl, keep=False).agg(['mean', 'sum'])

mean      0.017532
sum     509.000000
dtype: float64

In [7]:
# most ID dupes are from withdrawn or operational projects
lbnl.query('queue_status == "active"').duplicated(subset=ids_lbnl, keep=False).agg(['mean', 'sum'])

mean      0.011125
sum     114.000000
dtype: float64

In [19]:
active_lbnl = lbnl.query('queue_status == "active"').copy()
is_id_dupe_active = active_lbnl.duplicated(subset=ids_lbnl, keep=False)
id_dupe_lbnl_active = active_lbnl.loc[is_id_dupe_active,:]

In [20]:
# only in non-ISO regions
id_dupe_lbnl_active['entity'].value_counts(dropna=False)

Duke          34
WAPA          29
DominionSC    24
SRP           15
LADWP         10
PNM            2
Name: entity, dtype: int64

In [21]:
# what is the duplicate structure of the rest of the columns?
# Excluding nulls, count the number of duplicates for each column.
# Compare the fraction of duplicates (and absolute number of duplicates).
# Values <= ~0.5 indicate that the column is a good candidate for differentiating ID dupes.
pd.concat(
    [
        (
            id_dupe_lbnl_active
            .dropna(subset=c)
            .duplicated(subset=ids_lbnl + [c], keep=False)
            .agg(['mean', 'sum', 'count'])
            .rename(c)
        )
    for c in id_dupe_lbnl_active.columns.difference(set(ids_lbnl))
    ],
    axis=1
).T.sort_values('mean')

Unnamed: 0,mean,sum,count
resource_type_3,0.0,0.0,1.0
developer,0.0,0.0,17.0
interconnection_date,0.0,0.0,2.0
interconnection_date_raw,0.0,0.0,2.0
capacity_mw_resource_2,0.142857,2.0,14.0
date_proposed_raw,0.160714,9.0,56.0
date_proposed,0.163636,9.0,55.0
year_proposed,0.358491,19.0,53.0
point_of_interconnection,0.443396,47.0,106.0
queue_date,0.509434,54.0,106.0


Based on the above, the columns that usually differentiate ID duplicates are:
* `date_proposed`
* `year_proposed`
* `capacity_mw_resource_1`
* `county_1`
* `point_of_interconnection`
* `queue_date`

Columns that are usually the same for ID duplicates are:
* `queue_year`
* `queue_status`
* `interconnection_service_type`
* `interconnection_status_lbnl`
* `utility`
* `resource_type_X`

**Assumption / value judgment: if the only differences are dates, then the project is probably the same. The date differences are probably due to the project being resubmitted for contingency.**

In [22]:
# zero dupes in GS! This matches up with the zero dupes in active LBNL projects because it only includes ISO regions.
ids_gs = ['region', 'queue_id']
gs.duplicated(subset=ids_gs, keep=False).agg(['mean', 'sum'])

mean    0.0
sum     0.0
dtype: float64

I lost a bunch of work here when the container crashed while I updated drivers in the host OS. I'm not going to re-do it, but here were some takeaways:
* PJM is missing proposed_completion_date due to a bug in GridStatus's ETL code.
* fixed some misidentified columns in GS ETL code
* updated LBNL duplicate prioritization to take the keep the record with the latest `date_proposed`, `queue_date`, and `interconnection_status_lbnl`
  * this fixes all but one ID duplicate (excepting LADWP and DominionSC, which have errors in the source data for queue_id, and WAPA, whose queue_id needs to be combined with state into a composite key)

In [23]:
dupe_keys_lbnl = [
        "point_of_interconnection_clean",  # string normalization on point_of_interconnection
        "capacity_mw_resource_1",
        "county_1",
        "raw_state_name",  # not often useful but is a nearly certain differentiator
        "utility_clean",  # utility.fillna(region)
        "resource_type_1",  # not often useful but is a nearly certain differentiator
    ]
dupe_keys_gs = [
        "point_of_interconnection_clean",
        "capacity_mw",
        "county",
        "state",
        "utility_clean",
        "resource",
    ]

In [24]:
def normalize_poi(ser: pd.Series) -> pd.Series:
    # Essentially a poor man's bag-of-words model.
    out = (
        ser
        .astype("string")
        .str.lower()
        .str.replace("-| +", " ", regex=True)
        .str.replace(r"(?:sub)station|kv| at |tbd", "", regex=True)
        .fillna("")
    )
    out = pd.Series(
       [" ".join(sorted(x)) for x in out.str.split()],
        index=out.index,
       dtype="string",
    ).str.strip()
    out.replace("", pd.NA, inplace=True)
    return out
gs.loc[:, 'point_of_interconnection_clean'] = normalize_poi(gs['point_of_interconnection'])
gs['utility_clean'] = gs['utility'].fillna(gs['region'])

In [25]:
gs[['point_of_interconnection', 'point_of_interconnection_clean']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8233 entries, 0 to 987
Data columns (total 2 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   point_of_interconnection        7706 non-null   string
 1   point_of_interconnection_clean  7705 non-null   string
dtypes: string(2)
memory usage: 193.0 KB


In [26]:
gs[dupe_keys_gs + ['utility', 'point_of_interconnection']].isna().agg(['mean', 'sum']).T

Unnamed: 0,mean,sum
point_of_interconnection_clean,0.064132,528.0
capacity_mw,0.002429,20.0
county,0.012268,101.0
state,0.003401,28.0
utility_clean,0.0,0.0
resource,0.013118,108.0
utility,0.210009,1729.0
point_of_interconnection,0.064011,527.0


In [27]:
is_dupe_gs = gs.duplicated(subset=dupe_keys_gs, keep=False)
is_dupe_gs.agg(['mean', 'sum'])

mean      0.091704
sum     755.000000
dtype: float64

In [28]:
# compare to LBNL
active_lbnl['utility_clean'] = active_lbnl['utility'].fillna(active_lbnl['entity'])
active_lbnl['point_of_interconnection_clean'] = normalize_poi(active_lbnl['point_of_interconnection'])
is_dupe_lbnl = active_lbnl.duplicated(subset=dupe_keys_lbnl, keep=False)
is_dupe_lbnl.agg(['mean', 'sum'])

mean       0.109495
sum     1122.000000
dtype: float64

In [30]:
active_lbnl.shape

(10247, 32)

In [29]:
active_lbnl[dupe_keys_lbnl + ['utility', 'point_of_interconnection']].isna().agg(['mean', 'sum']).T

Unnamed: 0,mean,sum
point_of_interconnection_clean,0.007514,77.0
capacity_mw_resource_1,0.016005,164.0
county_1,0.036791,377.0
raw_state_name,0.033961,348.0
utility_clean,0.0,0.0
resource_type_1,0.00039,4.0
utility,0.147946,1516.0
point_of_interconnection,0.006636,68.0


Now determine which duplicate to keep. Can use the same general approach as for LBNL queues, but GS doesn't have a standardized interconnection status, and PJM is currently missing proposed completion date (but will not in the future).