# Attempts at Deduplicating the ISO queue data
### Final function at bottom of notebook

In [1]:
import pandas as pd
import dbcp
import copy



In [6]:
#conn = 'postgresql://postgres:postgres@postgres/postgres'

# Annoying Setup
Want ISO projects up to but not including normalization - keep separate for first round of deduplication

Priority is the active projects.

In [220]:
# copied out of etl.py:etl_lbnlisoqueues
import logging
from pathlib import Path
from typing import Dict

import pandas as pd
import pandas_gbq
import pydata_google_auth
import sqlalchemy as sa

import dbcp
from dbcp.constants import WORKING_PARTITIONS
from dbcp.schemas import TABLE_SCHEMAS
from dbcp.workspace.datastore import DBCPDatastore
from pudl.output.pudltabl import PudlTabl

logger = logging.getLogger(__name__)

#Extract
ds = DBCPDatastore(sandbox=True, local_cache_path="/app/input")
lbnl_raw_dfs = dbcp.extract.lbnlisoqueues.Extractor(ds).extract(
    update_date=WORKING_PARTITIONS["lbnlisoqueues"]["update_date"])

In [221]:
# copied out of dbcp.transform.lbnlisoqueues.py:transform
# added "dbcp.transform.lbnlisoqueues." prefix as needed to internal funcs
import logging
from typing import Any, Dict, List

import pandas as pd

from dbcp.schemas import TABLE_SCHEMAS
from dbcp.transform.helpers import normalize_multicolumns_to_rows, parse_dates
from pudl.helpers import add_fips_ids as _add_fips_ids

lbnl_transformed_dfs = {name: df.copy()
                        for name, df in lbnl_raw_dfs.items()}
dbcp.transform.lbnlisoqueues._set_global_project_ids(lbnl_transformed_dfs)

lbnl_transform_functions = {
    "active_iso_queue_projects": dbcp.transform.lbnlisoqueues.active_iso_queue_projects,
    "completed_iso_queue_projects": dbcp.transform.lbnlisoqueues.completed_iso_queue_projects,
    "withdrawn_iso_queue_projects": dbcp.transform.lbnlisoqueues.withdrawn_iso_queue_projects,
}

for table_name, transform_func in lbnl_transform_functions.items():
    logger.info(f"LBNL ISO Queues: Transforming {table_name} table.")
    lbnl_transformed_dfs[table_name] = transform_func(
        lbnl_transformed_dfs[table_name])

In [222]:
lbnl_transformed_dfs['active_iso_queue_projects'].head(3)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,500.0,,,yellowstone,,,12/31/2023,,NWE,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
1,500.0,,,yellowstone,,,12/31/2023,,PacifiCorp,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
2,500.0,,,yellowstone,,,12/31/2023,,Avista,Not Started,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31


# Deduplication

In [223]:
active_dups = lbnl_transformed_dfs['active_iso_queue_projects']
completed_dups = lbnl_transformed_dfs['completed_iso_queue_projects']
withdrawn_dups = lbnl_transformed_dfs['withdrawn_iso_queue_projects']
active = copy.deepcopy(active_dups)
completed = copy.deepcopy(completed_dups)
withdrawn = copy.deepcopy(withdrawn_dups)

### Drop exact duplicates
There's just one in completed lol

In [6]:
completed.shape

(1706, 22)

In [8]:
completed = completed.drop_duplicates()
completed.shape

(1705, 22)

### Which columns are enough to be considered a key?
Notes:
- `state`: didn't use because could span multiple states or be ambiguous what state (WY vs MT?)
- `point_of_interconnection`: could be used but needs some serious cleaning (see example below)
- `entity`: seems like this is a more specific version of `region`, where `region` groups non-ISO projects

In [6]:
active.columns

Index(['capacity_mw_resource_1', 'capacity_mw_resource_2',
       'capacity_mw_resource_3', 'county_1', 'county_2', 'county_3',
       'date_proposed_raw', 'developer', 'entity',
       'interconnection_status_lbnl', 'interconnection_status_raw',
       'point_of_interconnection', 'project_name', 'queue_date', 'queue_id',
       'queue_status', 'queue_year', 'region', 'resource_type_1',
       'resource_type_2', 'resource_type_3', 'resource_type_lbnl', 'state',
       'utility', 'year_proposed', 'date_proposed'],
      dtype='object')

In [6]:
active.head(5)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,500.0,,,yellowstone,,,12/31/2023,,NWE,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
1,500.0,,,yellowstone,,,12/31/2023,,PacifiCorp,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
2,500.0,,,yellowstone,,,12/31/2023,,Avista,Not Started,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
3,725.0,,,riverside,,,11/15/2024,,CAISO,Not Started,...,2019,CAISO,Battery,,,Battery,CA,,2024.0,2024-11-15
4,4.5,,,garfield,,,12/31/2021,,SPP,IA Executed,...,2015,SPP,Wind,,,Wind,OK,OKGE,2021.0,2021-12-31


In [7]:
active.entity.value_counts()

PJM           1541
MISO           580
ERCOT          527
SPP            498
CAISO          346
NYISO          308
ISO-NE         263
PacifiCorp     164
FPL            125
Duke           114
Dominion       108
BPA            103
SOCO           101
APS             90
TVA             84
SC              76
NVE             72
PNM             60
T-S             47
NWE             43
IP              42
SRP             33
Avista          33
LADWP           32
GTC             32
TEC             31
AEC             24
WAPA            23
LGE-KU          18
PSE             17
PSCO            17
TEP             15
IID             13
PGE             13
PRPA            10
EPE              8
SEC              8
BHC              6
JEA              6
CLPT             4
N-C              3
FMPP             1
Name: entity, dtype: int64

In [8]:
active.region.value_counts()

PJM                    1541
West (non-ISO)          848
Southeast (non-ISO)     728
MISO                    580
ERCOT                   527
SPP                     498
CAISO                   346
NYISO                   308
ISO-NE                  263
Name: region, dtype: int64

In [79]:
ids1 = ['county_1', 'resource_type_1', 'capacity_mw_resource_1', 'utility', 'region']
ids2 = ['county_1', 'resource_type_1', 'capacity_mw_resource_1', 'utility', 'entity']

In [80]:
active[active.duplicated(ids1, keep='first')].sort_values(ids1)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1081,75.0,,,abbeville,,,,,SC,In Progress,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Santee Cooper,,NaT
5193,150.0,,,acadia,,,8/31/2023,,MISO,Not Started,...,2020,MISO,Solar,,,Solar,LA,Entergy Louisiana LLC,2023.0,2023-08-31
3165,40.0,,,accomack,,,12/1/2021,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,VA,DPL,2021.0,2021-12-01
4784,2.3,,,ada,,,01may2020,,IP,In Progress,...,2019,West (non-ISO),Hydro,,,Hydro,ID,Idaho Power,2020.0,2020-05-01
5092,200.0,,,ada,,,01oct2024,,IP,Not Started,...,2020,West (non-ISO),Solar,,,Solar,ID,Idaho Power,2024.0,2024-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,76.0,,,,,,,,Dominion,unknown,...,2017,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
692,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
3195,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
4995,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT


In [84]:
active[active.duplicated(ids2, keep='first')].sort_values(ids2)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1081,75.0,,,abbeville,,,,,SC,In Progress,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Santee Cooper,,NaT
5193,150.0,,,acadia,,,8/31/2023,,MISO,Not Started,...,2020,MISO,Solar,,,Solar,LA,Entergy Louisiana LLC,2023.0,2023-08-31
3165,40.0,,,accomack,,,12/1/2021,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,VA,DPL,2021.0,2021-12-01
4784,2.3,,,ada,,,01may2020,,IP,In Progress,...,2019,West (non-ISO),Hydro,,,Hydro,ID,Idaho Power,2020.0,2020-05-01
5092,200.0,,,ada,,,01oct2024,,IP,Not Started,...,2020,West (non-ISO),Solar,,,Solar,ID,Idaho Power,2024.0,2024-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,76.0,,,,,,,,Dominion,unknown,...,2017,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
692,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
3195,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
4995,80.0,,,,,,,,Dominion,unknown,...,2018,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT


In [35]:
active.columns

Index(['capacity_mw_resource_1', 'capacity_mw_resource_2',
       'capacity_mw_resource_3', 'county_1', 'county_2', 'county_3',
       'date_proposed_raw', 'developer', 'entity',
       'interconnection_status_lbnl', 'interconnection_status_raw',
       'point_of_interconnection', 'project_name', 'queue_date', 'queue_id',
       'queue_status', 'queue_year', 'region', 'resource_type_1',
       'resource_type_2', 'resource_type_3', 'resource_type_lbnl', 'state',
       'utility', 'year_proposed', 'date_proposed'],
      dtype='object')

These id sets are pretty similar - most commonly date proposed is different  
Rule: assume that different date proposed is just someone entering a project into the queue twice with an updated/uncertain date

In [81]:
# dropping these "different date" rows seems pretty reasonable
cols = active.columns.to_list()
# remove date columns, raw columns, queue_id
removed_cols = ['date_proposed', 
                'year_proposed', 
                'date_proposed_raw', 
                'interconnection_status_raw',
                'queue_id', 
                'queue_date',
                'queue_year'
               ]    
for c in removed_cols:
    cols.remove(c)
active[active.duplicated(cols, keep='first')].sort_values(cols)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3881,0.0000,151.0,,floyd,,,44743,,SOCO,In Progress,...,2019,Southeast (non-ISO),Solar,Battery,,Solar+Battery,GA,Southern Company,2022.0,2022-07-01
4702,0.0000,,,hillsborough,,,01jan2021,,TEC,IA Executed,...,2017,Southeast (non-ISO),Gas,,,Gas,FL,Tampa Electric,2021.0,2021-01-01
2308,0.0000,,,lee,,,11/1/2024,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,IL,ComEd,2024.0,2024-11-01
3695,0.0000,,,lee,,,11/1/2024,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,IL,ComEd,2024.0,2024-11-01
3418,0.0076,,,,,,,,Dominion,unknown,...,2020,Southeast (non-ISO),Solar,,,Solar,SC,Dominion,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4098,1350.0000,,,person,,,,,Duke,In Progress,...,2020,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT
4643,1350.0000,,,person,,,,,Duke,In Progress,...,2019,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT
1369,1350.0000,,,person,,,,,Duke,In Progress,...,2019,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT
5490,1350.0000,,,person,,,,,Duke,In Progress,...,2019,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT


In [83]:
active.interconnection_status_lbnl.head(10)

project_id
0    In Progress
1    In Progress
2    Not Started
3    Not Started
4    IA Executed
5    In Progress
6    In Progress
7    In Progress
8    In Progress
9    In Progress
Name: interconnection_status_lbnl, dtype: object

Next steps:  
- Do some cleaning on point_of_interconnection and other string fields - simple string cleaning wasn't that effective/there are too many different cases but maybe this is something to come back to
    - Maybe a better route is finding "distance away" metrics in the strings and setting a threshold instead of trying to do string cleaning rules - forget what this "distance" is actually called

### Which columns create the most "one off" rows?
- When you remove a column from the "key" and there are a lot of duplicates then this column was helping to distinguish rows
    - Maybe these are columns that deserve some attention - since they are part of the "key" then they should be clean
- There are a lot of columns where only `queue_id` is different. This means all the same data was entered twice with a different `queue_id` - can it be dropped?

In [32]:
cols = active.columns.to_list()
cols.remove('queue_id')

dups = {}
for col in cols:
    ids = copy.deepcopy(cols)
    ids.remove(col)
    dups[col] = len(active[active.duplicated(ids, keep=False)].sort_values(ids))
sorted(dups.items(), key=lambda item: item[1])

[('capacity_mw_resource_3', 209),
 ('county_2', 209),
 ('county_3', 209),
 ('date_proposed_raw', 209),
 ('entity', 209),
 ('interconnection_status_lbnl', 209),
 ('queue_status', 209),
 ('queue_year', 209),
 ('region', 209),
 ('resource_type_1', 209),
 ('resource_type_2', 209),
 ('resource_type_3', 209),
 ('resource_type_lbnl', 209),
 ('state', 209),
 ('utility', 209),
 ('year_proposed', 209),
 ('date_proposed', 209),
 ('capacity_mw_resource_2', 212),
 ('interconnection_status_raw', 219),
 ('developer', 220),
 ('project_name', 223),
 ('county_1', 229),
 ('queue_date', 233),
 ('capacity_mw_resource_1', 356),
 ('point_of_interconnection', 395)]

This suggests a good key might be the following:  
`ids = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'project_name', 'developer', 'region']`

In [89]:
active.project_name.dropna().value_counts().head(20)

Solar                                              71
Battery Storage                                    16
Offshore Wind                                      11
Bowman Street 115 kV SS  - Augusta Area Study       6
Augusta E. Side 115 kV SS  - Augusta Area Study     5
Fuel Cell                                           4
Solar + Battery Storage                             4
Solar CNR only                                      3
Puddledock Road 115 kV SS  - Augusta Area Study     3
Solar (QF)                                          3
Wind                                                3
Energy Storage                                      3
Limestone Solar                                     2
Offshore Wind Increase                              2
Solar + Battery                                     2
Combined Cycle Upgrade                              2
Solar + battery                                     2
Knickerbocker Solar                                 2
NY Wind Holbrook 2          

### Remove duplicates where only queue_id is different

In [35]:
# same data, different queue_id
ids = active.columns.to_list()
ids.remove('queue_id')
active[active.duplicated(ids, keep=False)].sort_values(ids)

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2920,0.0,151.0,,floyd,,,44743,,SOCO,In Progress,...,2019,Southeast (non-ISO),Solar,Battery,,Solar+Battery,GA,Southern Company,2022.0,2022-07-01
3881,0.0,151.0,,floyd,,,44743,,SOCO,In Progress,...,2019,Southeast (non-ISO),Solar,Battery,,Solar+Battery,GA,Southern Company,2022.0,2022-07-01
590,0.0,,,lee,,,11/1/2024,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,IL,ComEd,2024.0,2024-11-01
2308,0.0,,,lee,,,11/1/2024,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,IL,ComEd,2024.0,2024-11-01
3695,0.0,,,lee,,,11/1/2024,,PJM,Not Started,...,2020,PJM,Battery,,,Battery,IL,ComEd,2024.0,2024-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4283,1000.0,,,san juan,,,9/30/2026,,APS,Not Started,...,2019,West (non-ISO),Wind,,,Wind,AZ,Arizona Public Service,2026.0,2026-09-30
1906,1350.0,,,person,,,,,Duke,In Progress,...,2019,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT
4643,1350.0,,,person,,,,,Duke,In Progress,...,2019,Southeast (non-ISO),Gas,,,Gas,NC,Duke Progress,,NaT
1245,,400.0,,elmore,,,15dec2020,,IP,Not Started,...,2020,West (non-ISO),Solar,Wind,,Solar+Wind,ID,Idaho Power,2020.0,2020-12-15


### Clean up point_of_interconnection

In [42]:
active.point_of_interconnection.head(3)

project_id
0    500kV at Broadview Substation
1             Broadview substation
2      Broadview Substation 500 kV
Name: point_of_interconnection, dtype: object

In [38]:
active.point_of_interconnection.value_counts()

TBD                                         30
Whirlwind Substation 230kV                  12
Substation 1363; 161kV Substation           11
Rosamond 230kV Switching Station             9
Hopewell-Surry 230 kV                        9
                                            ..
New substation on 500 kV McNary-HPP Line     1
NG Wilbraham substation                      1
Liberty Center-Buckeye Tap 69 kV             1
NG Treasure Valley substation                1
Circuit LAV11 out of Lava substation         1
Name: point_of_interconnection, Length: 4470, dtype: int64

In [283]:
active_test = copy.deepcopy(active)

active_test['poi_clean'] = (
            active_test['point_of_interconnection'].astype(str).
            str.lower().
            str.replace("substation", "").
            str.replace("kv", "").
            str.replace("-", " ").
            str.replace("station", "").
            str.replace(",", "").
            str.replace("tbd", "nan")
        )

active_test['poi_clean'] = [' '.join(sorted(x)) for x in active_test['poi_clean'].str.split()]
active_test['poi_clean'] = active_test['poi_clean'].str.strip()
active_test.poi_clean.value_counts()

nan                                   97
230 whirlwind                         17
230 hopewell surry                    13
1363; 161                             11
230 rosamond switching                 9
                                      ..
eweb's exact location nan. system.     1
115 george orangeburg st               1
115 chateaugay willis                  1
70 banos line los o'neil pgp           1
circuit lav11 lava of out              1
Name: poi_clean, Length: 4223, dtype: int64

In [72]:
active_test.poi_clean.value_counts().head(50)

tbd                                            30
230 whirlwind                                  17
230 hopewell surry                             13
1363; 161                                      11
230 imperial valley                             9
230 rosamond switching                          9
345 riverside                                   9
230 gates                                       8
230 central pinal                               8
polaris                                         8
345 gowanus                                     8
345 electric junction nelson                    8
345 3740;                                       8
230 windhub                                     8
345 creek stranger                              7
345 44200 hillje                                7
230 rawhide                                     7
county lucie northern st                        7
bullard                                         7
1685 1695 345 both ckts farmersvl moses tap     7


### Clean up county

In [76]:
import addfips

In [85]:
# still iterating on this
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015):
    """
    Add State and County FIPS IDs to a dataframe.

    To just add State FIPS IDs, make county_col = None.
    """
    # force the columns to be the nullable string types so we have a consistent
    # null value to filter out before feeding to addfips
    df = df.astype({
        state_col: pd.StringDtype()
    })
    if county_col:
        df = df.astype({
            county_col: pd.StringDtype()
        })
    af = addfips.AddFIPS(vintage=vintage)
    # Lookup the state and county FIPS IDs and add them to the dataframe:
    df["state_id_fips"] = df.apply(
        lambda x: (af.get_state_fips(state=x[state_col])
                   if pd.notnull(x[state_col]) else pd.NA),
        axis=1)

    # force the code columns to be nullable strings - the leading zeros are
    # important
    df = df.astype({
        "state_id_fips": pd.StringDtype()
    })

    logger.info(
        f"Assigned state FIPS codes for "
        f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records."
    )
    if county_col:
        df["county_id_fips"] = df.apply(
            lambda x: (af.get_county_fips(state=x[state_col], county=x[county_col])
                       if pd.notnull(x[county_col]) and pd.notnull(x[state_col]) else pd.NA),
            axis=1)
        # force the code columns to be nullable strings - the leading zeros are
        # important
        df = df.astype({
            "county_id_fips": pd.StringDtype()
        })
        logger.info(
            f"Assigned county FIPS codes for "
            f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records."
        )
    return df

### Make sure NaNs are being handled appropriately

In [16]:
nan_test = active[['capacity_mw_resource_1', 'capacity_mw_resource_2', 'capacity_mw_resource_3']].head(2)
nan_test

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,500.0,,
1,500.0,,


In [19]:
nan_test.duplicated(keep=False)

project_id
0    True
1    True
dtype: bool

### Playing with possible keys

In [97]:
active.columns

Index(['capacity_mw_resource_1', 'capacity_mw_resource_2',
       'capacity_mw_resource_3', 'county_1', 'county_2', 'county_3',
       'date_proposed_raw', 'developer', 'entity',
       'interconnection_status_lbnl', 'interconnection_status_raw',
       'point_of_interconnection', 'project_name', 'queue_date', 'queue_id',
       'queue_status', 'queue_year', 'region', 'resource_type_1',
       'resource_type_2', 'resource_type_3', 'resource_type_lbnl', 'state',
       'utility', 'year_proposed', 'date_proposed'],
      dtype='object')

#### Too much variability with project_name and developer, exclude from key

In [106]:
ids1 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'developer', 'project_name', 'region', 'resource_type_1']
df1 = active[active.duplicated(ids1, keep=False)].sort_values(ids1)
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'developer', 'region', 'resource_type_1']
df2 = active[active.duplicated(ids2, keep=False)].sort_values(ids2)
len(df1.index), len(df2.index)

(467, 514)

In [111]:
active.iloc[list(set(df2.index) - set(df1.index))].sort_values(ids2)[ids1].head(7)

Unnamed: 0_level_0,point_of_interconnection,capacity_mw_resource_1,county_1,developer,project_name,region,resource_type_1
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1813,230kV at Broadview Substation,80.0,yellowstone,,,West (non-ISO),Solar
3192,230kV at Broadview Substation,80.0,yellowstone,,Broadview Solar I,West (non-ISO),Solar
539,44200 Hillje 345kV,201.0,wharton,Sunchase Power,Danish Fields III Solar,ERCOT,Solar
439,44200 Hillje 345kV,201.0,wharton,Sunchase Power,Danish Fields Solar,ERCOT,Solar
836,44200 Hillje 345kV,201.0,wharton,Sunchase Power,Danish Fields II Solar,ERCOT,Solar
661,59903 Bearkat 345kV,250.0,glasscock,CIP,Kontiki 1 Wind (ERIK),ERCOT,Wind
3477,59903 Bearkat 345kV,250.0,glasscock,CIP,Kontiki 2 Wind (ERNEST),ERCOT,Wind


In [115]:
ids1 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'developer', 'region', 'resource_type_1']
df1 = active[active.duplicated(ids1, keep=False)].sort_values(ids1)
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
df2 = active[active.duplicated(ids2, keep=False)].sort_values(ids2)
len(df1.index), len(df2.index)

(514, 553)

In [117]:
active.iloc[list(set(df2.index) - set(df1.index))].sort_values(ids2)[ids1].head(5)

Unnamed: 0_level_0,point_of_interconnection,capacity_mw_resource_1,county_1,developer,region,resource_type_1
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4103,11306 Dermott 138kV,52.0,scurry,"KCE TX 13, LLC",ERCOT,Battery
4163,11306 Dermott 138kV,52.0,scurry,"KCE TX 22, LLC",ERCOT,Battery
518,3650 Elgin SS 138kV,52.0,williamson,"KCE TX 19, LLC",ERCOT,Battery
829,3650 Elgin SS 138kV,52.0,williamson,"KCE TX 21, LLC",ERCOT,Battery
691,60718 Pecos Valley 138kV,203.0,pecos,BRP Lyra BESS LLC,ERCOT,Battery


Utility doesn't seem to distinguish much

In [127]:
ids1 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'utility']
df1 = active[active.duplicated(ids1, keep=False)].sort_values(ids1)
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
df2 = active[active.duplicated(ids2, keep=False)].sort_values(ids2)
len(df1.index), len(df2.index)

(551, 553)

In [129]:
active.iloc[list(set(df2.index) - set(df1.index))].sort_values(ids2)[ids1]

Unnamed: 0_level_0,point_of_interconnection,capacity_mw_resource_1,county_1,region,resource_type_1,utility
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
266,Roadrunner 115kV,110.0,lea,SPP,Solar,SWPS
5491,Roadrunner 115kV,110.0,lea,SPP,Solar,


Seems like the same project was added to the queue multiple times with different interconnection statuses

In [136]:
ids1 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'interconnection_status_lbnl']
df1 = active[active.duplicated(ids1, keep=False)].sort_values(ids1)
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
df2 = active[active.duplicated(ids2, keep=False)].sort_values(ids2)
len(df1.index), len(df2.index)

(506, 553)

In [138]:
active.iloc[list(set(df2.index) - set(df1.index))].sort_values(ids2)[ids1].head(5)

Unnamed: 0_level_0,point_of_interconnection,capacity_mw_resource_1,county_1,region,resource_type_1,interconnection_status_lbnl
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1813,230kV at Broadview Substation,80.0,yellowstone,West (non-ISO),Solar,In Progress
3192,230kV at Broadview Substation,80.0,yellowstone,West (non-ISO),Solar,IA Executed
264,Allen 345 kV,100.0,allen,PJM,Solar,Not Started
2729,Allen 345 kV,100.0,allen,PJM,Solar,In Progress
3976,Arnold's Corner-Dahlgren 230 kV,100.0,king george,PJM,Battery,In Progress


### Second and third columns for a field
What to do about second columns of fields (county_2, capacity_mw_resource)

In [22]:
active['county_2'].dropna()

project_id
23          bates
270           nye
732          ford
735        castro
1026        meade
1120    roosevelt
1444     mitchell
1879     franklin
2666       jasper
2843          nye
2865         hand
2882       novata
3041      roberts
3726     mitchell
3792    wilbarger
3955         holt
4122       marion
4285    wilbarger
4421          kay
5204      hampton
5323       hughes
5448          kay
5464         ford
5507      nodaway
Name: county_2, dtype: object

In [78]:
active['capacity_mw_resource_3'].dropna()

project_id
3278    222.0
4358    375.0
4869     75.0
5113     40.0
Name: capacity_mw_resource_3, dtype: float64

- When two resource types are listed the `resource_type_lbnl` field lists them together, e.g. Solar+Battery
- When every other "key" column is the same and one row has just one resource type (Solar) and the other rows has two resource types (Solar + Battery) treat this as the same project (Solar with an optional battery). 
- Select the row with the most parts
- This should take care of the duplicates/disparities with `capacity_mw_resource_1` vs. `capacity_mw_resource_2` etc

In [130]:
ids1 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'resource_type_lbnl']
df1 = active[active.duplicated(ids1, keep=False)].sort_values(ids1)
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
df2 = active[active.duplicated(ids2, keep=False)].sort_values(ids2)
len(df1.index), len(df2.index)

(538, 553)

In [133]:
ids = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'resource_type_lbnl', 'resource_type_2']
active.iloc[list(set(df2.index) - set(df1.index))].sort_values(ids2)[ids].head(6)

Unnamed: 0_level_0,point_of_interconnection,capacity_mw_resource_1,county_1,region,resource_type_1,resource_type_lbnl,resource_type_2
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1447,Carson-Rogers Rd 500 kV,0.0,greensville,PJM,Solar,Solar,
317,Carson-Rogers Rd 500 kV,0.0,greensville,PJM,Solar,Solar+Battery,Battery
3194,Falcon 120 kV,100.0,eureka,West (non-ISO),Solar,Solar,
92,Falcon 120 kV,100.0,eureka,West (non-ISO),Solar,Solar+Battery,Battery
3127,Hopewell-Surry 230 kV,150.0,surry,PJM,Solar,Solar+Battery,Battery
892,PEGS 230kV Station,200.0,mckinley,West (non-ISO),Solar,Solar,


In [271]:
active_test_og = copy.deepcopy(active)
active_test_og['len_resource_type'] = active_test_og.resource_type_lbnl.str.len()
active_test_og = active_test_og.reset_index()
active_test = copy.deepcopy(active_test_og)
ids = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
active_test = active_test.groupby(ids, as_index=False, dropna=False).len_resource_type.max()
cols = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'len_resource_type', 'project_id']
active_test_og = active_test_og[cols]
ids2 = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1', 'len_resource_type']
active_test = active_test.merge(active_test_og, on=ids2) 
# drop all of the repeats that got merged on
active_test = active_test[~(active_test.duplicated(ids, keep="first"))]
active_test

Unnamed: 0,point_of_interconnection,capacity_mw_resource_1,county_1,region,resource_type_1,len_resource_type,project_id
0,Lieberman - North Benton 138 kV Line,74.0,caddo parish,SPP,Solar,5,5434
1,Oneida - Peterboro 115kV,20.0,madison,NYISO,Solar,5,2229
2,Three Forks-Dale 138 kV,150.0,madison,PJM,Solar,13,172
3,tap 138kV 44010 WAP - 42980 Nash.,101.0,brazoria,ERCOT,Battery,7,1041
4,tap 345kV 76009 Twin Buttes - 1444 Brown,110.0,concho,ERCOT,Solar,5,3480
...,...,...,...,...,...,...,...
5621,,950.0,owyhee,West (non-ISO),Wind,4,4376
5622,,1050.0,jerome,West (non-ISO),Wind,4,1390
5624,,1212.0,newberry,Southeast (non-ISO),Gas,3,3880
5625,,1265.0,anderson,Southeast (non-ISO),Gas,3,3614


In [272]:
ids = ['point_of_interconnection', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
active[~(active.duplicated(ids, keep="first"))]

Unnamed: 0_level_0,capacity_mw_resource_1,capacity_mw_resource_2,capacity_mw_resource_3,county_1,county_2,county_3,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,queue_year,region,resource_type_1,resource_type_2,resource_type_3,resource_type_lbnl,state,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,500.0,,,yellowstone,,,12/31/2023,,NWE,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
1,500.0,,,yellowstone,,,12/31/2023,,PacifiCorp,In Progress,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
2,500.0,,,yellowstone,,,12/31/2023,,Avista,Not Started,...,2020,West (non-ISO),Battery,,,Battery,MT,Colstrip,2023.0,2023-12-31
3,725.0,,,riverside,,,11/15/2024,,CAISO,Not Started,...,2019,CAISO,Battery,,,Battery,CA,,2024.0,2024-11-15
4,4.5,,,garfield,,,12/31/2021,,SPP,IA Executed,...,2015,SPP,Wind,,,Wind,OK,OKGE,2021.0,2021-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634,10.0,,,tooele,,,12/31/2019,"Ophir Canyon Solar, LLC",PacifiCorp,IA Executed,...,2018,West (non-ISO),Solar,,,Solar,UT,PacifiCorp,2019.0,2019-12-31
5635,31.5,,,tooele,,,12/31/2019,"Ophir Canyon Solar, LLC",PacifiCorp,IA Executed,...,2018,West (non-ISO),Solar,,,Solar,UT,PacifiCorp,2019.0,2019-12-31
5636,4.3,,,jefferson,,,1/1/2021,Deschutes Valley Water District,PacifiCorp,IA Executed,...,2018,West (non-ISO),Hydro,,,Hydro,OR,PacifiCorp,2021.0,2021-01-01
5637,4.2,,,shasta,,,12/31/2018,Slate Creek Hydro Associates LP,PacifiCorp,IA Executed,...,2018,West (non-ISO),Hydro,,,Hydro,CA,PacifiCorp,2018.0,2018-12-31


### Transform function

In [315]:
def remove_duplicates(df):
    # do some string cleaning on point_of_interconnection
    # for now "tbd" is mapped to "nan"
    df['point_of_interconnection_clean'] = (
                df['point_of_interconnection'].astype(str).
                str.lower().
                str.replace("substation", "").
                str.replace("kv", "").
                str.replace("-", " ").
                str.replace("station", "").
                str.replace(",", "").
                str.replace("at", "").
                str.replace("tbd", "nan")
            )

    df['point_of_interconnection_clean'] = [' '.join(sorted(x)) for x in df['point_of_interconnection_clean'].str.split()]
    df['point_of_interconnection_clean'] = df['point_of_interconnection_clean'].str.strip()
    
    # groupby this set of keys and keep the duplicate with the most listed resources
    # Note: "active" projects have county_1 and region, "completed" and "withdrawn" only have county and entity
    if 'county_1' in df.columns:
        key = ['point_of_interconnection_clean', 'capacity_mw_resource_1', 'county_1', 'region', 'resource_type_1']
    else:
        key = ['point_of_interconnection_clean', 'capacity_mw_resource_1', 'county', 'entity', 'resource_type_1']
    df['len_resource_type'] = df.resource_type_lbnl.str.len()
    df = df.reset_index()
    dups = copy.deepcopy(df)
    dups = dups.groupby(key, as_index=False, dropna=False).len_resource_type.max()
    df = dups.merge(df, on=(key + ['len_resource_type']))
    # merge added duplicates with same len_resource_type, drop these
    df = df[~(df.duplicated(key, keep="first"))]
    
    # some final cleanup
    df = (
        df.drop(['len_resource_type', 'point_of_interconnection_clean'], axis=1).
        set_index('project_id').
        sort_index()
    )
    return df

In [316]:
active_no_dups = remove_duplicates(active)
completed_no_dups = remove_duplicates(completed)
withdrawn_no_dups = remove_duplicates(withdrawn)
print(f"Num dups removed active: {len(active) - len(active_no_dups)}")
print(f"Percentage: {(len(active) - len(active_no_dups))/len(active)}")
print(f"Num dups removed completed: {len(completed) - len(completed_no_dups)}")
print(f"Percentage: {(len(completed) - len(completed_no_dups))/len(completed)}")
print(f"Num dups removed withdrawn: {len(withdrawn) - len(withdrawn_no_dups)}")
print(f"Percentage: {(len(withdrawn) - len(withdrawn_no_dups))/len(withdrawn)}")

Num dups removed active: 356
Percentage: 0.06313176095052314
Num dups removed completed: 95
Percentage: 0.05568581477139507
Num dups removed withdrawn: 304
Percentage: 0.04408352668213457
