# Clean county names and a second pass at deduplication

In [1]:
import pandas as pd
import dbcp
import copy



# Start with LBNL ISO queues

## Annoying Setup
Now include normalization and add fips - kept separate for first round of deduplication

In [2]:
# copied out of etl.py:etl_lbnlisoqueues
import logging
from pathlib import Path
from typing import Dict

import pandas as pd
import pandas_gbq
import pydata_google_auth
import sqlalchemy as sa

import dbcp
from dbcp.constants import WORKING_PARTITIONS
from dbcp.schemas import TABLE_SCHEMAS
from dbcp.workspace.datastore import DBCPDatastore
from pudl.output.pudltabl import PudlTabl

logger = logging.getLogger(__name__)

#Extract
ds = DBCPDatastore(sandbox=True, local_cache_path="/app/data/data_cache")
lbnl_raw_dfs = dbcp.extract.lbnlisoqueues.Extractor(ds).extract(
    update_date=WORKING_PARTITIONS["lbnlisoqueues"]["update_date"])

In [3]:
# copied out of dbcp.transform.lbnlisoqueues.py:transform
# added "dbcp.transform.lbnlisoqueues." prefix as needed to internal funcs
import logging
from typing import Any, Dict, List

import pandas as pd

from dbcp.schemas import TABLE_SCHEMAS
from dbcp.transform.helpers import normalize_multicolumns_to_rows, parse_dates
from pudl.helpers import add_fips_ids as _add_fips_ids

lbnl_transformed_dfs = {name: df.copy()
                        for name, df in lbnl_raw_dfs.items()}
dbcp.transform.lbnlisoqueues._set_global_project_ids(lbnl_transformed_dfs)

lbnl_transform_functions = {
    "active_iso_queue_projects": dbcp.transform.lbnlisoqueues.active_iso_queue_projects,
    "completed_iso_queue_projects": dbcp.transform.lbnlisoqueues.completed_iso_queue_projects,
    "withdrawn_iso_queue_projects": dbcp.transform.lbnlisoqueues.withdrawn_iso_queue_projects,
}

for table_name, transform_func in lbnl_transform_functions.items():
    logger.info(f"LBNL ISO Queues: Transforming {table_name} table.")
    lbnl_transformed_dfs[table_name] = transform_func(
        lbnl_transformed_dfs[table_name])

# pick up after normalization
lbnl_normalized_dfs = dbcp.transform.lbnlisoqueues.normalize_lbnl_dfs(lbnl_transformed_dfs)

In [4]:
lbnl_normalized_dfs.keys()

dict_keys(['iso_projects', 'iso_locations', 'iso_resource_capacity'])

In [6]:
lbnl_normalized_dfs['iso_projects'].head(3)

Unnamed: 0_level_0,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,interconnection_status_raw,point_of_interconnection,project_name,queue_date,queue_id,...,year_proposed,date_proposed,date_operational,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,West (non-ISO),12/31/2023,,NWE,In Progress,Active,500kV at Broadview Substation,,2020-08-13,390,...,2023.0,2023-12-31,NaT,,,,,,,NaT
1,West (non-ISO),12/31/2023,,PacifiCorp,In Progress,In Progress,Broadview substation,,2020-07-17,C011,...,2023.0,2023-12-31,NaT,,,,,,,NaT
3,CAISO,11/15/2024,,CAISO,Not Started,,Valley Substation 500kV,MENIFEE POWER BANK,2019-04-15,1645,...,2024.0,2024-11-15,NaT,,,,,,,NaT


In [8]:
lbnl_normalized_dfs['iso_projects'].columns

Index(['region', 'date_proposed_raw', 'developer', 'entity',
       'interconnection_status_lbnl', 'interconnection_status_raw',
       'point_of_interconnection', 'project_name', 'queue_date', 'queue_id',
       'queue_status', 'queue_year', 'resource_type_lbnl', 'utility',
       'year_proposed', 'date_proposed', 'date_operational', 'days_in_queue',
       'queue_date_raw', 'year_operational', 'date_withdrawn_raw',
       'withdrawl_reason', 'year_withdrawn', 'date_withdrawn'],
      dtype='object')

In [7]:
lbnl_normalized_dfs['iso_locations'].head(3)

Unnamed: 0,project_id,county,state
0,0,yellowstone,MT
1,1,yellowstone,MT
2,3,riverside,CA


In [10]:
lbnl_normalized_dfs['iso_resource_capacity'].head(3)

Unnamed: 0,project_id,resource,capacity_mw
0,0,Battery,500.0
1,1,Battery,500.0
2,3,Battery,725.0


Some project id's got dropped?
Check normalization to see why some projects get dropped

In [23]:
import numpy as np
pi = lbnl_normalized_dfs['iso_resource_capacity']['project_id'].sort_values().to_list()
full = np.arange(0, 14241)
diff = set(full) - set(pi)

## Use add county fips with backup geocoding

In [4]:
locs = lbnl_normalized_dfs['iso_locations']

In [44]:
locs[locs.county.isnull()]

Unnamed: 0,project_id,county,state
5947,6451,,DC
6475,7026,,DC
7923,8555,,CA
8091,8728,,MA
8189,8830,,ME
8191,8832,,ME
8201,8842,,NH
8207,8848,,ME
8210,8851,,RI
8219,8860,,RI


In [5]:
lbnl_normalized_dfs['iso_locations'] = dbcp.transform.helpers.add_county_fips_with_backup_geocoding(
        lbnl_normalized_dfs['iso_locations'])

In [6]:
lbnl_normalized_dfs['iso_locations']

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,locality_name,locality_type,containing_county
0,0,yellowstone,MT,30,30111,yellowstone,county,yellowstone
1,1,yellowstone,MT,30,30111,yellowstone,county,yellowstone
2,3,riverside,CA,06,06065,riverside,county,riverside
3,4,garfield,OK,40,40047,garfield,county,garfield
4,5,woodward,OK,40,40153,woodward,county,woodward
...,...,...,...,...,...,...,...,...
12865,13797,City of Portsmouth,VA,51,,Portsmouth,city,City of Portsmouth
12892,13830,City of Portsmouth,VA,51,,Portsmouth,city,City of Portsmouth
12907,13848,City of Hampton,VA,51,,Hampton,city,City of Hampton
12963,13914,City of Newport News,VA,51,,Newport News,city,City of Newport News


In [7]:
locs = lbnl_normalized_dfs['iso_locations']

There are still 125 unmatched county fips (previously there were 436)
For now, these are dropped

In [20]:
len(locs[locs.county_id_fips.isnull()])

125

In [21]:
locs

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,locality_name,locality_type,containing_county
0,0,yellowstone,MT,30,30111,yellowstone,county,yellowstone
1,1,yellowstone,MT,30,30111,yellowstone,county,yellowstone
2,3,riverside,CA,06,06065,riverside,county,riverside
3,4,garfield,OK,40,40047,garfield,county,garfield
4,5,woodward,OK,40,40153,woodward,county,woodward
...,...,...,...,...,...,...,...,...
12865,13797,City of Portsmouth,VA,51,,Portsmouth,city,City of Portsmouth
12892,13830,City of Portsmouth,VA,51,,Portsmouth,city,City of Portsmouth
12907,13848,City of Hampton,VA,51,,Hampton,city,City of Hampton
12963,13914,City of Newport News,VA,51,,Newport News,city,City of Newport News


In [42]:
# should have columns project_id, county, state, state_id_fips, county_id_fips
locs_clean = locs[locs.county_id_fips.notnull()]
locs_clean = locs_clean.drop(['locality_name', 'locality_type', 'county'], axis=1)
locs_clean = locs_clean.rename(columns={'containing_county': 'county'})
locs_clean['county'] = locs_clean['county'].str.lower()
locs_clean = locs_clean[['project_id', 'county', 'state', 'state_id_fips', 'county_id_fips']]

In [43]:
locs_clean

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips
0,0,yellowstone,MT,30,30111
1,1,yellowstone,MT,30,30111
2,3,riverside,CA,06,06065
3,4,garfield,OK,40,40047
4,5,woodward,OK,40,40153
...,...,...,...,...,...
12693,13602,northampton county,NC,37,37131
12717,13632,caroline county,VA,51,51033
12724,13641,jefferson county,KY,21,21111
12738,13656,caroline county,VA,51,51033


In [45]:
lbnl_normalized_dfs['iso_locations'] = locs_clean
denorm = dbcp.transform.lbnlisoqueues.denormalize(lbnl_normalized_dfs)

In [46]:
denorm

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,date_operational,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn,resource,capacity_mw
0,0,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,NWE,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
1,1,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,PacifiCorp,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
2,3,riverside,CA,06,06065,CAISO,11/15/2024,,CAISO,Not Started,...,NaT,,,,,,,NaT,Battery,725.00
3,4,garfield,OK,40,40047,SPP,12/31/2021,,SPP,IA Executed,...,NaT,,,,,,,NaT,Wind,4.50
4,5,woodward,OK,40,40153,SPP,,,SPP,In Progress,...,NaT,,,,,,,NaT,Battery,80.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14501,14172,,,,,,12/31/2021,,PJM,,...,NaT,35.0,11/17/2020,,22271.0,,2020.0,2030-12-23,,
14502,14181,,,,,,7/1/2021,,PJM,,...,NaT,31.0,3/27/2020,,22032.0,,2020.0,2030-04-28,Solar; Storage,3.72
14503,14211,,,,,,12/31/2023,,PJM,,...,NaT,10.0,3/31/2020,,22015.0,,2020.0,2030-04-11,Solar,20.00
14504,14216,,,,,,9/30/2022,,PJM,,...,NaT,48.0,10/20/2020,,22256.0,,2020.0,2030-12-08,Solar,80.00


In [23]:
old_locs = lbnl_normalized_dfs['iso_locations']

In [24]:
old_locs

Unnamed: 0,project_id,county,state
0,0,yellowstone,MT
1,1,yellowstone,MT
2,3,riverside,CA
3,4,garfield,OK
4,5,woodward,OK
...,...,...,...
13254,14233,Crawford,PA
13255,14235,Luzerne,PA
13256,14236,Adams,PA
13257,14237,Warren,NJ


In [25]:
lbnl_normalized_dfs['iso_locations'] = dbcp.transform.lbnlisoqueues.add_fips_codes(old_locs)

In [28]:
old_locs_denorm = dbcp.transform.lbnlisoqueues.denormalize(lbnl_normalized_dfs)

In [32]:
old_locs_denorm

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,date_operational,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn,resource,capacity_mw
0,0,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,NWE,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
1,1,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,PacifiCorp,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
2,3,riverside,CA,06,06065,CAISO,11/15/2024,,CAISO,Not Started,...,NaT,,,,,,,NaT,Battery,725.00
3,4,garfield,OK,40,40047,SPP,12/31/2021,,SPP,IA Executed,...,NaT,,,,,,,NaT,Wind,4.50
4,5,woodward,OK,40,40153,SPP,,,SPP,In Progress,...,NaT,,,,,,,NaT,Battery,80.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14501,14172,,,,,,12/31/2021,,PJM,,...,NaT,35.0,11/17/2020,,22271.0,,2020.0,2030-12-23,,
14502,14181,,,,,,7/1/2021,,PJM,,...,NaT,31.0,3/27/2020,,22032.0,,2020.0,2030-04-28,Solar; Storage,3.72
14503,14211,,,,,,12/31/2023,,PJM,,...,NaT,10.0,3/31/2020,,22015.0,,2020.0,2030-04-11,Solar,20.00
14504,14216,,,,,,9/30/2022,,PJM,,...,NaT,48.0,10/20/2020,,22256.0,,2020.0,2030-12-08,Solar,80.00


In [47]:
denorm

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,date_operational,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn,resource,capacity_mw
0,0,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,NWE,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
1,1,yellowstone,MT,30,30111,West (non-ISO),12/31/2023,,PacifiCorp,In Progress,...,NaT,,,,,,,NaT,Battery,500.00
2,3,riverside,CA,06,06065,CAISO,11/15/2024,,CAISO,Not Started,...,NaT,,,,,,,NaT,Battery,725.00
3,4,garfield,OK,40,40047,SPP,12/31/2021,,SPP,IA Executed,...,NaT,,,,,,,NaT,Wind,4.50
4,5,woodward,OK,40,40153,SPP,,,SPP,In Progress,...,NaT,,,,,,,NaT,Battery,80.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14501,14172,,,,,,12/31/2021,,PJM,,...,NaT,35.0,11/17/2020,,22271.0,,2020.0,2030-12-23,,
14502,14181,,,,,,7/1/2021,,PJM,,...,NaT,31.0,3/27/2020,,22032.0,,2020.0,2030-04-28,Solar; Storage,3.72
14503,14211,,,,,,12/31/2023,,PJM,,...,NaT,10.0,3/31/2020,,22015.0,,2020.0,2030-04-11,Solar,20.00
14504,14216,,,,,,9/30/2022,,PJM,,...,NaT,48.0,10/20/2020,,22256.0,,2020.0,2030-12-08,Solar,80.00


In [48]:
old_locs_denorm.columns == denorm.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [33]:
lbnl_normalized_dfs['iso_locations']

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips
0,0,yellowstone,MT,30,30111
1,1,yellowstone,MT,30,30111
2,3,riverside,CA,06,06065
3,4,garfield,OK,40,40047
4,5,woodward,OK,40,40153
...,...,...,...,...,...
13254,14233,Crawford,PA,42,42039
13255,14235,Luzerne,PA,42,42079
13256,14236,Adams,PA,42,42001
13257,14237,Warren,NJ,34,34041


## Sanity check deduplication in top counties (pre normalization)

In [11]:
import copy

In [7]:
# copied out of etl.py:etl_lbnlisoqueues
import logging
from pathlib import Path
from typing import Dict

import pandas as pd
import pandas_gbq
import pydata_google_auth
import sqlalchemy as sa

import dbcp
from dbcp.constants import WORKING_PARTITIONS
from dbcp.schemas import TABLE_SCHEMAS
from dbcp.workspace.datastore import DBCPDatastore
from pudl.output.pudltabl import PudlTabl

logger = logging.getLogger(__name__)

#Extract
ds = DBCPDatastore(sandbox=True, local_cache_path="/app/data/data_cache")
lbnl_raw_dfs = dbcp.extract.lbnlisoqueues.Extractor(ds).extract(
    update_date=WORKING_PARTITIONS["lbnlisoqueues"]["update_date"])

In [8]:
# copied out of dbcp.transform.lbnlisoqueues.py:transform
# added "dbcp.transform.lbnlisoqueues." prefix as needed to internal funcs
import logging
from typing import Any, Dict, List

import pandas as pd

from dbcp.schemas import TABLE_SCHEMAS
from dbcp.transform.helpers import normalize_multicolumns_to_rows, parse_dates
from pudl.helpers import add_fips_ids as _add_fips_ids

lbnl_transformed_dfs = {name: df.copy()
                        for name, df in lbnl_raw_dfs.items()}
dbcp.transform.lbnlisoqueues._set_global_project_ids(lbnl_transformed_dfs)

lbnl_transform_functions = {
    "active_iso_queue_projects": dbcp.transform.lbnlisoqueues.active_iso_queue_projects,
    "completed_iso_queue_projects": dbcp.transform.lbnlisoqueues.completed_iso_queue_projects,
    "withdrawn_iso_queue_projects": dbcp.transform.lbnlisoqueues.withdrawn_iso_queue_projects,
}

for table_name, transform_func in lbnl_transform_functions.items():
    logger.info(f"LBNL ISO Queues: Transforming {table_name} table.")
    lbnl_transformed_dfs[table_name] = transform_func(
        lbnl_transformed_dfs[table_name])

In [13]:
active = copy.deepcopy(lbnl_transformed_dfs['active_iso_queue_projects'])
withdrawn = copy.deepcopy(lbnl_transformed_dfs['withdrawn_iso_queue_projects'])
completed = copy.deepcopy(lbnl_transformed_dfs['completed_iso_queue_projects'])

These top counties definitely have too much capacity...  
But it's not immediately apparent which rows to drop. Some of these are like battery 1 and battery 2, so maybe only one is actually going to be built. Maybe some random dropping could be good lol in these counties that are way too high.

In [17]:
active.groupby('county_1').sum()['capacity_mw_resource_1'].sort_values(ascending=False).head(50)

county_1
suffolk            14374.00
kern               13952.30
maricopa           13264.00
clark              13012.82
riverside          10282.00
nassau             10148.00
coconino            7560.00
san luis obispo     6789.00
kings               6355.00
los angeles         5743.00
san juan            5739.00
lee                 5234.00
wharton             5221.00
person              4752.00
madison             4728.80
sussex              4646.30
washington          4371.20
navajo              4318.00
klamath             4276.41
haskell             4012.00
lake                3937.00
fresno              3918.00
new york            3904.00
pinal               3849.00
la paz              3825.00
grant               3824.00
clinton             3777.20
san diego           3767.00
san bernardino      3717.00
queens              3702.50
logan               3681.00
yuma                3560.00
anderson            3552.00
brazoria            3376.00
jackson             3367.19
montgomery 

In [19]:
active[active['county_1'] == 'suffolk']

Unnamed: 0_level_0,capacity_mw_resource_1,county_1,state,region,resource_type_1,capacity_mw_resource_2,capacity_mw_resource_3,county_2,county_3,date_proposed_raw,...,queue_date,queue_id,queue_status,queue_year,resource_type_2,resource_type_3,resource_type_lbnl,utility,year_proposed,date_proposed
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76,77.0,suffolk,NY,NYISO,Battery,,,,,05/2024,...,2020-05-04,1012,active,2020,,,Battery,LIPA,2024.0,2024-05-01
160,55.0,suffolk,NY,NYISO,Battery,,,,,05/2023,...,2019-12-21,966,active,2019,,,Battery,LIPA,2023.0,2023-05-01
291,24.0,suffolk,NY,NYISO,Battery,,,,,06/2021,...,2018-08-14,751,active,2018,,,Battery,LIPA,2021.0,2021-06-01
1022,100.0,suffolk,NY,NYISO,Battery,,,,,10/2021,...,2019-10-17,925,active,2019,,,Battery,LIPA,2021.0,2021-10-01
1058,150.0,suffolk,NY,NYISO,Battery,,,,,12/2023,...,2020-07-14,1049,active,2020,,,Battery,LIPA,2023.0,2023-12-01
1080,200.0,suffolk,NY,NYISO,Battery,,,,,12/2023,...,2020-07-06,1046,active,2020,,,Battery,LIPA,2023.0,2023-12-01
1109,1300.0,suffolk,NY,NYISO,Offshore Wind,,,,,11/2027,...,2020-05-21,1020,active,2020,,,Offshore Wind,LIPA,2027.0,2027-11-01
1131,1403.0,suffolk,NY,NYISO,Offshore Wind,,,,,12/2026,...,2020-04-30,1011,active,2020,,,Offshore Wind,LIPA,2026.0,2026-12-01
1186,30.0,suffolk,NY,NYISO,Battery,,,,,12/2022,...,2019-11-08,941,active,2019,,,Battery,LIPA,2022.0,2022-12-01
1334,96.0,suffolk,NY,NYISO,Wind,,,,,11/2022,...,2017-02-14,612,active,2017,,,Wind,LIPA,2022.0,2022-11-01


In [31]:
active[(active.queue_id == '1012') & (active.county_1 == 'suffolk')].values

array([[77.0, 'suffolk', 'NY', 'NYISO', 'Battery', nan, nan, nan, nan,
        '05/2024', 'Suffolk County Energy Storage II', 'NYISO',
        'In Progress', 'FES, SIS', 'Southold 69 kV Substation',
        'Suffolk County Storage II', Timestamp('2020-05-04 00:00:00'),
        '1012', 'active', 2020, nan, nan, 'Battery', 'LIPA', 2024.0,
        Timestamp('2024-05-01 00:00:00')]], dtype=object)

In [32]:
active[(active.queue_id == '0966') & (active.county_1 == 'suffolk')].values

array([[55.0, 'suffolk', 'NY', 'NYISO', 'Battery', nan, nan, nan, nan,
        '05/2023', 'Suffolk County Energy Storage, LLC', 'NYISO',
        'IA Executed', 'FES, SIS, FS', 'West Babylon - Lindenhurst 69kV',
        'Suffolk County Storage', Timestamp('2019-12-21 00:00:00'),
        '0966', 'active', 2019, nan, nan, 'Battery', 'LIPA', 2023.0,
        Timestamp('2023-05-01 00:00:00')]], dtype=object)

## Another pass at deduplication

Ideas:  
- Sanity check top counties
- Feature vector and cosine similarity to find duplicates
- String similarity score

In [2]:
# copied out of etl.py:etl_lbnlisoqueues
import logging
from pathlib import Path
from typing import Dict

import pandas as pd
import pandas_gbq
import pydata_google_auth
import sqlalchemy as sa

import dbcp
from dbcp.constants import WORKING_PARTITIONS
from dbcp.schemas import TABLE_SCHEMAS
from dbcp.workspace.datastore import DBCPDatastore
from pudl.output.pudltabl import PudlTabl

logger = logging.getLogger(__name__)

#Extract
ds = DBCPDatastore(sandbox=True, local_cache_path="/app/data/data_cache")
lbnl_raw_dfs = dbcp.extract.lbnlisoqueues.Extractor(ds).extract(
    update_date=WORKING_PARTITIONS["lbnlisoqueues"]["update_date"])

In [3]:
# copied out of dbcp.transform.lbnlisoqueues.py:transform
# added "dbcp.transform.lbnlisoqueues." prefix as needed to internal funcs
import logging
from typing import Any, Dict, List

import pandas as pd

from dbcp.schemas import TABLE_SCHEMAS
from dbcp.transform.helpers import normalize_multicolumns_to_rows, parse_dates
from pudl.helpers import add_fips_ids as _add_fips_ids

lbnl_transformed_dfs = {name: df.copy()
                        for name, df in lbnl_raw_dfs.items()}
dbcp.transform.lbnlisoqueues._set_global_project_ids(lbnl_transformed_dfs)

lbnl_transform_functions = {
    "active_iso_queue_projects": dbcp.transform.lbnlisoqueues.active_iso_queue_projects,
    "completed_iso_queue_projects": dbcp.transform.lbnlisoqueues.completed_iso_queue_projects,
    "withdrawn_iso_queue_projects": dbcp.transform.lbnlisoqueues.withdrawn_iso_queue_projects,
}

for table_name, transform_func in lbnl_transform_functions.items():
    logger.info(f"LBNL ISO Queues: Transforming {table_name} table.")
    lbnl_transformed_dfs[table_name] = transform_func(
        lbnl_transformed_dfs[table_name])

# pick up after normalization
lbnl_normalized_dfs = dbcp.transform.lbnlisoqueues.normalize_lbnl_dfs(lbnl_transformed_dfs)
# data enrichment
lbnl_normalized_dfs['iso_locations'] = dbcp.transform.lbnlisoqueues.add_county_fips_with_backup_geocoding(
    lbnl_normalized_dfs['iso_locations'])
lbnl_normalized_dfs['iso_locations'] = dbcp.transform.lbnlisoqueues.clean_county_names(lbnl_normalized_dfs['iso_locations'])
iso_for_tableau = dbcp.transform.lbnlisoqueues.denormalize(lbnl_normalized_dfs)

In [5]:
iso_for_tableau.groupby('county').sum()['capacity_mw'].sort_values(ascending=False)

county
kern                    68360.080253
riverside               43364.327000
san bernardino          38314.739993
clark                   31265.010012
middlesex               29556.427997
                            ...     
lemhi                       0.750000
wallowa                     0.360000
mcculloch                   0.000000
arlington                   0.000000
saint charles county      -15.000000
Name: capacity_mw, Length: 1596, dtype: float64

In [6]:
iso_for_tableau[iso_for_tableau.county == 'saint charles county']

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,date_operational,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn,resource,capacity_mw
6311,6112,saint charles county,MO,29,29183,,,,MISO,IA Executed,...,NaT,,11/29/2007,,,,,NaT,Coal,-15.0


### Validate LBNL ISO

In [11]:
df = pd.read_csv('../data/output/iso_for_tableau.csv')

In [12]:
df.columns

Index(['project_id', 'county', 'state', 'state_id_fips', 'county_id_fips',
       'region', 'date_proposed_raw', 'developer', 'entity',
       'interconnection_status_lbnl', 'interconnection_status_raw',
       'point_of_interconnection', 'project_name', 'queue_date', 'queue_id',
       'queue_status', 'queue_year', 'resource_type_lbnl', 'utility',
       'year_proposed', 'date_proposed', 'date_operational', 'days_in_queue',
       'queue_date_raw', 'year_operational', 'date_withdrawn_raw',
       'withdrawl_reason', 'year_withdrawn', 'date_withdrawn', 'resource',
       'capacity_mw', 'co2e_tpy'],
      dtype='object')

In [14]:
df.head(30)

Unnamed: 0,project_id,county,state,state_id_fips,county_id_fips,region,date_proposed_raw,developer,entity,interconnection_status_lbnl,...,days_in_queue,queue_date_raw,year_operational,date_withdrawn_raw,withdrawl_reason,year_withdrawn,date_withdrawn,resource,capacity_mw,co2e_tpy
0,0,yellowstone,MT,30.0,30111.0,West (non-ISO),12/31/2023,,NWE,In Progress,...,,,,,,,,Battery,500.0,
1,1,yellowstone,MT,30.0,30111.0,West (non-ISO),12/31/2023,,PacifiCorp,In Progress,...,,,,,,,,Battery,500.0,
2,3,riverside,CA,6.0,6065.0,CAISO,11/15/2024,,CAISO,Not Started,...,,,,,,,,Battery,725.0,
3,4,garfield,OK,40.0,40047.0,SPP,12/31/2021,,SPP,IA Executed,...,,,,,,,,Wind,4.5,
4,5,woodward,OK,40.0,40153.0,SPP,,,SPP,In Progress,...,,,,,,,,Battery,80.0,
5,6,sherman,OR,41.0,41055.0,West (non-ISO),12/31/2021,,BPA,In Progress,...,,,,,,,,Solar,800.0,
6,8,benton,WA,53.0,53005.0,West (non-ISO),12/1/2019,,BPA,In Progress,...,,,,,,,,Wind,250.0,
7,9,coles,IL,17.0,17029.0,MISO,7/1/2022,,MISO,In Progress,...,,,,,,,,Solar,99.0,
8,10,jerome,ID,16.0,16053.0,West (non-ISO),01dec2022,,IP,In Progress,...,,,,,,,,Battery,,
9,10,jerome,ID,16.0,16053.0,West (non-ISO),01dec2022,,IP,In Progress,...,,,,,,,,Solar,400.0,


### Switch to geocoding FIPS on other datasets