In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
path = Path('/app/data/raw/2023.05.24 OGW database.xlsx')
assert path.exists()

In [5]:
# dbcp.extract.eip_infrastructure.extract(path)
# vendor the extract function so this notebook can be easily rerun in the future without maintenance
def _convert_object_to_string_dtypes(df: pd.DataFrame) -> None:
    strings = df.select_dtypes("object")
    df.loc[:, list(strings.columns)] = strings.astype(pd.StringDtype())


def _downcast_ints(df: pd.DataFrame) -> None:
    ints = df.select_dtypes(np.int64)
    for col in ints.columns:
        ser = df.loc[:, col]
        assert (
            ser.ge(0).fillna(True).all()
        )  # didn't implement this for negative numbers
        assert np.all((ser.values >> 32) == 0)  # check for high bits
        df.loc[:, col] = ser.astype(pd.Int32Dtype())


def extract(path: Path) -> dict[str, pd.DataFrame]:
    """Read EIP excel database.

    Args:
        path (Path): filepath

    Returns:
        Dict[str, pd.DataFrame]: output dictionary of dataframes
    """
    sheets_to_read = [
        "Facility",
        # 'Company',
        "Project",
        "Air Construction",  # permit status is key to identifying actionable projects
        # 'Pipelines',
        # 'NGA',
        # 'NAICS',
        # 'CWA-NPDES',
        # 'CWA Wetland',
        # 'Air Operating',
        # 'Glossary',  # useful for data dictionary
        # 'Data Sources',
        # 'Map Layers',
        # 'Other Permits',
        # 'Test Collection',
        # 'Featured Facility Descriptors',
        # 'MARAD',
        # 'TEST',
        # 'Pipeline Digitization',
    ]
    raw_dfs = pd.read_excel(path, sheet_name=sheets_to_read)
    rename_dict = {
        "Facility": "eip_facilities",
        "Project": "eip_projects",
        "Air Construction": "eip_air_constr_permits",
    }
    raw_dfs = {rename_dict[key]: df for key, df in raw_dfs.items()}
    for df in raw_dfs.values():
        _convert_object_to_string_dtypes(df)
        _downcast_ints(df)

    return raw_dfs

eip = extract(path)

In [6]:
eip.keys()

dict_keys(['eip_facilities', 'eip_projects', 'eip_air_constr_permits'])

In [7]:
{k: df.shape for k, df in eip.items()}

{'eip_facilities': (766, 50),
 'eip_projects': (938, 40),
 'eip_air_constr_permits': (945, 23)}

In [8]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [10]:
fac = eip['eip_facilities']
# cos = eip['eip_companies']
# air = eip['eip_air_constr_permits']

Outline of work
Two parts: data cleaning and data normalization/structuring
# Structuring and Normalizaing

## Entity Relationships
### Entities
* facilities
* projects
* permits (air construction permits. there are many other permit types that I didn't integrate)

Cut pipelines and companies for now.

### Relationships
many : many
* facilities : projects
* projects : permits

one : many
* none

one : one
* none

no direct relationship
* facilities : permits (air construction permits are mediated through projects. Other permits not considered here do have direct relationships)

# Cleaning
Need to clean facilities, projects, and permits via the usual checklist. But I can ignore many unecessary columns and prefix them 'raw_' to discourage use.
## Facilities Cleaning
- [x] Accuracy
- [x] Atomicity
- [ ] Consistency
- [x] Completeness
- [x] Uniformity
- [x] Validity
    - [x] Range Validation
    - [x] Uniqueness Validation
    - [x] Set Membership Validation
    - [x] Type Validation
    - [x] Cross-Field Validation

### Accuracy
I'm mostly using this table for location information, so I'll focus on the "street address" and "coordinates" columns. I don't have "golden data" to compare against, but I can at least spot check some items by googling them. \[Update: 3/3 spot checks of location are good. Obviously this is far from comprehensive but gives a small measure of confidence.]

In [11]:
fac.sample(3, random_state=42)

Unnamed: 0,id,name,created_on,modified_on,CCS/CCUS,CCS (ID),CCS,Company (ID),Company,Project (ID),Project,State,Facility Alias,Facility Description,Latest Updates,State Facility ID Number(s),Primary NAICS Code,Primary SIC Code,Street Address,City,ZIP Code,County or Parish,Associated Facilities (ID),Associated Facilities,Pipelines (ID),Pipelines,Air Operating (ID),Air Operating,CWA-NPDES (ID),CWA-NPDES,CWA Wetland (ID),CWA Wetland,Other Permits (ID),Other Permits,Congressional Representatives,Link to EJSCREEN Report,Estimated Population within 3 miles,Percent People of Color within 3 miles,Percent Low-Income within 3 miles,Percent under 5 Years Old within 3 miles,Percent People over 64 Years Old within 3 miles,Air Toxics Cancer Risk (NATA Cancer Risk),Respiratory Hazard Index,PM2.5 (ug/m3),O3 (ppb),Wastewater Discharge Indicator,Location,Facility Footprint,EPA FRS ID,Facility ID
428,3811,Hall Summit Compressor Station,2021-09-03T15:12:17.872272,2022-12-02T06:46:26.961136,,,,2557,"Gulf South Pipeline Company, LP[2557]",3948,Index 99 Expansion Project - Hall Summit Compr...,LA,Hall Summit Compressor Station-Index 99 Expans...,The Hall Summit Compressor Station provides co...,,23638.0,,4922.0,407 Hwy 371,Ringgold,71068.0,Bienville,,,3305,Index 99 Expansion Project[3305],,,,,,,,,,https://ejscreen.epa.gov/mapper/EJSCREEN_repor...,332.0,32.0,60.0,10.0,15.0,40.0,0.5,9.88,38.2,0.00017,POINT(-93.282466 32.248802),,,
659,5806,Willow Processing Facility,2022-11-10T15:21:43.285182,2023-02-02T19:54:50.596961,,,,2489,ConocoPhillips Company[2489],5807,Willow Processing Facility - Initial Construct...,AK,,The Willow Processing Facility is part of the ...,,,,,,,,North Slope Borough,5804.0,Willow Project[5804],5796,Willow Oil Pipeline[5796],,,,,,,,,,https://ejscreen.epa.gov/mapper/EJSCREEN_repor...,0.0,,,,,,,,,,POINT(-151.989387 70.143891),,,
350,1100,Tioga Compressor Station,2021-05-20T19:13:45.411472,2023-02-07T03:41:27.913254,,,,2711,"WBI Energy Transmission, Inc.[2711]",3126,Tioga Compressor Station - Expansion[3126],ND,Tioga Compressor Station-North Bakken Expansio...,The Tioga Compressor Station is an existing co...,,3692.0,486210.0,4922.0,,Tioga,58852.0,Williams,854.0,Elkhorn Creek Compressor Station[854],3339,North Bakken Expansion Project[3339],,,,,,,,,"Kelly Armstrong, Republican",https://ejscreen.epa.gov/mapper/EJSCREEN_repor...,1097.0,4.0,22.0,8.0,20.0,20.0,0.2,5.2,41.1,5.6e-07,POINT(-102.906744 48.403529),,110070595044.0,10381.0


Googling "Hall Summit Compressor Station" turns up the facility. [Street address](https://goo.gl/maps/bF4YDNvE2gxqeGLv6) and coordinates match. Company also matches.

Willow Processing Facility is correctly associated with ConocoPhillips and located in North Slope Borough, AK. There is no street address.

Tioga Compressor Station is correctly located in Williams County, ND. Owner also matches.

### Atomicity
By inspection I see that all the ID and associated name fields can contain multiple values: company, project, pipelines, and permits. The location fields are mercifully single valued

In [12]:
# street address does not look multi-valued but has other problems. Thankfully lat lon is still available
# a little more digging suggests bad addresses are because these have not yet been built.
# Can't check for sure until I can join project status on to facilities
pd.options.display.max_colwidth = 0
fac.loc[fac['Street Address'].str.len().nlargest(10).index, ['id', 'name', 'Street Address', 'Location']]

Unnamed: 0,id,name,Street Address,Location
11,750,Annova LNG Brownsville,USFWS Access Road (left from intersection of Boca Chica Blvd and Kingston Ave),POINT(-97.2675 26.00556)
464,4055,Wildhorse Terminal,N. Little Ave (3.5 mi S of the intersection of N. Little Ave and Hwy 33),POINT(-96.76277 35.93083)
354,1105,Turkey Creek Compressor Station,W on Onyx Rd (towards the intersection of Johnsons Landing Rd),POINT(-92.424444 30.939722)
283,1031,El Paso Natural Gas - Red Mountain Compressor Station,1.4 miles on Co Rd D0006 from the intersection with NM-418,POINT(-107.998849 32.257081)
507,4480,Lone Star Alkylate Production Facility,Approx. 1.8 miles SW from FM 1942 and Hatcherville Rd,POINT(-94.923882 29.84787)
88,829,Corpus Christi Polymer & Desalination Plant,7001 Joe Fulton International Trade Corridor STE 200,POINT(-97.49595 27.834238)
699,6019,New Generation Gillis Treating Facility,SE of the int. of Texas Eastern Rd and Al Cormier Rd,POINT(-93.140833 30.448611)
372,1124,Willcox and Dragoon Compressor Stations,Arzberger Rd (6 miles E of Kansas Settlement Rd),POINT(-109.662345 32.109089)
98,839,Delta LNG Terminal,LA Hwy 23 (22 mi S from West Pointe à la Hache),POINT(-89.873677 29.596179)
488,4316,Dos Picos Gas Plant,CR 1090 (approx. 20 mi SE from I20 and Hwy 158),POINT(-101.86781 31.88272)


In [13]:
# location is not multi-valued - exactly two decimal points per coordinate pair
fac['Location'].str.count('\.').agg(['min', 'max'])

min    2
max    2
Name: Location, dtype: int64

In [14]:
# lots of missing facility IDs, but no multi-valued ones
fac['Facility ID'].describe()

count    381.000000  
mean     10195.451444
std      113.554118  
min      10000.000000
25%      10097.000000
50%      10195.000000
75%      10293.000000
max      10393.000000
Name: Facility ID, dtype: float64

### Completeness
Notable missing values and lack of missing values:
* 5 (0.7%) of facilities are missing linked Project IDs
* 35 (4.6%) missing "Location" (coordinates)
* 19 (2.4%) missing county (one missing state). But the true test is how successful `addfips` is with these pairs
* 88 to 100 (11.5% to 13.1%) missing EJ Screen metrics, depending on which metric

The `id` field is 100% complete.

Based on these nan counts, I should first try `addfips` on state/county pairs. If too many fail, the most complete option is to geocode via coordinates.

In [15]:
len(fac)

766

In [16]:
fac.isna().agg(['mean', 'sum']).T

Unnamed: 0,mean,sum
id,0.0,0.0
name,0.0,0.0
created_on,0.0,0.0
modified_on,0.0,0.0
CCS/CCUS,0.926893,710.0
CCS (ID),0.962141,737.0
CCS,0.962141,737.0
Company (ID),0.005222,4.0
Company,0.005222,4.0
Project (ID),0.006527,5.0


### Consistency - defer
Defer until I've cleaned the related datasets
### Uniformity
Important columns to check consistent representation:
* coordinates
* ID fields (check consistent array delimiters)

Secondary importance:
* street address (this is a luxury field)
* modified_on

#### Coordinates

In [17]:
# "POINT(-XX.X, YY.Y)" with 2 or 3 digits before the decimal and 2 to 7 digits after.
# Plus optional leading/trailing whitespace.
coord_pattern = r'\s*POINT\(-\d{2,3}\.\d{2,7} \d{2,3}\.\d{2,7}\)\s*'
fac['Location'].str.match(coord_pattern).agg(['mean', 'sum'])

mean    1.0  
sum     731.0
Name: Location, dtype: float64

In [18]:
# tighten criteria to 3+ digits after decimal
# Reveals that only 3 facilities have poor precision (plus or minus about a km)
coord_pattern = r'\s*POINT\(-\d{2,3}\.\d{3,7} \d{2,3}\.\d{3,7}\)\s*'
fac['Location'].str.match(coord_pattern).sum()

728

#### ID Fields
Want to check for consistent array delimiters.

In [19]:
# exclude ID cols with numeric types (no arrays present)
id_cols = [col for col in fac.columns if '(ID)' in col and pd.api.types.is_string_dtype(fac[col])]
id_cols

['Company (ID)',
 'Project (ID)',
 'Associated Facilities (ID)',
 'Pipelines (ID)',
 'Air Operating (ID)',
 'CWA-NPDES (ID)',
 'CWA Wetland (ID)',
 'Other Permits (ID)']

In [20]:
# mandatory opening pattern, optional delimiter, optional repeating pattern, optional closing pattern, mandatory end of line
array_pattern = r'(?:\d{3,5})(?:, ?)?(?:\d{3,5}, ?)*(?:\d{3,5})?$'

In [21]:
test_case = pd.Series([
    '1234',
    '1234,567',
    '1234, 567',
    '12345, 678, 9012',
    '1234\t5678', # tab is bad, no comma
    '12, 3456', # too short
    '1234    5678', # too many spaces, no comma
])
pd.concat([test_case, test_case.str.match(array_pattern)], axis=1)

Unnamed: 0,0,1
0,1234,True
1,1234567,True
2,"1234, 567",True
3,"12345, 678, 9012",True
4,1234\t5678,False
5,"12, 3456",False
6,1234 5678,False


In [22]:
# all pass the formatting test
for col in id_cols:
    assert fac[col].str.match(array_pattern).all()

#### Date Modified

In [23]:
# to_datetime works on all values present
timestamps = pd.to_datetime(fac['modified_on'])
timestamps.dtypes, timestamps.isna().sum()

(dtype('<M8[ns]'), 0)

#### Street Address - defer
hard to test and I don't care that much if it's wrong. Best way to test is probably to outsource to a pre-built geocoder

### Range Validation
Check IDs and Coordinates
#### Coordinates
All the extreme coordinates are real places! No "Null Island" dwellers either.

In [24]:
coord_pattern = r'^POINT\((?P<longitude>\-\d{2,3}\.\d{2,7}) (?P<latitude>\d{2,3}\.\d{2,7})\)'
coords = fac['Location'].str.extractall(coord_pattern).droplevel('match')
for col in coords.columns:
    coords.loc[:, col] = pd.to_numeric(coords.loc[:, col], errors='coerce')
coords.head()

Unnamed: 0,longitude,latitude
0,-103.525728,32.542358
1,-80.380335,40.331198
2,-105.77927,43.85124
3,-101.422777,35.641666
4,-84.250549,31.541712


In [25]:
coords.describe()

Unnamed: 0,longitude,latitude
count,731.0,731.0
mean,-93.99586,35.555141
std,14.107042,7.604859
min,-158.094996,17.710307
25%,-97.682529,30.010701
50%,-93.550833,32.269689
75%,-83.271619,39.917282
max,-64.754109,70.3199


In [27]:
# look at extreme coordinates
# max longitude
fac.loc[coords['longitude'].idxmax(), ['Location', 'City', 'ZIP Code', 'Facility Description']]

Location                POINT(-64.754109 17.710307)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
City                    St Croix                                                                                                                                                          

In [28]:
# Min longitude
fac.loc[coords['longitude'].idxmin(), ['Location', 'City', 'ZIP Code', 'Facility Description']]

Location                POINT(-158.094996 21.305314)                                                                                                                                                                                          
City                    Kapolei                                                                                                                                                                                                               
ZIP Code                96707                                                                                                                                                                                                                 
Facility Description    The Kapolei Refinery is a crude oil refinery with a capacity of 94,000 barrels/day on Oahu. The refinery distributes fuel throughout the state via pipelines and barges and has a 5.4 million barrel storage capacity.
Name: 755, dtype: object

In [29]:
# max latitude
fac.loc[coords['latitude'].idxmax(), ['Location', 'City', 'ZIP Code', 'Facility Description']]

Location                POINT(-148.5573 70.3199)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [30]:
# Min Latitude
fac.loc[coords["latitude"].idxmin(), ['Location', 'City', 'ZIP Code', 'Facility Description']]

Location                POINT(-64.754109 17.710307)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
City                    St Croix                                                                                                                                                          

#### IDs
There are lots of ID columns, but I only care about project IDs and associated facilities

In [31]:
# defined way up near the top
id_cols

['Company (ID)',
 'Project (ID)',
 'Associated Facilities (ID)',
 'Pipelines (ID)',
 'Air Operating (ID)',
 'CWA-NPDES (ID)',
 'CWA Wetland (ID)',
 'Other Permits (ID)']

In [32]:
proj_ids = fac['Project (ID)'].str.split(',', expand=True)
for col in proj_ids.columns:
    proj_ids.loc[:, col] = pd.to_numeric(proj_ids.loc[:, col], errors='coerce')

proj_ids.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2723.0,,,,,,,,,
1,2724.0,,,,,,,,,
2,2725.0,,,,,,,,,
3,2726.0,,,,,,,,,
4,2729.0,,,,,,,,,


In [33]:
# they all look in the same range
proj_ids.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,761.0,117.0,36.0,16.0,12.0,4.0,2.0,1.0,1.0,1.0
mean,4023.214192,4367.982906,4298.055556,4361.6875,4484.083333,5298.75,4983.0,3664.0,3665.0,4258.0
std,1248.24156,1106.984238,1084.447112,1014.719647,876.546266,882.664291,1866.761902,,,
min,2723.0,2732.0,2836.0,2855.0,3090.0,4161.0,3663.0,3664.0,3665.0,4258.0
25%,2941.0,3141.0,3183.25,3561.5,3933.25,4992.0,4323.0,3664.0,3665.0,4258.0
50%,3158.0,4330.0,4205.5,4472.0,4233.5,5363.5,4983.0,3664.0,3665.0,4258.0
75%,5338.0,5361.0,5329.75,5315.5,5274.25,5670.25,5643.0,3664.0,3665.0,4258.0
max,6411.0,6388.0,6126.0,5451.0,6046.0,6307.0,6303.0,3664.0,3665.0,4258.0


In [34]:
assoc_ids = fac['Associated Facilities (ID)'].str.split(',', expand=True)
for col in assoc_ids.columns:
    assoc_ids.loc[:, col] = pd.to_numeric(assoc_ids.loc[:, col], errors='coerce')

assoc_ids.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,


In [35]:
# they all look in the same range
assoc_ids.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,256.0,150.0,96.0,53.0,43.0,37.0,28.0,20.0,11.0,11.0,1.0
mean,3040.042969,3412.166667,3697.760417,4011.283019,4127.27907,4019.594595,3729.285714,3018.1,1009.727273,1061.545455,1016.0
std,2128.628533,2091.443294,2123.980542,2344.568005,2240.64823,2301.486156,2275.015276,2324.042959,41.499617,45.204787,
min,756.0,754.0,755.0,819.0,871.0,808.0,871.0,871.0,940.0,966.0,1016.0
25%,927.25,931.0,951.0,966.0,1045.5,1048.0,940.0,966.0,966.0,1027.0,1016.0
50%,3709.0,4107.0,4141.0,5521.0,5523.0,5521.0,5521.0,1062.0,1025.0,1097.0,1016.0
75%,5387.0,5522.75,5544.5,5851.0,5697.0,5544.0,5539.75,5524.0,1048.0,1097.0,1016.0
max,6311.0,6312.0,6312.0,6114.0,6078.0,6078.0,5546.0,5526.0,1048.0,1097.0,1016.0


### Uniqueness Validation
Check the `id` field (NOT `Facility ID`)

In [36]:
fac['id'].duplicated().sum()

0

### Set Membership Validation
Check state/county only. A few takeaways:
* state 'TBD' values need conversion to NULL
* a few states are arrays (but only one value, duplicated)
* 11 counties are multi-valued. Probably just take the first one.

In [37]:
from pudl.helpers import add_fips_ids


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [38]:
w_fips = add_fips_ids(fac[['State', 'County or Parish']], state_col='State', county_col='County or Parish', vintage=2020)
w_fips.head()

2023-07-18 20:18:47 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 98.69% of records.
2023-07-18 20:18:47 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 92.82% of records.


Unnamed: 0,State,County or Parish,state_id_fips,county_id_fips
0,NM,Lea,35,35025.0
1,PA,Washington,42,42125.0
2,WY,Campbell,56,56005.0
3,TX,Hutchison,48,
4,GA,Dougherty,13,13095.0


In [39]:
w_fips.shape

(766, 4)

In [40]:
# 9 bad states and 36 bad state/county combos
w_fips.describe()

Unnamed: 0,State,County or Parish,state_id_fips,county_id_fips
count,765,747,756,711
unique,53,354,47,339
top,TX,Jefferson,48,48245
freq,223,28,223,26


In [41]:
w_fips[w_fips['county_id_fips'].isna()].dropna(how='all', axis=0)

Unnamed: 0,State,County or Parish,state_id_fips,county_id_fips
3,TX,Hutchison,48.0,
133,LA,"Ascension, Iberville",22.0,
193,VI,St. Croix,78.0,
393,LA,"West Baton Rouge, Iberville",22.0,
396,LA,"West Baton Rouge, Iberville",22.0,
474,TX,"Midland, Glasscock",48.0,
505,TX,TBD,48.0,
511,TX,,48.0,
513,AZ,Mojave,4.0,
526,TX,TBD,48.0,


In [42]:
multi_county = w_fips[w_fips['County or Parish'].str.contains(',| and | or ', regex=True, na=False)]
multi_county

Unnamed: 0,State,County or Parish,state_id_fips,county_id_fips
133,LA,"Ascension, Iberville",22,
393,LA,"West Baton Rouge, Iberville",22,
396,LA,"West Baton Rouge, Iberville",22,
474,TX,"Midland, Glasscock",48,
538,LA,"St. Charles, St. John the Baptist, St. James",22,
577,TX,Calhoun or Howard,48,
633,WY,Lincoln and Sweetwater,56,
682,LA,"Jefferson Parish, Louisiana",22,
733,TX,"Chambers, Liberty, and Jefferson County",48,
734,TX,Chambers and Jefferson Counties,48,


In [43]:
multi_county['County or Parish'].str.split(',| and | or ', n=1, regex=True).str[0]

133    Ascension       
393    West Baton Rouge
396    West Baton Rouge
474    Midland         
538    St. Charles     
577    Calhoun         
633    Lincoln         
682    Jefferson Parish
733    Chambers        
734    Chambers        
745    Will            
Name: County or Parish, dtype: object

### Type Validation
All the ID columns and the coordinates are CSV string arrays that need parsing and conversion to numeric.
### Cross-Field Validation - Defer
A thorough cleaning would involve geocoding the given coordinates and making sure they match the given state, county values. Also reverse geocoding the given street address and computing distance vs given coordinates. But I'll defer that until we actually do something with the lat, lon values.

In [6]:
from dbcp.transform.eip_infrastructure import facilities_transform


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [7]:
trans = facilities_transform(extract(path)['eip_facilities'])

2023-07-19 16:21:00 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 99.61% of records.
2023-07-19 16:21:00 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 94.91% of records.
2023-07-19 16:21:00 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 92.31% of records.
2023-07-19 16:21:00 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 41.03% of records.


In [8]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 766 entries, 0 to 765
Data columns (total 51 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   facility_id                                          766 non-null    Int32         
 1   name                                                 766 non-null    string        
 2   raw_created_on                                       766 non-null    string        
 3   raw_modified_on                                      766 non-null    string        
 4   raw_is_ccs                                           56 non-null     string        
 5   ccs_id                                               29 non-null     float64       
 6   raw_company_id                                       762 non-null    string        
 7   raw_project_id                                       761 non-null    string        
 8   

In [47]:
trans['county_id_fips'].isna().agg(['mean', 'sum'])

mean    0.030026 
sum     23.000000
Name: county_id_fips, dtype: float64

In [50]:
# nothing can be done about these
trans.loc[trans['county_id_fips'].isna(), ['raw_state', 'raw_county_or_parish']]

Unnamed: 0,raw_state,raw_county_or_parish
193,VI,St. Croix
505,TX,TBD
511,TX,
526,TX,TBD
554,PR,
556,TX,TBD
570,,
599,LA,
637,AK,
644,LA,
