# Purpose
This notebook contains an initial exploration and implementation of ballot ready data cleaning.

In [1]:
import dbcp
import pandas as pd

from pathlib import Path

from dbcp.transform.helpers import add_county_fips_with_backup_geocoding


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
source_path = Path("ballot_ready/2023_04_05_climate_partners_upcoming_races_with_counties.csv")
raw_dfs = dbcp.extract.ballot_ready.extract(source_path)
transformed = dbcp.transform.ballot_ready.transform(raw_dfs)

2023-08-11 00:44:00 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-08-11 00:44:04 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 96.65% of records.


In [5]:
transformed["ballot_ready_counties"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221265 entries, 0 to 81681
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   county                    221265 non-null  string        
 1   election_id               221265 non-null  Int64         
 2   election_name             221265 non-null  string        
 3   election_day              221265 non-null  datetime64[ns]
 4   race_id                   221265 non-null  Int64         
 5   is_primary                221265 non-null  boolean       
 6   is_runoff                 221265 non-null  boolean       
 7   is_unexpired              221265 non-null  boolean       
 8   position_id               221265 non-null  Int64         
 9   position_name             221265 non-null  string        
 10  sub_area_name             150748 non-null  string        
 11  sub_area_value            166573 non-null  string        
 12  sub

In [7]:
transformed["ballot_ready_counties"].isna().sum()

county                           0
election_id                      0
election_name                    0
election_day                     0
race_id                          0
is_primary                       0
is_runoff                        0
is_unexpired                     0
position_id                      0
position_name                    0
sub_area_name                70517
sub_area_value               54692
sub_area_name_secondary     203827
sub_area_value_secondary    203369
state                            0
level                            0
tier                             0
is_judicial                      0
is_retention                     0
number_of_seats                  0
normalized_position_id           0
normalized_position_name         0
position_description             8
frequency                        0
reference_year                   0
partisan_type                   25
race_created_at                  0
race_updated_at                  0
state_id_fips       

## Transform implementation

In [2]:
source_path = Path("ballot_ready/2023_04_05_climate_partners_upcoming_races_with_counties.csv")
raw_dfs = dbcp.extract.ballot_ready.extract(source_path)

In [3]:
ballot_ready = raw_dfs["raw_ballot_ready"]

In [4]:
ballot_ready["counties"] = ballot_ready.counties.str[1:-1].str.split(",")

In [5]:
ballot_ready.counties

0                          [Barrow County, Jackson County]
1            [Colquitt County, Cook County, Thomas County]
2        [Brooks County, Colquitt County, Cook County, ...
3        [Columbia County, Luzerne County, Montour Coun...
4                                       [Allegheny County]
                               ...                        
81677                   [Travis County, Williamson County]
81678                      [Bastrop County, Travis County]
81679                                      [Travis County]
81680                   [Travis County, Williamson County]
81681                   [Travis County, Williamson County]
Name: counties, Length: 81682, dtype: object

In [6]:
exp_ballot_ready = ballot_ready.explode("counties")
exp_ballot_ready = exp_ballot_ready.rename(columns={"counties": "county"})
exp_ballot_ready.shape

(221518, 28)

In [7]:
duplicate_race = exp_ballot_ready.duplicated(subset=["county", "race_id"], keep=False)
duplicate_race.value_counts()

False    221012
True        506
dtype: int64

In [8]:
duplicate_race.sum()

506

In [9]:
ballot_ready = exp_ballot_ready.drop_duplicates(subset=["county", "race_id"])

In [10]:
assert ~ballot_ready.duplicated(subset=["county", "race_id"], keep=False).any()

In [14]:
from pudl.helpers import add_fips_ids

In [15]:
geo = add_fips_ids(ballot_ready)

2023-08-11 00:33:04 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-08-11 00:33:07 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 96.65% of records.


Counties that are in the wrong state.

In [20]:
geo[geo.county_id_fips.isna()][["county", "state"]]

Unnamed: 0,county,state
29,Boone County,WI
29,Lake County,WI
29,McHenry County,WI
60,Houston County,WI
63,Houston County,WI
...,...,...
81620,Marion County,LA
81620,Miller County,LA
81620,Newton County,LA
81620,Panola County,LA


In [30]:
DATETIME_COLUMNS = ["race_created_at", "race_updated_at", "election_day"]

geo = geo.convert_dtypes()
for col in DATETIME_COLUMNS:
    geo[col] = pd.to_datetime(geo[col])

In [31]:
geo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221265 entries, 0 to 81681
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   county                    221265 non-null  string        
 1   election_id               221265 non-null  Int64         
 2   election_name             221265 non-null  string        
 3   election_day              221265 non-null  datetime64[ns]
 4   race_id                   221265 non-null  Int64         
 5   is_primary                221265 non-null  boolean       
 6   is_runoff                 221265 non-null  boolean       
 7   is_unexpired              221265 non-null  boolean       
 8   position_id               221265 non-null  Int64         
 9   position_name             221265 non-null  string        
 10  sub_area_name             150748 non-null  string        
 11  sub_area_value            166573 non-null  string        
 12  sub

## Initial Analysis

In [2]:
import pandas as pd

df = pd.read_csv("/app/data/raw/2023_04_05_climate_partners_upcoming_races_with_counties.csv")

In [3]:
df.election_id.is_unique

False

In [4]:
df.election_id.value_counts()

4376    2634
5232    2583
5760    2581
4706    1742
5292    1741
        ... 
4865       1
5813       1
5963       1
4287       1
5850       1
Name: election_id, Length: 291, dtype: int64

In [5]:
df.query("election_id == 4376").head()

Unnamed: 0,counties,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,...,is_retention,number_of_seats,normalized_position_id,normalized_position_name,position_description,frequency,reference_year,partisan_type,race_created_at,race_updated_at
73962,"{Anderson County,Andrews County,Angelina Count...",4376,Texas General Election,2024-11-05,1521793,False,False,False,48,President of the United States,...,False,1,10,President of the United States of America,The President is the chief executive of the co...,{4},2024,partisan,2020-01-14 23:15:54,2020-01-14 23:15:54
73963,"{Austin County,Bastrop County,Blanco County,Br...",4376,Texas General Election,2024-11-05,1667754,False,False,False,46610,U.S. House of Representatives - Texas 10th Con...,...,False,1,40,U.S. Representative,The U.S. House of Representatives is one of tw...,{2},2022,partisan,2020-01-15 22:47:48,2020-01-15 22:47:48
73964,"{Andrews County,Bell County,Blanco County,Brow...",4376,Texas General Election,2024-11-05,1667850,False,False,False,46611,U.S. House of Representatives - Texas 11th Con...,...,False,1,40,U.S. Representative,The U.S. House of Representatives is one of tw...,{2},2022,partisan,2020-01-15 22:47:48,2020-01-15 22:47:48
73965,"{Denton County,Jack County,Johnson County,Palo...",4376,Texas General Election,2024-11-05,1667733,False,False,False,46612,U.S. House of Representatives - Texas 12th Con...,...,False,1,40,U.S. Representative,The U.S. House of Representatives is one of tw...,{2},2022,partisan,2020-01-15 22:47:48,2020-01-15 22:47:48
73966,"{Archer County,Armstrong County,Baylor County,...",4376,Texas General Election,2024-11-05,1667864,False,False,False,46613,U.S. House of Representatives - Texas 13th Con...,...,False,1,40,U.S. Representative,The U.S. House of Representatives is one of tw...,{2},2022,partisan,2020-01-15 22:47:48,2020-01-15 22:47:48


In [6]:
df.query("race_id == 2002453").counties.iloc[0]

'{Abbeville County,Anderson County,Banks County,Clarke County,Elbert County,Franklin County,Greene County,Hart County,Jackson County,Lincoln County,Madison County,McCormick County,Oconee County,Oconee County,Oglethorpe County,Stephens County,Taliaferro County,Wilkes County}'

In [7]:
df.race_id.is_unique

True

In [8]:
df.position_id.is_unique

False

In [9]:
df.position_id.value_counts()

49710     6
49666     5
49763     5
46998     5
277697    4
         ..
228368    1
228367    1
228358    1
228350    1
218495    1
Name: position_id, Length: 37179, dtype: int64

In [10]:
df.query("position_id == 49710")

Unnamed: 0,counties,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,...,is_retention,number_of_seats,normalized_position_id,normalized_position_name,position_description,frequency,reference_year,partisan_type,race_created_at,race_updated_at
0,"{Barrow County,Jackson County}",5954,Georgia Special General Election,2023-01-31,2020825,False,False,True,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2023-01-31 18:22:55,2023-01-31 18:22:55
144,"{Barrow County,Jackson County}",6213,Georgia HD 119 General Runoff Election,2023-02-28,2256527,False,True,True,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2023-02-16 21:41:07,2023-02-16 21:41:07
32104,"{Barrow County,Jackson County}",5380,Georgia Primary Election,2024-05-21,1688839,True,False,False,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2020-01-15 22:48:25,2020-01-15 22:48:25
41433,"{Barrow County,Jackson County}",5756,Georgia Primary Runoff Election,2024-07-23,2002086,True,True,False,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2022-08-25 22:34:19,2022-08-25 22:34:19
58215,"{Barrow County,Jackson County}",4435,Georgia General Election,2024-11-05,1671874,False,False,False,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2020-01-15 22:47:58,2020-01-15 22:47:58
80158,"{Barrow County,Jackson County}",5755,Georgia General Runoff Election,2024-12-03,2002085,False,True,False,49710,Georgia House of Representatives - District 119,...,False,1,610,State Representative,State Representatives are members of the state...,{2},2022,partisan,2022-08-25 22:34:19,2022-08-25 22:34:19


In [11]:
df.query("election_id == 4435").position_id.nunique()

1484

- `position` is the office / position someone is running for (president, Alaska House of Reps District 1). There can be multiple elections for a single position (special election, runoffs, general...)
- `election` is the election / event that elects some one to a position (special election, runoffs, general...). There can be multiple positions for an election. For example, the 2024 Georgia General Election has ~1500 positions.
- `race` is a unique combination of a position and an election. It is unique in the raw dataset. 

## Explode

In [12]:
toy = pd.DataFrame({'A': [[0, 1, 2], [1], [2,3], [3, 4]],
                   'B': [1,2,3,4,]})

In [13]:
toy.explode("A")

Unnamed: 0,A,B
0,0,1
0,1,1
0,2,1
1,1,2
2,2,3
2,3,3
3,3,4
3,4,4


In [14]:
df.shape

(81682, 28)

In [15]:
df.counties.iloc[0]

'{Barrow County,Jackson County}'

In [16]:
type(df.counties.str[1:-1].str.split(",").iloc[0])

list

In [17]:
df["counties"] = df.counties.str[1:-1].str.split(",")

In [18]:
exp_df = df.explode("counties")
exp_df = exp_df.rename(columns={"counties": "county"})
exp_df.shape

(221518, 28)

In [19]:
duplicate_race = exp_df.duplicated(subset=["county", "race_id"], keep=False)
duplicate_race.value_counts()

False    221012
True        506
dtype: int64

In [19]:
exp_df[duplicate_race].tail()

Unnamed: 0,county,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,...,is_retention,number_of_seats,normalized_position_id,normalized_position_name,position_description,frequency,reference_year,partisan_type,race_created_at,race_updated_at
78679,Lee County,5822,Mississippi General Runoff Election,2024-11-26,2011451,False,True,False,377536,Mississippi Supreme Court Justice - District 3...,...,False,1,4000,State Supreme Court Justice,The State Supreme Court is responsible for cor...,{8},2024,nonpartisan,2022-09-21 23:40:55,2022-09-21 23:40:55
80851,Cherokee County,5755,Georgia General Runoff Election,2024-12-03,2002405,False,True,False,276402,Georgia District Attorney - Appalachian Circuit,...,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:26,2022-08-25 22:34:26
80851,Cherokee County,5755,Georgia General Runoff Election,2024-12-03,2002405,False,True,False,276402,Georgia District Attorney - Appalachian Circuit,...,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:26,2022-08-25 22:34:26
80875,Oconee County,5755,Georgia General Runoff Election,2024-12-03,2002453,False,True,False,276425,Georgia District Attorney - Northern Circuit,...,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:27,2022-08-25 22:34:27
80875,Oconee County,5755,Georgia General Runoff Election,2024-12-03,2002453,False,True,False,276425,Georgia District Attorney - Northern Circuit,...,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:27,2022-08-25 22:34:27


In [25]:
df.query("race_id == 2002453").counties.iloc[0]

['Abbeville County',
 'Anderson County',
 'Banks County',
 'Clarke County',
 'Elbert County',
 'Franklin County',
 'Greene County',
 'Hart County',
 'Jackson County',
 'Lincoln County',
 'Madison County',
 'McCormick County',
 'Oconee County',
 'Oconee County',
 'Oglethorpe County',
 'Stephens County',
 'Taliaferro County',
 'Wilkes County']

In [20]:
df.query("race_id == 1658314").counties.iloc[0]

['Alcorn County',
 'Attala County',
 'Benton County',
 'Calhoun County',
 'Carroll County',
 'Chickasaw County',
 'Choctaw County',
 'Clay County',
 'Coahoma County',
 'Crittenden County',
 'DeSoto County',
 'Fayette County',
 'Grenada County',
 'Hardeman County',
 'Itawamba County',
 'Lafayette County',
 'Lee County',
 'Lee County',
 'Leflore County',
 'Lowndes County',
 'Marshall County',
 'McNairy County',
 'Monroe County',
 'Montgomery County',
 'Noxubee County',
 'Oktibbeha County',
 'Panola County',
 'Phillips County',
 'Pontotoc County',
 'Prentiss County',
 'Quitman County',
 'Shelby County',
 'Tallahatchie County',
 'Tate County',
 'Tippah County',
 'Tishomingo County',
 'Tunica County',
 'Union County',
 'Webster County',
 'Winston County',
 'Yalobusha County']

In [21]:
exp_df = exp_df.drop_duplicates()

In [22]:
assert not exp_df.duplicated(subset=["county", "race_id"], keep=False).any()

## Are there duplciates in the counties column
Yup! Looks like ballot ready is doing something weird that produces duplicate counties for some of the races. I checked to make sure There aren't two Lee Counties in Mississipi ;)

In [22]:
def are_elements_unique(lst):
    unique_set = set(lst)
    return len(lst) == len(unique_set)

In [44]:
are_counties_not_unique = ~df.counties.apply(are_elements_unique)
counties_not_unique = df[are_counties_not_unique]

In [52]:
counties_not_unique

Unnamed: 0,counties,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,sub_area_name,sub_area_value,sub_area_name_secondary,sub_area_value_secondary,state,level,tier,is_judicial,is_retention,number_of_seats,normalized_position_id,normalized_position_name,position_description,frequency,reference_year,partisan_type,race_created_at,race_updated_at
4196,"[Alcorn County, Attala County, Benton County, ...",5378,Mississippi Primary Election,2023-08-08,1658314,True,False,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2020-01-14 23:20:37,2020-01-14 23:20:37
4252,"[Alcorn County, Attala County, Benton County, ...",5378,Mississippi Primary Election,2023-08-08,1658145,True,False,False,232563,Mississippi Transportation Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,420,State Transportation Board,The State Transportation Board is responsible ...,{4},2023,partisan,2020-01-14 23:20:37,2020-01-14 23:20:37
5786,"[Alcorn County, Attala County, Benton County, ...",5749,Mississippi Primary Runoff Election,2023-08-29,1990313,True,True,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2022-08-18 20:42:46,2022-11-02 16:30:23
5842,"[Alcorn County, Attala County, Benton County, ...",5749,Mississippi Primary Runoff Election,2023-08-29,1990310,True,True,False,232563,Mississippi Transportation Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,420,State Transportation Board,The State Transportation Board is responsible ...,{4},2023,partisan,2022-08-18 20:42:46,2022-11-02 16:30:23
9799,"[Alcorn County, Attala County, Benton County, ...",4225,Mississippi General Election,2023-11-07,1538252,False,False,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2020-01-14 23:16:30,2021-09-30 19:37:57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78674,"[Attala County, Carroll County, Chickasaw Coun...",5822,Mississippi General Runoff Election,2024-11-26,2010350,False,True,False,377520,Mississippi Appellate Court Judge - District 3...,District,3,Position,1,MS,state,2,True,False,1,4010,State Appellate Court Justice,The Court of Appeals hears cases assigned by t...,{8},2024,nonpartisan,2022-09-21 22:46:18,2022-09-21 22:46:18
78678,"[Alcorn County, Attala County, Benton County, ...",5822,Mississippi General Runoff Election,2024-11-26,2010696,False,True,False,377535,Mississippi Supreme Court Justice - District 3...,District,3,Position,1,MS,state,2,True,False,1,4000,State Supreme Court Justice,The State Supreme Court is responsible for cor...,{8},2024,nonpartisan,2022-09-21 23:01:49,2022-09-21 23:01:49
78679,"[Alcorn County, Attala County, Benton County, ...",5822,Mississippi General Runoff Election,2024-11-26,2011451,False,True,False,377536,Mississippi Supreme Court Justice - District 3...,District,3,Position,2,MS,state,2,True,False,1,4000,State Supreme Court Justice,The State Supreme Court is responsible for cor...,{8},2024,nonpartisan,2022-09-21 23:40:55,2022-09-21 23:40:55
80851,"[Cherokee County, Cherokee County, Dawson Coun...",5755,Georgia General Runoff Election,2024-12-03,2002405,False,True,False,276402,Georgia District Attorney - Appalachian Circuit,,Appalachian Circuit,,,GA,regional,3,False,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:26,2022-08-25 22:34:26


In [45]:
counties_not_unique.state.value_counts()

MN    62
FL    44
IL    22
RI    14
MS    12
WI    10
MO    10
OH     8
GA     8
MT     8
WY     7
MI     4
AL     3
NY     2
KY     2
MA     2
NH     2
Name: state, dtype: int64

In [53]:
are_counties_not_unique.value_counts(normalize=True) * 100

False    99.730663
True      0.269337
Name: counties, dtype: float64

In [56]:
counties_not_unique

Unnamed: 0,counties,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,sub_area_name,sub_area_value,sub_area_name_secondary,sub_area_value_secondary,state,level,tier,is_judicial,is_retention,number_of_seats,normalized_position_id,normalized_position_name,position_description,frequency,reference_year,partisan_type,race_created_at,race_updated_at
4196,"[Alcorn County, Attala County, Benton County, ...",5378,Mississippi Primary Election,2023-08-08,1658314,True,False,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2020-01-14 23:20:37,2020-01-14 23:20:37
4252,"[Alcorn County, Attala County, Benton County, ...",5378,Mississippi Primary Election,2023-08-08,1658145,True,False,False,232563,Mississippi Transportation Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,420,State Transportation Board,The State Transportation Board is responsible ...,{4},2023,partisan,2020-01-14 23:20:37,2020-01-14 23:20:37
5786,"[Alcorn County, Attala County, Benton County, ...",5749,Mississippi Primary Runoff Election,2023-08-29,1990313,True,True,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2022-08-18 20:42:46,2022-11-02 16:30:23
5842,"[Alcorn County, Attala County, Benton County, ...",5749,Mississippi Primary Runoff Election,2023-08-29,1990310,True,True,False,232563,Mississippi Transportation Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,420,State Transportation Board,The State Transportation Board is responsible ...,{4},2023,partisan,2022-08-18 20:42:46,2022-11-02 16:30:23
9799,"[Alcorn County, Attala County, Benton County, ...",4225,Mississippi General Election,2023-11-07,1538252,False,False,False,232562,Mississippi Public Service Commission - Northe...,,Northern District,,,MS,state,2,False,False,1,470,State Utilities Board//Public Services Board//...,State Utility or Public Service Boards are res...,{4},2023,partisan,2020-01-14 23:16:30,2021-09-30 19:37:57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78674,"[Attala County, Carroll County, Chickasaw Coun...",5822,Mississippi General Runoff Election,2024-11-26,2010350,False,True,False,377520,Mississippi Appellate Court Judge - District 3...,District,3,Position,1,MS,state,2,True,False,1,4010,State Appellate Court Justice,The Court of Appeals hears cases assigned by t...,{8},2024,nonpartisan,2022-09-21 22:46:18,2022-09-21 22:46:18
78678,"[Alcorn County, Attala County, Benton County, ...",5822,Mississippi General Runoff Election,2024-11-26,2010696,False,True,False,377535,Mississippi Supreme Court Justice - District 3...,District,3,Position,1,MS,state,2,True,False,1,4000,State Supreme Court Justice,The State Supreme Court is responsible for cor...,{8},2024,nonpartisan,2022-09-21 23:01:49,2022-09-21 23:01:49
78679,"[Alcorn County, Attala County, Benton County, ...",5822,Mississippi General Runoff Election,2024-11-26,2011451,False,True,False,377536,Mississippi Supreme Court Justice - District 3...,District,3,Position,2,MS,state,2,True,False,1,4000,State Supreme Court Justice,The State Supreme Court is responsible for cor...,{8},2024,nonpartisan,2022-09-21 23:40:55,2022-09-21 23:40:55
80851,"[Cherokee County, Cherokee County, Dawson Coun...",5755,Georgia General Runoff Election,2024-12-03,2002405,False,True,False,276402,Georgia District Attorney - Appalachian Circuit,,Appalachian Circuit,,,GA,regional,3,False,False,1,750,District Attorney,The District Attorney is the chief legal repre...,{4},2024,partisan,2022-08-25 22:34:26,2022-08-25 22:34:26


In [50]:
counties_not_unique.to_csv("races_with_duplicate_counties.csv")

## Missing fips codes

In [30]:
import dbcp

br = dbcp.etl.etl_ballot_ready()["br_election_data"]


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd
2023-08-14 23:25:13 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-08-14 23:25:17 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 96.65% of records.


In [37]:
br.state_id_fips.isna().value_counts()

False    221265
Name: state_id_fips, dtype: int64

In [58]:
br.county_id_fips.isna().value_counts(normalize=True) * 100

False    96.653334
True      3.346666
Name: county_id_fips, dtype: float64

In [61]:
br[br.county_id_fips.isna()].iloc[2]

county                                                         McHenry County
election_id                                                              5209
election_name                                      Wisconsin Primary Election
election_day                                              2023-02-21 00:00:00
race_id                                                               1956520
is_primary                                                               True
is_runoff                                                               False
is_unexpired                                                            False
position_id                                                            384597
position_name               Wisconsin Appeals Court Judge - District 4, Se...
sub_area_name                                                        District
sub_area_value                                                              4
sub_area_name_secondary                                         

In [40]:
br[br.county_id_fips.isna()].to_csv("exploded_counties_with_incorrect_state.csv")