In [1]:
import pandas as pd
import dbcp

source_uri = "gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv"
raw_dfs = dbcp.extract.ballot_ready.extract(source_uri)
raw_ballot_ready = raw_dfs["raw_ballot_ready"]
br_election_data = dbcp.transform.ballot_ready._explode_counties(raw_ballot_ready)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd
2023-09-12 03:57:46 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-09-12 03:57:50 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.


In [2]:
br_election_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188074 entries, 1543 to 82775
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   election_id               188074 non-null  Int64         
 1   election_name             188074 non-null  string        
 2   election_day              188074 non-null  datetime64[ns]
 3   race_id                   188074 non-null  Int64         
 4   is_primary                188074 non-null  boolean       
 5   is_runoff                 188074 non-null  boolean       
 6   is_unexpired              188074 non-null  boolean       
 7   position_id               188074 non-null  Int64         
 8   position_name             188074 non-null  string        
 9   sub_area_name             114957 non-null  string        
 10  sub_area_value            125790 non-null  string        
 11  sub_area_name_secondary   11501 non-null   string        
 12  

In [3]:
br_election_data.nunique()

election_id                   298
election_name                 189
election_day                   87
race_id                     82760
is_primary                      2
is_runoff                       2
is_unexpired                    2
position_id                 37406
position_name               35306
sub_area_name                  41
sub_area_value               2559
sub_area_name_secondary        18
sub_area_value_secondary      183
raw_state                      51
level                           5
tier                            3
is_judicial                     2
is_retention                    2
number_of_seats                19
normalized_position_id        191
normalized_position_name      191
frequency                      15
reference_year                  8
partisan_type                   3
raw_county                   1878
race_created_at             27027
race_updated_at             40152
state_id_fips                  51
county_id_fips               3143
dtype: int64

### Frequency
Looks like it should be a position field, but there is one erroneous value. Apply a manual correction.

In [4]:
freq_group = br_election_data.groupby("position_id")["frequency"]
freq_group.nunique().value_counts()

1    37405
2        1
Name: frequency, dtype: int64

In [5]:
freq_group.unique()[freq_group.nunique() > 1]

position_id
156594    [[4], [2]]
Name: frequency, dtype: object

In [6]:
br_election_data.query("position_id == 156594")

Unnamed: 0,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,sub_area_name,sub_area_value,sub_area_name_secondary,sub_area_value_secondary,raw_state,level,tier,is_judicial,is_retention,number_of_seats,normalized_position_id,normalized_position_name,frequency,reference_year,partisan_type,raw_county,race_created_at,race_updated_at,state_id_fips,county_id_fips
78383,4317,California General Election,2024-11-05,2020782,False,False,False,156594,San Jose City Mayor,,,,,CA,city,3,False,False,1,1500,City Executive//Mayor,[4],2024,nonpartisan,Santa Clara County,2023-01-26 22:12:14.544,2023-01-26 22:12:14.544,6,6085
78384,5367,California Primary Election,2024-03-05,2020783,True,False,False,156594,San Jose City Mayor,,,,,CA,city,3,False,False,1,1500,City Executive//Mayor,[2],2022,nonpartisan,Santa Clara County,2023-01-26 22:12:14.646,2023-01-26 22:12:14.646,6,6085


Not sure if this is on instance of a non unique frequency is a ballot ready issue or expected.

### reference year

In [7]:
ref_group = br_election_data.groupby("position_id")["reference_year"]
ref_group.nunique().value_counts()

1    37405
2        1
Name: reference_year, dtype: int64

In [8]:
ref_group.unique()[ref_group.nunique() > 1]

position_id
156594    [2024, 2022]
Name: reference_year, dtype: object

Ok same same problem county.

In [9]:
# manually assign a new position id
new_index = br_election_data.position_id.max() + 1
assert new_index not in br_election_data.position_id
br_election_data.loc[br_election_data.race_id == 2020783, "position_id"] = new_index

## Normalization

In [10]:
id_cols = ["election_id", "position_id", "race_id"]
levels = pd.concat([
    br_election_data.groupby(id_col).nunique().le(1).all().rename(id_col) for id_col in id_cols
    ], axis=1)

In [11]:
levels.sort_values(id_cols, ascending=False).dropna()  # dropna just removes the id fields themselves, which are 'missing' due to being in the index

Unnamed: 0,election_id,position_id,race_id
raw_state,True,True,True
state_id_fips,True,True,True
election_name,True,False,True
election_day,True,False,True
position_name,False,True,True
sub_area_name,False,True,True
sub_area_value,False,True,True
sub_area_name_secondary,False,True,True
sub_area_value_secondary,False,True,True
level,False,True,True


In [12]:
# are all fields unique on some level?
levels.dropna().any(axis=1).all()

False

In [13]:
# which ones fail?
levels.loc[~levels.any(axis=1)]

Unnamed: 0,election_id,position_id,race_id
race_id,False,False,
raw_county,False,False,False
county_id_fips,False,False,False


`race_id` is an ID, so it should fail in this test. Counties should be a m:m relationship with districts, which are either position level or maybe race level fields. I'm not sure which one because I'm not sure which (if either) encodes the temporal changes in geography due to redistricting. Have to test it. [update: it's position level. So it doesn't change over time, despite the fact that the underlying districts do.]

In [14]:
# a race is a specific instance of a position in an election. It is a m:m relationship between position and election.
br_election_data.groupby(["election_id", "position_id"])['race_id'].nunique().value_counts()

1    82760
Name: race_id, dtype: int64

In [15]:
election_fields = set(levels.loc[levels.election_id.fillna(False)].index)
election_fields

{'election_day', 'election_name', 'raw_state', 'state_id_fips'}

In [16]:
position_fields = set(levels.loc[levels.position_id.fillna(False)].index) - election_fields - set(id_cols)
position_fields

{'frequency',
 'is_judicial',
 'is_retention',
 'level',
 'normalized_position_id',
 'normalized_position_name',
 'number_of_seats',
 'partisan_type',
 'position_name',
 'reference_year',
 'sub_area_name',
 'sub_area_name_secondary',
 'sub_area_value',
 'sub_area_value_secondary',
 'tier'}

In [17]:
race_fields = set(levels.loc[levels.race_id.fillna(False)].index) - election_fields - position_fields - set(id_cols)
race_fields

{'is_primary',
 'is_runoff',
 'is_unexpired',
 'race_created_at',
 'race_updated_at'}

### Check geography relationships
It turns out that counties are consistent between races for the same position. So it is a position-level attribute.

In [18]:
counties_by_race = br_election_data.groupby(id_cols[1:])['county_id_fips'].agg(lambda x: set(x.unique()))
counties_by_race.shape

(82760,)

In [19]:
counties_by_race.head()

position_id  race_id
2            1472432    {02070, 02013, 02180, 02063, 02158, 02195, 022...
3            1536258    {01079, 01057, 01025, 01067, 01065, 01075, 010...
             1729666    {01079, 01057, 01025, 01067, 01065, 01075, 010...
4            1446390    {04021, 04013, 04027, 04015, 04017, 04019, 040...
5            1377495    {05039, 05109, 05149, 05147, 05043, 05023, 050...
Name: county_id_fips, dtype: object

In [20]:
# Do counties differ within a position group? [no.]
from functools import reduce
diffs = counties_by_race.groupby(level='position_id').agg(lambda x: reduce(set.union, x) - reduce(set.intersection, x))
diffs.apply(len).value_counts()

0    37407
Name: county_id_fips, dtype: int64

## Normalize

### Elections


In [21]:
br_elections = br_election_data.drop_duplicates(subset='election_id')[list(election_fields) + ['election_id']].copy()

assert br_elections.duplicated(subset=list(election_fields)).sum() == 0

### Positions

In [22]:
br_positions = br_election_data.drop_duplicates(subset='position_id')[list(position_fields) + ['position_id']].copy()

In [23]:
br_positions.shape

(37407, 16)

In [24]:
assert br_positions.duplicated(subset=list(position_fields)).sum() == 0

AssertionError: 

In [None]:
# is normalized_position_id the de-duplicated version of position_id? [Update: no]
br_positions.loc[br_positions.duplicated(subset=list(position_fields), keep=False),:].sort_values(list(position_fields))

Unnamed: 0,is_judicial,position_name,sub_area_name,sub_area_value_secondary,sub_area_value,is_retention,tier,frequency,sub_area_name_secondary,number_of_seats,normalized_position_name,reference_year,normalized_position_id,partisan_type,level
515,False,Adair County Sheriff,,,,False,3,[4],,1,County Sheriff,2024,980,partisan,county
10357,False,Adair County Sheriff,,,,False,3,[4],,1,County Sheriff,2024,980,partisan,county
21489,False,Adair County Sheriff,,,,False,3,[4],,1,County Sheriff,2024,980,partisan,county
21175,False,Adams County Auditor,,,,False,3,[4],,1,County Auditor,2024,930,partisan,county
21522,False,Adams County Auditor,,,,False,3,[4],,1,County Auditor,2024,930,partisan,county
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64396,True,Florida Appeals Court Judge - District 1 (Reta...,District,,1,True,2,[6],,1,State Appellate Court Justice - Retention,2024,4052,nonpartisan,state
55388,True,Lincoln County Probate Judge,,,,False,3,[4],,1,County Court Judge - Probate//County Court Jud...,2024,4475,partisan,county
82303,True,Lincoln County Probate Judge,,,,False,3,[4],,1,County Court Judge - Probate//County Court Jud...,2024,4475,partisan,county
65265,True,New York Supreme Court - District 10,District,,10,False,3,[14],,1,State Trial Court Judge - General,2023,4027,partisan,state


In [25]:
br_election_data.groupby('normalized_position_id')['position_id'].nunique().value_counts()

1       41
2       20
4       11
5        8
3        8
        ..
282      1
1535     1
467      1
36       1
141      1
Name: position_id, Length: 84, dtype: int64

In [26]:
br_election_data.groupby('normalized_position_id')[list(position_fields)].nunique().le(1).all()

frequency                   False
position_name               False
sub_area_value_secondary    False
is_retention                 True
reference_year              False
number_of_seats             False
sub_area_value              False
sub_area_name_secondary     False
level                       False
tier                         True
normalized_position_name     True
partisan_type               False
is_judicial                  True
normalized_position_id       True
sub_area_name               False
dtype: bool

I guess I'll just leave the dupes in. The IDs are unique, and I need to be able to define them.

### Races

In [27]:
race_fields_ids = race_fields | set(id_cols)
br_position_election_assoc = br_election_data.drop_duplicates('race_id')[list(race_fields_ids)].copy()

In [28]:
br_position_election_assoc.shape

(82760, 8)

In [29]:
assert br_position_election_assoc.duplicated(subset=race_fields_ids).sum() == 0

### Position : Counties

In [30]:
br_position_county_assoc = br_election_data.groupby(['position_id', 'county_id_fips'], as_index=False)['raw_county'].first()

In [31]:
br_position_county_assoc.shape

(90751, 3)

## Test

In [32]:
# check all columns are accounted for
assert set(br_election_data.columns).symmetric_difference(set(id_cols) | election_fields | position_fields | race_fields | {'raw_county', 'county_id_fips'}) == set()