In [1]:
import pandas as pd
import dbcp

source_uri = "gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv"
raw_dfs = dbcp.extract.ballot_ready.extract(source_uri)
raw_ballot_ready = raw_dfs["raw_ballot_ready"]
br_election_data = dbcp.transform.ballot_ready._explode_counties(raw_ballot_ready)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd
2023-09-01 15:36:43 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-09-01 15:36:45 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.


In [18]:
br_election_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188074 entries, 1543 to 82775
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   election_id               188074 non-null  Int64         
 1   election_name             188074 non-null  string        
 2   election_day              188074 non-null  datetime64[ns]
 3   race_id                   188074 non-null  Int64         
 4   is_primary                188074 non-null  boolean       
 5   is_runoff                 188074 non-null  boolean       
 6   is_unexpired              188074 non-null  boolean       
 7   position_id               188074 non-null  Int64         
 8   position_name             188074 non-null  string        
 9   sub_area_name             114957 non-null  string        
 10  sub_area_value            125790 non-null  string        
 11  sub_area_name_secondary   11501 non-null   string        
 12  

In [5]:
br_election_data.raw_county.isna()

Unnamed: 0,raw_county,raw_county.1
1543,False,False
1545,False,False
1546,False,False
9958,False,False
41760,False,False
...,...,...
82771,False,True
82772,False,True
82773,False,True
82774,False,True


## Elections

In [2]:
br_election_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188790 entries, 0 to 82775
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   election_id               188790 non-null  Int64         
 1   election_name             188790 non-null  string        
 2   election_day              188790 non-null  datetime64[ns]
 3   race_id                   188790 non-null  Int64         
 4   is_primary                188790 non-null  boolean       
 5   is_runoff                 188790 non-null  boolean       
 6   is_unexpired              188790 non-null  boolean       
 7   position_id               188790 non-null  Int64         
 8   position_name             188790 non-null  string        
 9   sub_area_name             115480 non-null  string        
 10  sub_area_value            126339 non-null  string        
 11  sub_area_name_secondary   11501 non-null   string        
 12  sub

In [3]:
election_fields = [
    "election_id",
    "election_name",
    "election_day",
]
(br_election_data.groupby("election_id")[election_fields].nunique() <= 1).all()

election_id      True
election_name    True
election_day     True
dtype: bool

## Position

In [4]:
position_fields = [
    "reference_year",
    "position_id",
    "position_name",
    "sub_area_name",
    "sub_area_value",
    "sub_area_name_secondary",
    "sub_area_value_secondary",
    "raw_state",
    "level",
    "tier",
    "is_judicial",
    "is_retention",
    "normalized_position_id",
    "normalized_position_name",
    "frequency",
    "partisan_type",   
]

(br_election_data.groupby("position_id")[position_fields].nunique() <= 1).all()

reference_year              False
position_id                  True
position_name                True
sub_area_name                True
sub_area_value               True
sub_area_name_secondary      True
sub_area_value_secondary     True
raw_state                    True
level                        True
tier                         True
is_judicial                  True
is_retention                 True
normalized_position_id       True
normalized_position_name     True
frequency                   False
partisan_type                True
dtype: bool

### Frequency

In [5]:
freq_group = br_election_data.groupby("position_id")["frequency"]
freq_group.nunique().value_counts()

1    37413
2        1
Name: frequency, dtype: int64

In [6]:
freq_group.unique()[freq_group.nunique() > 1]

position_id
156594    [[4], [2]]
Name: frequency, dtype: object

In [7]:
br_election_data.query("position_id == 156594")

Unnamed: 0,election_id,election_name,election_day,race_id,is_primary,is_runoff,is_unexpired,position_id,position_name,sub_area_name,sub_area_value,sub_area_name_secondary,sub_area_value_secondary,raw_state,level,tier,is_judicial,is_retention,number_of_seats,normalized_position_id,normalized_position_name,frequency,reference_year,partisan_type,raw_county,race_created_at,race_updated_at,state_id_fips,county_id_fips
78383,4317,California General Election,2024-11-05,2020782,False,False,False,156594,San Jose City Mayor,,,,,CA,city,3,False,False,1,1500,City Executive//Mayor,[4],2024,nonpartisan,Santa Clara County,2023-01-26 22:12:14.544,2023-01-26 22:12:14.544,6,6085
78384,5367,California Primary Election,2024-03-05,2020783,True,False,False,156594,San Jose City Mayor,,,,,CA,city,3,False,False,1,1500,City Executive//Mayor,[2],2022,nonpartisan,Santa Clara County,2023-01-26 22:12:14.646,2023-01-26 22:12:14.646,6,6085


Not sure if this is on instance of a non unique frequency is a ballot ready issue or expected.

In [15]:
new_index = br_election_data.position_id.max() + 1
assert new_index not in br_election_data.position_id
br_election_data.loc[br_election_data.race_id == 2020783, "position_id"] = new_index

In [16]:
(br_election_data.groupby("position_id")[position_fields].nunique() <= 1).all()

reference_year              True
position_id                 True
position_name               True
sub_area_name               True
sub_area_value              True
sub_area_name_secondary     True
sub_area_value_secondary    True
raw_state                   True
level                       True
tier                        True
is_judicial                 True
is_retention                True
normalized_position_id      True
normalized_position_name    True
frequency                   True
partisan_type               True
dtype: bool

### reference year

In [38]:
ref_group = br_election_data.groupby("position_id")["reference_year"]
ref_group.nunique().value_counts()

1    37413
2        1
Name: reference_year, dtype: int64

In [39]:
ref_group.unique()[ref_group.nunique() > 1]

position_id
156594    [2024, 2022]
Name: reference_year, dtype: object

Ok same same problem county.

## Race

In [57]:
race_fields = [
    "race_id",
    "is_primary",
    "is_runoff",
    "is_unexpired",
    "number_of_seats",
    "race_created_at",
    "race_updated_at",
    "raw_state",
    "raw_county",
    "state_id_fips",
    "county_id_fips"
]

(br_election_data.groupby("race_id")[race_fields].nunique() <= 1).all()

race_id             True
is_primary          True
is_runoff           True
is_unexpired        True
number_of_seats     True
race_created_at     True
race_updated_at     True
raw_state           True
raw_county         False
state_id_fips       True
county_id_fips     False
dtype: bool

In [65]:
br_election_data.groupby("race_id")[race_fields].nunique()["raw_county"].to_frame().query("raw_county > 1")

Unnamed: 0_level_0,raw_county
race_id,Unnamed: 1_level_1
1368973,5
1368974,4
1368976,39
1368979,39
1368981,6
...,...
2778001,14
2778002,6
2778003,6
2778004,5


## Normalize

### Elections


In [67]:
br_elections = br_election_data.drop_duplicates(subset=election_fields)[election_fields].copy()

assert br_elections.election_id.is_unique

### Positions

In [51]:
br_positions = br_election_data.drop_duplicates(subset=position_fields)[position_fields].copy()

assert br_positions.position_id.is_unique

AssertionError: 

### Races

In [73]:
race_fields = race_fields + ["election_id", "position_id"]
br_races = br_election_data.drop_duplicates(subset=race_fields)[race_fields].copy()

In [78]:
len(br_races) == len(br_election_data)

True

In [79]:
br_races

Unnamed: 0,race_id,is_primary,is_runoff,is_unexpired,number_of_seats,race_created_at,race_updated_at,raw_state,raw_county,state_id_fips,county_id_fips,election_id,position_id
0,1365674,False,False,False,3,2020-01-14 23:08:21.016,2021-09-30 19:37:57.406,IL,Macon County,17,17115,4206,247556
1,1365754,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:54:36.541,IL,Cook County,17,17031,4206,226054
2,1365856,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:54:49.430,IL,Cook County,17,17031,4206,226051
3,1365861,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:53:14.016,IL,Cook County,17,17031,4206,226035
4,1365863,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:53:39.554,IL,Cook County,17,17031,4206,226030
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82771,2778453,False,True,False,1,2023-08-09 23:45:46.281,2023-08-09 23:45:46.281,TN,Davidson County,47,47037,5002,233401
82772,2778454,False,True,False,1,2023-08-09 23:45:56.423,2023-08-09 23:45:56.423,TN,Davidson County,47,47037,5002,233376
82773,2778472,False,False,False,1,2023-08-10 17:01:23.443,2023-08-10 17:01:23.443,SC,Greenville County,45,45045,4693,416058
82774,2778473,True,False,False,1,2023-08-10 17:01:23.478,2023-08-10 17:01:23.478,SC,Greenville County,45,45045,5392,416058


## Test

In [6]:
import pandas as pd
import dbcp

source_uri = "gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv"
raw_df = dbcp.extract.ballot_ready.extract(source_uri)
transformed = dbcp.transform.ballot_ready.transform(raw_df)

2023-09-01 15:37:47 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-09-01 15:37:49 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.


In [7]:
raw_df["raw_ballot_ready"].shape

(82776, 29)

## construct data mart table

In [19]:
import pandas as pd
import dbcp

source_uri = "gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv"
raw_df = dbcp.extract.ballot_ready.extract(source_uri)
transformed = dbcp.transform.ballot_ready.transform(raw_df)

2023-09-01 15:46:39 [    INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.
2023-09-01 15:46:41 [    INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.


In [29]:
engine = dbcp.helpers.get_sql_engine()
with engine.connect() as con:
    br_elections = pd.read_sql_table("br_elections", con, schema="data_warehouse")
    br_races = pd.read_sql_table("br_races", con, schema="data_warehouse")
    br_positions = pd.read_sql_table("br_positions", con, schema="data_warehouse")

In [30]:
br_election_data = br_races.merge(br_elections, how="left", on="election_id", validate="m:1")
br_election_data = br_election_data.merge(br_positions, how="left", on="position_id", validate="m:1")

In [32]:
br_election_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188074 entries, 0 to 188073
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   race_id                   188074 non-null  int64         
 1   raw_county                188074 non-null  object        
 2   is_primary                188074 non-null  bool          
 3   is_runoff                 188074 non-null  bool          
 4   is_unexpired              188074 non-null  bool          
 5   number_of_seats           188074 non-null  int64         
 6   race_created_at           188074 non-null  datetime64[ns]
 7   race_updated_at           188074 non-null  datetime64[ns]
 8   raw_state                 188074 non-null  object        
 9   state_id_fips             188074 non-null  object        
 10  county_id_fips            188074 non-null  object        
 11  election_id               188074 non-null  int64         
 12  po

In [8]:
transformed.keys()

dict_keys(['br_elections', 'br_positions', 'br_races'])

In [9]:
for df in transformed.values():
    print(df.columns)
    print()

Index(['election_id', 'election_name', 'election_day'], dtype='object')

Index(['reference_year', 'position_id', 'position_name', 'sub_area_name', 'sub_area_value', 'sub_area_name_secondary', 'sub_area_value_secondary', 'raw_state', 'level', 'tier', 'is_judicial', 'is_retention', 'normalized_position_id', 'normalized_position_name', 'frequency', 'partisan_type'], dtype='object')

Index(['race_id', 'is_primary', 'is_runoff', 'is_unexpired', 'number_of_seats', 'race_created_at', 'race_updated_at', 'raw_state', 'raw_county', 'raw_county', 'state_id_fips', 'county_id_fips', 'election_id', 'position_id'], dtype='object')



In [10]:
transformed["br_races"].duplicated(subset=["race_id", "raw_county"]).value_counts()

False    188074
dtype: int64

## Fips business

In [11]:
census_uri = "gs://dgm-archive/census/tl_2021_us_county.zip"
fips = dbcp.extract.fips_tables._extract_census_counties(census_uri)

In [12]:
fips.query("GEOID == '02066'")

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
2956,2,66,2804401,2066,Copper River,Copper River Census Area,5,H5,G4020,,,,S,63952335592,1217429937,62.034479,-143.9221674,"POLYGON ((-147.74430 61.42628, -147.71096 61.4..."


In [13]:
br_races = transformed["br_races"]

br_races.query("county_id_fips == '02261'")

Unnamed: 0,race_id,is_primary,is_runoff,is_unexpired,number_of_seats,race_created_at,race_updated_at,raw_state,raw_county,raw_county.1,state_id_fips,county_id_fips,election_id,position_id


In [14]:
valdez = br_races.query("county_id_fips == '02261'").copy()
valdez.shape

(0, 14)

In [15]:
br_races = br_races[br_races.county_id_fips != '02261'].copy()

In [16]:
valdez_corrections = [
    {'raw_county': 'Chugach Census Area', 'county_id_fips': '02063'},
    {'raw_county': 'Copper River Census Area', 'county_id_fips': '02066'},
]

valdez_corrections_dfs = []
for cor in valdez_corrections:
    corrected_df = valdez.copy()
    for field, value in cor.items():
        corrected_df[field] = value
    valdez_corrections_dfs.append(corrected_df)

In [17]:
pd.concat(valdez_corrections_dfs + [br_races])

Unnamed: 0,race_id,is_primary,is_runoff,is_unexpired,number_of_seats,race_created_at,race_updated_at,raw_state,raw_county,raw_county.1,state_id_fips,county_id_fips,election_id,position_id
1543,1385101,False,False,False,1,2020-01-14 23:09:11.733,2021-09-30 19:37:57.406,AK,Valdez-Cordova Census Area,Chugach Census Area,02,02063,4325,225906
1545,1385103,False,False,False,1,2020-01-14 23:09:11.733,2021-09-30 19:37:57.406,AK,Valdez-Cordova Census Area,Chugach Census Area,02,02063,4325,225908
1546,1385104,False,False,False,1,2020-01-14 23:09:11.733,2021-09-30 19:37:57.406,AK,Valdez-Cordova Census Area,Chugach Census Area,02,02063,4325,225909
9958,1472432,False,False,False,1,2020-01-14 23:13:16.097,2020-01-14 23:13:16.097,AK,Valdez-Cordova Census Area,Chugach Census Area,02,02063,4325,2
41760,1675001,False,False,False,1,2020-01-15 22:48:10.749,2020-01-15 22:48:10.749,AK,Valdez-Cordova Census Area,Chugach Census Area,02,02063,4325,46255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82771,2778453,False,True,False,1,2023-08-09 23:45:46.281,2023-08-09 23:45:46.281,TN,Davidson County,,47,47037,5002,233401
82772,2778454,False,True,False,1,2023-08-09 23:45:56.423,2023-08-09 23:45:56.423,TN,Davidson County,,47,47037,5002,233376
82773,2778472,False,False,False,1,2023-08-10 17:01:23.443,2023-08-10 17:01:23.443,SC,Greenville County,,45,45045,4693,416058
82774,2778473,True,False,False,1,2023-08-10 17:01:23.478,2023-08-10 17:01:23.478,SC,Greenville County,,45,45045,5392,416058


In [34]:
br_races

Unnamed: 0,race_id,is_primary,is_runoff,is_unexpired,number_of_seats,race_created_at,race_updated_at,raw_state,raw_county,state_id_fips,county_id_fips,election_id,position_id
0,1365674,False,False,False,3,2020-01-14 23:08:21.016,2021-09-30 19:37:57.406,IL,Macon County,17,17115,4206,247556
1,1365754,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:54:36.541,IL,Cook County,17,17031,4206,226054
2,1365856,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:54:49.430,IL,Cook County,17,17031,4206,226051
3,1365861,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:53:14.016,IL,Cook County,17,17031,4206,226035
4,1365863,False,False,False,1,2020-01-14 23:08:21.016,2023-02-02 19:53:39.554,IL,Cook County,17,17031,4206,226030
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82771,2778453,False,True,False,1,2023-08-09 23:45:46.281,2023-08-09 23:45:46.281,TN,Davidson County,47,47037,5002,233401
82772,2778454,False,True,False,1,2023-08-09 23:45:56.423,2023-08-09 23:45:56.423,TN,Davidson County,47,47037,5002,233376
82773,2778472,False,False,False,1,2023-08-10 17:01:23.443,2023-08-10 17:01:23.443,SC,Greenville County,45,45045,4693,416058
82774,2778473,True,False,False,1,2023-08-10 17:01:23.478,2023-08-10 17:01:23.478,SC,Greenville County,45,45045,5392,416058
