In [2]:
import pandas as pd
from google.cloud import storage

In [74]:
# Save old Ballot Ready data locally
client = storage.Client()
bucket = client.bucket("dgm-archive")
blobs = bucket.list_blobs(prefix="ballot_ready/Climate Partners_Upcoming Races_All Tiers_20240524.csv", versions=True)
for i, blob in enumerate(blobs):
    if i>1:
        exit # There should only be one file that has this name.
    blob.download_to_filename("ballot_ready_2024_05_24.csv")

In [15]:
# Save old Ballot Ready data locally
client = storage.Client()
bucket = client.bucket("dgm-archive")
blobs = bucket.list_blobs(prefix="ballot_ready/Climate Partners_Upcoming Races_2025-2026_20240826.csv", versions=True)
for i, blob in enumerate(blobs):
    if i>1:
        exit # There should only be one file that has this name.
    blob.download_to_filename("ballot_ready_2024_08_26.csv")



In [16]:
# Import old Ballot Ready data
old_br = pd.read_csv("ballot_ready_2024_05_24.csv")
# Import new Ballot Ready data
new_br = pd.read_csv("ballot_ready_2024_08_26.csv")

  old_br = pd.read_csv("ballot_ready_2024_05_24.csv")


In [None]:
# Confirm the data doesn't overlap:
print(new_br[new_br.election_id.isin(old_br.election_id)]) # Elections?
print(new_br[new_br.race_id.isin(old_br.race_id)]) # Races?
print(new_br[new_br.election_day.isin(old_br.election_day)]) # Election timespans?

Empty DataFrame
Columns: [id, election_id, election_name, election_day, race_id, geofence_id, is_primary, is_runoff, is_unexpired, position_id, mtfcc, geo_id, position_name, sub_area_name, sub_area_value, sub_area_name_secondary, sub_area_value_secondary, state, level, tier, is_judicial, is_retention, number_of_seats, normalized_position_id, normalized_position_name, position_description, frequency, reference_year, partisan_type, counties, race_created_at, race_updated_at]
Index: []

[0 rows x 32 columns]
Empty DataFrame
Columns: [id, election_id, election_name, election_day, race_id, geofence_id, is_primary, is_runoff, is_unexpired, position_id, mtfcc, geo_id, position_name, sub_area_name, sub_area_value, sub_area_name_secondary, sub_area_value_secondary, state, level, tier, is_judicial, is_retention, number_of_seats, normalized_position_id, normalized_position_name, position_description, frequency, reference_year, partisan_type, counties, race_created_at, race_updated_at]
Index: []



In [None]:
# Confirm by looking at dates covered.
print(f"Final election date in old data: {old_br.election_day.max()}")
print(f"First election date in new  data: {new_br.election_day.min()}")

Final election date in old data: 2024-12-14
First election date in new  data: 2025-02-04


In [78]:
# New columns in the data
print(f"New columns in the new data: {set(new_br.columns).difference(old_br.columns)}")
# Missing columns in the new data - None!
print(f"Columns no longer in the new data: {set(old_br.columns).difference(new_br.columns)}")

New columns in the new data: set()
Columns no longer in the new data: set()


There are some geographic fields in the data that we aren't currently using. The [Ballot Ready](https://support.ballotready.org/interpreting-mtfcc-and-geoid) documentation notes:

"Mtfcc and geo_id fields should be treated as pairs. Meaning that there could be more than one record in the census file with the same geo_id, but the mtfcc value identifies the type of census entity. BallotReady datasets should be joined to the census file on both the mtfcc and geo_id."

"mtfcc values that start with X will not have any corresponding entry in the census file. These mtfcc/geo_id pairs are for custom boundaries that BallotReady collected, that are not available via the census. Note that there's not one clear explanation about how to use the custom mtfcc values."

In [None]:
new_br.mtfcc.head(5) # A 5 digit MAF/TIGER feature class code. Those starting with X come from Ballot Ready's research.

0    G5420
1    G5420
2    X0102
3    X0102
4    G5420
Name: mtfcc, dtype: object

In [None]:
# GEO IDs vary in length based on what information they contain.
# https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
new_br.geo_id.str.len().value_counts().sort_index()
# 2: State FIPS
# 4: State FIPS + Congressional district
# 5: State FIPS + County FIPS
# 7: State FIPS + 5-digit place
# 8: Not a valid length described by the Census - e.g., 53059.C7 - need to be normalized
# 9: Not a valid length described by the Census - e.g., 4205-2-13 - need to be normalized
# 10: State FIPS + County FIPS + County sub-division
# 12: State FIPS + County FIPS + Tract + Block Group
# 13: Not a valid length described by the Census - e.g., 53063.8 R/S/B - need to be normalized
# 15: State FIPS + County FIPS + Tract + Block
# 16: State FIPS + County FIPS + Tract + Block + Suffix

2        770
4        992
5      64599
7     102909
8          2
9         12
10     75211
12      8605
13         4
15        14
16         6
Name: geo_id, dtype: int64

Regarding the `geofence_id`, the documentation notes:
"Depending on the scope of your export, there can be multiple geofences for the same mtfcc/geo_id pair that are distinguished by the valid_from and valid_to fields. That's how we track how the boundaries for a given political jurisdiction can change over time (due to redistricting, annexations, etc.)."

In [56]:
new_br.set_index(['race_id', 'geofence_id']).index.is_unique

True

We don't see `valid_to` and `valid_from` fields in our CSV, and each race is only associated with one `geofence_id` in the data sample. For now, we use these fields to validate our geocoding, but until we need more granular data they don't seem to necessarily serve our use case better than the existing geocoding workflow.

### Transformed Data

Let's compare the geo ID and the geocoded state and county FIPS columns to ensure geocoding works as expected.

In [10]:
transformed_br = pd.read_parquet('../../../data/output/data_mart/br_election_data.parquet')
print(transformed_br.election_day.min())
print(transformed_br.election_day.max())

2025-02-04 00:00:00
2026-12-12 00:00:00


In [13]:
old_br.info()

NameError: name 'old_br' is not defined

In [12]:
transformed_br.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226079 entries, 0 to 226078
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   race_id                   226079 non-null  Int64         
 1   is_primary                226079 non-null  boolean       
 2   is_runoff                 226079 non-null  boolean       
 3   is_unexpired              226079 non-null  boolean       
 4   number_of_seats           226079 non-null  Int64         
 5   race_created_at           226079 non-null  datetime64[ns]
 6   race_updated_at           226079 non-null  datetime64[ns]
 7   election_id               226079 non-null  Int64         
 8   position_id               226079 non-null  Int64         
 9   election_name             226079 non-null  string        
 10  election_day              226079 non-null  datetime64[ns]
 11  position_name             226079 non-null  string        
 12  re