In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [2]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [3]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2022 by Precinct\\*"

df = merge_files(location)

In [4]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609323 entries, 0 to 1031
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   county_code      609323 non-null  object
 1   county_name      609323 non-null  object
 2   elec_num         609323 non-null  int64 
 3   elec_date        609323 non-null  object
 4   elec_name        609323 non-null  object
 5   precinct_id      608549 non-null  object
 6   poll_loc         569737 non-null  object
 7   total_reg        609323 non-null  int64 
 8   total_reg_r      609323 non-null  int64 
 9   total_reg_d      609323 non-null  int64 
 10  total_reg_other  609323 non-null  int64 
 11  contest_name     609323 non-null  object
 12  district         609323 non-null  object
 13  contest_code     609323 non-null  int64 
 14  cand_or_issue    609323 non-null  object
 15  cand_party       609323 non-null  object
 16  cand_id          609323 non-null  int64 
 17  doe_num          

In [5]:
# Examine sample just to get a visual sense of the shape of the data
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
30926,PAL,Palm Beach,26906,11/08/2022,2022 General Election,4502,4502,1984,0,0,0,Port of Palm Beach District,Group 4,421340,Doreen Benson,REP,111927282,81541,346
10914,PAS,Pasco,26906,11/08/2022,2022 General Election,103,103,2298,0,0,0,Retention of Craig C. Villanti,Second District Court of Appeal,520208,Yes,NOP,113787369,10,753
2535,BRE,Brevard,26906,11/08/2022,2022 General Election,500,Precinct 500,1775,0,0,0,Governor and Lieutenant Governor,,160000,Roos / Rorabaugh,LPF,109876282,82220,2
25940,DAD,Miami-Dade,26906,11/08/2022,2022 General Election,4590,PRECINCT 459.0,908,0,0,0,Chief Financial Officer,,160500,OverVotes,,0,901,0
749,ORA,Orange,26906,11/08/2022,2022 General Election,632,PCT 632,2734,0,0,0,United States Senator,,120000,Dennis Misigoy,LPF,109912678,79517,6
4635,MRN,Marion,26906,11/08/2022,2022 General Election,4060,4060,3371,0,0,0,Retention of Charles T. Canady,Justice of the Supreme Court,500001,Yes,NOP,113445454,10,751
54014,PAL,Palm Beach,26906,11/08/2022,2022 General Election,3718,3718,2374,0,0,0,Retention of Cory J. Ciklin,Fourth District Court of Appeal,520402,OverVotes,,0,901,1
7276,VOL,Volusia,26906,11/08/2022,2022 General Election,906,906,1763,0,0,0,Retention of Jorge Labarga,Justice of the Supreme Court,500004,OverVotes,,0,901,1
5188,SAR,Sarasota,26906,11/08/2022,2022 General Election,515,515,1831,0,0,0,Retention of Charles T. Canady,Justice of the Supreme Court,500001,Yes,NOP,113445454,10,535
1904,DUV,Duval,26906,11/08/2022,2022 General Election,811,811,2211,0,0,0,Representative in Congress,District 4,140040,UnderVotes,,0,902,20


## Step 2: Removing Duplicates

In [6]:
df.duplicated().sum()

0

No dupes! Great!

## Step 3: Removing Extraneous Columns
Having already cleaned all of the other data, I know that there are a handful of columns we can remove off the bat:

In [7]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609323 entries, 0 to 1031
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   county_code    609323 non-null  object
 1   county_name    609323 non-null  object
 2   elec_date      609323 non-null  object
 3   precinct_id    608549 non-null  object
 4   poll_loc       569737 non-null  object
 5   total_reg      609323 non-null  int64 
 6   contest_name   609323 non-null  object
 7   district       609323 non-null  object
 8   contest_code   609323 non-null  int64 
 9   cand_or_issue  609323 non-null  object
 10  cand_party     609323 non-null  object
 11  cand_id        609323 non-null  int64 
 12  doe_num        609323 non-null  int64 
 13  vote_total     609323 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 69.7+ MB


## Step 4: Standardizing Contest Names

Since we're only interested in six races (President, Governor, U.S. Senate, State Senate, U.S. Representative, and State Representative), we need to comb through this list and identify any entries in `contest_name` that suggests it refers to one of those races. We could use regex string matching, but I'll do it manually to make sure there isn't some weird description that some random county used that we wouldn't predict.

In [9]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['United States Senator', 'Representative in Congress',
       'Governor and Lieutenant Governor', 'Attorney General',
       'Chief Financial Officer', 'Commissioner of Agriculture',
       'State Senator', 'State Representative', 'County Commissioner',
       'Retention of Charles T. Canady', 'Retention of John D. Couriel',
       'Retention of Jamie Grosshans', 'Retention of Jorge Labarga',
       'Retention of Ricky Polston', 'Retention of Ross L. Bilbrey',
       'Retention of Susan Kelsey', 'Retention of Bobby Long',
       'Retention of Lori S. Rowe', 'Retention of Bo Winokur',
       'Circuit Judge', 'Alachua Soil and Water Conservation District',
       'Amendment No. 1: Limitation on Assessment of Real Property Used for Residential Purposes',
       'Amendment No. 2: Abolishing the Constitution Revision Commission',
       'Amendment No. 3: Additional Homestead Property Tax Exemption for Specified Critical Public Services Workforce',
       'School Board', 'Retention of

Above, we can see that there are a lot of races represented in the data. While not fully standardized prior to 2018, it looks like they are standardized in subsequent elections. Just to confirm, I reviewed the above list manually looking for non-standard descriptions that might refer to one of our races and we'll run the following standardization step anyway.

In [10]:
df['contest_name'] = df['contest_name']\
.replace(dict.fromkeys(['PRESIDENT OF THE UNITED STATES'], 'President of the United States'))\
.replace(dict.fromkeys(['United States Senator', 'UNITED STATES SENATOR'], 'U.S. Senator'))\
.replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE', 'REPRESENTATIVE IN CONGRESS'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))



## Step 5: Narrowing to Races
And now we can more confidently narrow to our six contest types:

In [11]:
df = df[df.contest_name.isin(['President of the United States',\
                              'U.S. Senator',\
                              'U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Senator',\
                              'State Representative'])]\
                            .reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142119 entries, 0 to 142118
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   index          142119 non-null  int64 
 1   county_code    142119 non-null  object
 2   county_name    142119 non-null  object
 3   elec_date      142119 non-null  object
 4   precinct_id    141921 non-null  object
 5   poll_loc       134587 non-null  object
 6   total_reg      142119 non-null  int64 
 7   contest_name   142119 non-null  object
 8   district       142119 non-null  object
 9   contest_code   142119 non-null  int64 
 10  cand_or_issue  142119 non-null  object
 11  cand_party     142119 non-null  object
 12  cand_id        142119 non-null  int64 
 13  doe_num        142119 non-null  int64 
 14  vote_total     142119 non-null  int64 
dtypes: int64(6), object(9)
memory usage: 16.3+ MB


## Step 6: Dealing With NaN Cells

In [12]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]

df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
5937,0,BRO,Broward,11/08/2022,1,,1715,U.S. Senator,,120000,Marco Rubio,REP,0,81751,590
5938,1,BRO,Broward,11/08/2022,2,,4361,U.S. Senator,,120000,Marco Rubio,REP,0,81751,1710
5939,2,BRO,Broward,11/08/2022,3,,4477,U.S. Senator,,120000,Marco Rubio,REP,0,81751,1093
5940,3,BRO,Broward,11/08/2022,4,,3260,U.S. Senator,,120000,Marco Rubio,REP,0,81751,414
5941,4,BRO,Broward,11/08/2022,5,,3563,U.S. Senator,,120000,Marco Rubio,REP,0,81751,171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142111,400,WAS,Washington,11/08/2022,,Vernon City Hall,0,State Senator,District 2,240020,UnderVotes,,0,902,5
142112,401,WAS,Washington,11/08/2022,,New Hope - Campbells Park,610,State Senator,District 2,240020,UnderVotes,,0,902,5
142113,402,WAS,Washington,11/08/2022,,Orange Hill Fire Station,1572,State Senator,District 2,240020,UnderVotes,,0,902,9
142114,403,WAS,Washington,11/08/2022,,Armory - Chipley,1545,State Senator,District 2,240020,UnderVotes,,0,902,14


Looks like we have nulls in `poll_loc` and `precinct_id` -- the former is fine since we won't be doing anything with that column immediately, but the latter is unusual so let's take a closer look.

In [15]:
df_nan_precinct_id = df[df['precinct_id'].isna()]

df_nan_precinct_id

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
141855,0,WAS,Washington,11/08/2022,,Caryville,421,U.S. Senator,,120000,Marco Rubio,REP,0,81751,121
141856,1,WAS,Washington,11/08/2022,,Five Points,1023,U.S. Senator,,120000,Marco Rubio,REP,0,81751,562
141857,2,WAS,Washington,11/08/2022,,Hinson Crossroads,600,U.S. Senator,,120000,Marco Rubio,REP,0,81751,329
141858,3,WAS,Washington,11/08/2022,,First Baptist Church - Chipley,3669,U.S. Senator,,120000,Marco Rubio,REP,0,81751,1742
141859,4,WAS,Washington,11/08/2022,,Vernon City Hall,0,U.S. Senator,,120000,Marco Rubio,REP,0,81751,738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142111,400,WAS,Washington,11/08/2022,,Vernon City Hall,0,State Senator,District 2,240020,UnderVotes,,0,902,5
142112,401,WAS,Washington,11/08/2022,,New Hope - Campbells Park,610,State Senator,District 2,240020,UnderVotes,,0,902,5
142113,402,WAS,Washington,11/08/2022,,Orange Hill Fire Station,1572,State Senator,District 2,240020,UnderVotes,,0,902,9
142114,403,WAS,Washington,11/08/2022,,Armory - Chipley,1545,State Senator,District 2,240020,UnderVotes,,0,902,14


Looks like all of the nulls here are in Washington county, and they definitely contain vote totals. Let's confirm.

In [16]:
df_nan_precinct_id[df_nan_precinct_id['county_code'] != 'WAS']

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total


Confirmed. It's surprising that the precinct info isn't included in the dataset -- we may be able to cross reference the 2020 data using polling locations (which tend not to change much year-to-year, especially in smaller counties like Washington) to get the precinct IDs.

In [32]:
df_nan_precinct_id['poll_loc'].unique()


array(['Caryville', 'Five Points', 'Hinson Crossroads',
       'First Baptist Church - Chipley', 'Vernon City Hall',
       'New Hope - Campbells Park', 'Orange Hill Fire Station',
       'Armory - Chipley', 'Wausau'], dtype=object)

In [26]:
wash_precincts = {1: "Caryville", 
                  2: 'Five Points', 
                  3: 'Hinson Crossroads', 
                  4: 'First Baptist Church', 
                  5: 'Vernon City Hall',
                  6: 'Campbells Park',
                  7: 'Orange Hill Fire Station',
                  8: 'Armory',
                  9: 'Wausau'}


# df_nan_precinct_id['precinct_id'] = df_nan_precinct_id.apply(
#     lambda row: wash_precincts.items()[0] if 
# )

wash_precincts.keys().type

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9])

## Step 7: Check for Completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [58]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'Seminole' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67


All 67 counties are represented in the dataset, so this looks complete.

In [59]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['U.S. Senator' 'U.S. Representative' 'Governor and Lieutenant Governor'
 'State Senator' 'State Representative']
5


2018 had no presidential contest, so this looks complete.

## Step 8: Save to CSV

In [60]:
# df.to_csv('fl_2018_cleaned.csv')