In [18]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [19]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [20]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2020 by Precinct\\*"

df = merge_files(location)

In [21]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553307 entries, 0 to 839
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   county_code      553307 non-null  object
 1   county_name      553307 non-null  object
 2   elec_num         553307 non-null  int64 
 3   elec_date        553307 non-null  object
 4   elec_name        553307 non-null  object
 5   precinct_id      553307 non-null  object
 6   poll_loc         452021 non-null  object
 7   total_reg        553307 non-null  int64 
 8   total_reg_r      553307 non-null  int64 
 9   total_reg_d      553307 non-null  int64 
 10  total_reg_other  553307 non-null  int64 
 11  contest_name     553307 non-null  object
 12  district         553307 non-null  object
 13  contest_code     553307 non-null  int64 
 14  cand_or_issue    553307 non-null  object
 15  cand_party       553307 non-null  object
 16  cand_id          553307 non-null  int64 
 17  doe_num          5

In [22]:
# Examine sample just to get a visual sense of the shape of the data
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
250,GAD,Gadsden,10866,11/03/2020,2020 General Election,1,Pre 1 Centenary United Methodi,1458,0,0,0,Representative in Congress,District 5,140050,Gary Adler,REP,103559929,75184,415
1576,OKA,Okaloosa,10866,11/03/2020,2020 General Election,10,10,6885,0,0,0,Retention of Joseph Lewis Jr,First District Court of Appeal,520101,No,NOP,104990520,20,1322
1783,LEO,Leon,10866,11/03/2020,2020 General Election,4101,4101-4401 -- Premier Health & Fitness Center,1602,0,0,0,Representative in Congress,District 2,140020,UnderVotes,,0,902,347
10339,HIL,Hillsborough,10866,11/03/2020,2020 General Election,920,920,270,0,0,0,Sheriff,,335000,Ron McMullen,NPA,110518450,76113,24
2923,PAL,Palm Beach,10866,11/03/2020,2020 General Election,2202,,8,0,0,0,President of the United States,,100000,De La Fuente / Richardson,REF,0,74772,0
37766,PAL,Palm Beach,10866,11/03/2020,2020 General Election,2094,,688,0,0,0,Retention of Alan O. Forst,Fourth District Court of Appeal,520401,OverVotes,,0,901,0
5956,BRO,Broward,10866,11/03/2020,2020 General Election,J008,J008,509,0,0,0,Representative in Congress,District 20,140200,Alcee L. Hastings,DEM,101602313,75013,266
114,HAM,Hamilton,10866,11/03/2020,2020 General Election,3,3 - White Springs Library,954,0,0,0,State Senator,District 3,240030,Marva Harris Preston,REP,101469260,74381,390
31,BAK,Baker,10866,11/03/2020,2020 General Election,4B,Taylor Voting House,646,0,0,0,President of the United States,,100000,De La Fuente / Richardson,REF,0,74772,0
31219,BRO,Broward,10866,11/03/2020,2020 General Election,C023,C023,5954,0,0,0,Circuit Judge,"17th Judicial Circuit, Group 16",551716,George Odom Jr,NOP,103254918,74590,1675


## Step 2: Removing Duplicates

In [23]:
df.duplicated().sum()

0

No dupes! Great!

## Step 3: Removing Extraneous Columns
Having already cleaned the 2012 and 2014 data, I know that there are a handful of columns we can remove off the bat:

In [24]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553307 entries, 0 to 839
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   county_code    553307 non-null  object
 1   county_name    553307 non-null  object
 2   elec_date      553307 non-null  object
 3   precinct_id    553307 non-null  object
 4   poll_loc       452021 non-null  object
 5   total_reg      553307 non-null  int64 
 6   contest_name   553307 non-null  object
 7   district       553307 non-null  object
 8   contest_code   553307 non-null  int64 
 9   cand_or_issue  553307 non-null  object
 10  cand_party     553307 non-null  object
 11  cand_id        553307 non-null  int64 
 12  doe_num        553307 non-null  int64 
 13  vote_total     553307 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 63.3+ MB


## Step 4: Standardizing Contest Names

Since we're only interested in six races (President, Governor, U.S. Senate, State Senate, U.S. Representative, and State Representative), we need to comb through this list and identify any entries in `contest_name` that suggests it refers to one of those races. We could use regex string matching, but I'll do it manually to make sure there isn't some weird description that some random county used that we wouldn't predict.

In [26]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['President of the United States', 'Representative in Congress',
       'State Representative', 'County Commissioner',
       'Retention of Carlos G. Muñiz', 'Retention of Joseph Lewis Jr',
       'Retention of Scott Makar', 'Retention of Rachel Nordby',
       'Retention of Tim Osterhaus', 'Retention of Clay Roberts',
       'Retention of Adam S. Tanenbaum',
       'Amendment No. 1: Citizenship Requirement to Vote in Florida Elections',
       'Amendment No. 2: Raising Florida’s Minimum Wage',
       'Amendment No. 3: All Voters Vote in Primary Elections for State Legislature, Governor, and Cabinet',
       'Amendment No. 4: Voter Approval of Constitutional Amendments',
       'Amendment No. 5: Limitations on Homestead Property Tax Assessments; increased portability period to transfer accrued benefit',
       'Amendment No. 6: Ad Valorem Tax Discount for Spouses of Certain Deceased Veterans Who Had Permanent, Combat-Related Disabilities',
       'State Senator', 'Clerk of the Ci

Above, we can see that there are a lot of races represented in the data. While not fully standardized prior to 2018, it looks like they are standardized in subsequent elections. Just to confirm, I reviewed the above list manually looking for non-standard descriptions that might refer to one of our races and we'll run the following standardization step anyway.

In [27]:
df['contest_name'] = df['contest_name']\
.replace(dict.fromkeys(['PRESIDENT OF THE UNITED STATES'], 'President of the United States'))\
.replace(dict.fromkeys(['United States Senator', 'UNITED STATES SENATOR'], 'U.S. Senator'))\
.replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE', 'REPRESENTATIVE IN CONGRESS'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))



## Step 5: Narrowing to Races
And now we can more confidently narrow to our six contest types:

In [28]:
df = df[df.contest_name.isin(['President of the United States',\
                              'U.S. Senator',\
                              'U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Senator',\
                              'State Representative'])]\
                            .reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122811 entries, 0 to 122810
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   index          122811 non-null  int64 
 1   county_code    122811 non-null  object
 2   county_name    122811 non-null  object
 3   elec_date      122811 non-null  object
 4   precinct_id    122811 non-null  object
 5   poll_loc       100493 non-null  object
 6   total_reg      122811 non-null  int64 
 7   contest_name   122811 non-null  object
 8   district       122811 non-null  object
 9   contest_code   122811 non-null  int64 
 10  cand_or_issue  122811 non-null  object
 11  cand_party     122811 non-null  object
 12  cand_id        122811 non-null  int64 
 13  doe_num        122811 non-null  int64 
 14  vote_total     122811 non-null  int64 
dtypes: int64(6), object(9)
memory usage: 14.1+ MB


## Step 6: Dealing With NaN Cells

In [29]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]

df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
37703,125,DUV,Duval,11/03/2020,1001,,3307,President of the United States,,100000,Trump / Pence,REP,0,74773,172
37704,126,DUV,Duval,11/03/2020,1002,,3457,President of the United States,,100000,Trump / Pence,REP,0,74773,91
37705,127,DUV,Duval,11/03/2020,1003,,2114,President of the United States,,100000,Trump / Pence,REP,0,74773,51
37706,128,DUV,Duval,11/03/2020,1004,,2617,President of the United States,,100000,Trump / Pence,REP,0,74773,64
37707,129,DUV,Duval,11/03/2020,1005,,3961,President of the United States,,100000,Trump / Pence,REP,0,74773,140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97832,18781,PAL,Palm Beach,11/03/2020,5124,,394,State Representative,District 91,260910,UnderVotes,,0,902,19
97833,18782,PAL,Palm Beach,11/03/2020,5126,,491,State Representative,District 91,260910,UnderVotes,,0,902,24
97834,18783,PAL,Palm Beach,11/03/2020,5158,,1779,State Representative,District 91,260910,UnderVotes,,0,902,70
97835,18784,PAL,Palm Beach,11/03/2020,5160,,1741,State Representative,District 91,260910,UnderVotes,,0,902,78


It looks like the only nulls in this dataset are in the `poll_loc` column. That's great -- we can just leave those in for the time being.

## Step 7: Check for Completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [30]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'Seminole' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67


All 67 counties are represented in the dataset, so this looks complete.

In [31]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['President of the United States' 'U.S. Representative'
 'State Representative' 'State Senator']
4


2020 had no gubernatorial or U.S. Senate contests, so this looks complete.

## Step 8: Save to CSV

In [32]:
# df.to_csv('fl_2020_cleaned.csv')

In [34]:
df[df['county_code'] == 'WAS']

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
122643,0,WAS,Washington,11/03/2020,1,CARYVILLE TOWN HALL,475,President of the United States,,100000,Trump / Pence,REP,0,74773,185
122644,1,WAS,Washington,11/03/2020,2,FIVE POINT COMMUNITY CENTER,1053,President of the United States,,100000,Trump / Pence,REP,0,74773,735
122645,2,WAS,Washington,11/03/2020,3,HINSON CROSSROADS FIRE STATION,641,President of the United States,,100000,Trump / Pence,REP,0,74773,425
122646,3,WAS,Washington,11/03/2020,4,AGRICULTURE CENTER - CHIPLEY,4018,President of the United States,,100000,Trump / Pence,REP,0,74773,2264
122647,4,WAS,Washington,11/03/2020,5,VERNON CITY HALL,1629,President of the United States,,100000,Trump / Pence,REP,0,74773,956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122806,163,WAS,Washington,11/03/2020,8,ARMORY - CHIPLEY,1733,U.S. Representative,District 2,140020,UnderVotes,,0,902,183
122807,164,WAS,Washington,11/03/2020,9,WAUSAU TOWN HALL,1134,U.S. Representative,District 2,140020,UnderVotes,,0,902,58
122808,165,WAS,Washington,11/03/2020,11,EBRO CITY HALL,439,U.S. Representative,District 2,140020,UnderVotes,,0,902,57
122809,166,WAS,Washington,11/03/2020,12,GREENHEAD,2393,U.S. Representative,District 2,140020,UnderVotes,,0,902,123
