In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [2]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [3]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2014 by Precinct\\*"

df = merge_files(location)

In [4]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 385323 entries, 0 to 1119
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   county_code      385323 non-null  object 
 1   county_name      385323 non-null  object 
 2   elec_num         385323 non-null  int64  
 3   elec_date        385323 non-null  object 
 4   elec_name        385323 non-null  object 
 5   precinct_id      385323 non-null  object 
 6   poll_loc         340897 non-null  object 
 7   total_reg        385323 non-null  int64  
 8   total_reg_r      385323 non-null  int64  
 9   total_reg_d      385323 non-null  int64  
 10  total_reg_other  385323 non-null  int64  
 11  contest_name     385323 non-null  object 
 12  district         349577 non-null  object 
 13  contest_code     385323 non-null  int64  
 14  cand_or_issue    385323 non-null  object 
 15  cand_party       365969 non-null  object 
 16  cand_id          380165 non-null  float6

In [5]:
# Examine sample just to get a visual sense of the shape of the data
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
1071,SEM,Seminole,10218,11/04/2014,2014 General Election,2,Precinct 2,266005,105244,88123,72638,Circuit Judge,Group 23,551823,John Moser,NP,110484111.0,61693,685
3542,SAR,Sarasota,10218,11/04/2014,2014 General Election,323,323,1113,0,0,0,Sarasota County Charter Review Board,District 2,450120,Richard Dorfman,REP,115733640.0,62351,350
40049,BRO,Broward,10218,11/04/2014,2014 General Election,Y002,Y002 565,2663,0,0,0,State Representative,District 100,261000,"Joseph S. ""Joe"" Geller",DEM,109046184.0,60349,472
8701,LEO,Leon,10218,11/04/2014,2014 General Election,5214,5214 -- First Church Of The Nazarene,2233,0,0,0,Commissioner of Agriculture,,160800,Thaddeus Thad Hamilton,DEM,101335958.0,60304,706
28186,BRO,Broward,10218,11/04/2014,2014 General Election,T006,T006 395,719,0,0,0,Commissioner of Agriculture,,160800,Thaddeus Thad Hamilton,DEM,101335958.0,60304,162
12857,DAD,Miami-Dade,10218,11/04/2014,2014 General Election,274.0,PCT 274 0162,1444,0,0,0,Chief Financial Officer,,160500,Jeff Atwater,REP,112630053.0,55287,80
12480,ORA,Orange,10218,11/04/2014,2014 General Election,417,Precinct 417 131,7366,0,0,0,Attorney General,,160200,Pam Bondi,REP,110471335.0,60487,1109
5082,ESC,Escambia,10218,11/04/2014,2014 General Election,112,,6187,0,0,0,Commissioner of Agriculture,,160800,Thaddeus Thad Hamilton,DEM,101335958.0,60304,796
2171,LEE,Lee,10218,11/04/2014,2014 General Election,24,POLL 024,2553,0,0,0,Amendment No. 3: PROSPECTIVE APPOINTMENT OF CE...,,900300,UnderVotes,,0.0,902,144
2745,PIN,Pinellas,10218,11/04/2014,2014 General Election,147,Riviera UMC 036,1132,0,0,0,County Commissioner,District 2,380020,"Patricia ""Pat"" Gerard",DEM,106737519.0,62193,321


## Step 2: Removing Duplicates

In [6]:
df.duplicated().sum()

0

No dupes! Great!

## Step 3: Removing Extraneous Columns
Having already cleaned the 2012 data, I know that there are a handful of columns we can remove off the bat:

In [7]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 385323 entries, 0 to 1119
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   county_code    385323 non-null  object 
 1   county_name    385323 non-null  object 
 2   elec_date      385323 non-null  object 
 3   precinct_id    385323 non-null  object 
 4   poll_loc       340897 non-null  object 
 5   total_reg      385323 non-null  int64  
 6   contest_name   385323 non-null  object 
 7   district       349577 non-null  object 
 8   contest_code   385323 non-null  int64  
 9   cand_or_issue  385323 non-null  object 
 10  cand_party     365969 non-null  object 
 11  cand_id        380165 non-null  float64
 12  doe_num        385323 non-null  int64  
 13  vote_total     385323 non-null  int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 44.1+ MB


## Step 4: Standardizing Contest Names

Since we're only interested in six races (President, Governor, U.S. Senate, State Senate, U.S. Representative, and State Representative) and there was no presidential or U.S. Senate race in Florida in 2014, we need to comb through this list and identify any entries in `contest_name` that suggests it refers to one of those races.

In [9]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['U.S. Representative', 'Governor and Lieutenant Governor',
       'Attorney General', 'Chief Financial Officer',
       'Commissioner of Agriculture', 'Tax Collector',
       'County Commissioner', 'Retention of Robert T. Benton',
       'Retention of Joseph Lewis, Jr.', 'Retention of Scott Makar',
       'Retention of Tim Osterhaus', 'Retention of Clay Roberts',
       'Amendment No. 1: Water and Land Conservation - Dedicates funds to acquire and restore Florida conservation and recreation lands',
       'Amendment No. 2: Use of Marijuana for Certain Medical Conditions',
       'Amendment No. 3: PROSPECTIVE APPOINTMENT OF CERTAIN JUDICIAL VACANCIES',
       'State Representative', 'Clerk of the Circuit Court',
       'School Board', 'Canaveral Port Authority',
       'Retention of Wendy W. Berger', 'Retention of Kerry I. Evander',
       'Retention of Charles Alan Lawson',
       'Retention of Richard B. Orfinger',
       'Retention of William David Palmer',
       'Retention o

Above, we can see that 1) there are a lot of races represented in the data and 2) they aren't all uniformly named. Going through the above set of race names, I have extracted the ones that correspond to races we're interested in. Using dictionaries, we can standardize the naming convention for our races.

In [10]:
df['contest_name'] = df['contest_name'].replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))


## Step 5: Narrowing to Races
And now we can more confidently narrow to our four races:

In [11]:
df = df[df.contest_name.isin(['U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Representative',\
                              'State Senator'])]\
                            .reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82603 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          82603 non-null  int64  
 1   county_code    82603 non-null  object 
 2   county_name    82603 non-null  object 
 3   elec_date      82603 non-null  object 
 4   precinct_id    82603 non-null  object 
 5   poll_loc       73261 non-null  object 
 6   total_reg      82603 non-null  int64  
 7   contest_name   82603 non-null  object 
 8   district       76582 non-null  object 
 9   contest_code   82603 non-null  int64  
 10  cand_or_issue  82603 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        81876 non-null  float64
 13  doe_num        82603 non-null  int64  
 14  vote_total     82603 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 9.5+ MB


## Step 6: Dealing With NaN Cells

In [12]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]

df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
1987,0,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,Bill Posey,REP,100928199.0,60309,730
1988,1,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,Gabriel Rothblatt,DEM,119321677.0,60345,337
1989,2,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,WriteinVotes,,0.0,900,2
1990,3,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,OverVotes,,0.0,901,0
1991,4,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,UnderVotes,,0.0,902,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77037,7398,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Rick Scott,REP,103093132.0,61253,848
77038,7399,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Times Over Voted,NP,,901,0
77039,7400,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Write-in,NP,,900,4
77041,7406,SEM,Seminole,11/04/2014,9,Precinct 9,266005,State Senator,District 10,240100,Number of Under Votes,NP,,902,0


In [13]:
# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party = df[df['cand_party'].isna()]

df_nan_cand_party

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
70359,577,POL,Polk,11/04/2014,102,GREEN POND BAPTIST CHURCH 5995 GREENPOND CHURC...,1068,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70360,578,POL,Polk,11/04/2014,103,POLO PARK 426 POLO PARK BLVD,3182,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70361,579,POL,Polk,11/04/2014,104,POLO PARK EAST 525 POLO PARK EAST BLVD,2477,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70362,580,POL,Polk,11/04/2014,105,RIDGE MOBILE HOME PARK CLUBHOUSE 49473 HIGHWAY 27,1098,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70363,581,POL,Polk,11/04/2014,107,NEW LIFE COMMUNITY CHURCH 530 COMMONWEALTH AVE SW,3870,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72774,8677,POL,Polk,11/04/2014,410,CALVARY FIRST ASSEMBLY OF GOD 4550 E JOHNSON AVE,898,State Senator,District 14,240140,UnderVotes,,0.0,902,38
72775,8678,POL,Polk,11/04/2014,412,POINCIANA COMMUNITY CENTER 395 MARIGOLD AVE,6517,State Senator,District 14,240140,UnderVotes,,0.0,902,113
72776,8679,POL,Polk,11/04/2014,419,TUSCANY PRESERVE AT LAKE MARION 1850 PACIFIC ROAD,4959,State Senator,District 14,240140,UnderVotes,,0.0,902,94
72777,8680,POL,Polk,11/04/2014,422,SUNLAKE TERRACE ESTATES 6555-16 OLD LAKE WILSO...,208,State Senator,District 14,240140,UnderVotes,,0.0,902,2


In [14]:

# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party['cand_or_issue'].unique()

array(['OverVotes', 'UnderVotes'], dtype=object)

Looks like rows containing NaNs in cand_party are either rows describing overvotes and undervotes, or pertain to nonpartisan races that are not relevant to this analysis. Let's double check that latter assumption, and then drop those rows:

In [15]:
df_nan_cand_party[~df_nan_cand_party['cand_or_issue'].isin(['UnderVotes', 'OverVotes'])]['contest_name'].unique()

array([], dtype=object)

Assumptions confirmed. We can safely drop any rows that have a NaN value in the `cand_party` column.

In [16]:
df = df.dropna(subset=['cand_party'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81637 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          81637 non-null  int64  
 1   county_code    81637 non-null  object 
 2   county_name    81637 non-null  object 
 3   elec_date      81637 non-null  object 
 4   precinct_id    81637 non-null  object 
 5   poll_loc       72295 non-null  object 
 6   total_reg      81637 non-null  int64  
 7   contest_name   81637 non-null  object 
 8   district       75950 non-null  object 
 9   contest_code   81637 non-null  int64  
 10  cand_or_issue  81637 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        80910 non-null  float64
 13  doe_num        81637 non-null  int64  
 14  vote_total     81637 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 10.0+ MB


Looks like we still have nulls in `poll_loc` (which is fine), `district`, and `cand_id` -- let's take a look at those.

In [17]:
# Create a df with just the rows containing a null value in the district column
df_nan_district = df[df['district'].isna()]
df_nan_district.head(5)

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
57918,1477,PAL,Palm Beach,11/4/2014,1002,1002,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57919,1478,PAL,Palm Beach,11/4/2014,1004,1004,2,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57920,1479,PAL,Palm Beach,11/4/2014,1006,1006,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57921,1480,PAL,Palm Beach,11/4/2014,1008,1008,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57922,1481,PAL,Palm Beach,11/4/2014,1010,1010,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0


This is a little weird. Looks like we have NaNs in the district column for Governor and Lt. Governor candidates, in rows where vote tallies are indeed present. Is this just for the libertarian party candidate (as we can see in the above head window)? Let's check.

In [18]:
# Is this only present for the Governor's race? Doesn't look like it.
df_nan_district['contest_name'].unique()

array(['Governor and Lieutenant Governor'], dtype=object)

In [19]:
# Is this only present in Palm Beach? Doesn't look like it.
df_nan_district['county_name'].unique()

array(['Palm Beach', 'Polk', 'Seminole'], dtype=object)

In [20]:
# Is this only present for LPF party candidates? Doesn't look like it.
df_nan_district['cand_party'].unique()

array(['LPF', 'DEM', 'NPA', 'REP', 'NP'], dtype=object)

It looks like the only races that we're interested in that have null values in the `district` column are statewide and don't really have a district anyway. We'll just replace these NaNs with blank cells and move on.

In [21]:
# We can fill NaNs in this column for rows for statewide races with 'Statewide' be consistent with other dataframes.
df['district'].fillna('', inplace=True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81637 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          81637 non-null  int64  
 1   county_code    81637 non-null  object 
 2   county_name    81637 non-null  object 
 3   elec_date      81637 non-null  object 
 4   precinct_id    81637 non-null  object 
 5   poll_loc       72295 non-null  object 
 6   total_reg      81637 non-null  int64  
 7   contest_name   81637 non-null  object 
 8   district       81637 non-null  object 
 9   contest_code   81637 non-null  int64  
 10  cand_or_issue  81637 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        80910 non-null  float64
 13  doe_num        81637 non-null  int64  
 14  vote_total     81637 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 10.0+ MB


Okay, we're at a point where the only remaining nulls are in `cand_id` and `poll_loc`, neither of which are really vital. We can move on from filling in NaNs.

## Step 7: Check for Completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [23]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'Seminole' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67


All 67 counties are represented in the dataset, so this looks complete.

In [24]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['U.S. Representative' 'Governor and Lieutenant Governor'
 'State Representative' 'State Senator']
4


2014 had no presidential contest or senate races, so this looks complete.

## Step 8: Save to CSV

In [25]:
# df.to_csv('fl_2014_cleaned.csv')