In [31]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [32]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [33]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2016 by Precinct\\*"

df = merge_files(location)

In [34]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 626689 entries, 0 to 1663
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   county_code      626689 non-null  object 
 1   county_name      626689 non-null  object 
 2   elec_num         626689 non-null  int64  
 3   elec_date        626689 non-null  object 
 4   elec_name        626689 non-null  object 
 5   precinct_id      626689 non-null  object 
 6   poll_loc         609703 non-null  object 
 7   total_reg        626689 non-null  int64  
 8   total_reg_r      626689 non-null  int64  
 9   total_reg_d      626689 non-null  int64  
 10  total_reg_other  626689 non-null  int64  
 11  contest_name     626689 non-null  object 
 12  district         614659 non-null  object 
 13  contest_code     626503 non-null  float64
 14  cand_or_issue    626689 non-null  object 
 15  cand_party       605224 non-null  object 
 16  cand_id          621634 non-null  float64
 17

In [35]:
# Examine sample just to get a visual sense of the shape of the data
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
15519,PAL,Palm Beach,10282,11/08/2016,2016 General Election,2012,"2012, 2026",87,0,0,901642,Amendment No. 5: Homestead Tax exemption for C...,"Low-income, Long-term Residents; Determinatio...",900500.0,Yes for Approval,NOP,0.0,10.0,52
10093,LEE,Lee,10282,11/08/2016,2016 General Election,69,PCT. 069,4184,0,0,423938,Amendment No. 2: Use of Marijuana for Debilita...,,900200.0,UnderVotes,,0.0,902.0,186
3880,BRO,Broward,10282,11/08/2016,2016 General Election,C002,C002 32,1013,0,0,1179255,Retention of Spencer D. Levine,Fourth District Court of Appeal,520405.0,No,NOP,105198202.0,20.0,184
7186,DUV,Duval,10282,11/08/2016,2016 General Election,612,612.0,4496,0,0,565665,Public Defender,4th Judicial Circuit,220040.0,UnderVotes,,0.0,902.0,724
66282,BRO,Broward,10282,11/08/2016,2016 General Election,X036,X036 549,2301,0,0,1179255,Amendment No. 2: Use of Marijuana for Debilita...,,900200.0,OverVotes,,0.0,901.0,0
26795,DAD,Miami-Dade,10282,11/08/2016,2016 General Election,409,PCT 409/460/464/487 0257,2445,0,0,1379230,Retention of Edwin A. Scales,Third District Court of Appeal,520301.0,No,NOP,113920061.0,20.0,356
5689,CLL,Collier,10282,11/08/2016,2016 General Election,440,PCT 440 0044,3422,0,0,191353,State Senator,District 28,240280.0,Kathleen Passidomo,REP,103146644.0,64407.0,2186
8314,LAK,Lake,10282,11/08/2016,2016 General Election,69,69 0069,697,0,0,222432,Amendment No. 1: Rights of Electricity Consume...,,900100.0,OverVotes,,0.0,901.0,0
8196,OSC,Osceola,10282,11/08/2016,2016 General Election,502,502 - FIRE STATION 52 - PINE G,3964,0,0,196326,Retention of James A Edwards,Fifth District Court of Appeal,520502.0,OverVotes,,0.0,901.0,0
2730,PUT,Putnam,10282,11/08/2016,2016 General Election,27,Southwest Fire Dept,215,0,0,0,County Commissioner,District 3,380030.0,WriteinVotes,,0.0,900.0,0


## Step 2: Removing Duplicates

In [36]:
df.duplicated().sum()

154

Looks like we have some dupes! Let's take a look and just make sure we're good to drop them.

In [37]:
df_dupe = df[df.duplicated(keep=False) == True]

Upon closer investigations, it looks like all of the dupes are from two precincts in Miami-Dade, and none of the rows register any vote totals -- safe to drop!

In [38]:
df = df.drop_duplicates()

## Step 3: Removing Extraneous Columns
Having already cleaned the 2012 and 2014 data, I know that there are a handful of columns we can remove off the bat:

In [39]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 626535 entries, 0 to 1663
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   county_code    626535 non-null  object 
 1   county_name    626535 non-null  object 
 2   elec_date      626535 non-null  object 
 3   precinct_id    626535 non-null  object 
 4   poll_loc       609549 non-null  object 
 5   total_reg      626535 non-null  int64  
 6   contest_name   626535 non-null  object 
 7   district       614505 non-null  object 
 8   contest_code   626349 non-null  float64
 9   cand_or_issue  626535 non-null  object 
 10  cand_party     605070 non-null  object 
 11  cand_id        621480 non-null  float64
 12  doe_num        626349 non-null  float64
 13  vote_total     626535 non-null  int64  
dtypes: float64(3), int64(2), object(9)
memory usage: 71.7+ MB


## Step 4: Standardizing Contest Names

Since we're only interested in six races (President, Governor, U.S. Senate, State Senate, U.S. Representative, and State Representative), we need to comb through this list and identify any entries in `contest_name` that suggests it refers to one of those races. We could use regex string matching, but I'll do it manually to make sure there isn't some weird description that some random county used that we wouldn't predict.

In [41]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['President of the United States', 'United States Senator',
       'Representative in Congress', 'State Senator',
       'Clerk of the Circuit Court and Comptroller', 'Sheriff',
       'County Commissioner', 'Retention of Charles T. Canady',
       'Retention of Jorge Labarga', 'Retention of Ricky L. Polston',
       'Retention of Ross Bilbrey', 'Retention of Susan Kelsey',
       'Retention of Lori S. Rowe', 'Retention of Kent Wetherell',
       'Retention of Bo Winokur', 'Retention of Jim Wolf',
       'Amendment No. 1: Rights of Electricity Consumers Regarding Solar Energy Choice',
       'Amendment No. 2: Use of Marijuana for Debilitating Medical Conditions',
       'Amendment No. 3: Tax Exemption for Totally and Permanently Disabled First Responders',
       'Amendment No. 5: Homestead Tax exemption for Certain Senior, Low-income, Long-term Residents; Determination of Just Value',
       'State Representative', 'Tax Collector',
       'Superintendent of Schools', 'Beach Mosq

Above, we can see that 1) there are a lot of races represented in the data and 2) they aren't all uniformly named. Going through the above set of contest names, I have extracted the ones that correspond to races we're interested in. Using dictionaries, we can standardize the naming convention for our races.

In [42]:
df['contest_name'] = df['contest_name']\
.replace(dict.fromkeys(['PRESIDENT OF THE UNITED STATES'], 'President of the United States'))\
.replace(dict.fromkeys(['United States Senator', 'UNITED STATES SENATOR'], 'U.S. Senator'))\
.replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE', 'REPRESENTATIVE IN CONGRESS'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))



## Step 5: Narrowing to Races
And now we can more confidently narrow to our six contest types:

In [43]:
df = df[df.contest_name.isin(['President of the United States',\
                              'U.S. Senator',\
                              'U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Senator',\
                              'State Representative'])]\
                            .reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165059 entries, 0 to 165058
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          165059 non-null  int64  
 1   county_code    165059 non-null  object 
 2   county_name    165059 non-null  object 
 3   elec_date      165059 non-null  object 
 4   precinct_id    165059 non-null  object 
 5   poll_loc       159720 non-null  object 
 6   total_reg      165059 non-null  int64  
 7   contest_name   165059 non-null  object 
 8   district       160366 non-null  object 
 9   contest_code   165059 non-null  float64
 10  cand_or_issue  165059 non-null  object 
 11  cand_party     162347 non-null  object 
 12  cand_id        163755 non-null  float64
 13  doe_num        165059 non-null  float64
 14  vote_total     165059 non-null  int64  
dtypes: float64(3), int64(3), object(9)
memory usage: 18.9+ MB


## Step 6: Dealing With NaN Cells

In [44]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]

df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
2185,0,BAY,Bay,11/08/2016,1,,2003,President of the United States,,100000.0,Trump / Pence,REP,0.0,65072.0,1122
2186,1,BAY,Bay,11/08/2016,1,,2003,President of the United States,,100000.0,Clinton / Kaine,DEM,0.0,65058.0,189
2187,2,BAY,Bay,11/08/2016,1,,2003,President of the United States,,100000.0,Johnson / Weld,LPF,0.0,69370.0,25
2188,3,BAY,Bay,11/08/2016,1,,2003,President of the United States,,100000.0,Castle / Bradley,CPF,0.0,69385.0,0
2189,4,BAY,Bay,11/08/2016,1,,2003,President of the United States,,100000.0,Stein / Baraka,GRE,0.0,69377.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154706,8113,SEM,SEMINOLE,11/8/2016,76,Precinct 76,4117,U.S. Senator,,120000.0,Write-in,,,900.0,5
154707,8114,SEM,SEMINOLE,11/8/2016,77,Precinct 77,2782,U.S. Senator,,120000.0,Write-in,,,900.0,3
154708,8115,SEM,SEMINOLE,11/8/2016,78,Precinct 78,4173,U.S. Senator,,120000.0,Write-in,,,900.0,6
154709,8116,SEM,SEMINOLE,11/8/2016,79,Precinct 79,3698,U.S. Senator,,120000.0,Write-in,,,900.0,9


In [45]:
# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party = df[df['cand_party'].isna()]

df_nan_cand_party

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
143191,1002,POL,Polk,11/08/2016,101,Outreach Baptist,2577,President of the United States,,100000.0,WriteInVotes,,0.0,0.0,17
143192,1003,POL,Polk,11/08/2016,102,Green Pond Baptist Church,1149,President of the United States,,100000.0,WriteInVotes,,0.0,0.0,2
143193,1004,POL,Polk,11/08/2016,103,Polo Park,4043,President of the United States,,100000.0,WriteInVotes,,0.0,0.0,22
143194,1005,POL,Polk,11/08/2016,104,Polo Park East,2901,President of the United States,,100000.0,WriteInVotes,,0.0,0.0,14
143195,1006,POL,Polk,11/08/2016,105,Ridge Mobile Home Park Clubhouse,1752,President of the United States,,100000.0,WriteInVotes,,0.0,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154706,8113,SEM,SEMINOLE,11/8/2016,76,Precinct 76,4117,U.S. Senator,,120000.0,Write-in,,,900.0,5
154707,8114,SEM,SEMINOLE,11/8/2016,77,Precinct 77,2782,U.S. Senator,,120000.0,Write-in,,,900.0,3
154708,8115,SEM,SEMINOLE,11/8/2016,78,Precinct 78,4173,U.S. Senator,,120000.0,Write-in,,,900.0,6
154709,8116,SEM,SEMINOLE,11/8/2016,79,Precinct 79,3698,U.S. Senator,,120000.0,Write-in,,,900.0,9


In [46]:

# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party['cand_or_issue'].unique()

array(['WriteInVotes', 'OverVotes', 'UnderVotes', 'Times Blank Voted',
       'Times Over Voted', 'Write-in 30', 'Write-in'], dtype=object)

Looks like rows containing NaNs in `cand_party` are either rows describing overvotes, undervotes, or write-ins. While these might be interesting to analyze, they're outside of the scope our analysis; we can drop them.

In [47]:
df = df.dropna(subset=['cand_party'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162347 entries, 0 to 165058
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          162347 non-null  int64  
 1   county_code    162347 non-null  object 
 2   county_name    162347 non-null  object 
 3   elec_date      162347 non-null  object 
 4   precinct_id    162347 non-null  object 
 5   poll_loc       157008 non-null  object 
 6   total_reg      162347 non-null  int64  
 7   contest_name   162347 non-null  object 
 8   district       159136 non-null  object 
 9   contest_code   162347 non-null  float64
 10  cand_or_issue  162347 non-null  object 
 11  cand_party     162347 non-null  object 
 12  cand_id        161867 non-null  float64
 13  doe_num        162347 non-null  float64
 14  vote_total     162347 non-null  int64  
dtypes: float64(3), int64(3), object(9)
memory usage: 19.8+ MB


Looks like we still have nulls in `poll_loc` (which is fine), `district`, and `cand_id` -- let's take a look at those.

In [48]:
# Create a df with just the rows containing a null value in the district column
df_nan_district = df[df['district'].isna()]
df_nan_district.sample(5)

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
142904,715,POL,Polk,11/08/2016,225,Church of Jesus Christ L.D.S.,2827,President of the United States,,100000.0,Stein / Baraka,GRE,0.0,69377.0,18
143103,914,POL,Polk,11/08/2016,307,Lena Vista United Methodist Church,1754,President of the United States,,100000.0,De La Fuente / Steinberg,REF,0.0,69403.0,1
143096,907,POL,Polk,11/08/2016,250,Trinity Christian Center,2773,President of the United States,,100000.0,De La Fuente / Steinberg,REF,0.0,69403.0,1
153931,7338,SEM,SEMINOLE,11/8/2016,21,Precinct 21,4638,U.S. Senator,,120000.0,Basil E. Dalack,NPA,112148742.0,65789.0,8
154330,7737,SEM,SEMINOLE,11/8/2016,20,Precinct 20,6680,U.S. Senator,,120000.0,Steven Machat,NPA,118948766.0,64869.0,29


Looks like we have NaNs in the `district` column for at least some rows of President and U.S. Senate data, in rows where vote tallies are indeed present. Let's look a little deeper.

In [49]:
# Is this only present for the POTUS and U.S. Senate races? Looks like yes.
df_nan_district['contest_name'].unique()

array(['President of the United States', 'U.S. Senator'], dtype=object)

In [50]:
# Is this only present in Seminole and Polk counties? Looks like yes.
df_nan_district['county_name'].unique()

array(['Polk', 'SEMINOLE'], dtype=object)

In [51]:
# Is this only present for third party candidates? Doesn't look like it.
df_nan_district['cand_party'].unique()

array(['REP', 'DEM', 'LPF', 'CPF', 'GRE', 'REF', 'NPA'], dtype=object)

It looks like the only races that we're interested in that have null values in the `district` column are statewide and don't really have a district anyway. We'll just replace these NaNs with blank cells and move on.

In [52]:
# We can fill NaNs in this column for rows for statewide races with 'Statewide' be consistent with other dataframes.
df['district'].fillna('', inplace=True)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162347 entries, 0 to 165058
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          162347 non-null  int64  
 1   county_code    162347 non-null  object 
 2   county_name    162347 non-null  object 
 3   elec_date      162347 non-null  object 
 4   precinct_id    162347 non-null  object 
 5   poll_loc       157008 non-null  object 
 6   total_reg      162347 non-null  int64  
 7   contest_name   162347 non-null  object 
 8   district       162347 non-null  object 
 9   contest_code   162347 non-null  float64
 10  cand_or_issue  162347 non-null  object 
 11  cand_party     162347 non-null  object 
 12  cand_id        161867 non-null  float64
 13  doe_num        162347 non-null  float64
 14  vote_total     162347 non-null  int64  
dtypes: float64(3), int64(3), object(9)
memory usage: 19.8+ MB


Okay, we're at a point where the only remaining nulls are in `cand_id` and `poll_loc`, neither of which are really vital. We can move on from filling in NaNs.

## Step 7: Check for Completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [54]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'SEMINOLE' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67


All 67 counties are represented in the dataset, so this looks complete.

In [55]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['President of the United States' 'U.S. Senator' 'U.S. Representative'
 'State Senator' 'State Representative']
5


2016 had no gubernatorial contest, so this looks complete.

## Step 8: Save to CSV

In [56]:
# df.to_csv('fl_2016_cleaned.csv')