In [42]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [43]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [44]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2018 by Precinct\\*"

df = merge_files(location)

In [45]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 743093 entries, 0 to 1343
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   county_code      743093 non-null  object
 1   county_name      743093 non-null  object
 2   elec_num         743093 non-null  int64 
 3   elec_date        743093 non-null  object
 4   elec_name        743093 non-null  object
 5   precinct_id      743093 non-null  object
 6   poll_loc         733018 non-null  object
 7   total_reg        743093 non-null  int64 
 8   total_reg_r      743093 non-null  int64 
 9   total_reg_d      743093 non-null  int64 
 10  total_reg_other  743093 non-null  int64 
 11  contest_name     743093 non-null  object
 12  district         743093 non-null  object
 13  contest_code     743093 non-null  int64 
 14  cand_or_issue    743093 non-null  object
 15  cand_party       743093 non-null  object
 16  cand_id          743093 non-null  int64 
 17  doe_num          

In [46]:
# Examine sample just to get a visual sense of the shape of the data
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
39253,DAD,Miami-Dade,10481,11/06/2018,2018 General Election,409,PCT 409/460/464/487 0257,2547,0,0,0,Representative in Congress,District 25,140250,UnderVotes,,0,902,36
6192,PAL,Palm Beach,10481,11/06/2018,2018 General Election,1098,"1098, 1124",1008,0,0,0,United States Senator,,120000,Bill Nelson,DEM,113049868,70482,291
35259,DAD,Miami-Dade,10481,11/06/2018,2018 General Election,369,PCT 301/369 0173,0,0,0,0,Amendment No. 5: Supermajority Vote Required t...,"Authorize, or Raise State Taxes or Fees",900005,OverVotes,,0,901,0
1,SAR,Sarasota,10481,11/06/2018,2018 General Election,101,101,129,0,0,0,United States Senator,,120000,Bill Nelson,DEM,113049868,70482,63
24629,PAL,Palm Beach,10481,11/06/2018,2018 General Election,1370,1370,319,0,0,0,Palm Beach Soil and Water Conservation District,Group 1,871451,David E. Legg,NOP,112337790,72663,60
6278,LEO,Leon,10481,11/06/2018,2018 General Election,2509,2509 -- ALARM International Ministries,1844,0,0,0,Retention of Brad Thomas,First District Court of Appeal,520103,UnderVotes,,0,902,215
25946,PIN,Pinellas,10481,11/06/2018,2018 General Election,507,507,2134,0,0,0,Retention of Darryl C. Casanueva,Second District Court of Appeal,520202,OverVotes,,0,901,0
30390,BRO,Broward,10481,11/06/2018,2018 General Election,N009,N009,1224,0,0,0,Representative in Congress,District 23,140230,Don Endriss,NPA,102206295,70870,5
78661,DAD,Miami-Dade,10481,11/06/2018,2018 General Election,802,PCT 802 0512,1900,0,0,0,Amendment No. 7: First Responder and Military ...,,900007,No for Rejection,NOP,0,20,477
45988,PAL,Palm Beach,10481,11/06/2018,2018 General Election,3076,"3052, 3058, 3060, 3066, 3076",165,0,0,0,Circuit Judge,"15th Judicial Circuit, Group 13",551513,Scott Ryan Kerner,NOP,112074024,70225,31


## Step 2: Removing Duplicates

In [47]:
df.duplicated().sum()

212

Looks like we have some dupes! Let's take a look and just make sure we're good to drop them.

In [48]:
df_dupe = df[df.duplicated(keep=False) == True]

In [49]:
# See what precincts these rows are from
df_dupe['precinct_id'].unique()

array([463, 658], dtype=object)

In [50]:
# See if any of the rows contain vote totals
df_dupe_vt = df_dupe[df_dupe['vote_total'] != 0]
df_dupe_vt.head()

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total


Upon closer investigations, it looks like all of the dupes are from two precincts in Miami-Dade, and none of the rows register any vote totals -- safe to drop!

In [51]:
df = df.drop_duplicates()

## Step 3: Removing Extraneous Columns
Having already cleaned the 2012, 2014, and 2016 data, I know that there are a handful of columns we can remove off the bat:

In [52]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 742881 entries, 0 to 1343
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   county_code    742881 non-null  object
 1   county_name    742881 non-null  object
 2   elec_date      742881 non-null  object
 3   precinct_id    742881 non-null  object
 4   poll_loc       732806 non-null  object
 5   total_reg      742881 non-null  int64 
 6   contest_name   742881 non-null  object
 7   district       742881 non-null  object
 8   contest_code   742881 non-null  int64 
 9   cand_or_issue  742881 non-null  object
 10  cand_party     742881 non-null  object
 11  cand_id        742881 non-null  int64 
 12  doe_num        742881 non-null  int64 
 13  vote_total     742881 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 85.0+ MB


## Step 4: Standardizing Contest Names

Since we're only interested in six races (President, Governor, U.S. Senate, State Senate, U.S. Representative, and State Representative), we need to comb through this list and identify any entries in `contest_name` that suggests it refers to one of those races. We could use regex string matching, but I'll do it manually to make sure there isn't some weird description that some random county used that we wouldn't predict.

In [54]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['United States Senator', 'Representative in Congress', 'Governor',
       'Attorney General', 'Chief Financial Officer',
       'Commissioner of Agriculture', 'State Senator',
       'County Commissioner', 'Retention of Alan Lawson',
       'Retention of Harvey Jay', 'Retention of Stephanie Ray',
       'Retention of Brad Thomas', 'Retention of Kemmerly Thomas',
       'Retention of Allen Winsor', 'Circuit Judge', 'County Court Judge',
       'School Board', 'Alachua Soil and Water Conservation District',
       'Amendment No. 1: Increased Homestead Property Tax Exemption',
       'Amendment No. 2: Limitations on Property Tax Assessments',
       'Amendment No. 3: Voter Control of Gambling in Florida',
       'Amendment No. 4: Voting Restoration Amendment',
       'Amendment No. 5: Supermajority Vote Required to Impose, Authorize, or Raise State Taxes or Fees',
       'Amendment No. 6: Rights of Crime Victims; Judges',
       'Amendment No. 7: First Responder and Military Member

Above, we can see that 1) there are a lot of races represented in the data and 2) they look like they're finally standardized across the state! This makes things much easier, but for the sake of sticking to a standardized process, I'll go through the above set of contest names. I have extracted the ones that correspond to races we're interested in. Using dictionaries, we can standardize the naming convention for our races.

In [55]:
df['contest_name'] = df['contest_name']\
.replace(dict.fromkeys(['PRESIDENT OF THE UNITED STATES'], 'President of the United States'))\
.replace(dict.fromkeys(['United States Senator', 'UNITED STATES SENATOR'], 'U.S. Senator'))\
.replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE', 'REPRESENTATIVE IN CONGRESS'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))



## Step 5: Narrowing to Races
And now we can more confidently narrow to our six contest types:

In [56]:
df = df[df.contest_name.isin(['President of the United States',\
                              'U.S. Senator',\
                              'U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Senator',\
                              'State Representative'])]\
                            .reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137322 entries, 0 to 137321
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   index          137322 non-null  int64 
 1   county_code    137322 non-null  object
 2   county_name    137322 non-null  object
 3   elec_date      137322 non-null  object
 4   precinct_id    137322 non-null  object
 5   poll_loc       135287 non-null  object
 6   total_reg      137322 non-null  int64 
 7   contest_name   137322 non-null  object
 8   district       137322 non-null  object
 9   contest_code   137322 non-null  int64 
 10  cand_or_issue  137322 non-null  object
 11  cand_party     137322 non-null  object
 12  cand_id        137322 non-null  int64 
 13  doe_num        137322 non-null  int64 
 14  vote_total     137322 non-null  int64 
dtypes: int64(6), object(9)
memory usage: 15.7+ MB


## Step 6: Dealing With NaN Cells

In [57]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]

df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
26206,4956,DAD,Miami-Dade,11/06/2018,100,,3951,U.S. Senator,,120000,Rick Scott,REP,103093132,71039,199
26207,4957,DAD,Miami-Dade,11/06/2018,100,,3951,U.S. Senator,,120000,Bill Nelson,DEM,113049868,70482,820
26208,4958,DAD,Miami-Dade,11/06/2018,100,,3951,U.S. Senator,,120000,WriteinVotes,,0,900,2
26209,4959,DAD,Miami-Dade,11/06/2018,100,,3951,U.S. Senator,,120000,OverVotes,,0,901,0
26210,4960,DAD,Miami-Dade,11/06/2018,100,,3951,U.S. Senator,,120000,UnderVotes,,0,902,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49870,26239,DUV,Duval,11/06/2018,1415,,2261,Governor and Lieutenant Governor,,160000,UnderVotes,,0,902,10
49871,26258,DUV,Duval,11/06/2018,1415,,2261,State Representative,District 15,260150,Wyman Duggan,REP,103393517,69999,605
49872,26259,DUV,Duval,11/06/2018,1415,,2261,State Representative,District 15,260150,Tracye Ann Polson,DEM,118326666,70061,970
49873,26260,DUV,Duval,11/06/2018,1415,,2261,State Representative,District 15,260150,OverVotes,,0,901,0


It looks like the only nulls in this dataset are in the `poll_loc` column. That's great -- we can just leave those in for the time being.

## Step 7: Check for Completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [58]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'Seminole' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67


All 67 counties are represented in the dataset, so this looks complete.

In [59]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['U.S. Senator' 'U.S. Representative' 'Governor and Lieutenant Governor'
 'State Senator' 'State Representative']
5


2018 had no presidential contest, so this looks complete.

## Step 8: Save to CSV

In [60]:
# df.to_csv('fl_2018_cleaned.csv')