In [231]:
import numpy as np
import pandas as pd
import matplotlib as mp
import csv
import os
import glob

## Step 1: Reformat and Ingest Data

In [232]:
# Combine county files into single dataframe
def merge_files(directory):
    target_files = glob.glob(directory)
    combined_df = pd.DataFrame()
    for file in target_files:
        df = pd.read_table(file, names=['county_code', 'county_name', 'elec_num', 'elec_date', 'elec_name', 'precinct_id', 'poll_loc', 'total_reg', 'total_reg_r', 'total_reg_d', 'total_reg_other', 'contest_name', 'district', 'contest_code', 'cand_or_issue', 'cand_party', 'cand_id', 'doe_num', 'vote_total'], encoding_errors='replace')
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [233]:
location = "C:\\Users\\canor\\Documents\\GitHub\\FL-Political-Analysis\\Florida Analysis\\FL 2014 by Precinct\\*"

df = merge_files(location)

In [234]:
# Examine dataframe for size, dtypes, and NaNs
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 385323 entries, 0 to 1119
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   county_code      385323 non-null  object 
 1   county_name      385323 non-null  object 
 2   elec_num         385323 non-null  int64  
 3   elec_date        385323 non-null  object 
 4   elec_name        385323 non-null  object 
 5   precinct_id      385323 non-null  object 
 6   poll_loc         340897 non-null  object 
 7   total_reg        385323 non-null  int64  
 8   total_reg_r      385323 non-null  int64  
 9   total_reg_d      385323 non-null  int64  
 10  total_reg_other  385323 non-null  int64  
 11  contest_name     385323 non-null  object 
 12  district         349577 non-null  object 
 13  contest_code     385323 non-null  int64  
 14  cand_or_issue    385323 non-null  object 
 15  cand_party       365969 non-null  object 
 16  cand_id          380165 non-null  float6

In [235]:
df.sample(10)

Unnamed: 0,county_code,county_name,elec_num,elec_date,elec_name,precinct_id,poll_loc,total_reg,total_reg_r,total_reg_d,total_reg_other,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
32208,BRO,Broward,10218,11/04/2014,2014 General Election,V024,V024 454,1950,0,0,0,Commissioner of Agriculture,,160800,Adam Putnam,REP,113549963.0,60369,325
5076,POL,Polk,10218,11/04/2014,2014 General Election,243,SHEPHERDS COMMUNITY UNITED METHODIST 2165 SHEP...,4671,2008,1492,1171,Amendment 3,,900300,UnderVotes,,0.0,902,169
2812,BRE,Brevard,10218,11/04/2014,2014 General Election,133,,0,0,0,0,Governor and Lieutenant Governor,,160000,Scott / Lopez-Cantera,REP,103093132.0,61253,0
6982,SEM,Seminole,10218,11/04/2014,2014 General Election,78,Precinct 78,266005,105244,88123,72638,Circuit Judge,Group 13,551813,Number of Under Votes,NP,,902,0
147,GAD,Gadsden,10218,11/04/2014,2014 General Election,3,Pre. 3 Concord Fire Dept,1031,0,0,0,Attorney General,,160200,Pam Bondi,REP,110471335.0,60487,305
242,CLM,Columbia,10218,11/04/2014,2014 General Election,112,112 Supv Elections Office,552,0,0,0,Retention of Clay Roberts,First District Court of Appeal,520105,Yes,NOP,105003932.0,10,154
2213,BRE,Brevard,10218,11/04/2014,2014 General Election,126,,0,0,0,0,Attorney General,,160200,Bill Wohlsifer,LPF,111846393.0,61259,0
29356,BRO,Broward,10218,11/04/2014,2014 General Election,T023,T023 412,1293,0,0,0,Broward Soil and Water Conservation District,Group 4,830064,OverVotes,,0.0,901,0
560,LEO,Leon,10218,11/04/2014,2014 General Election,1301,1301-1302 -- Lincoln Neighborhood Ctr,980,0,0,0,Chief Financial Officer,,160500,"William ""Will"" Rankin",DEM,114807511.0,61112,336
4446,PAS,Pasco,10218,11/04/2014,2014 General Election,75,075 - MAGNOLIA VALLEY 74,3215,0,0,0,County Commissioner,District 2,380020,UnderVotes,,0.0,902,70


## Step 2: Removing Duplicates

In [236]:
df.duplicated().sum()

0

No dupes! Great!

## Step 3: Removing Extraneous Columns

In [237]:
df = df.drop(columns=['total_reg_r', 'total_reg_d', 'total_reg_other', 'elec_num', 'elec_name'])
df.sample(5)

Unnamed: 0,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
30961,DAD,Miami-Dade,11/04/2014,626.0,PCT 626/648 0396,564,Governor and Lieutenant Governor,,160000,Scott / Lopez-Cantera,REP,103093132.0,61253,173
1551,CHA,Charlotte,11/04/2014,23.0,ENG BEACH VILLAS 0029,1064,Amendment No. 3: PROSPECTIVE APPOINTMENT OF CE...,,900300,No for Rejection,NOP,0.0,20,309
3451,POL,Polk,11/04/2014,314.0,1ST BAPTIST CHURCH OF LUCERNE PARK 5650 STATE ...,2969,2nd DCA - Sleet,Second District Court of Appeal,520203,No,NOP,110555691.0,20,426
5769,SEM,Seminole,11/04/2014,66.0,Precinct 66,266005,Commissioner of Agriculture,,160800,Write-in,NP,,900,2
13910,DAD,Miami-Dade,11/04/2014,304.0,PCT 304 0173,6749,Governor and Lieutenant Governor,,160000,Scott / Lopez-Cantera,REP,103093132.0,61253,637


## Step 4: Narrowing to Races

We're only interested in: Presidential, Gubernatorial, U.S. Congress, U.S. Senate, State House, and State Senate races.

In [238]:
# Look at all listed races and select those that are relevant
df['contest_name'].unique()

array(['U.S. Representative', 'Governor and Lieutenant Governor',
       'Attorney General', 'Chief Financial Officer',
       'Commissioner of Agriculture', 'Tax Collector',
       'County Commissioner', 'Retention of Robert T. Benton',
       'Retention of Joseph Lewis, Jr.', 'Retention of Scott Makar',
       'Retention of Tim Osterhaus', 'Retention of Clay Roberts',
       'Amendment No. 1: Water and Land Conservation - Dedicates funds to acquire and restore Florida conservation and recreation lands',
       'Amendment No. 2: Use of Marijuana for Certain Medical Conditions',
       'Amendment No. 3: PROSPECTIVE APPOINTMENT OF CERTAIN JUDICIAL VACANCIES',
       'State Representative', 'Clerk of the Circuit Court',
       'School Board', 'Canaveral Port Authority',
       'Retention of Wendy W. Berger', 'Retention of Kerry I. Evander',
       'Retention of Charles Alan Lawson',
       'Retention of Richard B. Orfinger',
       'Retention of William David Palmer',
       'Retention o

Above, we can that 1) there are a lot of races represented in the data. 

In [242]:
df['contest_name'] = df['contest_name'].replace(dict.fromkeys(['Congress 10', 'Congress 9', 'Congress 15', 'Congress 17', 'Representative in Congress', 'U.S. REPRESENTATIVE'], 'U.S. Representative'))\
.replace(dict.fromkeys(['GOVERNOR AND  LT.GOVERNOR', 'Governor'], 'Governor and Lieutenant Governor'))\
.replace(dict.fromkeys(['STATE REPRESENTATIVE', 'House 39', 'House 40', 'House 41', 'House 42'], 'State Representative'))\
.replace(dict.fromkeys(['STATE SENATOR', 'Senate 14'], 'State Senator'))


In [None]:
df = df[df.contest_name.isin(['U.S. Representative',\
                              'Governor and Lieutenant Governor',\
                              'State Representative',\
                              'U.S. Representative',\
                              'State Senator'])]\
                            .reset_index()

df.info()

## Step 4: Dealing With NaN Cells

In [244]:
# Looking for any rows with nulls
df_nan = df[df.isna().any(axis=1)]
df_nan

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
1987,0,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,Bill Posey,REP,100928199.0,60309,730
1988,1,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,Gabriel Rothblatt,DEM,119321677.0,60345,337
1989,2,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,WriteinVotes,,0.0,900,2
1990,3,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,OverVotes,,0.0,901,0
1991,4,BRE,Brevard,11/04/2014,100,,1914,U.S. Representative,District 8,140080,UnderVotes,,0.0,902,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77037,7398,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Rick Scott,REP,103093132.0,61253,848
77038,7399,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Times Over Voted,NP,,901,0
77039,7400,SEM,Seminole,11/04/2014,9,Precinct 9,266005,Governor and Lieutenant Governor,,160000,Write-in,NP,,900,4
77041,7406,SEM,Seminole,11/04/2014,9,Precinct 9,266005,State Senator,District 10,240100,Number of Under Votes,NP,,902,0


In [245]:
# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party = df[df['cand_party'].isna()]
df_nan_cand_party

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
70359,577,POL,Polk,11/04/2014,102,GREEN POND BAPTIST CHURCH 5995 GREENPOND CHURC...,1068,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70360,578,POL,Polk,11/04/2014,103,POLO PARK 426 POLO PARK BLVD,3182,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70361,579,POL,Polk,11/04/2014,104,POLO PARK EAST 525 POLO PARK EAST BLVD,2477,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70362,580,POL,Polk,11/04/2014,105,RIDGE MOBILE HOME PARK CLUBHOUSE 49473 HIGHWAY 27,1098,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
70363,581,POL,Polk,11/04/2014,107,NEW LIFE COMMUNITY CHURCH 530 COMMONWEALTH AVE SW,3870,U.S. Representative,District 10,140100,OverVotes,,0.0,901,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72774,8677,POL,Polk,11/04/2014,410,CALVARY FIRST ASSEMBLY OF GOD 4550 E JOHNSON AVE,898,State Senator,District 14,240140,UnderVotes,,0.0,902,38
72775,8678,POL,Polk,11/04/2014,412,POINCIANA COMMUNITY CENTER 395 MARIGOLD AVE,6517,State Senator,District 14,240140,UnderVotes,,0.0,902,113
72776,8679,POL,Polk,11/04/2014,419,TUSCANY PRESERVE AT LAKE MARION 1850 PACIFIC ROAD,4959,State Senator,District 14,240140,UnderVotes,,0.0,902,94
72777,8680,POL,Polk,11/04/2014,422,SUNLAKE TERRACE ESTATES 6555-16 OLD LAKE WILSO...,208,State Senator,District 14,240140,UnderVotes,,0.0,902,2


In [246]:

# Making sure rows with nulls in 'cand_party' are only under and overvotes, which we can discard
df_nan_cand_party['cand_or_issue'].unique()

array(['OverVotes', 'UnderVotes'], dtype=object)

Looks like rows containing NaNs in cand_party are either rows describing overvotes and undervotes, or pertain to nonpartisan races that are not relevant to this analysis. Let's double check that latter assumption, and then drop those rows:

In [247]:
df_nan_cand_party[~df_nan_cand_party['cand_or_issue'].isin(['UnderVotes', 'OverVotes'])]['contest_name'].unique()

array([], dtype=object)

Assumptions confirmed.

In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82603 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          82603 non-null  int64  
 1   county_code    82603 non-null  object 
 2   county_name    82603 non-null  object 
 3   elec_date      82603 non-null  object 
 4   precinct_id    82603 non-null  object 
 5   poll_loc       73261 non-null  object 
 6   total_reg      82603 non-null  int64  
 7   contest_name   82603 non-null  object 
 8   district       76582 non-null  object 
 9   contest_code   82603 non-null  int64  
 10  cand_or_issue  82603 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        81876 non-null  float64
 13  doe_num        82603 non-null  int64  
 14  vote_total     82603 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 9.5+ MB


In [249]:
df = df.dropna(subset=['cand_party'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81637 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          81637 non-null  int64  
 1   county_code    81637 non-null  object 
 2   county_name    81637 non-null  object 
 3   elec_date      81637 non-null  object 
 4   precinct_id    81637 non-null  object 
 5   poll_loc       72295 non-null  object 
 6   total_reg      81637 non-null  int64  
 7   contest_name   81637 non-null  object 
 8   district       75950 non-null  object 
 9   contest_code   81637 non-null  int64  
 10  cand_or_issue  81637 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        80910 non-null  float64
 13  doe_num        81637 non-null  int64  
 14  vote_total     81637 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 10.0+ MB


Looks like we still have nulls in `poll_loc` (which is fine), `district`, and `cand_id` -- let's take a look at those.

In [250]:
# Create a df with just the rows containing a null value in the district column
df_nan_district = df[df['district'].isna()]
df_nan_district.head(5)

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
57918,1477,PAL,Palm Beach,11/4/2014,1002,1002,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57919,1478,PAL,Palm Beach,11/4/2014,1004,1004,2,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57920,1479,PAL,Palm Beach,11/4/2014,1006,1006,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57921,1480,PAL,Palm Beach,11/4/2014,1008,1008,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0
57922,1481,PAL,Palm Beach,11/4/2014,1010,1010,0,Governor and Lieutenant Governor,,160000,Adrian Wyllie,LPF,106930498.0,60196,0


This is a little weird. Looks like we have NaNs in the district column for Governor and Lt. Governor candidates, in rows where vote tallies are indeed present. Is this just for the libertarian party candidate (as we can see in the above head window)? Let's check.

In [251]:
# Is this only present for the Governor's race? Doesn't look like it.
df_nan_district['contest_name'].unique()

array(['Governor and Lieutenant Governor'], dtype=object)

In [252]:
# Is this only present in Palm Beach? Doesn't look like it.
df_nan_district['county_name'].unique()

array(['Palm Beach', 'Polk', 'Seminole'], dtype=object)

In [253]:
# Is this only present for LPF party candidates? Doesn't look like it.
df_nan_district['cand_party'].unique()

array(['LPF', 'DEM', 'NPA', 'REP', 'NP'], dtype=object)

It looks like the only races that we're interested in that have null values in the `district` column are statewide and don't really have a district anyway. We'll just replace these NaNs with blank cells and move on.

In [254]:
# We can fill NaNs in this column for rows for statewide races with a blank to be consistent with other dataframes.
df['district'].fillna('', inplace=True)
df.sample(5)

Unnamed: 0,index,county_code,county_name,elec_date,precinct_id,poll_loc,total_reg,contest_name,district,contest_code,cand_or_issue,cand_party,cand_id,doe_num,vote_total
42355,2899,LEE,Lee,11/04/2014,33,POLL 033,2494,U.S. Representative,District 19,140190,UnderVotes,,0.0,902,46
41083,4131,LAK,Lake,11/04/2014,48,48 0048,2729,U.S. Representative,District 10,140100,Michael McKenna,DEM,120752577.0,60384,525
9156,20460,BRO,Broward,11/04/2014,Q042,Q042 287,1451,Governor and Lieutenant Governor,,160000,Scott / Lopez-Cantera,REP,103093132.0,61253,369
52160,7896,ORA,Orange,11/04/2014,308,Precinct 308 83,2199,Governor and Lieutenant Governor,,160000,UnderVotes,,0.0,902,10
77304,1268,STJ,St. Johns,11/04/2014,203,Heritage Landing,4462,Governor and Lieutenant Governor,,160000,Burkett / Matos,NPA,108852312.0,63420,5


In [255]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81637 entries, 0 to 82602
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          81637 non-null  int64  
 1   county_code    81637 non-null  object 
 2   county_name    81637 non-null  object 
 3   elec_date      81637 non-null  object 
 4   precinct_id    81637 non-null  object 
 5   poll_loc       72295 non-null  object 
 6   total_reg      81637 non-null  int64  
 7   contest_name   81637 non-null  object 
 8   district       81637 non-null  object 
 9   contest_code   81637 non-null  int64  
 10  cand_or_issue  81637 non-null  object 
 11  cand_party     81637 non-null  object 
 12  cand_id        80910 non-null  float64
 13  doe_num        81637 non-null  int64  
 14  vote_total     81637 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 10.0+ MB


Okay, we're at a point where the only remaining nulls are in `cand_id` and `poll_loc`, neither of which are really vital. We can move on from filling in NaNs.

## Step 5: Standardizing Contest Names

Earlier we saw that different counties recorded the Governor's race (and a few other statewide races, but we're not interested in those) slightly different. Let's fix those. 

In [256]:
races = df['contest_name'].unique()
print(races)
print(len(races))

['U.S. Representative' 'Governor and Lieutenant Governor'
 'State Representative' 'State Senator']
4


In [257]:
# def select_races(df):
#     df = df[df.contest_name.isin(['Governor',\
#                                 'United States Senator',\
#                                 'U.S. REPRESENTATIVE',\
#                                 'Representative in Congress',\
#                                 'GOVERNOR AND  LT.GOVERNOR',\
#                                 'STATE SENATOR',\
#                                 'STATE REPRESENTATIVE',\
#                                 'Governor and Lieutenant Governor',\
#                                 'U.S. Representative',\
#                                 'State Senator',\
#                                 'State Representative'])]\
#                                 .reset_index()
#     return df

In [258]:
# df_narrowed = select_races(df)
# df_narrowed.info()

## Step (Final): Check for completeness
Finally, let's just double check to make sure all counties and races are present in our final output:

In [259]:
counties = df['county_name'].unique()
print(counties)
print(len(counties))
# All counties are present and accounted for 

['Alachua' 'Baker' 'Bay' 'Bradford' 'Brevard' 'Broward' 'Calhoun'
 'Charlotte' 'Citrus' 'Clay' 'Collier' 'Columbia' 'Miami-Dade' 'Desoto'
 'Dixie' 'Duval' 'Escambia' 'Flagler' 'Franklin' 'Gadsden' 'Gilchrist'
 'Glades' 'Gulf' 'Hamilton' 'Hardee' 'Hendry' 'Hernando' 'Highlands'
 'Hillsborough' 'Holmes' 'Indian River' 'Jackson' 'Jefferson' 'Lafayette'
 'Lake' 'Lee' 'Leon' 'Levy' 'Liberty' 'Madison' 'Manatee' 'Monroe'
 'Marion' 'Martin' 'Nassau' 'Okaloosa' 'Okeechobee' 'Orange' 'Osceola'
 'Palm Beach' 'Pasco' 'Pinellas' 'Polk' 'Putnam' 'Santa Rosa' 'Sarasota'
 'Seminole' 'St. Johns' 'St. Lucie' 'Sumter' 'Suwannee' 'Taylor' 'Union'
 'Volusia' 'Wakulla' 'Walton' 'Washington']
67
