In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
house_precinct_csv = 'data/raw/HOUSE_precinct_general.csv'
dime_csv = 'data/raw/dime.csv'

In [16]:
#HOUSE official dtypes from documentation
official_dtypes = {'precinct':str,'office':str, 'party_detailed':str, 'party_simplified':str,
'mode':str,'votes':int, 'county_name':str, 'county_fips':str, 'jurisdiction_name':str,
'jurisdiction_fips':str, 'candidate':str, 'district':str, 'dataverse':str,'year':int,
'stage':str, 'state':str, 'special':str, 'writein':str, 'state_po':str, 'state_fips':str,
'state_cen':str, 'state_ic':str, 'date':str, 'readme_check':str,'magnitude':int}

house =  pd.read_csv(house_precinct_csv, dtype = official_dtypes)

In [17]:
#Filter states
states = ['MI', 'MN', 'PA', 'WI']
house = house[house['state_po'].isin(states)]

In [18]:
#Filter office to US House only
house = house[house['office'].isin(['US HOUSE'])]

In [19]:
#Filter precincts with zero votes
house = house[house['votes']!=0]

In [20]:
#Filter Statistical Adjustments
house = house[house['jurisdiction_name']!="{STATISTICAL ADJUSTMENTS}"]

In [21]:
# Michigan also reports the 'absentee' vote (mode == 'ABSENTEE')
# Is unclear how to match the absentee ballot votes to a precinct
# For this reason, I am dropping them here.
# We should check how this might affect our analysis.
house = house[house['mode']!='ABSENTEE']

In [22]:
#Concatenate state and district to match district column in DIME
house['state_po_district'] = house['state_po'] + house['district'].str[1:]

In [23]:
michigan = house['state'] == "MICHIGAN"
house[['precinct_code', 'ward_code']] = house.loc[michigan, 'precinct'].str.split(',', expand = True)
house['precinct_code'] = house.loc[michigan, 'precinct_code'].apply('{0:0>3}'.format)
house['ward_code'] = house.loc[michigan, 'ward_code'].str.replace('WARD','')
house['ward_code'] = house.loc[michigan, 'ward_code'].str.strip()
house['ward_code'] = house.loc[michigan, 'ward_code'].fillna('0')
house['ward_code'] = house.loc[michigan, 'ward_code'].apply('{0:0>2}'.format)



In [29]:
#Note: We still need to check the precincts that have code with letters like '0-22L', '1-A', '1-B'
house.loc[michigan, 'precinct_code'].unique()

array(['001', '010', '011', '012', '013', '014', '015', '016', '018',
       '019', '002', '020', '021', '003', '302', '304', '305', '306',
       '004', '005', '501', '006', '007', '008', '009', '017', '022',
       '023', '024', '025', '026', '027', '028', '029', '030', '031',
       '032', '033', '034', '035', '036', '037', '038', '039', '040',
       '041', '042', '043', '044', '045', '046', '047', '048', '049',
       '050', '051', '052', '053', '054', '055', '056', '057', '058',
       '059', '060', '061', '062', '063', '064', '065', '066', '067',
       '068', '069', '070', '071', '072', '073', '074', '075', '076',
       '077', '0-22L', '0-23', '1-W', '3-S', '1-A', '1-B', '10-A', '11-A',
       '12-A', '13-A', '14-A', '15-A', '16-A', '16-B', '17-A', '17-B',
       '18-A', '19-A', '19-B', '2-A', '20-A', '21-A', '22-A', '22-B',
       '23-A', '23-B', '24-A', '24-B', '25-A', '3-A', '3-B', '31-A',
       '31-B', '32-A', '33-A', '34-A', '34-B', '34-C', '35-A', '35-B',
       '35-C',

In [26]:
house.loc[michigan, 'precinct_id'] = house.loc[michigan, 'county_fips'] + house.loc[michigan, 'ward_code'] + house.loc[michigan, 'precinct_code']

356394    2604300001
356395    2613100001
356396    2600300001
356397    2609700001
356398    2603300001
             ...    
414097    2616300095
414098    2616300095
414129    2616300096
414130    2616300096
414131    2616300096
Name: precinct_id, Length: 15914, dtype: object

In [None]:
#Totals by precinct
total_by_prec_cand = house.groupby(['state_po', 'state_po_district', 'precinct','candidate'])['votes'].sum().reset_index()
total_by_prec = total_by_prec_cand.groupby(['state_po', 'state_po_district', 'precinct'])['votes'].sum().reset_index()
total_by_prec = total_by_prec.rename(columns = {'votes': 'total_votes'})

In [None]:
total_by_prec

In [None]:
#Totals by county
total_by_county_cand = house.groupby(['state_po', 'state_po_district', 'county_fips','candidate'])['votes'].sum().reset_index()
total_by_county = total_by_county_cand.groupby(['state_po', 'state_po_district', 'county_fips'])['votes'].sum().reset_index()
total_by_county = total_by_county.rename(columns = {'votes': 'total_votes'})

In [None]:
result = pd.merge(total_by_county_cand, total_by_county, on = ['state_po', 'state_po_district', 'county_fips'])
result['pct_vote'] = result['votes']/result['total_votes']
result['candidate_house'] = result['candidate'].str.lower()
result['candidate_house'] = result['candidate_house'].str.replace('.', '')
result['candidate_house'] = result['candidate_house'].str.strip()

In [None]:
#DIME
dime =  pd.read_csv(dime_csv)
dime.sort_values(by=['district', 'name'])
dime = dime.drop(columns=['cycle', 'seat', 'ran_general', 'winner', 'recipient_type'])


In [None]:
# new data frame with split name
split = dime['name'].str.split(",", n = 1, expand = True)

In [None]:
# Change party number codes to letter codes
dime.loc[dime['party'] == 100, 'party'] = 'D'
dime.loc[dime['party'] == 200, 'party'] = 'R'
dime.loc[dime['party'] == 328, 'party'] = 'I'

In [None]:
#Create new candidate name to match with HOUSE df, and new id column to match with census
dime['candidate_dime'] = dime['ffname'] + " " + split[0]
dime['candidate_dime'] = dime['candidate_dime'].str.replace('.', '')
dime['candidate_dime'] = dime['candidate_dime'].str.replace('mr', '')
dime['candidate_dime'] = dime['candidate_dime'].str.strip()

dime['census_match'] = dime['party'] + dime['name'].astype(str).str[:3] + dime['state']
dime

In [None]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
           names_array.append(wrong_option)
           ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [None]:
result_MN = result[result['state_po'].isin(['MN'])]
result_MN = result_MN['candidate_house'].drop_duplicates()
dime_MN = dime[dime['state'].isin(['MN'])]

str2Match = result_MN.tolist()
strOptions = dime_MN['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MN= pd.DataFrame()
merged_MN['candidate_house']=pd.Series(str2Match)
merged_MN['candidate_dime']=pd.Series(name_match)
merged_MN['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_MN


In [None]:
result_WI = result[result['state_po'].isin(['WI'])]
result_WI = result_WI['candidate_house'].drop_duplicates()
dime_WI = dime[dime['state'].isin(['WI'])]

str2Match = result_WI.tolist()
strOptions = dime_WI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_WI = pd.DataFrame()
merged_WI['candidate_house']=pd.Series(str2Match)
merged_WI['candidate_dime']=pd.Series(name_match)
merged_WI['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_WI

In [None]:
result_PA = result[result['state_po'].isin(['PA'])]
result_PA = result_PA['candidate_house'].drop_duplicates()
dime_PA = dime[dime['state'].isin(['PA'])]

str2Match = result_PA.tolist()
strOptions = dime_PA['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_PA = pd.DataFrame()
merged_PA['candidate_house'] = pd.Series(str2Match)
merged_PA['candidate_dime'] = pd.Series(name_match)
merged_PA['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_PA

In [None]:
result_MI = result[result['state_po'].isin(['MI'])]
result_MI = result_MI['candidate_house'].drop_duplicates()
dime_MI = dime[dime['state'].isin(['MI'])]

str2Match = result_MI.tolist()
strOptions = dime_MI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MI = pd.DataFrame()
merged_MI['candidate_house'] = pd.Series(str2Match)
merged_MI['candidate_dime'] = pd.Series(name_match)
merged_MI['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_MI

In [None]:
#concatenate results of fuzzy matching by state
merged = pd.concat([merged_MI, merged_MN, merged_PA, merged_WI], sort=False)

In [None]:
#merge with results of fuzzy matching
result = result.merge(merged, how = 'left', on = 'candidate_house')
result

In [None]:
# merge with dime data to add ideaology
result = result.merge(dime, how = 'left', on = 'candidate_dime')
result

In [None]:
# Calculate the min correct ratio by county
result = result.join(result.groupby(['state_po_district', 'county_fips'])['correct_ratio'].agg(['min']), on = ['state_po_district', 'county_fips'])

# Keep only precincts where min correct ratio is at least 75
result = result[result['min']>=75]


In [None]:
# Compute ideology score weighted by % votes
result['wgt_cfscore'] = result['recipient_cfscore'] * result['pct_vote']
result

In [None]:
result_agg = result.groupby(['state_po_district', 'county_fips'])['wgt_cfscore'].sum().reset_index()
result_agg

In [None]:
#Lean Liberal (0 through -0.5)
#Strong Liberal (-0.5 through -1)
#Very Liberal (-1 and lower)
#Lean Conservative (0 through 0.5)
#Strong Conservative (0.5 through 1)
#Very Conservative (1 and higher)
result_agg['cf_category'] = ''
result_agg.loc[(result_agg['wgt_cfscore'] < -1), 'cf_category'] = '-3'
result_agg.loc[(result_agg['wgt_cfscore'] >= -1) & (result_agg['wgt_cfscore'] <= -0.5), 'cf_category'] = '-2'
result_agg.loc[(result_agg['wgt_cfscore'] >= -0.5) & (result_agg['wgt_cfscore'] < 0), 'cf_category'] = '-1'
result_agg.loc[(result_agg['wgt_cfscore'] >= 0) & (result_agg['wgt_cfscore'] < 0.5), 'cf_category'] = '1'
result_agg.loc[(result_agg['wgt_cfscore'] >= 0.5) & (result_agg['wgt_cfscore'] < 1), 'cf_category'] = '2'
result_agg.loc[(result_agg['wgt_cfscore'] >= 1), 'cf_category'] = '3'

result_agg['cf_label'] = ''
result_agg.loc[(result_agg['cf_category'] == '-3'), 'cf_label'] = 'Very Liberal'
result_agg.loc[(result_agg['cf_category'] == '-2'), 'cf_label'] = 'Strong Liberal'
result_agg.loc[(result_agg['cf_category'] == '-1'), 'cf_label'] = 'Lean Liberal'
result_agg.loc[(result_agg['cf_category'] == '1'), 'cf_label'] = 'Lean Conservative'
result_agg.loc[(result_agg['cf_category'] == '2'), 'cf_label'] = 'Strong Conservative'
result_agg.loc[(result_agg['cf_category'] == '3'), 'cf_label'] = 'Very Conservative'

result_agg

In [None]:
#export to csv
result_agg.to_csv('data/ideology_county.csv', index = False)

In [None]:
census_mi_vtd_csv = 'data/MI_VTD_demographics.csv'
mi_vtd = pd.read_csv(census_mi_vtd_csv, dtype = official_dtypes)
mi_vtd

In [None]:
for x in ['prop_white', 'prop_Black', 'prop_Hispanic', 'prop_AIA', 'prop_Asian']:
    mi_vtd[x] = mi_vtd['total_pop'] * mi_vtd[x]
mi_vtd

In [None]:
mi_vtd['county_fips'] = mi_vtd['VTD_ID'].str[:5]
mi_vtd

In [None]:
mi_county = mi_vtd.groupby(['county_fips']).agg(
     total_pop = ('total_pop','sum'),
     total_White = ('prop_white','sum'),
     total_Black = ('prop_Black','sum'),
     total_Hispanic = ('prop_Hispanic','sum'),
     total_AIA = ('prop_AIA','sum'),
     total_Asian = ('prop_Asian','sum'),
     ).reset_index()
mi_county

In [None]:
mi_county['prop_White'] = mi_county['total_White'] / mi_county['total_pop']
mi_county['prop_Black'] = mi_county['total_Black'] / mi_county['total_pop']
mi_county['prop_Hispanic'] = mi_county['total_Hispanic'] / mi_county['total_pop']
mi_county['prop_AIA'] = mi_county['total_AIA'] / mi_county['total_pop']
mi_county['prop_Asian'] = mi_county['total_Asian'] / mi_county['total_pop']
mi_county

In [None]:
mi_result = mi_county.merge(result_agg, how = 'left', on = 'county_fips')
mi_result

In [None]:
mi_result['cf_label'].value_counts()

In [None]:
mi_result = mi_result[['county_fips','total_pop','prop_White','prop_Black','prop_Hispanic','prop_AIA','prop_Asian', 'wgt_cfscore','cf_category','cf_label']]
mi_result

In [None]:
mi_result.dropna(inplace=True)
mi_result

In [None]:
#export to csv
mi_result.to_csv('data/mi_matched_county.csv', index = False)