In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
house_precinct_csv = 'data/raw/HOUSE_precinct_general.csv'
dime_csv = 'data/raw/dime.csv'

In [3]:
#HOUSE official dtypes from documentation
official_dtypes = {'precinct':str,'office':str, 'party_detailed':str, 'party_simplified':str,
'mode':str,'votes':int, 'county_name':str, 'county_fips':str, 'jurisdiction_name':str,
'jurisdiction_fips':str, 'candidate':str, 'district':str, 'dataverse':str,'year':int,
'stage':str, 'state':str, 'special':str, 'writein':str, 'state_po':str, 'state_fips':str,
'state_cen':str, 'state_ic':str, 'date':str, 'readme_check':str,'magnitude':int}

house =  pd.read_csv(house_precinct_csv, dtype = official_dtypes)

In [4]:
#Filter states
states = ['MI', 'MN', 'PA', 'WI']
house = house[house['state_po'].isin(states)]

In [5]:
#Filter office to US House only
house = house[house['office'].isin(['US HOUSE'])]

In [6]:
#Filter precincts with zero votes
house = house[house['votes']!=0]


In [7]:
#Concatenate state and district to match district column in DIME
house['state_po_district'] = house['state_po'] + house['district'].str[1:]

In [8]:
#Totals by precinct
total_by_prec_cand = house.groupby(['state_po', 'state_po_district', 'precinct','candidate'])['votes'].sum().reset_index()
total_by_prec = total_by_prec_cand.groupby(['state_po', 'state_po_district', 'precinct'])['votes'].sum().reset_index()
total_by_prec = total_by_prec.rename(columns = {'votes': 'total_votes'})

In [None]:
#Totals by county
total_by_county_cand = house.groupby(['state_po', 'state_po_district', 'county_fips','candidate'])['votes'].sum().reset_index()
total_by_county = total_by_county_cand.groupby(['state_po', 'state_po_district', 'county_fips'])['votes'].sum().reset_index()
total_by_county = total_by_county.rename(columns = {'votes': 'total_votes'})

In [9]:
result = pd.merge(total_by_county_cand, total_by_county, on = ['state_po', 'state_po_district', 'county_fips'])
result['pct_vote'] = result['votes']/result['total_votes']
result['candidate_house'] = result['candidate'].str.lower()
result['candidate_house'] = result['candidate_house'].str.replace('.', '')
result['candidate_house'] = result['candidate_house'].str.strip()

  result['candidate_house'] = result['candidate_house'].str.replace('.', '')


In [10]:
#DIME
dime =  pd.read_csv(dime_csv)
dime.sort_values(by=['district', 'name'])
dime = dime.drop(columns=['cycle', 'seat', 'ran_general', 'winner', 'recipient_type'])


In [11]:
# new data frame with split name
split = dime['name'].str.split(",", n = 1, expand = True)

In [12]:
#Create new candidate name to match with HOUSE df
dime['candidate_dime'] = dime['ffname'] + " " + split[0]
dime['candidate_dime'] = dime['candidate_dime'].str.replace('.', '')
dime['candidate_dime'] = dime['candidate_dime'].str.replace('mr', '')
dime['candidate_dime'] = dime['candidate_dime'].str.strip()
dime

  dime['candidate_dime'] = dime['candidate_dime'].str.replace('.', '')


Unnamed: 0.1,Unnamed: 0,Cand_ID,bonica_rid,name,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,candidate_dime
0,58722,H8WI05157,cand140787,"garcia, ramon hyron mr",ramon hyron,100,WI,WI05,-1.589,-1.416,0.944,ramon hyron garcia
1,58777,H8PA15161,cand140710,"leiner, william jr.",william .,100,PA,PA15,-1.737,-1.540,-0.082,william leiner
2,58778,H8PA12085,cand140709,"tarasi, elizabeth m",elizabeth m,100,PA,PA17,-1.257,-1.023,0.473,elizabeth m tarasi
3,58779,H8PA12077,cand140708,"anthony, aaron",aaron,100,PA,PA12,-1.815,-1.662,0.128,aaron anthony
4,58780,H8PA12069,cand140707,"prigg, thomas lee",thomas lee,100,PA,PA14,-1.694,-1.570,-1.374,thomas lee prigg
...,...,...,...,...,...,...,...,...,...,...,...,...
273,64756,H8MN05247,cand146300,"torres ray, patricia",patricia,100,MN,MN05,-1.463,-1.294,-1.625,patricia torres ray
274,64757,H8MN05239,cand146299,"omar, ilhan",ilhan,100,MN,MN05,-1.515,-1.364,-1.625,ilhan omar
275,64758,H8MN05221,cand146298,"anderson kelliher, margaret",margaret,100,MN,MN05,-0.867,-0.740,-1.625,margaret anderson kelliher
276,64759,H8MI12120,cand146295,"niemuth, niles",niles,328,MI,MI12,-1.906,-1.804,-0.827,niles niemuth


In [13]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
           names_array.append(wrong_option)
           ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [14]:
result_MN = result[result['state_po'].isin(['MN'])]
result_MN = result_MN['candidate_house'].drop_duplicates()
dime_MN = dime[dime['state'].isin(['MN'])]

str2Match = result_MN.tolist()
strOptions = dime_MN['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MN= pd.DataFrame()
merged_MN['candidate_house']=pd.Series(str2Match)
merged_MN['candidate_dime']=pd.Series(name_match)
merged_MN['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_MN


Unnamed: 0,candidate_house,candidate_dime,correct_ratio
0,dan feehan,daniel feehan,87.0
1,jim hagedorn,james hagedorn,85.0
2,writein,erik paulsen,42.0
3,angie craig,angela dawn craig,71.0
4,jason lewis,jason mark lewis,100.0
5,dean phillips,dean phillips,100.0
6,erik paulsen,erik paulsen,100.0
7,betty mccollum,betty mccollum,100.0
8,greg ryan,gregory ryan,86.0
9,susan pendergast sindt,soren christian sorensen,43.0


In [15]:
result_WI = result[result['state_po'].isin(['WI'])]
result_WI = result_WI['candidate_house'].drop_duplicates()
dime_WI = dime[dime['state'].isin(['WI'])]

str2Match = result_WI.tolist()
strOptions = dime_WI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_WI = pd.DataFrame()
merged_WI['candidate_house']=pd.Series(str2Match)
merged_WI['candidate_dime']=pd.Series(name_match)
merged_WI['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_WI

Unnamed: 0,candidate_house,candidate_dime,correct_ratio
0,bryan steil,bryan george steil,100.0
1,ken yorgan,mark pocan,50.0
2,randy bryce,randy bryce,100.0
3,joseph kexel,paul nehlen,35.0
4,mark pocan,mark pocan,100.0
5,rick cruz,mark pocan,32.0
6,joey wayne reed,kyle frenette,43.0
7,bradley jason burt,bradley thomas dr boivin,62.0
8,ron kind,ronald james kind,67.0
9,steve toft,steve toft,100.0


In [16]:
result_PA = result[result['state_po'].isin(['PA'])]
result_PA = result_PA['candidate_house'].drop_duplicates()
dime_PA = dime[dime['state'].isin(['PA'])]

str2Match = result_PA.tolist()
strOptions = dime_PA['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_PA = pd.DataFrame()
merged_PA['candidate_house'] = pd.Series(str2Match)
merged_PA['candidate_dime'] = pd.Series(name_match)
merged_PA['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_PA

Unnamed: 0,candidate_house,candidate_dime,correct_ratio
0,brian k fitzpatrick,brian fitzpatrick,100.0
1,henry scott wallace,henry scott wallace,100.0
2,brendan f boyle,brendan f boyle,100.0
3,david torres,david wertime,64.0
4,bryan e leib,bryan leib,100.0
5,dwight evans,dwight evans,100.0
6,daniel david,daniel e david,100.0
7,madeleine dean cunnane,madeleine dean,100.0
8,mary gay scanlon,mary gay scanlon,100.0
9,pearl kim,pearl kim,100.0


In [17]:
result_MI = result[result['state_po'].isin(['MI'])]
result_MI = result_MI['candidate_house'].drop_duplicates()
dime_MI = dime[dime['state'].isin(['MI'])]

str2Match = result_MI.tolist()
strOptions = dime_MI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MI = pd.DataFrame()
merged_MI['candidate_house'] = pd.Series(str2Match)
merged_MI['candidate_dime'] = pd.Series(name_match)
merged_MI['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_MI

Unnamed: 0,candidate_house,candidate_dime,correct_ratio
0,jack bergman,john bergman,75.0
1,matthew w morgan,matthew wade morgan,93.0
2,bill huizenga,william p the hon huizenga,76.0
3,rob davidson,robert michael davidson,80.0
4,ronald e graeser,paul colin clements,46.0
5,cathy albro,catherine albro,77.0
6,justin amash,justin amash,100.0
7,ted gerrard,leonard schwartz,44.0
8,joe farrington,weldon frederick wooden,49.0
9,jerry hilliard,ryan hill,61.0


In [18]:
#concatenate results of fuzzy matching by state
merged = pd.concat([merged_MI, merged_MN, merged_PA, merged_WI], sort=False)

In [19]:
#merge with results of fuzzy matching
result = result.merge(merged, how = 'left', on = 'candidate_house')
result

Unnamed: 0,state_po,state_po_district,precinct,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio
0,MI,MI01,1,JACK BERGMAN,127276,218026,0.583765,jack bergman,john bergman,75.0
1,MI,MI01,1,MATTHEW W MORGAN,90750,218026,0.416235,matthew w morgan,matthew wade morgan,93.0
2,MI,MI01,"1, WARD 1",JACK BERGMAN,352,666,0.528529,jack bergman,john bergman,75.0
3,MI,MI01,"1, WARD 1",MATTHEW W MORGAN,314,666,0.471471,matthew w morgan,matthew wade morgan,93.0
4,MI,MI01,10,JACK BERGMAN,894,2107,0.424300,jack bergman,john bergman,75.0
...,...,...,...,...,...,...,...,...,...,...
42102,WI,WI08,VILLAGE OF WITTENBERG WARDS 1-2,MIKE GALLAGHER,412,670,0.614925,mike gallagher,michael john gallagher,78.0
42103,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,BEAU LIEGEOIS,32,145,0.220690,beau liegeois,beau liegeois,100.0
42104,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,MIKE GALLAGHER,113,145,0.779310,mike gallagher,michael john gallagher,78.0
42105,WI,WI08,VILLAGE OF WRIGHTSTOWN WARDS 1-3,BEAU LIEGEOIS,690,2358,0.292621,beau liegeois,beau liegeois,100.0


In [20]:
# merge with dime data to add ideaology
result = result.merge(dime, how = 'left', on = 'candidate_dime')
result

Unnamed: 0,state_po,state_po_district,precinct,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,Cand_ID,bonica_rid,name,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship
0,MI,MI01,1,JACK BERGMAN,127276,218026,0.583765,jack bergman,john bergman,75.0,...,H6MI01226,cand137055,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233
1,MI,MI01,1,MATTHEW W MORGAN,90750,218026,0.416235,matthew w morgan,matthew wade morgan,93.0,...,H8MI01149,cand140638,"morgan, matthew wade",matthew wade,100,MI,MI01,-1.474,-1.318,0.233
2,MI,MI01,"1, WARD 1",JACK BERGMAN,352,666,0.528529,jack bergman,john bergman,75.0,...,H6MI01226,cand137055,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233
3,MI,MI01,"1, WARD 1",MATTHEW W MORGAN,314,666,0.471471,matthew w morgan,matthew wade morgan,93.0,...,H8MI01149,cand140638,"morgan, matthew wade",matthew wade,100,MI,MI01,-1.474,-1.318,0.233
4,MI,MI01,10,JACK BERGMAN,894,2107,0.424300,jack bergman,john bergman,75.0,...,H6MI01226,cand137055,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42102,WI,WI08,VILLAGE OF WITTENBERG WARDS 1-2,MIKE GALLAGHER,412,670,0.614925,mike gallagher,michael john gallagher,78.0,...,H6WI08155,cand137709,"gallagher, michael john",michael john,200,WI,WI08,1.106,1.107,0.171
42103,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,BEAU LIEGEOIS,32,145,0.220690,beau liegeois,beau liegeois,100.0,...,H8WI08045,cand142828,"liegeois, beau",beau,100,WI,WI08,-1.494,-1.319,0.171
42104,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,MIKE GALLAGHER,113,145,0.779310,mike gallagher,michael john gallagher,78.0,...,H6WI08155,cand137709,"gallagher, michael john",michael john,200,WI,WI08,1.106,1.107,0.171
42105,WI,WI08,VILLAGE OF WRIGHTSTOWN WARDS 1-3,BEAU LIEGEOIS,690,2358,0.292621,beau liegeois,beau liegeois,100.0,...,H8WI08045,cand142828,"liegeois, beau",beau,100,WI,WI08,-1.494,-1.319,0.171


In [21]:
# Calculate the min correct ratio by county
result = result.join(result.groupby(['state_po_district', 'county_fips'])['correct_ratio'].agg(['min']), on = ['state_po_district', 'county_fips'])

# Keep only precincts where min correct ratio is at least 75
result = result[result['min']>=75]


In [22]:
# Compute ideology score weighted by % votes
result['wgt_cfscore'] = result['recipient_cfscore'] * result['pct_vote']
result

Unnamed: 0,state_po,state_po_district,precinct,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,name,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,min,wgt_cfscore
0,MI,MI01,1,JACK BERGMAN,127276,218026,0.583765,jack bergman,john bergman,75.0,...,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233,75.0,0.633969
1,MI,MI01,1,MATTHEW W MORGAN,90750,218026,0.416235,matthew w morgan,matthew wade morgan,93.0,...,"morgan, matthew wade",matthew wade,100,MI,MI01,-1.474,-1.318,0.233,75.0,-0.613530
2,MI,MI01,"1, WARD 1",JACK BERGMAN,352,666,0.528529,jack bergman,john bergman,75.0,...,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233,75.0,0.573982
3,MI,MI01,"1, WARD 1",MATTHEW W MORGAN,314,666,0.471471,matthew w morgan,matthew wade morgan,93.0,...,"morgan, matthew wade",matthew wade,100,MI,MI01,-1.474,-1.318,0.233,75.0,-0.694949
4,MI,MI01,10,JACK BERGMAN,894,2107,0.424300,jack bergman,john bergman,75.0,...,"bergman, john",john,200,MI,MI01,1.086,1.044,0.233,75.0,0.460790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42102,WI,WI08,VILLAGE OF WITTENBERG WARDS 1-2,MIKE GALLAGHER,412,670,0.614925,mike gallagher,michael john gallagher,78.0,...,"gallagher, michael john",michael john,200,WI,WI08,1.106,1.107,0.171,78.0,0.680107
42103,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,BEAU LIEGEOIS,32,145,0.220690,beau liegeois,beau liegeois,100.0,...,"liegeois, beau",beau,100,WI,WI08,-1.494,-1.319,0.171,78.0,-0.329710
42104,WI,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,MIKE GALLAGHER,113,145,0.779310,mike gallagher,michael john gallagher,78.0,...,"gallagher, michael john",michael john,200,WI,WI08,1.106,1.107,0.171,78.0,0.861917
42105,WI,WI08,VILLAGE OF WRIGHTSTOWN WARDS 1-3,BEAU LIEGEOIS,690,2358,0.292621,beau liegeois,beau liegeois,100.0,...,"liegeois, beau",beau,100,WI,WI08,-1.494,-1.319,0.171,78.0,-0.437176


In [23]:
result_agg = result.groupby(['state_po_district', 'county_fips'])['wgt_cfscore'].sum().reset_index()
result_agg

Unnamed: 0,state_po_district,precinct,wgt_cfscore
0,MI01,1,0.020439
1,MI01,"1, WARD 1",-0.120967
2,MI01,10,-0.387792
3,MI01,11,0.109885
4,MI01,12,-0.152033
...,...,...,...
10338,WI08,VILLAGE OF WAUSAUKEE WARD 1,0.352591
10339,WI08,VILLAGE OF WINNECONNE WARDS 1-4,0.310655
10340,WI08,VILLAGE OF WITTENBERG WARDS 1-2,0.104806
10341,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,0.532207


In [24]:
#Lean Liberal (0 through -0.5)
#Strong Liberal (-0.5 through -1)
#Very Liberal (-1 and lower)
#Lean Conservative (0 through 0.5)
#Strong Conservative (0.5 through 1)
#Very Conservative (1 and higher)
result_agg['cf_category'] = ''
result_agg.loc[(result_agg['wgt_cfscore'] < -1), 'cf_category'] = '-3'
result_agg.loc[(result_agg['wgt_cfscore'] >= -1) & (result_agg['wgt_cfscore'] <= -0.5), 'cf_category'] = '-2'
result_agg.loc[(result_agg['wgt_cfscore'] >= -0.5) & (result_agg['wgt_cfscore'] < 0), 'cf_category'] = '-1'
result_agg.loc[(result_agg['wgt_cfscore'] >= 0) & (result_agg['wgt_cfscore'] < 0.5), 'cf_category'] = '1'
result_agg.loc[(result_agg['wgt_cfscore'] >= 0.5) & (result_agg['wgt_cfscore'] < 1), 'cf_category'] = '2'
result_agg.loc[(result_agg['wgt_cfscore'] >= 1), 'cf_category'] = '3'

result_agg['cf_label'] = ''
result_agg.loc[(result_agg['cf_category'] == '-3'), 'cf_label'] = 'Very Liberal'
result_agg.loc[(result_agg['cf_category'] == '-2'), 'cf_label'] = 'Strong Liberal'
result_agg.loc[(result_agg['cf_category'] == '-1'), 'cf_label'] = 'Lean Liberal'
result_agg.loc[(result_agg['cf_category'] == '1'), 'cf_label'] = 'Lean Conservative'
result_agg.loc[(result_agg['cf_category'] == '2'), 'cf_label'] = 'Strong Conservative'
result_agg.loc[(result_agg['cf_category'] == '3'), 'cf_label'] = 'Very Conservative'

result_agg




Unnamed: 0,state_po_district,precinct,wgt_cfscore,cf_category,cf_label
0,MI01,1,0.020439,1,Lean Conservative
1,MI01,"1, WARD 1",-0.120967,-1,Lean Liberal
2,MI01,10,-0.387792,-1,Lean Liberal
3,MI01,11,0.109885,1,Lean Conservative
4,MI01,12,-0.152033,-1,Lean Liberal
...,...,...,...,...,...
10338,WI08,VILLAGE OF WAUSAUKEE WARD 1,0.352591,1,Lean Conservative
10339,WI08,VILLAGE OF WINNECONNE WARDS 1-4,0.310655,1,Lean Conservative
10340,WI08,VILLAGE OF WITTENBERG WARDS 1-2,0.104806,1,Lean Conservative
10341,WI08,VILLAGE OF WRIGHTSTOWN WARD 4,0.532207,2,Strong Conservative


In [25]:
#export to csv
result_agg.to_csv('data/ideology_county.csv', index = False)