In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
house_precinct_csv = 'data/raw/HOUSE_precinct_general.csv'
dime_csv = 'data/raw/dime.csv'

In [3]:
#HOUSE official dtypes from documentation
official_dtypes = {'precinct':str,'office':str, 'party_detailed':str, 'party_simplified':str,
'mode':str,'votes':int, 'county_name':str, 'county_fips':str, 'jurisdiction_name':str,
'jurisdiction_fips':str, 'candidate':str, 'district':str, 'dataverse':str,'year':int,
'stage':str, 'state':str, 'special':str, 'writein':str, 'state_po':str, 'state_fips':str,
'state_cen':str, 'state_ic':str, 'date':str, 'readme_check':str,'magnitude':int}

house =  pd.read_csv(house_precinct_csv, dtype = official_dtypes)

In [4]:
#Filter states
states = ['MI', 'MN', 'PA', 'WI']
house = house[house['state_po'].isin(states)]

In [5]:
#Filter office to US House only
house = house[house['office'].isin(['US HOUSE'])]

In [6]:
#Filter precincts with zero votes
house = house[house['votes']!=0]

In [7]:
#Filter Statistical Adjustments
house = house[house['jurisdiction_name']!="{STATISTICAL ADJUSTMENTS}"]

In [8]:
# Michigan also reports the 'absentee' vote (mode == 'ABSENTEE')
# Is unclear how to match the absentee ballot votes to a precinct
# For this reason, I am dropping them here.
# We should check how this might affect our analysis.
house = house[house['mode']!='ABSENTEE']

In [9]:
#Concatenate state and district to match district column in DIME
house['state_po_district'] = house['state_po'] + house['district'].str[1:]

In [10]:
#Each state reports precincts differently
#The following is only for MI
michigan = house['state'] == "MICHIGAN"
house[['precinct_code', 'ward_code']] = house.loc[michigan, 'precinct'].str.split(',', expand = True)
house['precinct_code'] = house.loc[michigan, 'precinct_code'].apply('{0:0>3}'.format)
house['ward_code'] = house.loc[michigan, 'ward_code'].str.replace('WARD','')
house['ward_code'] = house.loc[michigan, 'ward_code'].str.strip()
house['ward_code'] = house.loc[michigan, 'ward_code'].fillna('0')
house['ward_code'] = house.loc[michigan, 'ward_code'].apply('{0:0>2}'.format)

In [11]:
#Note: We still need to check the precincts that have code with letters like '0-22L', '1-A', '1-B' for MI
#Dropping them from now
house_mi = house.loc[michigan]
house_mi['tocheck'] = np.where(house_mi.precinct_code.str.contains('-'), '1', '0')
house_mi['precinct'][(house_mi['tocheck'] == '1')]
house_mi = house_mi[house_mi['tocheck']!='1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_mi['tocheck'] = np.where(house_mi.precinct_code.str.contains('-'), '1', '0')


In [12]:
#Create NAME20 to match census data
#In census data NAME20 = COUNTYFIPS + MCDFIPS + WARD + PRECINCT
#In house data we have jurisdiction_fips = state_fips + county_fips + mcd_fips
#and we created ward_code and precinct_code
#so we have to remove state_fips from jurisdiction_fips and add ward_code and precinct_code
house_mi['NAME20'] = house_mi['jurisdiction_fips'].str[2:] + house_mi['ward_code'] + house_mi['precinct_code']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_mi['NAME20'] = house_mi['jurisdiction_fips'].str[2:] + house_mi['ward_code'] + house_mi['precinct_code']


In [13]:
#Edit candidate names based on inspection
house_mi['candidate']  = house_mi['candidate'] .str.replace('FRED UPTON','FREDERICK STEPHEN UPTON')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_mi['candidate']  = house_mi['candidate'] .str.replace('FRED UPTON','FREDERICK STEPHEN UPTON')


In [14]:
#Totals by precinct for MI
total_by_prec_cand_mi = house_mi.groupby(['state_po', 'state_po_district', 'NAME20','candidate'])['votes'].sum().reset_index()
total_by_prec_cand_mi


Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes
0,MI,MI01,0010104000001,JACK BERGMAN,449
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214
2,MI,MI01,0011246000001,JACK BERGMAN,395
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181
4,MI,MI01,0011932000001,JACK BERGMAN,365
...,...,...,...,...,...
15315,MI,MI14,1638245300001,MARC S HERSCHFUS,681
15316,MI,MI14,1638245300001,PHILIP KOLODY,8
15317,MI,MI14,1638245300002,BRENDA LAWRENCE,221
15318,MI,MI14,1638245300002,MARC S HERSCHFUS,410


In [15]:
total_by_prec_mi = total_by_prec_cand_mi.groupby(['state_po', 'state_po_district', 'NAME20'])['votes'].sum().reset_index()
total_by_prec_mi = total_by_prec_mi.rename(columns = {'votes': 'total_votes'})
total_by_prec_mi

Unnamed: 0,state_po,state_po_district,NAME20,total_votes
0,MI,MI01,0010104000001,663
1,MI,MI01,0011246000001,576
2,MI,MI01,0011932000001,562
3,MI,MI01,0013482000001,675
4,MI,MI01,0013574000001,302
...,...,...,...,...
4747,MI,MI14,1633670000004,954
4748,MI,MI14,1633670000005,1376
4749,MI,MI14,1633670000006,1195
4750,MI,MI14,1638245300001,968


In [16]:
result = pd.merge(total_by_prec_cand_mi, total_by_prec_mi, on = ['state_po', 'state_po_district', 'NAME20'])
result['pct_vote'] = result['votes']/result['total_votes']
result['candidate_house'] = result['candidate'].str.lower()
result['candidate_house'] = result['candidate_house'].str.replace('.', '')
result['candidate_house'] = result['candidate_house'].str.strip()
result

  result['candidate_house'] = result['candidate_house'].str.replace('.', '')


Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman
...,...,...,...,...,...,...,...,...
15315,MI,MI14,1638245300001,MARC S HERSCHFUS,681,968,0.703512,marc s herschfus
15316,MI,MI14,1638245300001,PHILIP KOLODY,8,968,0.008264,philip kolody
15317,MI,MI14,1638245300002,BRENDA LAWRENCE,221,636,0.347484,brenda lawrence
15318,MI,MI14,1638245300002,MARC S HERSCHFUS,410,636,0.644654,marc s herschfus


In [17]:
#Commenting the lines where we computed the total by county for the second deliverable
#Totals by county
#total_by_county_cand = house.groupby(['state_po', 'state_po_district', 'county_fips','candidate'])['votes'].sum().reset_index()
#total_by_county = total_by_county_cand.groupby(['state_po', 'state_po_district', 'county_fips'])['votes'].sum().reset_index()
#total_by_county = total_by_county.rename(columns = {'votes': 'total_votes'})

In [18]:
#result = pd.merge(total_by_county_cand, total_by_county, on = ['state_po', 'state_po_district', 'county_fips'])
#result['pct_vote'] = result['votes']/result['total_votes']
#result['candidate_house'] = result['candidate'].str.lower()
#result['candidate_house'] = result['candidate_house'].str.replace('.', '')
#result['candidate_house'] = result['candidate_house'].str.strip()

In [19]:
#DIME
dime =  pd.read_csv(dime_csv)
dime.sort_values(by=['district', 'name'])
dime = dime.drop(columns=['cycle', 'seat', 'ran_general', 'winner', 'recipient_type'])

In [20]:
# new data frame with split name
split = dime['name'].str.split(",", n = 1, expand = True)

In [21]:
# Change party number codes to letter codes
dime.loc[dime['party'] == 100, 'party'] = 'D'
dime.loc[dime['party'] == 200, 'party'] = 'R'
dime.loc[dime['party'] == 328, 'party'] = 'I'

In [22]:
#Create new candidate name to match with HOUSE df, and new id column to match with census
dime['candidate_dime'] = dime['ffname'] + " " + split[0]
dime['candidate_dime'] = dime['candidate_dime'].str.replace('.', '')
dime['candidate_dime'] = dime['candidate_dime'].str.replace('mr', '')
dime['candidate_dime'] = dime['candidate_dime'].str.strip()

dime['census_match'] = dime['party'] + dime['name'].astype(str).str[:3] + dime['state']

  dime['candidate_dime'] = dime['candidate_dime'].str.replace('.', '')


In [23]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
           names_array.append(wrong_option)
           ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [24]:
result_MN = result[result['state_po'].isin(['MN'])]
result_MN = result_MN['candidate_house'].drop_duplicates()
dime_MN = dime[dime['state'].isin(['MN'])]

str2Match = result_MN.tolist()
strOptions = dime_MN['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MN= pd.DataFrame()
merged_MN['candidate_house']=pd.Series(str2Match)
merged_MN['candidate_dime']=pd.Series(name_match)
merged_MN['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_MN


  merged_MN['candidate_house']=pd.Series(str2Match)
  merged_MN['candidate_dime']=pd.Series(name_match)
  merged_MN['correct_ratio']=pd.Series(ratio_match).astype(float)


Unnamed: 0,candidate_house,candidate_dime,correct_ratio


In [25]:
result_WI = result[result['state_po'].isin(['WI'])]
result_WI = result_WI['candidate_house'].drop_duplicates()
dime_WI = dime[dime['state'].isin(['WI'])]

str2Match = result_WI.tolist()
strOptions = dime_WI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_WI = pd.DataFrame()
merged_WI['candidate_house']=pd.Series(str2Match)
merged_WI['candidate_dime']=pd.Series(name_match)
merged_WI['correct_ratio']=pd.Series(ratio_match).astype(float)

merged_WI

  merged_WI['candidate_house']=pd.Series(str2Match)
  merged_WI['candidate_dime']=pd.Series(name_match)
  merged_WI['correct_ratio']=pd.Series(ratio_match).astype(float)


Unnamed: 0,candidate_house,candidate_dime,correct_ratio


In [26]:
result_PA = result[result['state_po'].isin(['PA'])]
result_PA = result_PA['candidate_house'].drop_duplicates()
dime_PA = dime[dime['state'].isin(['PA'])]

str2Match = result_PA.tolist()
strOptions = dime_PA['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_PA = pd.DataFrame()
merged_PA['candidate_house'] = pd.Series(str2Match)
merged_PA['candidate_dime'] = pd.Series(name_match)
merged_PA['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_PA

  merged_PA['candidate_house'] = pd.Series(str2Match)
  merged_PA['candidate_dime'] = pd.Series(name_match)
  merged_PA['correct_ratio'] = pd.Series(ratio_match).astype(float)


Unnamed: 0,candidate_house,candidate_dime,correct_ratio


In [27]:
result_MI = result[result['state_po'].isin(['MI'])]
result_MI = result_MI['candidate_house'].drop_duplicates()
dime_MI = dime[dime['state'].isin(['MI'])]

str2Match = result_MI.tolist()
strOptions = dime_MI['candidate_dime'].fillna('######').tolist()

name_match,ratio_match=checker(str2Match,strOptions)
merged_MI = pd.DataFrame()
merged_MI['candidate_house'] = pd.Series(str2Match)
merged_MI['candidate_dime'] = pd.Series(name_match)
merged_MI['correct_ratio'] = pd.Series(ratio_match).astype(float)

merged_MI.sort_values(by=['correct_ratio'])

Unnamed: 0,candidate_house,candidate_dime,correct_ratio
41,marc joseph sosnowski,jeremy michael peruski,42.0
40,d etta wilcoxon,weldon frederick wooden,42.0
46,philip kolody,william r wild,44.0
7,ted gerrard,leonard schwartz,44.0
45,marc s herschfus,candius stearns,45.0
4,ronald e graeser,paul colin clements,46.0
12,kathy goodwin,matt longjohn,46.0
23,andrea kirby,rashida tlaib,48.0
8,joe farrington,weldon frederick wooden,49.0
36,gary walkowicz,a rocky raczkowski,50.0


In [28]:
#concatenate results of fuzzy matching by state
#merged = pd.concat([merged_MI, merged_MN, merged_PA, merged_WI], sort=False)

In [29]:
#merge with results of fuzzy matching
result = result.merge(merged_MI, how = 'left', on = 'candidate_house')
result

Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0
...,...,...,...,...,...,...,...,...,...,...
15315,MI,MI14,1638245300001,MARC S HERSCHFUS,681,968,0.703512,marc s herschfus,candius stearns,45.0
15316,MI,MI14,1638245300001,PHILIP KOLODY,8,968,0.008264,philip kolody,william r wild,44.0
15317,MI,MI14,1638245300002,BRENDA LAWRENCE,221,636,0.347484,brenda lawrence,brenda lulenar lawrence,100.0
15318,MI,MI14,1638245300002,MARC S HERSCHFUS,410,636,0.644654,marc s herschfus,candius stearns,45.0


In [30]:
# merge with dime data to add ideology
result = result.merge(dime, how = 'left', on = 'candidate_dime')
result

Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,bonica_rid,name,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,census_match
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0,...,cand137055,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0,...,cand140638,"morgan, matthew wade",matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0,...,cand137055,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0,...,cand140638,"morgan, matthew wade",matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0,...,cand137055,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15315,MI,MI14,1638245300001,MARC S HERSCHFUS,681,968,0.703512,marc s herschfus,candius stearns,45.0,...,cand142531,"stearns, candius mrs.",candius .,R,MI,MI09,1.179,1.169,-0.021,RsteMI
15316,MI,MI14,1638245300001,PHILIP KOLODY,8,968,0.008264,philip kolody,william r wild,44.0,...,cand144684,"wild, william r.",william r.,D,MI,MI13,0.023,0.147,-2.429,DwilMI
15317,MI,MI14,1638245300002,BRENDA LAWRENCE,221,636,0.347484,brenda lawrence,brenda lulenar lawrence,100.0,...,cand42832,"lawrence, brenda lulenar",brenda lulenar,D,MI,MI14,-0.709,-0.684,-2.671,DlawMI
15318,MI,MI14,1638245300002,MARC S HERSCHFUS,410,636,0.644654,marc s herschfus,candius stearns,45.0,...,cand142531,"stearns, candius mrs.",candius .,R,MI,MI09,1.179,1.169,-0.021,RsteMI


In [31]:
# Calculate the min correct ratio by precinct
result = result.join(result.groupby(['state_po_district', 'NAME20'])['correct_ratio'].agg(['min']), on = ['state_po_district', 'NAME20'])

# Keep only precincts where min correct ratio is at least 70
result = result[result['min']>=70]
result


Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,name,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,census_match,min
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0,...,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0,...,"morgan, matthew wade",matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0,...,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0,...,"morgan, matthew wade",matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0,...,"bergman, john",john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13395,MI,MI13,1632200000454,BRENDA JONES,2,4,0.500000,brenda jones,brenda jones,100.0,...,"jones, brenda",brenda,D,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0
13396,MI,MI13,1632200000454,RASHIDA TLAIB,2,4,0.500000,rashida tlaib,rashida tlaib,100.0,...,"tlaib, rashida",rashida,D,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0
13411,MI,MI13,1632200000458,BRENDA JONES,5,10,0.500000,brenda jones,brenda jones,100.0,...,"jones, brenda",brenda,D,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0
13412,MI,MI13,1632200000458,RASHIDA TLAIB,5,10,0.500000,rashida tlaib,rashida tlaib,100.0,...,"tlaib, rashida",rashida,D,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0


In [32]:
# Compute ideology score weighted by % votes
result['wgt_cfscore'] = result['recipient_cfscore'] * result['pct_vote']
result

Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,ffname,party,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,census_match,min,wgt_cfscore
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0,...,john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.735466
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0,...,matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0,-0.475771
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0,...,john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.744740
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0,...,matthew wade,D,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0,-0.463184
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0,...,john,R,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.705320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13395,MI,MI13,1632200000454,BRENDA JONES,2,4,0.500000,brenda jones,brenda jones,100.0,...,brenda,D,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0,-0.279500
13396,MI,MI13,1632200000454,RASHIDA TLAIB,2,4,0.500000,rashida tlaib,rashida tlaib,100.0,...,rashida,D,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0,-0.689000
13411,MI,MI13,1632200000458,BRENDA JONES,5,10,0.500000,brenda jones,brenda jones,100.0,...,brenda,D,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0,-0.279500
13412,MI,MI13,1632200000458,RASHIDA TLAIB,5,10,0.500000,rashida tlaib,rashida tlaib,100.0,...,rashida,D,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0,-0.689000


In [33]:
result['cf_category'] = ''
result.loc[(result['wgt_cfscore'] < -1), 'cf_category'] = '-3'
result.loc[(result['wgt_cfscore'] >= -1) & (result['wgt_cfscore'] <= -0.5), 'cf_category'] = '-2'
result.loc[(result['wgt_cfscore'] >= -0.5) & (result['wgt_cfscore'] < 0), 'cf_category'] = '-1'
result.loc[(result['wgt_cfscore'] >= 0) & (result['wgt_cfscore'] < 0.5), 'cf_category'] = '1'
result.loc[(result['wgt_cfscore'] >= 0.5) & (result['wgt_cfscore'] < 1), 'cf_category'] = '2'
result.loc[(result['wgt_cfscore'] >= 1), 'cf_category'] = '3'

result['cf_label'] = ''
result.loc[(result['cf_category'] == '-3'), 'cf_label'] = 'Very Liberal'
result.loc[(result['cf_category'] == '-2'), 'cf_label'] = 'Strong Liberal'
result.loc[(result['cf_category'] == '-1'), 'cf_label'] = 'Lean Liberal'
result.loc[(result['cf_category'] == '1'), 'cf_label'] = 'Lean Conservative'
result.loc[(result['cf_category'] == '2'), 'cf_label'] = 'Strong Conservative'
result.loc[(result['cf_category'] == '3'), 'cf_label'] = 'Very Conservative'
result

Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,state,district,recipient_cfscore,recipient_cfscore_dyn,district_partisanship,census_match,min,wgt_cfscore,cf_category,cf_label
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0,...,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.735466,2,Strong Conservative
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0,...,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0,-0.475771,-1,Lean Liberal
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0,...,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.744740,2,Strong Conservative
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0,...,MI,MI01,-1.474,-1.318,0.233,DmorMI,75.0,-0.463184,-1,Lean Liberal
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0,...,MI,MI01,1.086,1.044,0.233,RberMI,75.0,0.705320,2,Strong Conservative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13395,MI,MI13,1632200000454,BRENDA JONES,2,4,0.500000,brenda jones,brenda jones,100.0,...,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0,-0.279500,-1,Lean Liberal
13396,MI,MI13,1632200000454,RASHIDA TLAIB,2,4,0.500000,rashida tlaib,rashida tlaib,100.0,...,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0,-0.689000,-2,Strong Liberal
13411,MI,MI13,1632200000458,BRENDA JONES,5,10,0.500000,brenda jones,brenda jones,100.0,...,MI,MI13,-0.559,-0.376,-2.429,DjonMI,100.0,-0.279500,-1,Lean Liberal
13412,MI,MI13,1632200000458,RASHIDA TLAIB,5,10,0.500000,rashida tlaib,rashida tlaib,100.0,...,MI,MI13,-1.378,-1.174,-2.429,DtlaMI,100.0,-0.689000,-2,Strong Liberal


In [34]:
#Map Census to House data
census_house_mi_csv = 'data/mi_match_vtd_house.csv'
map_census_house_mi = pd.read_csv(census_house_mi_csv, dtype = {'GEOID20':str, 'NAME20':str, 'jurisdiction_fips':str})
map_census_house_mi = map_census_house_mi[['GEOID20', 'NAME20']]
map_census_house_mi['VTD_ID'] = map_census_house_mi['GEOID20']

In [35]:
mi_result = result.merge(map_census_house_mi, how = 'left', on = 'NAME20')

In [36]:
#Census data
census_mi_vtd_csv = 'data/MI_VTD_demographics.csv'
mi_vtd = pd.read_csv(census_mi_vtd_csv)
# Drop rows where VTD has ZZZ
remove = mi_vtd['VTD_ID'].str.contains('Z')
mi_vtd = mi_vtd[ ~remove]
mi_vtd

Unnamed: 0,VTD_ID,total_pop,pop_density,prop_white,prop_Black,prop_Hispanic,prop_AIA,prop_Asian
0,26001001001,1207.804879,0.000007,0.979628,0.001625,0.006426,0.003294,0.004178
1,26001001002,900.160254,0.000005,0.970183,0.000000,0.009807,0.002979,0.007299
2,26001001003,734.259490,0.000004,0.956542,0.000000,0.019545,0.018574,0.005339
3,26001001004,1153.336515,0.000017,0.957575,0.010796,0.005826,0.009611,0.010857
4,26001001005,728.053661,0.000008,0.965826,0.002006,0.013346,0.009739,0.002126
...,...,...,...,...,...,...,...,...
4800,26165165017,2220.556261,0.000024,0.928959,0.003546,0.017013,0.013569,0.030324
4801,26165165018,649.342851,0.000007,0.967061,0.003378,0.022804,0.008446,0.000000
4802,26165165019,524.447372,0.000006,0.959952,0.000709,0.011411,0.021145,0.005098
4803,26165165020,784.265861,0.000009,0.955280,0.002365,0.011091,0.019053,0.010148


In [37]:
mi_result = mi_result.merge(mi_vtd, how = 'left', on = 'VTD_ID')
mi_result

Unnamed: 0,state_po,state_po_district,NAME20,candidate,votes,total_votes,pct_vote,candidate_house,candidate_dime,correct_ratio,...,cf_label,GEOID20,VTD_ID,total_pop,pop_density,prop_white,prop_Black,prop_Hispanic,prop_AIA,prop_Asian
0,MI,MI01,0010104000001,JACK BERGMAN,449,663,0.677225,jack bergman,john bergman,75.0,...,Strong Conservative,26001001001,26001001001,1207.804879,0.000007,0.979628,0.001625,0.006426,0.003294,0.004178
1,MI,MI01,0010104000001,MATTHEW W MORGAN,214,663,0.322775,matthew w morgan,matthew wade morgan,93.0,...,Lean Liberal,26001001001,26001001001,1207.804879,0.000007,0.979628,0.001625,0.006426,0.003294,0.004178
2,MI,MI01,0011246000001,JACK BERGMAN,395,576,0.685764,jack bergman,john bergman,75.0,...,Strong Conservative,26001001002,26001001002,900.160254,0.000005,0.970183,0.000000,0.009807,0.002979,0.007299
3,MI,MI01,0011246000001,MATTHEW W MORGAN,181,576,0.314236,matthew w morgan,matthew wade morgan,93.0,...,Lean Liberal,26001001002,26001001002,900.160254,0.000005,0.970183,0.000000,0.009807,0.002979,0.007299
4,MI,MI01,0011932000001,JACK BERGMAN,365,562,0.649466,jack bergman,john bergman,75.0,...,Strong Conservative,26001001003,26001001003,734.259490,0.000004,0.956542,0.000000,0.019545,0.018574,0.005339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2654,MI,MI13,1632200000454,BRENDA JONES,2,4,0.500000,brenda jones,brenda jones,100.0,...,Lean Liberal,,,,,,,,,
2655,MI,MI13,1632200000454,RASHIDA TLAIB,2,4,0.500000,rashida tlaib,rashida tlaib,100.0,...,Strong Liberal,,,,,,,,,
2656,MI,MI13,1632200000458,BRENDA JONES,5,10,0.500000,brenda jones,brenda jones,100.0,...,Lean Liberal,,,,,,,,,
2657,MI,MI13,1632200000458,RASHIDA TLAIB,5,10,0.500000,rashida tlaib,rashida tlaib,100.0,...,Strong Liberal,,,,,,,,,


In [38]:
mi_result.dropna(inplace=True)
mi_result = mi_result[['NAME20', 'VTD_ID','total_pop', 'pop_density','prop_white','prop_Black','prop_Hispanic','prop_AIA','prop_Asian', 'wgt_cfscore', 'pct_vote', 'cf_category','cf_label']]
mi_result

Unnamed: 0,NAME20,VTD_ID,total_pop,pop_density,prop_white,prop_Black,prop_Hispanic,prop_AIA,prop_Asian,wgt_cfscore,pct_vote,cf_category,cf_label
0,0010104000001,26001001001,1207.804879,0.000007,0.979628,0.001625,0.006426,0.003294,0.004178,0.735466,0.677225,2,Strong Conservative
1,0010104000001,26001001001,1207.804879,0.000007,0.979628,0.001625,0.006426,0.003294,0.004178,-0.475771,0.322775,-1,Lean Liberal
2,0011246000001,26001001002,900.160254,0.000005,0.970183,0.000000,0.009807,0.002979,0.007299,0.744740,0.685764,2,Strong Conservative
3,0011246000001,26001001002,900.160254,0.000005,0.970183,0.000000,0.009807,0.002979,0.007299,-0.463184,0.314236,-1,Lean Liberal
4,0011932000001,26001001003,734.259490,0.000004,0.956542,0.000000,0.019545,0.018574,0.005339,0.705320,0.649466,2,Strong Conservative
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2643,1636508000011,26163163819,2640.978283,0.000704,0.900795,0.000000,0.000000,0.000000,0.099205,0.016989,0.010665,1,Lean Conservative
2644,1636508000012,26163163820,2020.426589,0.000945,0.882879,0.017812,0.049494,0.006434,0.043381,-0.004318,0.008178,-1,Lean Liberal
2645,1636508000012,26163163820,2020.426589,0.000945,0.882879,0.017812,0.049494,0.006434,0.043381,-0.670097,0.486989,-2,Strong Liberal
2646,1636508000012,26163163820,2020.426589,0.000945,0.882879,0.017812,0.049494,0.006434,0.043381,0.501053,0.492193,2,Strong Conservative


In [39]:
#Export to csv
mi_result.to_csv('data/mi_matched_precinct.csv', index = False)

In [40]:
#result_agg = result.groupby(['state_po_district', 'county_fips'])['wgt_cfscore'].sum().reset_index()
#result_agg

In [41]:
#Lean Liberal (0 through -0.5)
#Strong Liberal (-0.5 through -1)
#Very Liberal (-1 and lower)
#Lean Conservative (0 through 0.5)
#Strong Conservative (0.5 through 1)
#Very Conservative (1 and higher)
# result_agg['cf_category'] = ''
# result_agg.loc[(result_agg['wgt_cfscore'] < -1), 'cf_category'] = '-3'
# result_agg.loc[(result_agg['wgt_cfscore'] >= -1) & (result_agg['wgt_cfscore'] <= -0.5), 'cf_category'] = '-2'
# result_agg.loc[(result_agg['wgt_cfscore'] >= -0.5) & (result_agg['wgt_cfscore'] < 0), 'cf_category'] = '-1'
# result_agg.loc[(result_agg['wgt_cfscore'] >= 0) & (result_agg['wgt_cfscore'] < 0.5), 'cf_category'] = '1'
# result_agg.loc[(result_agg['wgt_cfscore'] >= 0.5) & (result_agg['wgt_cfscore'] < 1), 'cf_category'] = '2'
# result_agg.loc[(result_agg['wgt_cfscore'] >= 1), 'cf_category'] = '3'

# result_agg['cf_label'] = ''
# result_agg.loc[(result_agg['cf_category'] == '-3'), 'cf_label'] = 'Very Liberal'
# result_agg.loc[(result_agg['cf_category'] == '-2'), 'cf_label'] = 'Strong Liberal'
# result_agg.loc[(result_agg['cf_category'] == '-1'), 'cf_label'] = 'Lean Liberal'
# result_agg.loc[(result_agg['cf_category'] == '1'), 'cf_label'] = 'Lean Conservative'
# result_agg.loc[(result_agg['cf_category'] == '2'), 'cf_label'] = 'Strong Conservative'
# result_agg.loc[(result_agg['cf_category'] == '3'), 'cf_label'] = 'Very Conservative'

# result_agg

In [42]:
#export to csv
#result_agg.to_csv('data/ideology_county.csv', index = False)

In [43]:
#mi_result.dropna(inplace=True)
#export to csv
#mi_result.to_csv('data/mi_matched_county.csv', index = False)

In [44]:
# for x in ['prop_white', 'prop_Black', 'prop_Hispanic', 'prop_AIA', 'prop_Asian']:
#     mi_vtd[x] = mi_vtd['total_pop'] * mi_vtd[x]
# mi_vtd

In [45]:
# mi_vtd['county_fips'] = mi_vtd['VTD_ID'].str[:5]
# mi_vtd

In [46]:
# mi_county = mi_vtd.groupby(['county_fips']).agg(
#      total_pop = ('total_pop','sum'),
#      total_White = ('prop_white','sum'),
#      total_Black = ('prop_Black','sum'),
#      total_Hispanic = ('prop_Hispanic','sum'),
#      total_AIA = ('prop_AIA','sum'),
#      total_Asian = ('prop_Asian','sum'),
#      ).reset_index()
# mi_county

In [47]:
# mi_county['prop_White'] = mi_county['total_White'] / mi_county['total_pop']
# mi_county['prop_Black'] = mi_county['total_Black'] / mi_county['total_pop']
# mi_county['prop_Hispanic'] = mi_county['total_Hispanic'] / mi_county['total_pop']
# mi_county['prop_AIA'] = mi_county['total_AIA'] / mi_county['total_pop']
# mi_county['prop_Asian'] = mi_county['total_Asian'] / mi_county['total_pop']
# mi_county

In [48]:
# mi_result = mi_county.merge(result_agg, how = 'left', on = 'county_fips')
# mi_result

In [49]:
#mi_result = mi_result[['county_fips','total_pop','prop_White','prop_Black','prop_Hispanic','prop_AIA','prop_Asian', 'wgt_cfscore','cf_category','cf_label']]
