# Imports

In [1]:
import pandas as pd
import numpy as np
import regex as re

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### District Makeup Data

In [2]:
#importing data
partisan_lean_districts = pd.read_csv('./data/partisan-lean/partisan_lean_DISTRICTS.csv')
elasticity_districts = pd.read_csv('./data/elasticity-scores/elasticity-by-district.csv')

In [3]:
#print shapes
print(partisan_lean_districts.shape)
print(elasticity_districts.shape)

(435, 2)
(435, 2)


In [4]:
#merge dataframes on district
district_makeup = pd.merge(elasticity_districts,partisan_lean_districts,how='inner', on= 'district').sort_values(by='district')

In [5]:
#resetting index
district_makeup.reset_index(inplace=True,drop=True)

In [6]:
#new combined df
print(district_makeup.shape)
district_makeup.head()

(435, 3)


Unnamed: 0,district,elasticity,pvi_538
0,AK-1,1.16,R+15.21
1,AL-1,0.9,R+28.61
2,AL-2,0.97,R+30.97
3,AL-3,0.79,R+31.37
4,AL-4,0.86,R+59.81


In [7]:
#regex to add 0's to make all the district names uniform for later contatenation
district_makeup.district = district_makeup.district.str.replace('-(\d{1})$',r'-0\1')

In [8]:
#changing pvi to negative and positive range and then extracting just number
district_makeup.pvi_538 = district_makeup.pvi_538.str.replace('(D)[+]','D-')

#getting rid of party letter
district_makeup.pvi_538 = district_makeup.pvi_538.str.replace('\w([-+].+)',r'\1')

#removing + signs
district_makeup.pvi_538 = district_makeup.pvi_538.str.replace('[+]',' ')

In [9]:
district_makeup.head()

Unnamed: 0,district,elasticity,pvi_538
0,AK-01,1.16,15.21
1,AL-01,0.9,28.61
2,AL-02,0.97,30.97
3,AL-03,0.79,31.37
4,AL-04,0.86,59.81


## Trump Scores

In [10]:
#importing csvs
trump_averages = pd.read_csv('./data/trump-score/trump_averages.csv')
trump_predictions = pd.read_csv('./data/trump-score/trump_vote_predictions.csv')

In [11]:
#print shapes
print(trump_averages.shape)
print(trump_predictions.shape)

(1729, 11)
(55369, 15)


In [12]:
#preview df
trump_averages.head()

Unnamed: 0,congress,chamber,bioguide,last_name,state,district,party,votes,agree_pct,predicted_agree,net_trump_vote
0,0,house,A000055,Aderholt,AL,4.0,Republican,107,0.971963,0.950011,63.0
1,115,house,A000055,Aderholt,AL,4.0,R,95,0.968421,0.946349,63.0
2,116,house,A000055,Aderholt,AL,4.0,R,12,1.0,0.979001,63.0
3,0,house,A000367,Amash,MI,3.0,Republican,108,0.592593,0.814675,9.4
4,115,house,A000367,Amash,MI,3.0,R,96,0.541667,0.847386,9.4


In [13]:
#preview df
trump_averages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1729 entries, 0 to 1728
Data columns (total 11 columns):
congress           1729 non-null int64
chamber            1729 non-null object
bioguide           1729 non-null object
last_name          1729 non-null object
state              1729 non-null object
district           1410 non-null float64
party              1729 non-null object
votes              1729 non-null int64
agree_pct          1729 non-null float64
predicted_agree    1729 non-null float64
net_trump_vote     1729 non-null float64
dtypes: float64(4), int64(2), object(5)
memory usage: 148.7+ KB


In [14]:
#splitting out the house
trump_avgs_house = trump_averages[trump_averages['chamber'] == 'house']

In [15]:
#trump house overview
trump_avgs_house.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1410 entries, 0 to 1409
Data columns (total 11 columns):
congress           1410 non-null int64
chamber            1410 non-null object
bioguide           1410 non-null object
last_name          1410 non-null object
state              1410 non-null object
district           1410 non-null float64
party              1410 non-null object
votes              1410 non-null int64
agree_pct          1410 non-null float64
predicted_agree    1410 non-null float64
net_trump_vote     1410 non-null float64
dtypes: float64(4), int64(2), object(5)
memory usage: 132.2+ KB


In [16]:
#convert district to int to remove decimal then to a string
trump_avgs_house.district = trump_avgs_house.district.astype(int)
trump_avgs_house.district = trump_avgs_house.district.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [17]:
#make district column from state and district columns
trump_avgs_house['district'] = trump_avgs_house.state.astype(str) + '-' + trump_avgs_house.district

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
#standardizing all house districts
trump_avgs_house.district = trump_avgs_house.district.str.replace('-(\d{1})$',r'-0\1')

In [19]:
#fixing at-large districts

trump_avgs_house.district = trump_avgs_house.district.str.replace('00$','01')

In [20]:
#dropping state column because it's in the district name
trump_avgs_house.drop(columns='state',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [21]:
#splitting out the 115th congress
trump115 = trump_avgs_house[trump_avgs_house['congress'] == 115]

In [22]:
#splitting out the 116th congress
trump116 = trump_avgs_house[trump_avgs_house['congress'] == 116]

In [23]:
#dropping columns
trump115 = trump115.drop(columns=['chamber','bioguide'])

## Election Results

### 2018

In [24]:
#importing data
election2018 = pd.read_csv('./data/2018-house/2018_house_election_results.csv')

In [25]:
#preview df
election2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 10 columns):
raceid         896 non-null object
keyrace        896 non-null bool
uncontested    896 non-null bool
first          896 non-null object
last           896 non-null object
gender         896 non-null object
party          896 non-null object
winner         896 non-null bool
votes          896 non-null int64
pct            896 non-null float64
dtypes: bool(3), float64(1), int64(1), object(5)
memory usage: 51.7+ KB


In [26]:
election2018.drop_duplicates(inplace=True)

In [27]:
#preview df
print(election2018.shape)
election2018.head()

(826, 10)


Unnamed: 0,raceid,keyrace,uncontested,first,last,gender,party,winner,votes,pct
0,ALH01,False,False,Bradley,Byrne,male,R,True,152308,63.3
1,ALH01,False,False,Robert,Kennedy,male,D,False,88365,36.7
2,ALH02,False,False,Martha,Roby,female,R,True,138582,61.5
3,ALH02,False,False,Tabitha,Isner,female,D,False,86581,38.5
4,ALH03,False,False,Mike,Rogers,male,R,True,147481,63.8


In [28]:
#rename columns
election2018.columns = ['district', 'keyrace', 'uncontested','first','last',
                        'gender','party','winner','votes','pct']

In [29]:
#reformatting district column
election2018.district = election2018.district.str.replace('([A-Z]{2})H',r'\1-')

In [30]:
#seeing how many unique districts are in the df
#missing 23 districts
election2018.district.nunique()

412

### 2016

In [31]:
#importing data
elections_1976_2016 = pd.read_csv('./data/election_results_76-16/1976-2016-house.csv')

In [32]:
#preview df
print(elections_1976_2016.shape)
elections_1976_2016.head()

(28272, 16)


Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,candidatevotes,totalvotes,version
0,1990,Alaska,AK,2,94,81,US House,0,gen,False,John S. Devens,democrat,False,91677,191647,20171005
1,2016,Alaska,AK,2,94,81,US House,0,gen,False,Bernie Souphanavong,none,False,9093,308198,20171005
2,2006,Alaska,AK,2,94,81,US House,0,gen,False,Alexander Crawford,libertarian,False,4029,234645,20171005
3,1984,Alaska,AK,2,94,81,US House,0,gen,False,Donald E. Young,republican,False,113582,206437,20171005
4,2016,Alaska,AK,2,94,81,US House,0,gen,False,Don Young,republican,False,155088,308198,20171005


In [33]:
#splitting out 2016
election2016 = elections_1976_2016[(elections_1976_2016['year']==2016) & (elections_1976_2016['office']=='US House')]

In [34]:
#new vote percentage column
election2016['vote%'] = election2016['candidatevotes'] / election2016['totalvotes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
#preview df
print(election2016.shape)
election2016.head()

(1441, 17)


Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,candidatevotes,totalvotes,version,vote%
1,2016,Alaska,AK,2,94,81,US House,0,gen,False,Bernie Souphanavong,none,False,9093,308198,20171005,0.029504
4,2016,Alaska,AK,2,94,81,US House,0,gen,False,Don Young,republican,False,155088,308198,20171005,0.503209
33,2016,Alaska,AK,2,94,81,US House,0,gen,False,,,True,1228,308198,20171005,0.003984
63,2016,Alaska,AK,2,94,81,US House,0,gen,False,Steve Lindbeck,democrat,False,111019,308198,20171005,0.36022
71,2016,Alaska,AK,2,94,81,US House,0,gen,False,Jim C. McDermott,libertarian,False,31770,308198,20171005,0.103083


In [36]:
#changing 0 districts to 1
election2016.district = election2016.district.replace(to_replace=0,value=1)

In [37]:
#changing it to a str
election2016.district = election2016.district.astype(str)

In [38]:
#formatting the district
election2016.district = election2016.state_po + '-' + election2016.district

In [39]:
#adding a zero before the single digit districts 
election2016.district = election2016.district.str.replace('-(\d{1})$',r'-0\1')

## Fixing 2018 districts

In [40]:
#list of all the districts that are in the 2016 dataset
election2016_districts = election2016.district.sort_values().unique().tolist()

In [41]:
#list of all the districts that are in the 2018 dataset
election2018_districts = election2018.district.sort_values().unique().tolist()

In [42]:
#districts not in the 2018 dataset
different_districts = np.setdiff1d(election2016_districts,election2018_districts).tolist()
print(len(different_districts))
different_districts

23


['IA-01',
 'IA-02',
 'IA-03',
 'IA-04',
 'IN-01',
 'IN-02',
 'IN-03',
 'IN-04',
 'IN-05',
 'IN-06',
 'IN-07',
 'IN-08',
 'IN-09',
 'KS-01',
 'KS-02',
 'KS-03',
 'KS-04',
 'KY-01',
 'KY-02',
 'KY-03',
 'KY-04',
 'KY-05',
 'KY-06']

In [43]:
#missing 2018 districts entered by hand
missing_2018districts = pd.DataFrame(data = 
                    [['IA-01',False,False,'Abby','Finkenauer','female','D',True,170342,51.0],
                    ['IA-01',False,False,'Rod','Blum','male','R',False,153442,45.9],
                    ['IA-02',False,False,'Dave','Loebsack','male','D',True,171446,54.8],
                    ['IA-02',False,False,'Christopher','Peters','male','R',False,133287,42.6],
                    ['IA-03',False,False,'Cindy','Axne','female','D',True,175642,49.3],
                    ['IA-03',False,False,'David','Young','male','R',False,167933,47.2],
                    ['IA-04',False,False,'Steve','King','male','R',True,157676,50.4],
                    ['IA-04',False,False,'J.D.','Scholten','male','D',False,147246,47.0],
                    ['IN-01',False,False,'Peter','Visclosky','male','D',True,159611,65.1],
                    ['IN-01',False,False,'Mark','Leyva','male','R',False,85594,34.9],
                    ['IN-02',False,False,'Jackie','Walorski','female','R',True,125499,54.8],
                    ['IN-02',False,False,'Mel','Hall','male','D',False,103363,45.2],
                    ['IN-03',False,False,'Jim','Banks','male','R',True,158927,64.7],
                    ['IN-03',False,False,'Courtney','Tritch','female','D',False,86610,35.3],
                    ['IN-04',False,False,'Jim','Baird','male','R',True,156539,64.1],
                    ['IN-04',False,False,'Tobi','Beck','male','D',False,87824,35.9],
                    ['IN-05',False,False,'Susan','Brooks','female','R',True,180035,56.8],
                    ['IN-05',False,False,'Dee','Thornton','female','D',False,137142,43.2],
                    ['IN-06',False,False,'Greg','Pence','male','R',True,154260,63.8],
                    ['IN-06',False,False,'Jeannine','Lake','female','D',False,79430,32.9],
                    ['IN-07',False,False,'André','Carson','male','D',True,141139,64.9],
                    ['IN-07',False,False,'Wayne','Harmon','male','R',False,76457,35.1],
                    ['IN-08',False,False,'Larry','Bucshon','male','R',True,157396,64.4],
                    ['IN-08',False,False,'William','Tanoos','male','D',False,86895,35.6],
                    ['IN-09',False,False,'Trey','Hollingsworth','male','R',True,153271,56.5],
                    ['IN-09',False,False,'Liz','Watson','female','D',False,118090,43.5],
                    ['KS-01',False,False,'Roger','Marshall','male','R',True,153082,68.1],
                    ['KS-01',False,False,'Alan','LaPolice','male','D',False,71558,31.9],
                    ['KS-02',False,False,'Steve','Watkins','male','R',True,126098,47.6],
                    ['KS-02',False,False,'Paul','Davis','male','D',False,123859,46.8],
                    ['KS-03',False,False,'Sharice','Davids','female','D',True,170518,53.6],
                    ['KS-03',False,False,'Kevin','Yoder','male','R',False,139762,43.9],
                    ['KS-04',False,False,'Ron','Estes','male','R',True,143459,59.3],
                    ['KS-04',False,False,'James','Thompson','male','D',False,98304,40.7],
                    ['KY-01',False,False,'James','Comer','male','R',True,172167,68.6],
                    ['KY-01',False,False,'Paul','Walker','male','D',False,78849,31.4],
                    ['KY-02',False,False,'Brett','Guthrie','male','R',True,171700,66.7],
                    ['KY-02',False,False,'Hank','Linderman','male','D',False,79964,31.1],
                    ['KY-03',False,False,'John','Yarmouth','male','D',True,173002,62.1],
                    ['KY-03',False,False,'Vickie','Gibson','female','R',False,101930,36.6],
                    ['KY-04',False,False,'Thomas','Massie','male','R',True,162946,62.2],
                    ['KY-04',False,False,'Seth','Hall','male','D',False,90536,34.6],
                    ['KY-05',False,False,'Harold','Rogers','male','R',True,172093,78.9],
                    ['KY-05',False,False,'Kenneth','Stepp','male','D',False,45890,21.1],
                    ['KY-06',False,False,'Andy','Barr','male','R',True,154468,51.0],
                    ['KY-06',False,False,'Amy','McGrath','female','D',False,144736,47.8]]
                    ,columns = election2018.columns)

In [44]:
#length of missing districts which is double the missing districts because 2 candidates
len(missing_2018districts)

46

In [45]:
#merging missing 2018 districts with rest of 2018 df
election2018 = pd.concat([election2018,missing_2018districts],axis=0)

In [46]:
#number of unique districts in 2018 df

election2018.district.nunique()

435

## Filtering out Election Winners

In [47]:
election2018_winners = election2018[election2018.winner == True].drop_duplicates()

election2018_winners_districts = election2018_winners.district.sort_values().unique().tolist()

In [48]:
#the districts where there were no winners
no_winner_districts = np.setdiff1d(election2016_districts,election2018_winners_districts).tolist()
print(len(no_winner_districts))
no_winner_districts

3


['GA-07', 'NM-02', 'UT-04']

In [49]:
#too close to call on election night but Woodall won
election2018[election2018.district == 'GA-07']

Unnamed: 0,district,keyrace,uncontested,first,last,gender,party,winner,votes,pct
233,GA-07,True,False,Rob,Woodall,male,R,False,140430,50.1
234,GA-07,True,False,Carolyn,Bourdeaux,female,D,False,140011,49.9


In [50]:
#changing status to win
election2018.loc[election2018['last']=='Woodall', 'winner'] = True

In [51]:
#Torres Small won
election2018[election2018.district == 'NM-02']

Unnamed: 0,district,keyrace,uncontested,first,last,gender,party,winner,votes,pct
538,NM-02,True,False,Xochitl,Torres Small,female,D,False,101481,50.9
539,NM-02,True,False,Yvette,Herrell,female,R,False,97767,49.1


In [52]:
#changing status to win
election2018.loc[election2018['last']=='Torres Small', 'winner'] = True

In [53]:
#McAdams Won
election2018[election2018.district == 'UT-04']

Unnamed: 0,district,keyrace,uncontested,first,last,gender,party,winner,votes,pct
827,UT-04,True,False,Ben,McAdams,male,D,False,134964,50.1
828,UT-04,True,False,Mia,Love,female,R,False,134270,49.9


In [54]:
#changing status to win
election2018.loc[(election2018['first'] == 'Ben') & (election2018['last'] == 'McAdams'),'winner'] = True

In [55]:
election2018_winners = election2018[election2018.winner == True].drop_duplicates()

In [56]:
#finally have 435 district winners!
election2018_winners.district.nunique()

435

## Final cleaning up of dataframe

In [57]:
election2018_winners.head()

Unnamed: 0,district,keyrace,uncontested,first,last,gender,party,winner,votes,pct
0,AL-01,False,False,Bradley,Byrne,male,R,True,152308,63.3
2,AL-02,False,False,Martha,Roby,female,R,True,138582,61.5
4,AL-03,False,False,Mike,Rogers,male,R,True,147481,63.8
6,AL-04,False,False,Robert,Aderholt,male,R,True,183968,79.9
8,AL-05,False,False,Mo,Brooks,male,R,True,158373,61.1


In [58]:
#making margin column
election2018_winners['margin'] = election2018_winners['pct'] - (100-election2018_winners['pct'])

In [59]:
#changing democratic margin to negative and republican as positive
election2018_winners['margin'] = np.where(election2018_winners['party']=='D',abs(election2018_winners['margin'])*-1,election2018_winners['margin'])

In [60]:
#dropping keyrace and winner columns
election2018_winners.drop(columns=['keyrace','winner'],inplace=True)

In [61]:
#binarize gender

election2018_winners.gender = np.where(election2018_winners.gender == 'male', 1, 0 )

## Merge dataframes

In [62]:
#merging winners with trump agree percentage
pd.set_option('display.max_columns', 22)
final_df = pd.merge(election2018_winners, trump115,on='district')

In [63]:
final_df.head()

Unnamed: 0,district,uncontested,first,last,gender,party_x,votes_x,pct,margin,congress,last_name,party_y,votes_y,agree_pct,predicted_agree,net_trump_vote
0,AL-01,False,Bradley,Byrne,1,R,152308,63.3,26.6,115,Byrne,R,96,0.958333,0.94584,29.4
1,AL-02,False,Martha,Roby,0,R,138582,61.5,23.0,115,Roby,R,95,0.957895,0.947442,31.9
2,AL-03,False,Mike,Rogers,1,R,147481,63.8,27.6,115,Rogers,R,96,0.958333,0.948683,33.0
3,AL-04,False,Robert,Aderholt,1,R,183968,79.9,59.8,115,Aderholt,R,95,0.968421,0.946349,63.0
4,AL-05,False,Mo,Brooks,1,R,158373,61.1,22.2,115,Brooks,R,92,0.836957,0.947222,33.4


In [64]:
final_df.drop(columns=['predicted_agree','net_trump_vote'],inplace=True)

In [65]:
final_df.columns = ['district','uncontested','116first','116last','116gender','116party','2018votes','2018pct','2018margin','congress','115last','115party','115trump_votes_y','115agree_pct']

In [66]:
final_df = pd.merge(final_df, district_makeup, on='district')

In [67]:
final_df.shape

(447, 16)

In [68]:
final_df.to_csv('./data/final_df.csv')