# External data handling

This notebook is used to scrap data and tables online to build small datasets with relevant information for our analysis : 
- Us_states : a dataframe with all possible abreviations codes for each USA state, ISO-3 are used by the plotting tool we use to create maps (origin : https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations)
- results_by_states_V2 : a dataframe with the results of the 20++ election by states, with the number of votes for each candidate and the percentage of votes for each candidate (origin : https://en.wikipedia.org/wiki/2020_United_States_presidential_election_by_state)

In [2]:
import pandas as pd

## US_states dataframe

In [3]:
US_states = pd.read_html('https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations')[0]
US_states = US_states.iloc[8:, :]
US_states.columns = US_states.iloc[0]
US_states = US_states.iloc[3:, :-1].reset_index(drop=True)
US_states['Name'] = US_states.iloc[:, 0]
US_states['status']=US_states.iloc[:, 1]
US_states = US_states.iloc[:, 2:]
# keep only status = 'State' or 'Federal district'
US_states = US_states[US_states['status'].isin(['State', 'Federal district'])]
US_states = US_states.drop('status', axis=1)
US_states

8,ISO,ANSI,ANSI.1,USPS,USCG,GPO,AP,Name
0,US-AL,AL,1,AL,AL,Ala.,Ala.,Alabama
1,US-AK,AK,2,AK,AK,Alaska,Alaska,Alaska
2,US-AZ,AZ,4,AZ,AZ,Ariz.,Ariz.,Arizona
3,US-AR,AR,5,AR,AR,Ark.,Ark.,Arkansas
4,US-CA,CA,6,CA,CF,Calif.,Calif.,California
5,US-CO,CO,8,CO,CL,Colo.,Colo.,Colorado
6,US-CT,CT,9,CT,CT,Conn.,Conn.,Connecticut
7,US-DE,DE,10,DE,DL,Del.,Del.,Delaware
8,US-DC,DC,11,DC,DC,D.C.,D.C.,District of Columbia
9,US-FL,FL,12,FL,FL,Fla.,Fla.,Florida


In [4]:
# create a csv file if it doesn't exist yet
try: 
    US_states = pd.read_csv('../Data/External_Data/US_states_V2.csv')
except: 
    US_states.to_csv('../Data/External_Data/US_states_V2.csv', index=False)    

## US_states_political_inclination dataframe

In [5]:
results_by_state = pd.DataFrame()

df_2000 = pd.read_html('https://en.wikipedia.org/wiki/2000_United_States_presidential_election', match='Alabama')[0]
df_2000 = df_2000.iloc[:, [2,3,5,6,28,29]]
df_2000.columns = ['Republican', 'Republican%', 'Democratic', 'Democratic%', 'Total', 'State']
df_2000 = df_2000[df_2000['State'].str.len()<=2].reset_index(drop=True)
df_2000['year'] = 2000
# append to results_by_state
results_by_state = results_by_state.append(df_2000, ignore_index=True)

df_2004 = pd.read_html('https://en.wikipedia.org/wiki/2004_United_States_presidential_election', match='Alabama')[0]
df_2004 = df_2004.iloc[:, [2,3,5,6,25,26]]
df_2004.columns = ['Republican', 'Republican%', 'Democratic', 'Democratic%', 'Total', 'State']
df_2004 = df_2004[df_2004['State'].str.len()<=2].reset_index(drop=True)
df_2004['year'] = 2004
# append to results_by_state
results_by_state = results_by_state.append(df_2004, ignore_index=True)

df_2008 = pd.read_html('https://en.wikipedia.org/wiki/2008_United_States_presidential_election', match='Alabama')[0]
df_2008 = df_2008.iloc[:, [2,3,5,6,25,26]]
df_2008.columns = ['Democratic', 'Democratic%', 'Republican', 'Republican%', 'Total', 'State']
df_2008 = df_2008[df_2008['State'].str.len()<=2].reset_index(drop=True)
df_2008['year'] = 2008
# append to results_by_state
results_by_state = results_by_state.append(df_2008, ignore_index=True)

df_2012 = pd.read_html('https://en.wikipedia.org/wiki/2012_United_States_presidential_election', match='Alabama')[0]
df_2012 = df_2012.iloc[1:, [1,2,4,5,18,19]]
df_2012.columns = ['Democratic', 'Democratic%', 'Republican', 'Republican%', 'Total', 'State']
df_2012['State'] = df_2012['State'].str.replace('–AL', '')
df_2012 = df_2012[df_2012['State'].str.len()<=2].reset_index(drop=True)
df_2012['year'] = 2012
# append to results_by_state
results_by_state = results_by_state.append(df_2012, ignore_index=True)

df_2016 = pd.read_html('https://en.wikipedia.org/wiki/2016_United_States_presidential_election', match='Alaska')[1]
df_2016 = df_2016.iloc[:, [1,2,4,5,21,0]]
df_2016.columns = ['Democratic', 'Democratic%', 'Republican', 'Republican%', 'Total', 'State']
df_2016 = df_2016.drop([20, 21, 30, 31, 32, 57]).reset_index(drop=True)
df_2016['State'] = df_2000['State']
df_2016['year'] = 2016
# append to results_by_state
results_by_state = results_by_state.append(df_2016, ignore_index=True)

# save in csv file
results_by_state.to_csv('../Data/External_Data/results_by_state_V2.csv', index=False)
results_by_state

Unnamed: 0,Republican,Republican%,Democratic,Democratic%,Total,State,year
0,941173.0,56.48%,692611.0,41.57%,1666272.0,AL,2000
1,167398.0,58.62%,79004.0,27.67%,285560.0,AK,2000
2,781652.0,51.02%,685341.0,44.73%,1532016.0,AZ,2000
3,472940.0,51.31%,422768.0,45.86%,921781.0,AR,2000
4,4567429.0,41.65%,5861203.0,53.45%,10965856.0,CA,2000
...,...,...,...,...,...,...,...
255,1221747,36.83%,1742718,52.54%,3317019,WA,2016
256,489371,68.50%,188794,26.43%,714423,WV,2016
257,1405284,47.22%,1382536,46.45%,2976150,WI,2016
258,174419,68.17%,55973,21.88%,255849,WY,2016
