# External data handling

This notebook is used to scrap data and tables online to build small datasets with relevant information for our analysis : 
- US_states : a dataframe with corresponding ISO-3 codes for each USA state, those are used by the plotting tool we use to create maps (origin : https://en.wikipedia.org/wiki/ISO_3166-2:US)
- US_states_political_inclination : a dataframe wich containes the political inclination (D for democratic or R for republican) for each state and for each presidential election (origin : https://en.wikipedia.org/wiki/Red_states_and_blue_states)
- US_states_population : a dataframe wich containes the population for each state (origin : https://www.beerinstitute.org/member-portal/2020-brewers-almanac/)
- US_states_beer_consumption_per_capita : a dataframe wich containes the beer consumption for each state (origin : https://www.beerinstitute.org/member-portal/2020-brewers-almanac/)

In [13]:
import pandas as pd

## US_states dataframe

In [14]:
US_states = pd.read_html('https://en.wikipedia.org/wiki/ISO_3166-2:US')[0]
# US_states['Subdivision name (en)'] = US_states['Subdivision name (en)'].apply(lambda x: 'United States, ' + x)
US_states['Code'] = US_states['Code'].apply(lambda x: x[3:])
US_states.drop(columns=['Subdivision category'], inplace=True)
US_states.head()

Unnamed: 0,Code,Subdivision name (en)
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [15]:
# create a csv file if it doesn't exist yet
try: 
    US_states = pd.read_csv('../Data/External_Data/US_states.csv')
except: 
    US_states.to_csv('../Data/External_Data/US_states.csv', index=False)    

## US_states_political_inclination dataframe

In [16]:
# Get table from wikipedia
tables = pd.read_html('https://en.wikipedia.org/wiki/Red_states_and_blue_states')
for table in tables:
    if 'Year' in table.columns:
        Red_blue_states = table
        break
Red_blue_states.columns = Red_blue_states.columns.droplevel(1)
Red_blue_states.head(8)

Unnamed: 0,Year,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016,2020
0,Democratic candidate,George McGovern,Jimmy Carter,Jimmy Carter,Walter Mondale,Michael Dukakis,Bill Clinton,Bill Clinton,Al Gore,John Kerry,Barack Obama,Barack Obama,Hillary Clinton,Joe Biden
1,Republican candidate,Richard Nixon,Gerald Ford,Ronald Reagan,Ronald Reagan,George H. W. Bush,George H. W. Bush,Bob Dole,George W. Bush,George W. Bush,John McCain,Mitt Romney,Donald Trump,Donald Trump
2,National popular vote,Nixon,Carter,Reagan,Reagan,Bush,Clinton,Clinton,Gore,Bush,Obama,Obama,Clinton,Biden
3,Alabama,Nixon,Carter,Reagan,Reagan,Bush,Bush,Dole,Bush,Bush,McCain,Romney,Trump,Trump
4,Alaska,Nixon,Ford,Reagan,Reagan,Bush,Bush,Dole,Bush,Bush,McCain,Romney,Trump,Trump
5,Arizona,Nixon,Ford,Reagan,Reagan,Bush,Bush,Clinton,Bush,Bush,McCain,Romney,Trump,Biden
6,Arkansas,Nixon,Carter,Reagan,Reagan,Bush,Clinton,Clinton,Bush,Bush,McCain,Romney,Trump,Trump
7,California,Nixon,Ford,Reagan,Reagan,Bush,Clinton,Clinton,Gore,Kerry,Obama,Obama,Clinton,Biden


Let's extract the Democratic vs Republican candidates list for each year

In [17]:
candidates = Red_blue_states[:2].T
candidates.reset_index(inplace=True)
# put first row as header
new_hearder = candidates.iloc[0]
candidates = candidates[1:]
candidates.columns = new_hearder
#keep only last word in each column entry
candidates = candidates.applymap(lambda x: x.split()[-1])
candidates

Unnamed: 0,Year,Democratic candidate,Republican candidate
1,1972,McGovern,Nixon
2,1976,Carter,Ford
3,1980,Carter,Reagan
4,1984,Mondale,Reagan
5,1988,Dukakis,Bush
6,1992,Clinton,Bush
7,1996,Clinton,Dole
8,2000,Gore,Bush
9,2004,Kerry,Bush
10,2008,Obama,McCain


Get a nice table of each candidate who won in each state by year of election

In [18]:
US_states_politics = Red_blue_states[3:].reset_index(drop=True)
US_states_politics.rename(columns={'Year':'State'}, inplace=True)
US_states_politics.head()

Unnamed: 0,State,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016,2020
0,Alabama,Nixon,Carter,Reagan,Reagan,Bush,Bush,Dole,Bush,Bush,McCain,Romney,Trump,Trump
1,Alaska,Nixon,Ford,Reagan,Reagan,Bush,Bush,Dole,Bush,Bush,McCain,Romney,Trump,Trump
2,Arizona,Nixon,Ford,Reagan,Reagan,Bush,Bush,Clinton,Bush,Bush,McCain,Romney,Trump,Biden
3,Arkansas,Nixon,Carter,Reagan,Reagan,Bush,Clinton,Clinton,Bush,Bush,McCain,Romney,Trump,Trump
4,California,Nixon,Ford,Reagan,Reagan,Bush,Clinton,Clinton,Gore,Kerry,Obama,Obama,Clinton,Biden


Finally let's get the political inclination of each state for each elecion

In [19]:
US_states_political_inclination = US_states_politics.copy()
# for each year and for each state, if the name is in the Democratic candidate list, the state is Democratic, else it is Republican
for year in US_states_politics.columns[1:]:
    US_states_political_inclination[year] = US_states_politics[year].apply(lambda x: 'D' if x in candidates['Democratic candidate'].values else 'R')

US_states_political_inclination.head()

Unnamed: 0,State,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016,2020
0,Alabama,R,D,R,R,R,R,R,R,R,R,R,R,R
1,Alaska,R,R,R,R,R,R,R,R,R,R,R,R,R
2,Arizona,R,R,R,R,R,R,D,R,R,R,R,R,D
3,Arkansas,R,D,R,R,R,D,D,R,R,R,R,R,R
4,California,R,R,R,R,R,D,D,D,D,D,D,D,D


Create a .csv file with the dataframe

In [20]:
# If the csv file containing US_states_political_inclination file does not exist yet, create it
try:
    US_states_political_inclination = pd.read_csv('../Data/External_Data/US_states_political_inclination.csv')   
except:
    US_states_political_inclination.to_csv('../Data/External_Data/US_states_political_inclination.csv', index=False)

## US_states_population dataframe

In [21]:
US_states_population = pd.read_csv('../Data/External_Data/US_states_population.csv')
US_states_population.head()

Unnamed: 0.1,Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Alabama,4040389,4091025,4139269,4193114,4232965,4262731,4290403,4320281,4351037,...,4798834,4815564,4830460,4842481,4853160,4864745,4875120,4887871,4903185,4921532
1,Alaska,550043,569273,587073,596993,600624,601345,604918,608846,615205,...,722038,730399,737045,736307,737547,741504,739786,737438,731545,731158
2,Arizona,3665339,3762394,3867333,3993390,4147561,4306908,4432308,4552207,4667277,...,6473497,6556629,6634999,6733840,6833596,6945452,7048876,7171646,7278717,7421401
3,Arkansas,2350624,2370666,2394098,2423743,2450605,2480121,2504858,2524007,2538202,...,2940407,2952109,2959549,2967726,2978407,2990410,3002997,3013825,3017804,3030522
4,California,29811427,30414114,30875920,31147208,31317179,31493525,31780829,32217708,32682794,...,37641823,37960782,38280824,38625139,38953142,39209127,39399349,39557045,39512223,39368078


## US_states_beer_consumption_per_capita

In [22]:
US_states_beer_consumption = pd.read_csv('../Data/External_Data/US_states_beer_consumption_per_capita.csv')
US_states_beer_consumption.head()

## in gallons per capita per year

Unnamed: 0,State,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Alabama,29.2,28.9,29.3,29.0,29.7,29.3,29.7,29.5,29.9,...,30.4,30.4,29.0,29.5,29.3,30.0,28.9,28.4,29.0,29.1
1,Alaska,41.3,39.7,40.3,36.7,38.9,38.3,35.1,36.2,37.0,...,28.8,28.8,29.0,28.3,28.9,27.2,26.5,27.0,27.2,26.1
2,Arizona,41.9,40.9,40.6,40.5,38.8,38.9,40.3,39.3,39.1,...,29.7,29.5,29.0,28.8,27.4,27.1,26.4,26.2,25.7,27.0
3,Arkansas,29.0,28.8,29.1,28.5,28.4,28.0,29.0,28.5,29.2,...,26.3,26.3,25.9,24.9,24.3,24.0,23.8,23.3,23.2,23.9
4,California,34.6,32.5,31.1,30.4,29.7,29.0,28.2,28.6,28.2,...,26.8,25.9,25.7,26.2,26.0,25.5,25.3,24.8,24.6,24.5
