In [64]:
# wiki tables extracted using:
# https://github.com/rocheio/wiki-table-scrape

In [32]:
import wikitablescrape
import pandas as pd
import numpy as np
import pickle

state_ranks = {}

In [91]:
# African-American Population
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_U.S._states_by_African-American_population',
    output_name='afam_pop'
)
df = pd.read_csv('afam_pop/afam_pop.csv')


In [93]:
td = dict(zip(df['State or territory'],
                                   df['Rank']))
state_ranks['afam_pop'] = td

In [96]:
# Land Area
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_U.S._states_by_area',
    output_name='land_area'
)
df = pd.read_csv('land_area/land_area.csv')

In [97]:
td = dict(zip(df.reset_index().iloc[1:,].iloc[:,0],
              df.reset_index().iloc[1:,].loc[:,'level_4']))
state_ranks['land_area'] = {k:int(v) for k,v in td.items() if type(v) is str }


In [98]:
# Fertility Rate (2017)
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_fertility_rate',
    output_name='fertility_rate'
)
df = pd.read_csv('fertility_rate/fertility_rate.csv')

In [109]:
td = dict(zip(df.iloc[:,0],
              df.loc[:,'TFR2017']))
td = {k:float(v) for k,v in td.items() if v[0].isdigit() }
state_ranks['fertility_rate17'] = td

In [110]:
# GDP per capita
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_U.S._states_by_GDP_per_capita',
    output_name='gdp_pc'
)
df = pd.read_csv('gdp_pc/gdp_pc.csv')

In [114]:
td = dict(zip(df.loc[:,'State'],
              df.loc[:,'Rank']))
td = {k:int(v) for k,v in td.items() if v[0].isdigit() }
state_ranks['gdp_pc_rank'] = td

In [127]:
td = dict(zip(df.loc[:,'State'],
              [float(i.replace(',','')) for i in df.loc[:,'2017']]))
state_ranks['gdp_pc'] = td

In [129]:
# Income Inequality
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_U.S._states_by_Gini_coefficient',
    output_name='income_ineq'
)
df = pd.read_csv('income_ineq/income_ineq.csv')

In [143]:
state_ranks['income_ineq'] = dict(zip(df.iloc[:,1],
                                           [float(i.strip('.')) for i in \
                                            df.loc[:,'Gini Coefficient']]))

state_ranks['income_ineq_rank'] = dict(zip(df.iloc[:,1],
                                           df.loc[:,'Rank']))

In [148]:
# Population Density
wikitablescrape.scrape(
    url='https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density',
    output_name='pop_density'
)
df = pd.read_csv('pop_density/pop_density.csv')

In [152]:
state_ranks['pop_dens_rank'] = dict(zip(df.iloc[:,0],
                                        df.loc[:,'Pop. dens. Rank']))

state_ranks['pop_dens_km'] = dict(zip(df.iloc[:,0],
                                      df.loc[:,'Density (Pop./ km2)']))


In [183]:
# 538 Partisan Lean
# NOTE: DC PVI imputed as same value as california (D+24), given that it is
# heavily democratic, but cannot be directly calculated for obvious reasons
# csv constructed by copy/paste 538 data:
# https://github.com/fivethirtyeight/data/tree/master/partisan-lean

df = pd.read_csv('pvi_538.txt')
td = dict(zip(df['state'],
             [abs(float(i[2:])) \
              if i[0]=='R'\
              else -abs(float(i[2:])) \
              for i in df['pvi_538']]))
td['District of Columbia'] = -24.0
state_ranks['partisan_lean'] = td

In [188]:
# Census regions
# https://github.com/cphalpert/census-regions/
df = pd.read_csv('us census bureau regions and divisions.csv')

state_ranks['census_region'] = dict(zip(df['State'],
                                        df['Region']))

state_ranks['census_division'] = dict(zip(df['State'],
                                          df['Division']))

In [192]:
pickle.dump(state_ranks, open('state_ranks.pickle', 'wb'))