In [64]:
import pandas as pd
import geopandas as gpd
import numpy as np

%matplotlib inline

In [198]:
# shape file
us_shape = gpd.read_file('../../src/data/cb_2018_us_state_500k.shp')

# president voting data
us_presidency = pd.read_csv('../../src/data/president_county_candidate.csv')

# Covid data per state
df_covid = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

# Covid data for all of the USA
df_us_total = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv')

# Population estimates for all of the USA
state_population = pd.read_csv('../../src/data/state_population.csv')

# governers per state

url = 'https://en.wikipedia.org/wiki/List_of_current_United_States_governors'
governers_list = pd.read_html(url)
df_gov = governers_list[0]#[['State', 'Governer', 'Party.1']]
df_gov = df_gov.iloc[:,[0,2,4]]

In [58]:
us_shape.head()

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ..."
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ..."
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6..."
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ..."
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169..."


# Create a column called 'percent republican' 

## The percentage of republican voters per state

In [53]:
trump_votes = us_presidency[us_presidency.candidate == 'Donald Trump'].groupby(['state']).votes.sum()
all_votes = us_presidency.groupby(['state']).votes.sum()
percent_red = trump_votes/all_votes * 100

# Normalize data by total cases in the US

In [95]:
# Normalize state covid by total us cases at that time
df_covid_total = df_covid.merge(df_us_total, left_on='date', right_on='date')
df_covid_total['cases_norm'] = df_covid_total.cases_x/df_covid_total.cases_y
df_covid_total['deaths_norm'] = df_covid_total.deaths_x/df_covid_total.deaths_y
df_covid_total_norm = df_covid_total.replace(np.nan, 0)[['date','state', 'cases_norm', 'deaths_norm']]
df_covid_total_norm

Unnamed: 0,date,state,cases_norm,deaths_norm
0,2020-01-21,Washington,1.000000,0.000000
1,2020-01-22,Washington,1.000000,0.000000
2,2020-01-23,Washington,1.000000,0.000000
3,2020-01-24,Illinois,0.500000,0.000000
4,2020-01-24,Washington,0.500000,0.000000
...,...,...,...,...
13814,2020-11-08,Virginia,0.019102,0.015574
13815,2020-11-08,Washington,0.012157,0.010721
13816,2020-11-08,West Virginia,0.002823,0.002109
13817,2020-11-08,Wisconsin,0.028021,0.010066


# Filter the month to the month of July percent cases and deaths

In [94]:
df_covid_total.date = pd.to_datetime(df_covid_total.date)
df_covid_total['month'] = df_covid_total.date.dt.strftime('%m')
percent_cases_state = df_covid_total[df_covid_total.month == '07'].groupby(['state']).cases_norm.mean() * 100 
percent_deaths_state = df_covid_total[df_covid_total.month == '07'].groupby(['state']).deaths_norm.mean() * 100 

In [22]:
df_us_total.head()

Unnamed: 0,date,cases,deaths
0,2020-01-21,1,0
1,2020-01-22,1,0
2,2020-01-23,1,0
3,2020-01-24,2,0
4,2020-01-25,3,0


# Pull STATE NAME and population estimates 

In [52]:
df_state_data = state_population[['STATE', 'NAME', 'POPESTIMATE2019']]

# Create a "change in cases" column

In [153]:
# Merge total cases with united states cases 
df_covid_total = df_covid.merge(df_us_total,
                                left_on='date',
                                right_on='date',
                                suffixes=('_state', '_national'))

# convert date to datetime object
df_covid_total.date = pd.to_datetime(df_covid_total.date)

# create a months column and then take the mean
df_covid_total['month'] = df_covid_total.date.dt.strftime('%m')


# April cases
df_covid_total_april = df_covid_total[df_covid_total.month == '04']
april_cases = df_covid_total_april.groupby(['state']).cases_state.mean()

# July cases
df_covid_total_july = df_covid_total[df_covid_total.month == '07']
july_cases = df_covid_total_july.groupby(['state']).cases_state.mean()

# Increase in Cases
change_in_cases = july_cases - april_cases

change_in_cases
# df_covid_total['cases_state'] = cases_state
# df_covid_total['deaths_state'] = deaths_state


df_covid_pop = df_covid_total.merge(df_state_data, left_on= 'state', right_on='NAME')
df_covid_pop

Unnamed: 0,date,state,fips,cases_state,deaths_state,cases_national,deaths_national,month,STATE,NAME,POPESTIMATE2019
0,2020-01-21,Washington,53,1,0,1,0,01,53,Washington,7614893
1,2020-01-22,Washington,53,1,0,1,0,01,53,Washington,7614893
2,2020-01-23,Washington,53,1,0,1,0,01,53,Washington,7614893
3,2020-01-24,Washington,53,1,0,2,0,01,53,Washington,7614893
4,2020-01-25,Washington,53,1,0,3,0,01,53,Washington,7614893
...,...,...,...,...,...,...,...,...,...,...,...
12868,2020-11-04,West Virginia,54,25987,472,9576594,234223,11,54,West Virginia,1792147
12869,2020-11-05,West Virginia,54,26547,480,9698098,235331,11,54,West Virginia,1792147
12870,2020-11-06,West Virginia,54,27087,487,9830895,236554,11,54,West Virginia,1792147
12871,2020-11-07,West Virginia,54,27742,502,9957051,237567,11,54,West Virginia,1792147


In [154]:
df_covid_pop

Unnamed: 0,date,state,fips,cases_state,deaths_state,cases_national,deaths_national,month,STATE,NAME,POPESTIMATE2019
0,2020-01-21,Washington,53,1,0,1,0,01,53,Washington,7614893
1,2020-01-22,Washington,53,1,0,1,0,01,53,Washington,7614893
2,2020-01-23,Washington,53,1,0,1,0,01,53,Washington,7614893
3,2020-01-24,Washington,53,1,0,2,0,01,53,Washington,7614893
4,2020-01-25,Washington,53,1,0,3,0,01,53,Washington,7614893
...,...,...,...,...,...,...,...,...,...,...,...
12868,2020-11-04,West Virginia,54,25987,472,9576594,234223,11,54,West Virginia,1792147
12869,2020-11-05,West Virginia,54,26547,480,9698098,235331,11,54,West Virginia,1792147
12870,2020-11-06,West Virginia,54,27087,487,9830895,236554,11,54,West Virginia,1792147
12871,2020-11-07,West Virginia,54,27742,502,9957051,237567,11,54,West Virginia,1792147


In [155]:
# Turn state delta into a dataframe

case_delta = pd.DataFrame(change_in_cases)
case_delta.columns = ['case_delta']

In [156]:
df_covid_pop_state_delta = df_covid_pop.merge(case_delta, how='inner', on='state')

In [159]:
df_covid_pop_state_delta.columns = [x.lower() for x in df_covid_pop_state_delta.columns]

In [161]:
df_covid_pop_state_delta.iloc[:, 8]


0        53
1        53
2        53
3        53
4        53
         ..
12868    54
12869    54
12870    54
12871    54
12872    54
Name: state, Length: 12873, dtype: int64

In [178]:
columns = df_covid_pop_state_delta.columns.to_list()
columns[8] = 'drop'
df_covid_pop_state_delta.columns = columns

In [181]:
df_covid_pop_state_delta = df_covid_pop_state_delta.drop(columns=['drop', 'name'])

# Merge the the state part data on state

In [192]:
df_gov.columns = ['state', 'governer', 'party']
party = df_gov[['state', 'party']]
party.party[22] = 'Democratic'
party.party[47] = 'Republican'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [201]:
df_covid_pop_state_party = df_covid_pop_state_delta.merge(party, on='state')

In [203]:
us_shape

Unnamed: 0,statefp,statens,affgeoid,geoid,stusps,name,lsad,aland,awater,geometry
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ..."
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ..."
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6..."
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ..."
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169..."
5,22,1629543,0400000US22,22,LA,Louisiana,0,111897594374,23753621895,"MULTIPOLYGON (((-88.86770 29.86155, -88.86566 ..."
6,26,1779789,0400000US26,26,MI,Michigan,0,146600952990,103885855702,"MULTIPOLYGON (((-83.19159 42.03537, -83.18993 ..."
7,25,606926,0400000US25,25,MA,Massachusetts,0,20205125364,7129925486,"MULTIPOLYGON (((-70.23405 41.28565, -70.22361 ..."
8,16,1779783,0400000US16,16,ID,Idaho,0,214049787659,2391722557,"POLYGON ((-117.24267 44.39655, -117.23484 44.3..."
9,12,294478,0400000US12,12,FL,Florida,0,138949136250,31361101223,"MULTIPOLYGON (((-80.17628 25.52505, -80.17395 ..."


# Merge on shape file data

In [208]:
us_shape.statefp = us_shape.statefp.astype(int)

In [211]:
us_shape.columns = [x.lower() for x in us_shape.columns]
final_shape_data = us_shape.merge(df_covid_pop_state_party, left_on='statefp', right_on='fips')


In [214]:
final_shape_data.to_pickle('../../src/data/shape_covid_pop.pickle')

In [219]:
percent_red = pd.DataFrame(percent_red)
percent_red.columns = ['percent_red']

In [None]:
final_shape_data.merge(percent_red, 