In [1]:
import pandas as pd
import requests
pd.set_option('max_rows', 100) 

In [2]:
url = 'https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports/03-31-2020.csv'

### Get daily data from the url

In [3]:
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)
df = pd.read_html(r.text, header=[0], index_col=1)[0]

#### quick glance at the data

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Active
count,0.0,2432.0,2432.0,2434.0,2434.0,2434.0,2434.0
mean,,35.593296,-76.856383,352.295399,17.299507,73.144618,186.210764
std,,11.408142,44.898252,3988.014789,329.031803,1470.015611,2672.700243
min,,-42.8821,-159.596679,0.0,0.0,0.0,-6.0
25%,,33.176592,-94.69932,2.0,0.0,0.0,0.0
50%,,37.547916,-86.096751,6.0,0.0,0.0,0.0
75%,,41.406791,-79.272587,31.0,1.0,0.0,0.0
max,,71.7069,178.065,105792.0,12428.0,63153.0,77635.0


### Aggregate by state

In [5]:
state_totals = df[df.Country_Region=='US'].groupby(by='Province_State').sum()

In [6]:
state_totals.describe()

Unnamed: 0.1,Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Active
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,0.0,1397.261622,-3327.74741,3244.344828,66.775862,121.103448,0.0
std,0.0,1154.954186,2887.627484,10179.628981,208.85066,922.296384,0.0
min,0.0,0.0,-13378.805429,0.0,0.0,0.0,0.0
25%,0.0,418.858961,-5190.570613,252.5,4.25,0.0,0.0
50%,0.0,1152.194028,-3017.705127,659.5,12.5,0.0,0.0
75%,0.0,2292.925088,-864.637952,2343.0,52.75,0.0,0.0
max,0.0,4531.701456,145.6739,75833.0,1550.0,7024.0,0.0


In [7]:
state_totals.sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Active
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New York,0.0,2389.536539,-4229.693579,75833,1550,0,0
New Jersey,0.0,846.015259,-1567.071444,18696,267,0,0
California,0.0,1795.311328,-5787.848423,8210,173,0,0
Michigan,0.0,2971.190821,-5777.476718,7615,259,0,0
Florida,0.0,1583.394696,-4540.237308,6741,85,0,0
Massachusetts,0.0,548.2932,-930.501649,6620,89,0,0
Illinois,0.0,2134.881632,-4722.348891,5994,99,0,0
Washington,0.0,1608.118427,-4108.874073,5432,225,0,0
Louisiana,0.0,1863.15607,-5508.646964,5237,239,0,0
Pennsylvania,0.0,2446.615349,-4655.077407,4963,63,0,0


In [8]:
state_totals.sort_values(by='Deaths', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Active
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New York,0.0,2389.536539,-4229.693579,75833,1550,0,0
New Jersey,0.0,846.015259,-1567.071444,18696,267,0,0
Michigan,0.0,2971.190821,-5777.476718,7615,259,0,0
Louisiana,0.0,1863.15607,-5508.646964,5237,239,0,0
Washington,0.0,1608.118427,-4108.874073,5432,225,0,0
California,0.0,1795.311328,-5787.848423,8210,173,0,0
Georgia,0.0,4531.701456,-11543.485056,3929,111,0,0
Illinois,0.0,2134.881632,-4722.348891,5994,99,0,0
Massachusetts,0.0,548.2932,-930.501649,6620,89,0,0
Florida,0.0,1583.394696,-4540.237308,6741,85,0,0


In [9]:
df[df.Province_State=='New Jersey'].sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,,Unassigned,New Jersey,US,2020-03-31 23:43:56,0.0,0.0,3686,247,0,0,"Unassigned, New Jersey, US"
34003.0,,Bergen,New Jersey,US,2020-03-31 23:43:56,40.960109,-74.071643,2909,4,0,0,"Bergen, New Jersey, US"
34013.0,,Essex,New Jersey,US,2020-03-31 23:43:56,40.786813,-74.24658,1900,4,0,0,"Essex, New Jersey, US"
34017.0,,Hudson,New Jersey,US,2020-03-31 23:43:56,40.737662,-74.075157,1606,2,0,0,"Hudson, New Jersey, US"
34039.0,,Union,New Jersey,US,2020-03-31 23:43:56,40.658354,-74.306801,1418,0,0,0,"Union, New Jersey, US"
34031.0,,Passaic,New Jersey,US,2020-03-31 23:43:56,41.032386,-74.299541,1294,1,0,0,"Passaic, New Jersey, US"
34023.0,,Middlesex,New Jersey,US,2020-03-31 23:43:56,40.436299,-74.41427,1277,0,0,0,"Middlesex, New Jersey, US"
34025.0,,Monmouth,New Jersey,US,2020-03-31 23:43:56,40.265497,-74.222407,1140,2,0,0,"Monmouth, New Jersey, US"
34029.0,,Ocean,New Jersey,US,2020-03-31 23:43:56,39.889545,-74.280893,1022,1,0,0,"Ocean, New Jersey, US"
34027.0,,Morris,New Jersey,US,2020-03-31 23:43:56,40.860953,-74.545537,841,1,0,0,"Morris, New Jersey, US"


In [10]:
df[df.Province_State=='New Jersey'].sort_values(by='Deaths', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,,Unassigned,New Jersey,US,2020-03-31 23:43:56,0.0,0.0,3686,247,0,0,"Unassigned, New Jersey, US"
34013.0,,Essex,New Jersey,US,2020-03-31 23:43:56,40.786813,-74.24658,1900,4,0,0,"Essex, New Jersey, US"
34003.0,,Bergen,New Jersey,US,2020-03-31 23:43:56,40.960109,-74.071643,2909,4,0,0,"Bergen, New Jersey, US"
34007.0,,Camden,New Jersey,US,2020-03-31 23:43:56,39.803438,-74.963888,228,3,0,0,"Camden, New Jersey, US"
34017.0,,Hudson,New Jersey,US,2020-03-31 23:43:56,40.737662,-74.075157,1606,2,0,0,"Hudson, New Jersey, US"
34025.0,,Monmouth,New Jersey,US,2020-03-31 23:43:56,40.265497,-74.222407,1140,2,0,0,"Monmouth, New Jersey, US"
34031.0,,Passaic,New Jersey,US,2020-03-31 23:43:56,41.032386,-74.299541,1294,1,0,0,"Passaic, New Jersey, US"
34011.0,,Cumberland,New Jersey,US,2020-03-31 23:43:56,39.371994,-75.107126,18,1,0,0,"Cumberland, New Jersey, US"
34035.0,,Somerset,New Jersey,US,2020-03-31 23:43:56,40.564657,-74.61683,413,1,0,0,"Somerset, New Jersey, US"
34027.0,,Morris,New Jersey,US,2020-03-31 23:43:56,40.860953,-74.545537,841,1,0,0,"Morris, New Jersey, US"


In [11]:
df[df.Province_State=='New York'].sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
36061.0,,New York City,New York,US,2020-03-31 23:43:56,40.767273,-73.971526,43119,932,0,0,"New York City, New York, US"
36119.0,,Westchester,New York,US,2020-03-31 23:43:56,41.162784,-73.757417,9967,10,0,0,"Westchester, New York, US"
36059.0,,Nassau,New York,US,2020-03-31 23:43:56,40.740665,-73.589419,8544,48,0,0,"Nassau, New York, US"
36103.0,,Suffolk,New York,US,2020-03-31 23:43:56,40.883201,-72.801217,6713,44,0,0,"Suffolk, New York, US"
36087.0,,Rockland,New York,US,2020-03-31 23:43:56,41.150279,-74.025605,2863,8,0,0,"Rockland, New York, US"
36071.0,,Orange,New York,US,2020-03-31 23:43:56,41.403375,-74.302408,1556,5,0,0,"Orange, New York, US"
36029.0,,Erie,New York,US,2020-03-31 23:43:56,42.76249,-78.730637,499,8,0,0,"Erie, New York, US"
36027.0,,Dutchess,New York,US,2020-03-31 23:43:56,41.764861,-73.743567,484,3,0,0,"Dutchess, New York, US"
36055.0,,Monroe,New York,US,2020-03-31 23:43:56,43.146389,-77.693229,257,8,0,0,"Monroe, New York, US"
36067.0,,Onondaga,New York,US,2020-03-31 23:43:56,43.004919,-76.199712,249,1,0,0,"Onondaga, New York, US"


In [12]:
df[df.Province_State=='New Hampshire'].sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
33015.0,,Rockingham,New Hampshire,US,2020-03-31 23:43:56,42.984997,-71.128834,137,1,0,0,"Rockingham, New Hampshire, US"
33011.0,,Hillsborough,New Hampshire,US,2020-03-31 23:43:56,42.915378,-71.720025,99,2,0,0,"Hillsborough, New Hampshire, US"
33009.0,,Grafton,New Hampshire,US,2020-03-31 23:43:56,43.937798,-71.820572,34,0,0,0,"Grafton, New Hampshire, US"
33017.0,,Strafford,New Hampshire,US,2020-03-31 23:43:56,43.291833,-71.02336,28,0,0,0,"Strafford, New Hampshire, US"
33013.0,,Merrimack,New Hampshire,US,2020-03-31 23:43:56,43.29663,-71.681157,27,0,0,0,"Merrimack, New Hampshire, US"
33001.0,,Belknap,New Hampshire,US,2020-03-31 23:43:56,43.516373,-71.416842,12,0,0,0,"Belknap, New Hampshire, US"
33003.0,,Carroll,New Hampshire,US,2020-03-31 23:43:56,43.874986,-71.204302,12,0,0,0,"Carroll, New Hampshire, US"
33005.0,,Cheshire,New Hampshire,US,2020-03-31 23:43:56,42.92016,-72.25311,4,0,0,0,"Cheshire, New Hampshire, US"
33019.0,,Sullivan,New Hampshire,US,2020-03-31 23:43:56,43.360941,-72.222031,4,0,0,0,"Sullivan, New Hampshire, US"
,,Unassigned,New Hampshire,US,2020-03-31 23:43:56,0.0,0.0,0,0,0,0,"Unassigned, New Hampshire, US"


In [13]:
population = pd.read_csv('data/SCPRC-EST2019-18+POP-RES.csv')
population.sort_values(by='POPESTIMATE2019', ascending=False).head(10)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,POPESTIMATE2019,POPEST18PLUS2019,PCNT_POPEST18PLUS
0,10,0,0,0,United States,328239523,255200373,77.7
5,40,4,9,6,California,39512223,30617582,77.5
44,40,3,7,48,Texas,28995881,21596071,74.5
10,40,3,5,12,Florida,21477737,17247808,80.3
33,40,1,2,36,New York,19453561,15425262,79.3
39,40,1,2,42,Pennsylvania,12801989,10167376,79.4
14,40,2,3,17,Illinois,12671821,9853946,77.8
36,40,2,3,39,Ohio,11689100,9111081,77.9
11,40,3,5,13,Georgia,10617423,8113542,76.4
34,40,3,5,37,North Carolina,10488084,8187369,78.1


In [14]:
geog = pd.read_csv('data/us_geography.csv')
geog.head(10)

Unnamed: 0,State,tot_sq_mi,tot_sq_km,land_sq_mi,land_sq_km,water_sq_mi,water_sq_km,inland_sq_mi,inland_sq_km,coast_sq_mi,coast_sq_km,gl_sq_mi,gl_sq_km,terr_sq_mi,terr_sq_km,latitude,longitude
0,United States,3796742,9833517,3531905,9147593,264837,685924,85647,221824,42337,109652,60094,155643,76759,198806,,
1,Alabama,52420,135767,50645,131171,1775,4597,1058,2740,517,1340,—,—,199,516,32.739632,-86.843459
2,Alaska,665384,1723337,570641,1477953,94743,245383,19304,49997,26119,67647,—,—,49320,127739,63.346191,-152.837068
3,Arizona,113990,295234,113594,294207,396,1026,396,1026,—,—,—,—,—,—,34.209964,-111.602401
4,Arkansas,53179,137732,52035,134771,1143,2961,1143,2961,—,—,—,—,—,—,34.895526,-92.444626
5,California,163695,423967,155779,403466,7916,20501,2833,7339,245,634,—,—,4837,12528,37.148573,-119.540651
6,Colorado,104094,269601,103642,268431,452,1170,452,1170,—,—,—,—,—,—,38.993575,-105.507774
7,Connecticut,5543,14357,4842,12542,701,1816,171,443,530,1372,—,—,—,—,41.579784,-72.746667
8,Delaware,2489,6446,1949,5047,540,1399,91,237,355,920,—,—,94,242,38.99355,-75.447374
9,District of Columbia,68,177,61,158,7,19,7,19,—,—,—,—,—,—,38.904148,-77.017094


### Merge state population date with state COVID-19 data

In [15]:
state_columns=['Confirmed', 'Deaths', 'Recovered', 'Active', ]

In [16]:
state_with_population = pd.merge(state_totals[state_columns], population[['NAME', 'POPESTIMATE2019']], left_on='Province_State', right_on='NAME')
state_with_population['fraction_confirmed'] = state_with_population['Confirmed'] / state_with_population['POPESTIMATE2019'] * 100.0
state_with_population['deaths'] = state_with_population['Deaths'] / state_with_population['POPESTIMATE2019'] * 100.0
state_with_population['death_rate?'] = state_with_population['Deaths'] / state_with_population['Confirmed'] * 100.0


In [17]:
state_with_population.sort_values(by='death_rate?', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
18,5237,239,0,0,Louisiana,4648794,0.112653,0.005141,4.563681
45,293,13,0,0,Vermont,623989,0.046956,0.002083,4.43686
47,5432,225,0,0,Washington,7614893,0.071334,0.002955,4.142121
36,568,23,0,0,Oklahoma,3956971,0.014354,0.000581,4.049296
22,7615,259,0,0,Michigan,9986857,0.07625,0.002593,3.401182
7,319,10,0,0,Delaware,973764,0.032759,0.001027,3.134796
10,3929,111,0,0,Georgia,10617423,0.037005,0.001045,2.825146
37,690,18,0,0,Oregon,4217737,0.016359,0.000427,2.608696
26,198,5,0,0,Montana,1068778,0.018526,0.000468,2.525253
1,119,3,0,0,Alaska,731545,0.016267,0.00041,2.521008


In [18]:
state_with_population.sort_values(by='fraction_confirmed', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
32,75833,1550,0,0,New York,19453561,0.389816,0.007968,2.043965
30,18696,267,0,0,New Jersey,8882190,0.210489,0.003006,1.428113
18,5237,239,0,0,Louisiana,4648794,0.112653,0.005141,4.563681
21,6620,89,0,0,Massachusetts,6892503,0.096046,0.001291,1.344411
6,3128,69,0,0,Connecticut,3565287,0.087735,0.001935,2.205882
22,7615,259,0,0,Michigan,9986857,0.07625,0.002593,3.401182
47,5432,225,0,0,Washington,7614893,0.071334,0.002955,4.142121
8,495,9,0,0,District of Columbia,705749,0.070138,0.001275,1.818182
5,2966,69,0,0,Colorado,5758736,0.051504,0.001198,2.326365
13,5994,99,0,0,Illinois,12671821,0.047302,0.000781,1.651652


In [19]:
state_with_population.sort_values(by='deaths', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
32,75833,1550,0,0,New York,19453561,0.389816,0.007968,2.043965
18,5237,239,0,0,Louisiana,4648794,0.112653,0.005141,4.563681
30,18696,267,0,0,New Jersey,8882190,0.210489,0.003006,1.428113
47,5432,225,0,0,Washington,7614893,0.071334,0.002955,4.142121
22,7615,259,0,0,Michigan,9986857,0.07625,0.002593,3.401182
45,293,13,0,0,Vermont,623989,0.046956,0.002083,4.43686
6,3128,69,0,0,Connecticut,3565287,0.087735,0.001935,2.205882
21,6620,89,0,0,Massachusetts,6892503,0.096046,0.001291,1.344411
8,495,9,0,0,District of Columbia,705749,0.070138,0.001275,1.818182
5,2966,69,0,0,Colorado,5758736,0.051504,0.001198,2.326365


In [20]:
state_with_population.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,3680.0,75.686275,0.0,0.0,6436069.0,0.041844,0.000892,1.928334
std,10794.788457,221.479434,0.0,0.0,7360660.0,0.0603,0.001382,0.996244
min,108.0,0.0,0.0,0.0,578759.0,0.008892,0.0,0.0
25%,395.5,7.5,0.0,0.0,1789606.0,0.015881,0.000245,1.307126
50%,987.0,18.0,0.0,0.0,4467673.0,0.022541,0.000427,1.818182
75%,3047.0,59.0,0.0,0.0,7446805.0,0.037886,0.000813,2.332113
max,75833.0,1550.0,0.0,0.0,39512220.0,0.389816,0.007968,4.563681
