In [1]:
import pandas as pd
import requests
pd.set_option('max_rows', 100) 

In [2]:
url = 'https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports/03-29-2020.csv'

### Get daily data from the url

In [3]:
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)
df = pd.read_html(r.text, header=[0], index_col=0)[0].iloc[:-1]

#### quick glance at the data

In [4]:
df[df.Country_Region=='US'].sort_values(by='Province_State')

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
,1047.0,Dallas,Alabama,US,2020-03-29 23:08:25,32.326881,-87.108667,2,0,0,0,"Dallas, Alabama, US"
,1055.0,Etowah,Alabama,US,2020-03-29 23:08:25,34.045673,-86.040519,6,0,0,0,"Etowah, Alabama, US"
,1053.0,Escambia,Alabama,US,2020-03-29 23:08:25,31.125679,-87.159187,1,0,0,0,"Escambia, Alabama, US"
,1051.0,Elmore,Alabama,US,2020-03-29 23:08:25,32.597854,-86.144153,13,0,0,0,"Elmore, Alabama, US"
,1009.0,Blount,Alabama,US,2020-03-29 23:08:25,33.982109,-86.567906,5,0,0,0,"Blount, Alabama, US"
...,...,...,...,...,...,...,...,...,...,...,...,...
,56031.0,Platte,Wyoming,US,2020-03-29 23:08:25,42.132991,-104.966331,0,0,0,0,"Platte, Wyoming, US"
,56043.0,Washakie,Wyoming,US,2020-03-29 23:08:25,43.904516,-107.680187,1,0,0,0,"Washakie, Wyoming, US"
,56005.0,Campbell,Wyoming,US,2020-03-29 23:08:25,44.248861,-105.547440,1,0,0,0,"Campbell, Wyoming, US"
,56009.0,Converse,Wyoming,US,2020-03-29 23:08:25,42.972723,-105.508185,1,0,0,0,"Converse, Wyoming, US"


In [5]:
df.describe()

Unnamed: 0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
count,3148.0,3433.0,3433.0,3433.0,3433.0,3433.0,3433.0
mean,30472.462834,36.862569,-82.775425,209.761142,9.881736,43.426158,115.234489
std,15295.72416,10.053753,39.143801,2956.602158,234.89099,1161.951657,2024.815327
min,1001.0,-42.8821,-170.132,0.0,0.0,0.0,-6.0
25%,18180.5,33.953399,-97.632172,0.0,0.0,0.0,0.0
50%,29182.0,38.071225,-89.046796,2.0,0.0,0.0,0.0
75%,45089.5,41.729806,-81.727132,10.0,0.0,0.0,0.0
max,99999.0,71.7069,178.065,97689.0,10779.0,62570.0,73880.0


### Aggregate by state

In [6]:
state_totals = df[df.Country_Region=='US'].groupby(by='Province_State').sum()

In [7]:
state_totals.describe()

Unnamed: 0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
count,59.0,59.0,59.0,59.0,59.0,59.0,59.0
mean,1625887.0,2050.698623,-4918.486094,2387.898305,41.813559,45.169492,0.0
std,1995046.0,1723.851856,4444.368446,7897.064059,129.749291,346.953448,0.0
min,0.0,-14.271,-25057.879425,0.0,0.0,0.0,0.0
25%,160012.0,561.685748,-7320.23271,151.5,2.0,0.0,0.0
50%,1288529.0,1947.589655,-4708.049596,503.0,9.0,0.0,0.0
75%,2269284.0,3302.767521,-1044.429181,1686.5,30.5,0.0,0.0
max,12256520.0,8041.803434,145.6739,59648.0,965.0,2665.0,0.0


In [8]:
state_totals.sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New York,2235844.0,2637.737383,-4679.399365,59648,965,0,0
New Jersey,714441.0,846.015259,-1567.071444,13386,161,0,0
California,351364.0,2194.949775,-7002.258461,5852,124,0,0
Michigan,2164889.0,3655.617539,-7070.178085,5488,132,0,0
Massachusetts,350196.0,630.994149,-1071.277031,4955,48,0,0
Illinois,1744404.0,4064.049006,-9096.189667,4596,66,0,0
Washington,2068521.0,1842.11767,-4708.049596,4465,198,0,0
Florida,808550.0,1939.030577,-5540.949842,4246,56,0,0
Louisiana,1412096.0,1989.915353,-5876.747946,3540,151,0,0
Pennsylvania,2818489.0,2734.81125,-5200.226295,3432,41,0,0


In [9]:
state_totals.sort_values(by='Confirmed', ascending=False)

Unnamed: 0_level_0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New York,2235844.0,2637.737383,-4679.399365,59648,965,0,0
New Jersey,714441.0,846.015259,-1567.071444,13386,161,0,0
California,351364.0,2194.949775,-7002.258461,5852,124,0,0
Michigan,2164889.0,3655.617539,-7070.178085,5488,132,0,0
Massachusetts,350196.0,630.994149,-1071.277031,4955,48,0,0
Illinois,1744404.0,4064.049006,-9096.189667,4596,66,0,0
Washington,2068521.0,1842.11767,-4708.049596,4465,198,0,0
Florida,808550.0,1939.030577,-5540.949842,4246,56,0,0
Louisiana,1412096.0,1989.915353,-5876.747946,3540,151,0,0
Pennsylvania,2818489.0,2734.81125,-5200.226295,3432,41,0,0


In [10]:
df[df.Country_Region=='US'].groupby(by='Province_State').sum().sort_values(by='Deaths', ascending=False)

Unnamed: 0_level_0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New York,2235844.0,2637.737383,-4679.399365,59648,965,0,0
Washington,2068521.0,1842.11767,-4708.049596,4465,198,0,0
New Jersey,714441.0,846.015259,-1567.071444,13386,161,0,0
Louisiana,1412096.0,1989.915353,-5876.747946,3540,151,0,0
Michigan,2164889.0,3655.617539,-7070.178085,5488,132,0,0
California,351364.0,2194.949775,-7002.258461,5852,124,0,0
Georgia,2092677.0,5216.556611,-13288.811358,2651,80,0,0
Illinois,1744404.0,4064.049006,-9096.189667,4596,66,0,0
Florida,808550.0,1939.030577,-5540.949842,4246,56,0,0
Massachusetts,350196.0,630.994149,-1071.277031,4955,48,0,0


In [11]:
df[df.Province_State=='New Jersey'].sort_values(by='Confirmed', ascending=False)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
,,Unassigned,New Jersey,US,2020-03-29 23:08:25,0.0,0.0,3020,56,0,0,"Unassigned, New Jersey, US"
,34003.0,Bergen,New Jersey,US,2020-03-29 23:08:25,40.960109,-74.071643,2169,23,0,0,"Bergen, New Jersey, US"
,34013.0,Essex,New Jersey,US,2020-03-29 23:08:25,40.786813,-74.24658,1227,13,0,0,"Essex, New Jersey, US"
,34017.0,Hudson,New Jersey,US,2020-03-29 23:08:25,40.737662,-74.075157,974,6,0,0,"Hudson, New Jersey, US"
,34023.0,Middlesex,New Jersey,US,2020-03-29 23:08:25,40.436299,-74.41427,938,12,0,0,"Middlesex, New Jersey, US"
,34039.0,Union,New Jersey,US,2020-03-29 23:08:25,40.658354,-74.306801,896,7,0,0,"Union, New Jersey, US"
,34025.0,Monmouth,New Jersey,US,2020-03-29 23:08:25,40.265497,-74.222407,870,8,0,0,"Monmouth, New Jersey, US"
,34031.0,Passaic,New Jersey,US,2020-03-29 23:08:25,41.032386,-74.299541,831,10,0,0,"Passaic, New Jersey, US"
,34029.0,Ocean,New Jersey,US,2020-03-29 23:08:25,39.889545,-74.280893,759,6,0,0,"Ocean, New Jersey, US"
,34027.0,Morris,New Jersey,US,2020-03-29 23:08:25,40.860953,-74.545537,566,8,0,0,"Morris, New Jersey, US"


In [12]:
df[df.Province_State=='New York'].sort_values(by='Confirmed')

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
,36123.0,Yates,New York,US,2020-03-29 23:08:25,42.635055,-77.103699,0,0,0,0,"Yates, New York, US"
,36005.0,Bronx,New York,US,2020-03-29 23:08:25,40.852093,-73.862828,0,0,0,0,"Bronx, New York, US"
,36081.0,Queens,New York,US,2020-03-29 23:08:25,40.710881,-73.816847,0,0,0,0,"Queens, New York, US"
,36099.0,Seneca,New York,US,2020-03-29 23:08:25,42.78081,-76.824971,0,0,0,0,"Seneca, New York, US"
,,Unassigned,New York,US,2020-03-29 23:08:25,0.0,0.0,0,159,0,0,"Unassigned, New York, US"
,36049.0,Lewis,New York,US,2020-03-29 23:08:25,43.784416,-75.44904,0,0,0,0,"Lewis, New York, US"
,36047.0,Kings,New York,US,2020-03-29 23:08:25,40.636182,-73.949356,0,0,0,0,"Kings, New York, US"
,36085.0,Richmond,New York,US,2020-03-29 23:08:25,40.585822,-74.148086,0,0,0,0,"Richmond, New York, US"
,36097.0,Schuyler,New York,US,2020-03-29 23:08:25,42.39184,-76.87733,1,0,0,0,"Schuyler, New York, US"
,36035.0,Fulton,New York,US,2020-03-29 23:08:25,43.113639,-74.417988,1,0,0,0,"Fulton, New York, US"


In [13]:
df[df.Province_State=='New York'].sort_values(by='Confirmed')

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
,36123.0,Yates,New York,US,2020-03-29 23:08:25,42.635055,-77.103699,0,0,0,0,"Yates, New York, US"
,36005.0,Bronx,New York,US,2020-03-29 23:08:25,40.852093,-73.862828,0,0,0,0,"Bronx, New York, US"
,36081.0,Queens,New York,US,2020-03-29 23:08:25,40.710881,-73.816847,0,0,0,0,"Queens, New York, US"
,36099.0,Seneca,New York,US,2020-03-29 23:08:25,42.78081,-76.824971,0,0,0,0,"Seneca, New York, US"
,,Unassigned,New York,US,2020-03-29 23:08:25,0.0,0.0,0,159,0,0,"Unassigned, New York, US"
,36049.0,Lewis,New York,US,2020-03-29 23:08:25,43.784416,-75.44904,0,0,0,0,"Lewis, New York, US"
,36047.0,Kings,New York,US,2020-03-29 23:08:25,40.636182,-73.949356,0,0,0,0,"Kings, New York, US"
,36085.0,Richmond,New York,US,2020-03-29 23:08:25,40.585822,-74.148086,0,0,0,0,"Richmond, New York, US"
,36097.0,Schuyler,New York,US,2020-03-29 23:08:25,42.39184,-76.87733,1,0,0,0,"Schuyler, New York, US"
,36035.0,Fulton,New York,US,2020-03-29 23:08:25,43.113639,-74.417988,1,0,0,0,"Fulton, New York, US"


In [14]:
population = pd.read_csv('data/SCPRC-EST2019-18+POP-RES.csv')
population.sort_values(by='POPESTIMATE2019', ascending=False).head(10)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,POPESTIMATE2019,POPEST18PLUS2019,PCNT_POPEST18PLUS
0,10,0,0,0,United States,328239523,255200373,77.7
5,40,4,9,6,California,39512223,30617582,77.5
44,40,3,7,48,Texas,28995881,21596071,74.5
10,40,3,5,12,Florida,21477737,17247808,80.3
33,40,1,2,36,New York,19453561,15425262,79.3
39,40,1,2,42,Pennsylvania,12801989,10167376,79.4
14,40,2,3,17,Illinois,12671821,9853946,77.8
36,40,2,3,39,Ohio,11689100,9111081,77.9
11,40,3,5,13,Georgia,10617423,8113542,76.4
34,40,3,5,37,North Carolina,10488084,8187369,78.1


In [15]:
geog = pd.read_csv('data/us_geography.csv')
geog.head(10)

Unnamed: 0,State,tot_sq_mi,tot_sq_km,land_sq_mi,land_sq_km,water_sq_mi,water_sq_km,inland_sq_mi,inland_sq_km,coast_sq_mi,coast_sq_km,gl_sq_mi,gl_sq_km,terr_sq_mi,terr_sq_km,latitude,longitude
0,United States,3796742,9833517,3531905,9147593,264837,685924,85647,221824,42337,109652,60094,155643,76759,198806,,
1,Alabama,52420,135767,50645,131171,1775,4597,1058,2740,517,1340,—,—,199,516,32.739632,-86.843459
2,Alaska,665384,1723337,570641,1477953,94743,245383,19304,49997,26119,67647,—,—,49320,127739,63.346191,-152.837068
3,Arizona,113990,295234,113594,294207,396,1026,396,1026,—,—,—,—,—,—,34.209964,-111.602401
4,Arkansas,53179,137732,52035,134771,1143,2961,1143,2961,—,—,—,—,—,—,34.895526,-92.444626
5,California,163695,423967,155779,403466,7916,20501,2833,7339,245,634,—,—,4837,12528,37.148573,-119.540651
6,Colorado,104094,269601,103642,268431,452,1170,452,1170,—,—,—,—,—,—,38.993575,-105.507774
7,Connecticut,5543,14357,4842,12542,701,1816,171,443,530,1372,—,—,—,—,41.579784,-72.746667
8,Delaware,2489,6446,1949,5047,540,1399,91,237,355,920,—,—,94,242,38.99355,-75.447374
9,District of Columbia,68,177,61,158,7,19,7,19,—,—,—,—,—,—,38.904148,-77.017094


In [16]:
population[['NAME', 'POPESTIMATE2019']]

Unnamed: 0,NAME,POPESTIMATE2019
0,United States,328239523
1,Alabama,4903185
2,Alaska,731545
3,Arizona,7278717
4,Arkansas,3017804
5,California,39512223
6,Colorado,5758736
7,Connecticut,3565287
8,Delaware,973764
9,District of Columbia,705749


### Merge state population date with state COVID-19 data

In [17]:
state_columns=['Confirmed', 'Deaths', 'Recovered', 'Active', ]

In [18]:
state_with_population = pd.merge(state_totals[state_columns], population[['NAME', 'POPESTIMATE2019']], left_on='Province_State', right_on='NAME')
state_with_population['fraction_confirmed'] = state_with_population['Confirmed'] / state_with_population['POPESTIMATE2019'] * 100.0
state_with_population['deaths'] = state_with_population['Deaths'] / state_with_population['POPESTIMATE2019'] * 100.0
state_with_population['death_rate?'] = state_with_population['Deaths'] / state_with_population['Confirmed'] * 100.0


In [19]:
state_with_population.sort_values(by='death_rate?', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
45,235,12,0,0,Vermont,623989,0.037661,0.001923,5.106383
47,4465,198,0,0,Washington,7614893,0.058635,0.0026,4.43449
18,3540,151,0,0,Louisiana,4648794,0.076149,0.003248,4.265537
36,429,16,0,0,Oklahoma,3956971,0.010842,0.000404,3.729604
10,2651,80,0,0,Georgia,10617423,0.024968,0.000753,3.017729
7,232,6,0,0,Delaware,973764,0.023825,0.000616,2.586207
22,5488,132,0,0,Michigan,9986857,0.054952,0.001322,2.405248
37,548,13,0,0,Oregon,4217737,0.012993,0.000308,2.372263
46,890,20,0,0,Virginia,8535519,0.010427,0.000234,2.247191
16,330,7,0,0,Kansas,2913314,0.011327,0.00024,2.121212


In [20]:
state_with_population.sort_values(by='fraction_confirmed', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
32,59648,965,0,0,New York,19453561,0.306617,0.004961,1.617825
30,13386,161,0,0,New Jersey,8882190,0.150706,0.001813,1.202749
18,3540,151,0,0,Louisiana,4648794,0.076149,0.003248,4.265537
21,4955,48,0,0,Massachusetts,6892503,0.07189,0.000696,0.968718
47,4465,198,0,0,Washington,7614893,0.058635,0.0026,4.43449
6,1993,34,0,0,Connecticut,3565287,0.0559,0.000954,1.705971
22,5488,132,0,0,Michigan,9986857,0.054952,0.001322,2.405248
8,342,5,0,0,District of Columbia,705749,0.048459,0.000708,1.461988
5,2307,47,0,0,Colorado,5758736,0.040061,0.000816,2.037278
45,235,12,0,0,Vermont,623989,0.037661,0.001923,5.106383


In [21]:
state_with_population.sort_values(by='deaths', ascending=False)

Unnamed: 0,Confirmed,Deaths,Recovered,Active,NAME,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
32,59648,965,0,0,New York,19453561,0.306617,0.004961,1.617825
18,3540,151,0,0,Louisiana,4648794,0.076149,0.003248,4.265537
47,4465,198,0,0,Washington,7614893,0.058635,0.0026,4.43449
45,235,12,0,0,Vermont,623989,0.037661,0.001923,5.106383
30,13386,161,0,0,New Jersey,8882190,0.150706,0.001813,1.202749
22,5488,132,0,0,Michigan,9986857,0.054952,0.001322,2.405248
6,1993,34,0,0,Connecticut,3565287,0.0559,0.000954,1.705971
5,2307,47,0,0,Colorado,5758736,0.040061,0.000816,2.037278
10,2651,80,0,0,Georgia,10617423,0.024968,0.000753,3.017729
8,342,5,0,0,District of Columbia,705749,0.048459,0.000708,1.461988


In [22]:
state_with_population.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,POPESTIMATE2019,fraction_confirmed,deaths,death_rate?
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,2755.901961,48.235294,0.0,0.0,6436069.0,0.030883,0.000567,1.667706
std,8445.276067,138.628942,0.0,0.0,7360660.0,0.046258,0.000898,1.047909
min,86.0,0.0,0.0,0.0,578759.0,0.005583,0.0,0.0
25%,267.0,3.5,0.0,0.0,1789606.0,0.011991,0.000139,1.06576
50%,774.0,12.0,0.0,0.0,4467673.0,0.015739,0.000261,1.546392
75%,2150.0,35.5,0.0,0.0,7446805.0,0.02728,0.000504,2.060989
max,59648.0,965.0,0.0,0.0,39512220.0,0.306617,0.004961,5.106383


## Something strange about the data direct from github

In [23]:
dfx = pd.read_html(url)
dfx

[      Unnamed: 0     FIPS     Admin2  Province_State      Country_Region  \
 0            NaN  45001.0  Abbeville  South Carolina                  US   
 1            NaN  22001.0     Acadia       Louisiana                  US   
 2            NaN  51001.0   Accomack        Virginia                  US   
 3            NaN  16001.0        Ada           Idaho                  US   
 4            NaN  19001.0      Adair            Iowa                  US   
 ...          ...      ...        ...             ...                 ...   
 3429         NaN      NaN        NaN             NaN           Venezuela   
 3430         NaN      NaN        NaN             NaN             Vietnam   
 3431         NaN      NaN        NaN             NaN  West Bank and Gaza   
 3432         NaN      NaN        NaN             NaN              Zambia   
 3433         NaN      NaN        NaN             NaN            Zimbabwe   
 
               Last_Update        Lat       Long_  Confirmed  Deaths  \
 0