In [1]:
import functools
import glob
import os
import pandas as pd
import numpy as np

# Data

In [2]:
# CSSE data
data = pd.concat(map(functools.partial(pd.read_csv), glob.glob("data/csse_covid19_daily_reports/*.csv")), sort=True)

In [3]:
# Date col names changed at some point
country_cols = ['Country/Region', 'Country_Region']
data['Country'] = data[country_cols].apply(lambda x: ','.join(x.dropna()), axis=1)

In [4]:
# Country col names changed at some point
date_cols = ['Last Update', 'Last_Update']
data['Last_Updated'] = data[date_cols].apply(lambda x: ','.join(x.dropna()), axis=1)

In [5]:
# Region col names changed at some point
region_cols = ['Province/State', 'Province_State']
data['Province'] = data[region_cols].apply(lambda x: ','.join(x.dropna()), axis=1)

In [6]:
# Set date index
data = data.set_index(pd.DatetimeIndex(data['Last_Updated'])).sort_index() # Set ascending date

In [7]:
data.tail()

Unnamed: 0_level_0,Active,Admin2,Combined_Key,Confirmed,Country/Region,Country_Region,Deaths,FIPS,Last Update,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered,Country,Last_Updated,Province
Last_Updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-03-29 23:14:06,75.0,,"Australian Capital Territory, Australia",77.0,,Australia,0.0,,,2020-03-29 23:14:06,-35.4735,,149.0124,,,Australian Capital Territory,2.0,Australia,2020-03-29 23:14:06,Australian Capital Territory
2020-03-29 23:14:06,293.0,,"South Australia, Australia",299.0,,Australia,0.0,,,2020-03-29 23:14:06,-34.9285,,138.6007,,,South Australia,6.0,Australia,2020-03-29 23:14:06,South Australia
2020-03-29 23:14:06,61.0,,"Tasmania, Australia",66.0,,Australia,0.0,,,2020-03-29 23:14:06,-42.8821,,147.3272,,,Tasmania,5.0,Australia,2020-03-29 23:14:06,Tasmania
2020-03-29 23:14:06,574.0,,"Victoria, Australia",769.0,,Australia,4.0,,,2020-03-29 23:14:06,-37.8136,,144.9631,,,Victoria,191.0,Australia,2020-03-29 23:14:06,Victoria
2020-03-29 23:14:06,281.0,,"Western Australia, Australia",311.0,,Australia,2.0,,,2020-03-29 23:14:06,-31.9505,,115.8605,,,Western Australia,28.0,Australia,2020-03-29 23:14:06,Western Australia


In [8]:
#data['Country/Region'].unique()

# Countries for viz

In [11]:
# Select countries for comparison
df = data[data['Country'].isin(['Mainland China', 'South Korea', 'Italy', 'US', 'Mexico'])]

In [14]:
# Check last update in Mexico
df[df['Country'] == 'Mexico'].tail()

Unnamed: 0_level_0,Active,Admin2,Combined_Key,Confirmed,Country/Region,Country_Region,Deaths,FIPS,Last Update,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered,Country,Last_Updated,Province
Last_Updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-03-25 23:33:04,396.0,,Mexico,405.0,,Mexico,5.0,,,2020-03-25 23:33:04,23.6345,,-102.5528,,,,4.0,Mexico,2020-03-25 23:33:04,
2020-03-26 23:48:18,465.0,,Mexico,475.0,,Mexico,6.0,,,2020-03-26 23:48:18,23.6345,,-102.5528,,,,4.0,Mexico,2020-03-26 23:48:18,
2020-03-27 23:23:03,573.0,,Mexico,585.0,,Mexico,8.0,,,2020-03-27 23:23:03,23.6345,,-102.5528,,,,4.0,Mexico,2020-03-27 23:23:03,
2020-03-28 23:05:25,701.0,,Mexico,717.0,,Mexico,12.0,,,2020-03-28 23:05:25,23.6345,,-102.5528,,,,4.0,Mexico,2020-03-28 23:05:25,
2020-03-29 23:08:13,828.0,,Mexico,848.0,,Mexico,16.0,,,2020-03-29 23:08:13,23.6345,,-102.5528,,,,4.0,Mexico,2020-03-29 23:08:13,


In [15]:
# Keep these columns
df = df[['Confirmed', 'Country', 'Province']]
df

Unnamed: 0_level_0,Confirmed,Country,Province
Last_Updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22 17:00:00,4.0,Mainland China,Hunan
2020-01-22 17:00:00,1.0,Mainland China,Ningxia
2020-01-22 17:00:00,,Mainland China,Shaanxi
2020-01-22 17:00:00,2.0,Mainland China,Shandong
2020-01-22 17:00:00,9.0,Mainland China,Shanghai
...,...,...,...
2020-03-29 23:08:25,120.0,US,Oklahoma
2020-03-29 23:08:25,3.0,US,Oklahoma
2020-03-29 23:08:25,0.0,US,Florida
2020-03-29 23:08:25,3.0,US,Kentucky
