In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.plotting.register_matplotlib_converters()

In [9]:
api.dataset_list_files??

In [10]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
files = api.dataset_download_files('sudalairajkumar/novel-corona-virus-2019-dataset', unzip=True)

In [11]:
# download google coordinate file
import requests

from shapely.geometry import mapping, shape
from shapely.prepared import prep

data = requests.get("https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson").json()

# WORLD dataframe uses ISO_A3, so we extract that one from the data
iso_a3_lookup = {}
for feature in data['features']:
    iso_a3_lookup[feature['properties']['ISO_A3']] = prep(shape(feature['geometry']))
    
def get_iso(point):
    for country, geom in iso_a3_lookup.items():
        if geom.contains(point):
            return country
    return 'unknown'

In [49]:
df = pd.read_csv('covid_19_data.csv', parse_dates = ['ObservationDate', 'Last Update'])
df.columns = [i.replace('/', '').replace(' ', '') for i in df.columns]
df.ObservationDate = pd.to_datetime(df.ObservationDate, format='%m/%d/%Y')
df.ProvinceState.replace('None', np.nan, inplace=True)
df.ProvinceState.fillna(df.CountryRegion, inplace=True)
df.ProvinceState = df.ProvinceState.apply(lambda x: x.strip())
df.CountryRegion = df.CountryRegion.apply(lambda x: x.strip())

# some adjustments to fit with the other datasets
df.CountryRegion = df.CountryRegion.apply(lambda x: 'China' if x == 'Mainland China' else x)
for col in ['CountryRegion', 'ProvinceState']:
    df[col] = df[col].apply(lambda x: 'United Kingdom' if x == 'UK' else x)
    df[col] = df[col].apply(lambda x: x.lstrip('(').rstrip(')').rstrip(',').lstrip('\'').rstrip('\''))

# check if we need a ulid
if len(df['ProvinceState'].drop_duplicates()) == len(df[['ProvinceState', 'CountryRegion']].drop_duplicates()):
    print('Provinces are unique and can be used as identifier')
else:
    print('Unfortunately, Provinces are not unique.')
    
# unique land identifier
df['ULID'] = df.ProvinceState + ' - ' + df.CountryRegion
df.sample(10)

Unfortunately, Provinces are not unique.


Unnamed: 0,SNo,ObservationDate,ProvinceState,CountryRegion,LastUpdate,Confirmed,Deaths,Recovered,ULID
2947,2948,2020-03-02,Guangdong,China,2020-03-02 15:03:23,1350.0,7.0,1059.0,Guangdong - China
1617,1618,2020-02-16,"Toronto, ON",Canada,2020-02-04 00:13:06,2.0,0.0,0.0,"Toronto, ON - Canada"
4190,4191,2020-03-08,"Cobb County, GA",US,2020-03-07 16:53:03,1.0,0.0,0.0,"Cobb County, GA - US"
975,976,2020-02-07,"Madison, WI",US,2020-02-05 21:53:02,1.0,0.0,0.0,"Madison, WI - US"
3376,3377,2020-03-04,"Norfolk County, MA",US,2020-03-03 14:33:03,1.0,0.0,0.0,"Norfolk County, MA - US"
4491,4492,2020-03-09,"Shelby County, TN",US,2020-03-08 16:13:36,1.0,0.0,0.0,"Shelby County, TN - US"
5471,5472,2020-03-14,Texas,US,2020-03-14 22:33:03,57.0,0.0,0.0,Texas - US
5488,5489,2020-03-14,Algeria,Algeria,2020-03-14 12:33:03,37.0,3.0,12.0,Algeria - Algeria
1857,1858,2020-02-19,Sri Lanka,Sri Lanka,2020-02-08 03:43:03,1.0,0.0,1.0,Sri Lanka - Sri Lanka
2142,2143,2020-02-23,Inner Mongolia,China,2020-02-23 09:43:02,75.0,0.0,27.0,Inner Mongolia - China


In [42]:
print('Dataset contains {} rows for {} countries. Latest data point is from {}. Last updated on {}'.format(len(df), df.CountryRegion.nunique(), df.ObservationDate.max(), df.LastUpdate.max()))

Dataset contains 5632 rows for 155 countries. Latest data point is from 2020-03-14 00:00:00. Last updated on 2020-03-14 23:53:02


In [43]:
# this DF rewrites the one above as a time series, but it also contains lat/long data
# use it to create a lat/long data lookup
df_conf = pd.read_csv('time_series_covid_19_confirmed.csv')
df_conf.columns = [i.replace('/', '').replace(' ', '') for i in df_conf.columns]
df_conf.ProvinceState.replace('None', np.nan, inplace=True)
df_conf.ProvinceState.fillna(df_conf.CountryRegion, inplace=True)
df_conf.ProvinceState = df_conf.ProvinceState.apply(lambda x: x.strip())
df_conf.CountryRegion = df_conf.CountryRegion.apply(lambda x: x.strip())
df_conf['ULID'] = df_conf.ProvinceState + ' - ' + df_conf.CountryRegion
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(df_conf['Long'], df_conf['Lat'])]
coord_lookup = dict(zip(df_conf.ULID, geometry))

### Use the coordinates provided in the dataset to replace place names with ISO names from Google API

In [53]:
iso_lookup = {}
for name, point in coord_lookup.items():
    iso_lookup[name] = get_iso(point)
for j in [i for i in df.ULID.unique() if i not in df_conf.ULID.values]:
    if j.endswith('US'):
        iso_lookup[j] = 'USA'
    elif j.endswith('Canada'):
        iso_lookup[j] = 'CAN'
for k in list(set([i for i in df.ULID if i not in iso_lookup.keys()])):
    if 'Diamond Princess' in k:
        iso_lookup[k] = ''
iso_lookup['Australia - Australia'] = 'AUS'
iso_lookup['Ivory Coast - Ivory Coast'] = 'CIV'
iso_lookup['Bavaria - Germany'] = 'DEU'
iso_lookup['Cruise Ship - Others'] = 'unknown'
iso_lookup['From Diamond Princess - Israel'] = 'ISR'
iso_lookup['North Ireland - North Ireland'] = 'GBR'
iso_lookup['Republic of Ireland - Republic of Ireland'] = 'IRL'
iso_lookup['Hong Kong - Hong Kong'] = 'HKG'
iso_lookup['Philippines - Philippines'] = 'PHL'
iso_lookup['Cruise Ship - Others'] = 'unknown'
iso_lookup['New Zealand - New Zealand'] = 'NZL'
iso_lookup['Northern Territory - Australia'] = 'AUS'
iso_lookup['Faroe Islands - Faroe Islands'] = 'FRO'
iso_lookup['Grand Princess Cruise Ship - US'] = 'USA'
iso_lookup['Maldives - Maldives'] = 'MDV'
iso_lookup['Cyprus - Cyprus'] = 'CYP'
iso_lookup['Hong Kong - Hong Kong SAR'] = 'HKG'
iso_lookup['Grand Princess - US'] = 'USA'
iso_lookup['Hawaii - US'] = 'USA'
iso_lookup['Channel Islands - Channel Islands'] = 'GB'
iso_lookup['Macau - Macau'] = iso_lookup['Macau - China']
iso_lookup['Vatican City - Vatican City'] = 'VAT'
iso_lookup['South Korea - South Korea'] = iso_lookup['Korea, South - Korea, South']
iso_lookup['Czech Republic - Czech Republic'] = iso_lookup['Czechia - Czechia']
iso_lookup['Gibraltar - Gibraltar'] = iso_lookup['Gibraltar - United Kingdom']
iso_lookup['Saint Barthelemy - Saint Barthelemy'] = iso_lookup['Saint Barthelemy - France']
iso_lookup['St. Martin - St. Martin'] = iso_lookup['St Martin - France']
iso_lookup['Palestine - Palestine'] = 'PSE'
iso_lookup['Taiwan - Taiwan'] = iso_lookup['Taiwan* - Taiwan*']

In [54]:
df['iso_a3'] = df.ULID.apply(lambda x: iso_lookup[x])
df.sample(10)

KeyError: 'Congo (Kinshasa - Congo (Kinshasa'

In [51]:
for k in list(set([i for i in df.ULID if i not in iso_lookup.keys()])):
    if 'Diamond Princess' in k:
        iso_lookup[k] = ''
list(set([i for i in df.ULID if i not in iso_lookup.keys()]))

['Vatican City - Vatican City',
 'South Korea - South Korea',
 'Czech Republic - Czech Republic',
 'Gibraltar - Gibraltar',
 'Saint Barthelemy - Saint Barthelemy',
 'Congo (Kinshasa - Congo (Kinshasa',
 'St. Martin - St. Martin',
 'Macau - Macau',
 'Palestine - Palestine',
 'Taiwan - Taiwan']