# COVID-19 Case Counts from Johns Hopkins University

**[Work in progress]**

This notebook creates a .csv file with cummulative confimed cases and deaths for ingestion into the Knowledge Graph.

Data source: [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
from pathlib import Path
import pandas as pd
import dateutil

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


In [4]:
df = pd.read_csv(NEO4J_HOME / "import/02a-JHUCasesGlobal.csv", dtype='str')
df.fillna('', inplace=True)
df['country'] = df['country'].str.strip()
df['admin1'] = df['admin1'].str.strip()

In [5]:
df.head()

Unnamed: 0,country,admin1,cummulativeConfirmed,date,cummulativeDeaths
0,China,Anhui,1,2020-01-22,0
1,China,Beijing,14,2020-01-22,0
2,China,Chongqing,6,2020-01-22,0
3,China,Fujian,1,2020-01-22,0
4,China,Guangdong,26,2020-01-22,0


### Standardize country names to match GeoNames.org

In [6]:
ref = pd.read_csv("../reference_data/SpecialLocations.csv", comment='#', dtype='str')

Convert dataframe into a dictionary

In [7]:
ref['val'] = ref[['geoname', 'type']].values.tolist()
name_list = ref[['name', 'val']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [8]:
def standardize_locations(row):
    country = row['country']
    admin1 = row['admin1']
    cruiseship = ''
    
    cname = name_dict.get(country)
    if cname != None:
        if cname[1] == 'Country':
            country = cname[0]
        elif cname[1] == 'CruiseShip':
            country = ''
            admin1 = ''
            cruiseship = cname[0]

    aname = name_dict.get(admin1)    
    if aname != None:
        if aname[1] == 'Admin1':
            admin1 = aname[0]
        elif aname[1] == 'Country':
            # dependent territories according to ISO are represented at the country level
            admin1 = ''
            country = aname[0]
        elif aname[1] == 'CruiseShip':
            country = ''
            admin1 = ''
            cruiseship = aname[0]

            
    return country, admin1, cruiseship

In [9]:
df[['country','admin1','cruiseship']] = df.apply(standardize_locations, axis=1, result_type="expand")

Check that country names match GeoNames

In [10]:
countries = pd.read_csv(NEO4J_HOME / "import/00e-GeoNamesCountry.csv")
loc0 = df[['country']].copy()
loc0 = loc0.merge(countries, left_on='country', right_on='name', how='left')

In [11]:
loc0.fillna('', inplace=True)
loc0 = loc0.query("name == ''")
country_mismatches = loc0['country'].unique()
print("Country name mismatches:")
print(country_mismatches)
# TODO comma in string 'Bonaire, Saint Eustatius and Saba' is not handled properly

Country name mismatches:
['' 'Bonaire, Saint Eustatius and Saba']


Check that admin1 names match GeoNames

In [12]:
admin1 = pd.read_csv(NEO4J_HOME / "import/00f-GeoNamesAdmin1.csv")
loc1 = df[['admin1']].copy()
loc1 = loc1.query("admin1 != ''")
loc1 = loc1.merge(admin1, left_on='admin1', right_on='name', how='left')

In [13]:
loc1.fillna('', inplace=True)
loc1 = loc1.query("name == ''")
loc1_mismatches = loc1['admin1'].unique()
print("Admin1 name mismatches:")
print(loc1_mismatches)

Admin1 name mismatches:
[]


#### Save cases for countries

In [14]:
cases_country = df[(df['country'] != '') & (df['admin1'] == '')]
cases_country = cases_country[['country', 'date', 'cummulativeConfirmed', 'cummulativeDeaths']]
cases_country.to_csv(NEO4J_HOME / "import/02a-JHUCasesGlobalCountry.csv", index=False)
cases_country.head()

Unnamed: 0,country,date,cummulativeConfirmed,cummulativeDeaths
15,Macao,2020-01-22,1,0
24,Japan,2020-01-22,2,0
25,South Korea,2020-01-22,1,0
26,Taiwan,2020-01-22,1,0
27,Thailand,2020-01-22,2,0


#### Save cases for admin1 divisions

In [15]:
cases_admin1 = df[(df['country'] != '') & (df['admin1'] != '')]
cases_admin1 = cases_admin1[['country', 'admin1', 'date', 'cummulativeConfirmed', 'cummulativeDeaths']]
cases_admin1.to_csv(NEO4J_HOME / "import/02a-JHUCasesGlobalAdmin1.csv", index=False)
cases_admin1.head()

Unnamed: 0,country,admin1,date,cummulativeConfirmed,cummulativeDeaths
0,China,Anhui,2020-01-22,1,0
1,China,Beijing,2020-01-22,14,0
2,China,Chongqing,2020-01-22,6,0
3,China,Fujian,2020-01-22,1,0
4,China,Guangdong,2020-01-22,26,0


#### Save cases for cruise ships

In [16]:
cases_cruiseship = df[df['cruiseship'] != '']
cases_cruiseship = cases_cruiseship[['cruiseship', 'date', 'cummulativeConfirmed', 'cummulativeDeaths']]
cases_cruiseship.to_csv(NEO4J_HOME / "import/02a-JHUCasesGlobalCruiseShip.csv", index=False)
cases_cruiseship.head()

Unnamed: 0,cruiseship,date,cummulativeConfirmed,cummulativeDeaths
865,Diamond Princess,2020-02-07,61,0
928,Diamond Princess,2020-02-08,61,0
991,Diamond Princess,2020-02-09,64,0
1054,Diamond Princess,2020-02-10,135,0
1117,Diamond Princess,2020-02-11,135,0
