# Assigns location to Strain Data from CNCB
**[Work in progress]**

This notebook standardizes location information for viral strain from CNCB for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
#import dateutil
from pathlib import Path
#from os import path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-19636412-9e74-4bac-8a4c-c6c8b49bb9d3/installation-4.1.0/import


## Standardize Location data for SARS-CoV-2 Strain metadata

TODO this code should be replaced with a general solution.

Below is a workaround for now.

In [4]:
df = pd.read_csv(NEO4J_IMPORT / "01d-CNCBStrain.csv", dtype='str')
df.fillna('', inplace=True)

In [5]:
df.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,location
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
1,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
2,https://www.gisaid.org/EPI_ISL_403963,hCoV-19/Thailand/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand/ Nonthaburi Province
3,https://www.gisaid.org/EPI_ISL_403962,hCoV-19/Thailand/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand/ Nonthaburi Province
4,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China / Hubei / Wuhan


#### Split locations

In [6]:
df['locationLevels'] = df['location'].str.count('/')

### Standardize country names to match GeoNames.org

In [7]:
df[['country', 'admin1', 'admin2', 'city']] = df['location'].str.split('/', n=3, expand=True)
df['country'] = df['country'].str.strip()
df['admin1'] = df['admin1'].str.strip()
df['admin2'] = df['admin2'].str.strip()
df['city'] = df['city'].str.strip()
df.fillna('', inplace=True)

Fix country names

In [8]:
ref = pd.read_csv("../../reference_data/SpecialLocations.csv", comment='#', dtype='str')

Convert dataframe into a dictionary

In [9]:
ref['val'] = ref[['geoname', 'type']].values.tolist()
name_list = ref[['name', 'val']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [10]:
def standardize_country(row):
    country = row['country']
    cname = name_dict.get(country)
    if cname != None:
        if cname[1] == 'Country':
            return cname[0]
        else:
            return country
    else:
        return country

In [11]:
df['country'] = df['country'].str.replace('\u200eRomania','Romania')

In [12]:
df['country'] = df.apply(standardize_country, axis=1)

Check for country mismatches

In [13]:
countries = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv")
loc0 = df[['country']].copy()
loc0 = loc0.merge(countries, left_on='country', right_on='name', how='left')

In [14]:
loc0.fillna('', inplace=True)
loc0 = loc0.query("name == ''")
print("Country name mismatches:")
loc0['country'].unique()

Country name mismatches:


array(['ISRAEL', 'Bahrein'], dtype=object)

Standardize Admin1 names to match GeoNames.org

In [15]:
def standardize_admin1(row):
    admin1 = row['admin1']
    aname = name_dict.get(admin1)
    if aname != None:
        if aname[1] == 'Admin1':
            return aname[0]
        else:
            return admin1
    else:
        return admin1

In [16]:
df['admin1'] = df.apply(standardize_admin1, axis=1)

Check for Admin1 mismatches

In [17]:
admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv")
loc1 = df[['admin1']].copy()
loc1 = loc1.query("admin1 != ''")
loc1 = loc1.merge(admin1, left_on='admin1', right_on='name', how='left')

In [18]:
loc1.fillna('', inplace=True)
loc1 = loc1.query("name == ''")
print("Admin1 name mismatches:")
loc1['admin1'].unique()

Admin1 name mismatches:


array(['Leuven', 'Kathmandu', 'Rhone-Alpes', 'Rome', 'Sihanoukville',
       'Hong Kong', 'North Rhine Westphalia', 'Kerala State', 'Haarlem',
       'Blaricum', 'Hardinxveld Giessendam', 'Naarden', 'Zeewolde',
       'Nootdorp', 'Oisterwijk', 'Tilburg', 'Rotterdam', 'NSW',
       'Helsinki', 'Argovie', 'Basel', 'Tessin', 'Zuid Holland',
       'Noord Brabant', 'Noord Holland', 'Cork', 'Munich', 'Talca',
       'Limerick', 'Grand-Est', 'Normandie', 'Hauts de France',
       'Huldenberg', 'Kraainem', 'Sint-Niklaas', 'Brussels', 'Genève',
       'Ho Chi Minh city', 'Tbilisi', 'IDF', 'Bourgogne-France-Comté',
       'Hangzhou', 'Saint Petersburg', 'Copenhagen', 'Quangning',
       'Vinhphuc', 'Riyadh', 'Hawali', 'Holsbeek', 'Kessel-Lo',
       'Couthuin', 'Comunitat_Valenciana', 'Zielonogorskie',
       'Ile de France', 'Bretagne', 'Dasman', 'Prague', 'ARA',
       'New York City', 'Castilla y León', 'Liège', 'Amazonas State',
       'Gilgit', 'Dilbeek', 'Kasterlee', 'Schoten', 'Sint-Piet

Check for Admin2 names

In [19]:
admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv")
loc2 = df[['admin2']].copy()
loc2 = loc2.query("admin2 != ''")
loc2 = loc2.merge(admin2, left_on='admin2', right_on='name', how='left')

In [20]:
loc2.fillna('', inplace=True)
loc2 = loc2.query("name == ''")
print("Admin2 name mismatches:")
loc2['admin2'].unique()

Admin2 name mismatches:


array(['Wuhan', 'Zhuhai City', 'Shenzhen City', 'Chicago', 'Los Angeles',
       'Phoenix', 'Guangzhou City', "Pu'er", 'Clayton', 'Munich',
       'Hangzhou', 'Sydney', 'Yongchuan', 'Chengdu', 'Kunming',
       'Pingxiang', 'Zhongxian', 'Huaian', 'Qingdao', 'Contamines',
       'Hefei', 'Jingzhou', 'Guangzhou', 'Tianmen', 'Suzhou',
       'Heinsberg District', 'Weifang', 'Yorkshire', 'Kirkland', 'Tacoma',
       'Strasbourg', 'Rouen', 'Compiègne', 'Reims', 'Pontoise',
       'Crépy en Valois', 'Feira de Santana', 'Seattle',
       'Crépy-en -Valois', 'Montreux-Chateau', 'Thise', 'Dane county',
       'Château-Thierry', 'Garches', 'Longjumeau', 'Rennes',
       'South Yorkshire', 'Northamtonshire', 'London', 'Dane',
       'San Francisco County', 'NanChang', 'Manhattan', 'Brest', 'Tours',
       'Crouy en Thelle', 'Levallois-Perret', 'Meudon la Forêt', 'Vanves',
       'Lyon', 'Saint-Priest', 'Macon', 'Venissieux', 'Valence',
       'Bourg-en-Bresse', 'Privas', 'Brooklyn', 'Fuyang', 'Co

Check city names

In [21]:
city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv")
loc3 = df[['city']].copy()

In [22]:
loc3 = loc3.query("city != ''")

In [23]:
loc3

Unnamed: 0,city
544,Sheffield
10047,Havlickuv Brod
35887,Bilbao


In [24]:
loc3 = loc3.query("city != ''")
loc3 = loc3.merge(city, left_on='city', right_on='name', how='left')

In [25]:
loc3.fillna('', inplace=True)
loc3 = loc3.query("name == ''")
print("City name mismatches:")
loc3['city'].unique()

City name mismatches:


array([], dtype=object)

In [26]:
strains = df[['id', 'name', 'alias', 'taxonomyId', 'hostTaxonomyId', 'collectionDate', 
              'country', 'admin1', 'admin2', 'city','locationLevels']]
strains.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,country,admin1,admin2,city,locationLevels
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China,Hubei,,,1
1,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China,Hubei,,,1
2,https://www.gisaid.org/EPI_ISL_403963,hCoV-19/Thailand/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand,Nonthaburi,,,1
3,https://www.gisaid.org/EPI_ISL_403962,hCoV-19/Thailand/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand,Nonthaburi,,,1
4,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China,Hubei,Wuhan,,2


In [27]:
strains.to_csv(NEO4J_IMPORT / "01d-CNCBStrain.csv", index=False)