# Assigns location to Strain Data from CNCB
**[Work in progress]**

This notebook standardizes location information for viral strain from CNCB for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
#import dateutil
from pathlib import Path
#from os import path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


## Standardize Location data for SARS-CoV-2 Strain metadata

TODO this code should be replaced with a general solution.

Below is a workaround for now.

In [4]:
df = pd.read_csv(NEO4J_HOME / "import/01d-CNCBStrain.csv", dtype='str')
df.fillna('', inplace=True)

In [5]:
df.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,location
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
1,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
2,https://www.gisaid.org/EPI_ISL_403963,BetaCoV/Nonthaburi/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand/ Nonthaburi Province
3,https://www.gisaid.org/EPI_ISL_403962,BetaCoV/Nonthaburi/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand/ Nonthaburi Province
4,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China / Hubei / Wuhan


#### Split locations

In [6]:
df['locationLevels'] = df['location'].str.count('/')

In [7]:
df[['country', 'admin1', 'admin2', 'city']] = df['location'].str.split('/', n=3, expand=True)
df['country'] = df['country'].str.strip()
df['admin1'] = df['admin1'].str.strip()
df['admin2'] = df['admin2'].str.strip()
df['city'] = df['city'].str.strip()

Fix country names

In [8]:
df['country'] = df['country'].str.replace('\u200eRomania','Romania')
df['country'] = df['country'].str.replace('Czech Republic','Czechia')
df['country'] = df['country'].str.replace('Czech republic','Czechia')
df['country'] = df['country'].str.replace('Luxemburg','Luxembourg')
df.fillna('', inplace=True)

Check for country mismatches

In [9]:
countries = pd.read_csv(NEO4J_HOME / "import/00e-GeoNamesCountry.csv")
loc0 = df[['country']].copy()
loc0 = loc0.merge(countries, left_on='country', right_on='name', how='left')

In [10]:
loc0.fillna('', inplace=True)
loc0 = loc0.query("name == ''")
print("Country name mismatches:")
loc0['country'].unique()

Country name mismatches:


array([], dtype=object)

Fix Admin1 names

In [11]:
states = {
        # US
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        # Canada
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NT': 'Northwest Territories',
        'NS': 'Nova Scotia',
        'NU': 'Nunavut',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'YT': 'Yukon'
}

In [12]:
# # expand states to full name
df['admin1'] = df['admin1'].apply(lambda s: states.get(s, s))
df['admin1'] = df['admin1'].str.replace('Nonthaburi Province','Nonthaburi')
df['admin1'] = df['admin1'].str.replace('Guangdong Province','Guangdong')
df['admin1'] = df['admin1'].str.replace('Ile De France','Ile-de-France')
df.fillna('', inplace=True)

Check for Admin1 mismatches

In [13]:
admin1 = pd.read_csv(NEO4J_HOME / "import/00f-GeoNamesAdmin1.csv")
loc1 = df[['admin1']].copy()
loc1 = loc1.query("admin1 != ''")
loc1 = loc1.merge(admin1, left_on='admin1', right_on='name', how='left')

In [14]:
loc1.fillna('', inplace=True)
loc1 = loc1.query("name == ''")
print("Admin1 name mismatches:")
loc1['admin1'].unique()

Admin1 name mismatches:


array(['Leuven', 'Kathmandu', 'Rome', 'Sihanoukville', 'Hong Kong',
       'North Rhine Westphalia', 'Kerala State', 'Haarlem', 'Blaricum',
       'Hardinxveld Giessendam', 'Naarden', 'Zeewolde', 'Nootdorp',
       'Oisterwijk', 'Tilburg', 'Rotterdam', 'NSW', 'Helsinki', 'Argovie',
       'Basel', 'Tessin', 'Zuid Holland', 'Noord Brabant',
       'Noord Holland', 'Cork', 'Munich', 'Talca', 'Limerick',
       'Grand-Est', 'Normandie', 'Hauts de France', 'Huldenberg',
       'Kraainem', 'Sint-Niklaas', 'Brussels', 'Genève',
       'Ho Chi Minh city', 'Tbilisi', 'IDF', 'Bourgogne-France-Comté',
       'Hangzhou', 'Saint Petersburg', 'Copenhagen', 'Quangning',
       'Vinhphuc', 'Riyadh', 'Hawali', 'Holsbeek', 'Kessel-Lo',
       'Couthuin', 'Comunitat_Valenciana', 'Zielonogorskie',
       'Ile de France', 'Bretagne', 'Dasman', 'Prague', 'ARA',
       'New York City', 'Castilla y León', 'Liège', 'Amazonas State',
       'Gilgit', 'Dilbeek', 'Kasterlee', 'Schoten', 'Sint-Pieters-Woluwe',
  

Check for Admin2 names

In [15]:
admin2 = pd.read_csv(NEO4J_HOME / "import/00g-GeoNamesAdmin2.csv")
loc2 = df[['admin2']].copy()
loc2 = loc2.query("admin2 != ''")
loc2 = loc2.merge(admin2, left_on='admin2', right_on='name', how='left')

In [16]:
loc2.fillna('', inplace=True)
loc2 = loc2.query("name == ''")
print("Admin2 name mismatches:")
loc2['admin2'].unique()

Admin2 name mismatches:


array(['Wuhan', 'Zhuhai City', 'Shenzhen City', 'Chicago', 'Los Angeles',
       'Phoenix', 'Guangzhou City', "Pu'er", 'Clayton', 'Munich',
       'Hangzhou', 'Sydney', 'Yongchuan', 'Chengdu', 'Kunming',
       'Pingxiang', 'Zhongxian', 'Huaian', 'Qingdao', 'Hefei', 'Jingzhou',
       'Guangzhou', 'Tianmen', 'Suzhou', 'Heinsberg District', 'Weifang',
       'Yorkshire', 'Kirkland', 'Tacoma', 'Strasbourg', 'Rouen',
       'Compiègne', 'Reims', 'Pontoise', 'Crépy en Valois',
       'Feira de Santana', 'Seattle', 'Crépy-en -Valois',
       'Montreux-Chateau', 'Thise', 'Dane county', 'Château-Thierry',
       'Garches', 'Longjumeau', 'Rennes', 'South Yorkshire',
       'Northamtonshire', 'London', 'Dane', 'San Francisco County',
       'NanChang', 'Manhattan', 'Brest', 'Tours', 'Crouy en Thelle',
       'Levallois-Perret', 'Meudon la Forêt', 'Vanves', 'Lyon',
       'Saint-Priest', 'Macon', 'Venissieux', 'Valence',
       'Bourg-en-Bresse', 'Privas', 'Brooklyn', 'Fuyang', 'Compiegne',
    

Check city names

In [17]:
city = pd.read_csv(NEO4J_HOME / "import/00h-GeoNamesCity.csv")
loc3 = df[['city']].copy()

In [18]:
loc3 = loc3.query("city != ''")

In [19]:
loc3

Unnamed: 0,city
531,Sheffield
9707,Havlickuv Brod


In [20]:
loc3 = loc3.query("city != ''")
loc3 = loc3.merge(city, left_on='city', right_on='name', how='left')

In [21]:
loc3.fillna('', inplace=True)
loc3 = loc3.query("name == ''")
print("City name mismatches:")
loc3['city'].unique()

City name mismatches:


array([], dtype=object)

In [22]:
strains = df[['id', 'name', 'alias', 'taxonomyId', 'hostTaxonomyId', 'collectionDate', 
              'country', 'admin1', 'admin2', 'city','locationLevels']]
strains.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,country,admin1,admin2,city,locationLevels
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China,Hubei,,,1
1,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China,Hubei,,,1
2,https://www.gisaid.org/EPI_ISL_403963,BetaCoV/Nonthaburi/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand,Nonthaburi,,,1
3,https://www.gisaid.org/EPI_ISL_403962,BetaCoV/Nonthaburi/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand,Nonthaburi,,,1
4,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China,Hubei,Wuhan,,2


In [23]:
strains.to_csv(NEO4J_HOME / "import/01d-CNCBStrain.csv", index=False)