# Assigns location to Strain Data from CNCB
**[Work in progress]**

This notebook standardizes location information for viral strain from CNCB for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu)

In [59]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph

In [60]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [61]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


## Standardize Location data for SARS-CoV-2 Strain metadata

TODO this code should be replaced with a general solution.

Below is a workaround for now.

In [62]:
df = pd.read_csv(NEO4J_HOME / "import/01d-CNCBStrain.csv", dtype='str')
df.fillna('', inplace=True)

In [63]:
df.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,location
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
1,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei
2,https://www.gisaid.org/EPI_ISL_403963,BetaCoV/Nonthaburi/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand/ Nonthaburi Province
3,https://www.gisaid.org/EPI_ISL_403962,BetaCoV/Nonthaburi/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand/ Nonthaburi Province
4,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China / Hubei / Wuhan


#### Split locations

In [64]:
df['locationLevels'] = df['location'].str.count('/')

In [65]:
df[['location1', 'location2', 'location3', 'location4']] = df['location'].str.split('/', n=3, expand=True)

#### Clean up extra white space and NaN

In [66]:
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.fillna('', inplace=True)

In [67]:
df.query("locationLevels == 3")

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,location,locationLevels,location1,location2,location3,location4
544,https://www.gisaid.org/EPI_ISL_414500,hCoV-19/England/Sheff01/2020,EPI_ISL_414500,taxonomy:2697049,taxonomy:9606,2020-03-04,United Kingdom / England / Yorkshire / Sheffield,3,United Kingdom,England,Yorkshire,Sheffield
9786,https://www.gisaid.org/EPI_ISL_434572,hCoV-19/Czech Republic/2308/2020,EPI_ISL_434572,taxonomy:2697049,taxonomy:9606,2020-04-14,Czech Republic / Vysocina Region / Bela - Lede...,3,Czech Republic,Vysocina Region,Bela - Ledec na Sazavou,Havlickuv Brod


In [68]:
location1 = df['location1'].unique()

In [69]:
print(len(location1))

86


In [70]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

In [100]:
query = """
CALL db.index.fulltext.queryNodes("locations", $location) YIELD node, score
MATCH (node)-[:IN]->(next:Location)
RETURN node.name, labels(node), next.name, labels(next), score
LIMIT 3
"""

In [101]:
df_list = []
for location in location1:
    df_loc = graph.run(query, location = location).to_data_frame()
    df_loc['location1'] = location
    df_list.append(df_loc)
    
df_loc1 = pd.concat(df_list)

In [73]:
# failures: Czech Republic
df_loc1.head(100)

Unnamed: 0,node.name,labels(node),next.name,labels(next),score,location1
0,China,"[Location, Country]",Eastern Asia,"[Location, UNSubRegion]",5.306468,China
0,Thailand,"[Location, Country]",South-eastern Asia,"[Location, UNSubRegion]",6.248669,Thailand
0,United States,"[Location, Country]",Northern America,"[Location, UNSubRegion]",8.596201,United States
0,France,"[Location, Country]",Western Europe,"[Location, UNSubRegion]",5.255381,France
0,Australia,"[Location, Country]",Australia and New Zealand,"[Location, UNSubRegion]",5.687883,Australia
0,Germany,"[Location, Country]",Western Europe,"[Location, UNSubRegion]",5.987919,Germany
0,Singapore,"[Location, Country]",South-eastern Asia,"[Location, UNSubRegion]",5.987919,Singapore
0,United Kingdom,"[Location, Country]",Northern Europe,"[Location, UNSubRegion]",8.596201,United Kingdom
0,South Korea,"[Location, Country]",Eastern Asia,"[Location, UNSubRegion]",6.904882,South Korea
0,Japan,"[Location, Country]",Eastern Asia,"[Location, UNSubRegion]",6.248669,Japan


In [102]:
location2 = df.query("location2 != ''")['location2'].unique()

In [103]:
print(len(location2))

891


In [104]:
print(location2)

['Hubei' 'Nonthaburi Province' 'Zhejiang' 'Guangdong Province' 'Guangdong'
 'Washington' 'Illinois' 'Taiwan' 'California' 'Arizona' 'Ile-de-France'
 'Yunnan' 'Victoria' 'Bavaria' 'Gyeonggi-do' 'Aichi' 'Queensland'
 'New South Wales' 'Leuven' 'Chongqing' 'Sichuan' 'Ile De France'
 'Jiangxi' 'Jiangsu' 'Shandong' 'Tokyo' 'Thanh Hoa' 'Kyoto' 'Wisconsin'
 'Massachusetts' 'Kathmandu' 'Beijing' 'Rhone-Alpes' 'Nara' 'Osaka'
 'Guangxi' 'Rome' 'Fujian' 'Sihanoukville' 'Texas' 'Anhui' 'Hong Kong'
 'Chungcheongnam-do' 'Seoul' 'Baden-Wuerttemberg' 'Sao Paulo' 'Kanagawa'
 'Lombardy' 'Mexico City' 'Ontario' 'Zurich' 'North Rhine Westphalia'
 'Auckland' 'Kerala State' 'Wales' 'Haarlem' 'Blaricum'
 'Hardinxveld Giessendam' 'Naarden' 'Utrecht' 'Zeewolde' 'Nootdorp'
 'Oisterwijk' 'Tilburg' 'Rotterdam' 'NSW' 'Helsinki' 'Shanghai' 'Geneva'
 'Argovie' 'Vaud' 'Basel' 'England' 'Tessin' 'Scotland' 'Zuid Holland'
 'Noord Brabant' 'Noord Holland' 'Gelderland' 'New York' 'Cork' 'MN'
 'Munich' 'Talca' 'Santiago' 

In [105]:
location2 = location2[60:80]
print(location2)

['Zeewolde' 'Nootdorp' 'Oisterwijk' 'Tilburg' 'Rotterdam' 'NSW' 'Helsinki'
 'Shanghai' 'Geneva' 'Argovie' 'Vaud' 'Basel' 'England' 'Tessin'
 'Scotland' 'Zuid Holland' 'Noord Brabant' 'Noord Holland' 'Gelderland'
 'New York']


In [106]:
df_list = []
for location in location2:
    df_loc = graph.run(query, location = location).to_data_frame()
    df_loc['location2'] = location
    df_list.append(df_loc)
    
df_loc2 = pd.concat(df_list)

In [107]:
df_loc2.head(100)

Unnamed: 0,node.name,labels(node),next.name,labels(next),score,location2
0,Zeewolde,"[Location, City]",Gemeente Zeewolde,"[Location, Admin2]",5.987919,Zeewolde
1,Gemeente Zeewolde,"[Location, Admin2]",Flevoland,"[Location, Admin1]",4.480581,Zeewolde
0,Gemeente Pijnacker-Nootdorp,"[Location, Admin2]",South Holland,"[Location, Admin1]",3.735385,Nootdorp
0,Oisterwijk,"[Location, City]",Gemeente Oisterwijk,"[Location, Admin2]",5.987919,Oisterwijk
1,Gemeente Oisterwijk,"[Location, Admin2]",North Brabant,"[Location, Admin1]",4.480581,Oisterwijk
0,Tilburg,"[Location, City]",Gemeente Tilburg,"[Location, Admin2]",5.987919,Tilburg
1,Gemeente Tilburg,"[Location, Admin2]",North Brabant,"[Location, Admin1]",4.480581,Tilburg
0,Rotterdam,"[Location, City]",Gemeente Rotterdam,"[Location, Admin2]",5.816166,Rotterdam
1,Rotterdam,"[Location, City]",Schenectady County,"[Location, Admin2]",5.816166,Rotterdam
2,Gemeente Rotterdam,"[Location, Admin2]",South Holland,"[Location, Admin1]",4.352064,Rotterdam
