# Example queries on COVID-19 Knowledge Graph
[Work in progress]

In [1]:
import os
import time
import pandas as pd
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_HOME = os.getenv('NEO4J_HOME')
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


Start neo4j database if it is not already running

In [4]:
status = !"$NEO4J_HOME"/bin/neo4j status
if not 'Neo4j is running' in status:
   !"$NEO4J_HOME"/bin/neo4j start

Directories in use:
  home:         /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3
  config:       /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3/conf
  logs:         /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3/logs
  plugins:      /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3/plugins
  import:       /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3/import
  data:         /Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a2

Wait until database is started up

In [5]:
status = !"$NEO4J_HOME"/bin/neo4j status
while not 'Neo4j is running' in status:
    time.sleep(15)
    status = !"$NEO4J_HOME"/bin/neo4j status
    status = str(status)
    print(status)

# sometimes neo4j needs more time to be ready
time.sleep(15)

['Neo4j is running at pid 4218']


In [6]:
graph = Graph("bolt://localhost:7687/", password="neo4jbinder")

### List Organisms in KG

In [7]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009
3,human,Homo sapiens,taxonomy:9606
4,intermediate horseshoe bat,Rhinolophus affinis,taxonomy:59477
5,horseshoe bat,Rhinolophus,taxonomy:49442
6,Malayan pangolin,Manis javanica,taxonomy:9974
7,palm civet,Paradoxurus,taxonomy:71116
8,carnivores,Canidae,taxonomy:9608


### List Coronavirus Outbreaks

In [8]:
query = """
MATCH (p:Organism)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy,outbreak,startDate
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049,COVID-19,2019
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626,MERS,2012
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009,SARS,2003


### List Strains that are mentioned in PubMed Central Articles

In [9]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(o:Organism)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, o.name as host, s.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()
# TODO where do the 2013 bat strains come from??

Unnamed: 0,pmc,name,collectionDate,host,host_id
0,pmc:PMC7095418,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
1,pmc:PMC7067954,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
2,pmc:PMC7060195,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
3,pmc:PMC7062204,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
4,pmc:PMC7089605,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
5,pmc:PMC7092824,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
6,pmc:PMC7036342,Wuhan-Hu-1,2019-12-04,human,ncbiprotein:NC_045512
7,pmc:PMC7045880,Wuhan/IVDC-HB-01/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402119
8,pmc:PMC7095418,Wuhan/WIV04/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402124
9,pmc:PMC7095418,Wuhan/WIV02/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402127


### List Gene and Protein information for Reference Genome

In [10]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,referenceGenome,name,collectionDate,gene,geneId,protein,protein_id
0,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,endoRNAse,ncbiprotein:YP_009725310
1,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp8,ncbiprotein:YP_009725304
2,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp9,ncbiprotein:YP_009725305
3,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp6,ncbiprotein:YP_009725302
4,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,ORF1a polyprotein,ncbiprotein:YP_009725295
5,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp3,ncbiprotein:YP_009725299
6,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,helicase,ncbiprotein:YP_009725308
7,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp7,ncbiprotein:YP_009725303
8,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,nsp11,ncbiprotein:YP_009725312
9,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-04,ORF1ab,ncbigene:43740578,ORF1ab polyprotein,ncbiprotein:YP_009724389


### Cases in a specific County (Admin2)

In [11]:
admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date("2020-04-30")})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths
0,San Diego County,3432,120


### Aggregate cases by State (Admin1)

In [12]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: date("2020-04-30")})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeConfirmed) as confirmed, sum(c.cummulativeDeaths) as deaths
ORDER BY deaths
"""
graph.run(query).to_data_frame()

Unnamed: 0,state,confirmed,deaths
0,Rhode Island,7534,0
1,Wyoming,559,1
2,Alaska,355,5
3,South Dakota,2449,13
4,Hawaii,609,16
5,Montana,453,16
6,North Dakota,1067,19
7,Utah,4308,41
8,West Virginia,1125,41
9,Vermont,863,49


In [14]:
!"$NEO4J_HOME"/bin/neo4j stop

Stopping Neo4j........ stopped


## Old queries need to be updated

### List person demographics and strain information for California

In [15]:
# query = """
# MATCH (a:Admin1)<-[:LOCATED_IN]-(p:Person)-[:CARRIES]->(s:Strain)
# WHERE a.name = 'California'
# RETURN p.age as age, p.sex as sex, p.exposure_location as exposure_location, s.name as strain, s.clade as clade
# """
# graph.run(query).to_data_frame()

#### Same query using parameterized Cypher
Parameters to Cypher queries can be passed as key-value arguments. Parameters in Cypher are named and are wrapped in curly braces.

In [16]:
# admin1 = 'California'

# query = """
# MATCH (a:Admin1{name: {admin1}})<-[:LOCATED_IN]-(p:Person)-[:CARRIES]->(s:Strain)
# RETURN p.age as age, p.sex as sex, p.exposure_location as exposure_location, 
#        s.name as strain, s.clade as clade, s.date as date
# ORDER BY s.date
# """
# graph.run(query, admin1=admin1).to_data_frame().head(100)

### Where did clade A originate?

In [17]:
# clade = 'A'

# query = """
# MATCH (s:Strain)--(a:Country)
# WHERE s.clade STARTS WITH {clade}
# RETURN s.clade as clade, s.name, s.date, a.name
# ORDER BY s.date
# """
# graph.run(query, clade=clade).to_data_frame().head(100)

### Find persons that imported the virus from another location

In [18]:
# query = """
# MATCH (c:Admin1)<-[:LOCATED_IN]-(p:Person)-[:CARRIES]->(s:Strain)
# WHERE c.name <> p.exposure_location
# RETURN c.name as `state/province`, p.age as age, p.sex as sex, p.exposure_location as exposure_location, 
#        s.name as strain, s.clade as clade
# ORDER BY p.exposure_location
# """
# graph.run(query).to_data_frame()