# Example queries for biological entities on COVID-19 Knowledge Graph
[Work in progress]

This notebook demonstrates how to run Cypher queries in a Jupyter Notebook by connecting to a database server.

In [31]:
import datetime
import pandas as pd
from py2neo import Graph

In [32]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [33]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### List Node Metadata

In [34]:
query = """
MATCH (n:NodeMetadata)
RETURN n.name, n.shortDescription, n.description, n.example, n.details
"""
graph.run(query).to_data_frame()

Unnamed: 0,n.name,n.shortDescription,n.description,n.example,n.details
0,Location,Geographic location,A geograpic location,"World, ..., Country, State, Country, City, Cru...",
1,World,The World,Top level location,,
2,UNRegion,Continental regions,Continental regions according to the M49 stan...,Americas,https://unstats.un.org/unsd/methodology/m49/
3,UNSubRegion,Subcontinental regions,Subcontinental regions according to the M49 st...,Latin America and the Caribbean,https://unstats.un.org/unsd/methodology/m49/
4,UNIntermediateRegion,Subdivisions of subcontinental regions,Subdivisions of subcontinental regions accordi...,Caribbean,https://unstats.un.org/unsd/methodology/m49/
5,Country,Countries and dependent Territories,Countries and dependent Territories defined b...,United States,http://www.geonames.org/
6,Admin1,"State, Province, Municipality","First administrative divisions, e.g, State, Pr...",California,http://www.geonames.org/
7,Admin2,County,Second administrative divisions: County in the US,San Diego County,http://www.geonames.org/
8,City,City,City,San Diego,http://www.geonames.org/
9,CruiseShip,Cruise ship,Cruise ship,Diamond Princess,http://www.productontology.org/doc/Cruise_ship


### List Organisms in KG

In [35]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,,taxonomy:2697049
1,MERS-CoV,,taxonomy:1335626
2,SARS-CoV,,taxonomy:694009
3,human,,taxonomy:9606
4,intermediate horseshoe bat,,taxonomy:59477
5,Malayan horseshoe bat,,taxonomy:608659
6,horseshoe bat,,taxonomy:49442
7,Malayan pangolin,,taxonomy:9974
8,palm civet,,taxonomy:71116
9,carnivores,,taxonomy:9608


### List Coronavirus Outbreaks

In [36]:
query = """
MATCH (p:Pathogen)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy,outbreak,startDate
0,SARS-CoV-2,,taxonomy:2697049,COVID-19,2019
1,MERS-CoV,,taxonomy:1335626,MERS,2012
2,SARS-CoV,,taxonomy:694009,SARS,2003


### List Strains that are mentioned in PubMed Central Articles

In [37]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(h:Host)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, h.name as host, h.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame().head(20)

Unnamed: 0,pmc,name,collectionDate,host,host_id
0,pmc:PMC7166773,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
1,pmc:PMC7166309,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
2,pmc:PMC7095418,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
3,pmc:PMC7118693,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
4,pmc:PMC7205519,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
5,pmc:PMC7194065,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
6,pmc:PMC7156227,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
7,pmc:PMC7182198,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
8,pmc:PMC7230814,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477
9,pmc:PMC7106073,TG13,2013-07-24,intermediate horseshoe bat,taxonomy:59477


### List Gene and Protein information for Reference Genome
This query lists the genes and proteins encoded by the SARS-CoV-2 reference genome. This is the first genome of SARS-CoV-2 collected in Wuhan on Dec. 5, 2019.

In [38]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,referenceGenome,name,collectionDate,gene,geneId,protein,protein_id
0,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF1ab,ncbigene:43740578-266-21555,Replicase polyprotein 1ab,md5:e6608b50fcd6e004708a875615ddf2d9
1,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF1ab,ncbigene:43740578-266-13483,Replicase polyprotein 1a,md5:e781b58591b8dbdd15f84dcbdec82105
2,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,S,ncbigene:43740568-21563-25384,Spike glycoprotein,md5:4c35f09aac2f7be4f3cffd30c6aecac8
3,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF3a,ncbigene:43740569-25393-26220,Protein 3a,md5:f5c8b89ceac3f14e456577557df1ef40
4,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,E,ncbigene:43740570-26245-26472,Envelope small membrane protein,md5:375e0f905c315e06a99c80b736c125d2
5,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,M,ncbigene:43740571-26523-27191,Membrane protein,md5:1cd6abff79ad3633e17582eb0e576539
6,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF6,ncbigene:43740572-27202-27387,Non-structural protein 6,md5:90b50e0be9abd893bd32b163d6933f0c
7,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF7a,ncbigene:43740573-27394-27759,Protein 7a,md5:f65213344e2e68de1cae4feb9c5e07b1
8,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF7b,ncbigene:43740574-27756-27887,Protein non-structural 7b,md5:c7f0179da4ca26456ee905081f485cc8
9,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-09,ORF8,ncbigene:43740577-27894-28259,Non-structural protein 8,md5:3a77ad9207d29beacfbf6f18b0da9e26


### Find Mutations in the SARS-CoV-2 "S" gene
The S gene is translated by the human or other species to the spike protein, which binds the human ACE2 receptor to gain entrance into the cell. Here we compare the mutations (missense mutations) found at different locations.

In [39]:
query = """
MATCH (g:Gene{name:'S'})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})
<-[:HAS_VARIANT]-(s:Strain)-[:FOUND_IN]->(l:Location)-[:IN*]->(r:USRegion{name:$region}) 
RETURN v.name AS mutation, l.name AS location, r.name AS region
ORDER BY v.name, l.name
"""

##### Variants in the US West Region

In [40]:
region = 'West Region'
graph.run(query, region=region).to_data_frame().head(10)

Unnamed: 0,mutation,location,region
0,S:c.1042Gca>Aca,Washington,West Region
1,S:c.1042Gca>Aca,Washington,West Region
2,S:c.1151cCt>cTt,King County,West Region
3,S:c.1151cCt>cTt,San Diego,West Region
4,S:c.1153Act>Gct,Washington County,West Region
5,S:c.1183Gtc>Atc,San Diego,West Region
6,S:c.1214gAt>gTt,Arizona,West Region
7,S:c.1240Caa>Gaa,Arizona,West Region
8,S:c.1240Caa>Gaa,Arizona,West Region
9,S:c.1240Caa>Gaa,Arizona,West Region


##### Variants in the US Northeast Region

In [41]:
region = 'Northeast Region'
graph.run(query, region=region).to_data_frame().head(10)

Unnamed: 0,mutation,location,region
0,S:c.1076aGc>aAc,Queens,Northeast Region
1,S:c.1150Cct>Tct,Queens,Northeast Region
2,S:c.1150Cct>Tct,Queens,Northeast Region
3,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
4,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
5,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
6,S:c.13Ctt>Ttt,Hudson County,Northeast Region
7,S:c.13Ctt>Ttt,Manhattan,Northeast Region
8,S:c.13Ctt>Ttt,Massachusetts,Northeast Region
9,S:c.13Ctt>Ttt,Massachusetts,Northeast Region


## List COVID Case Data

### Cases in a specific County (Admin2) 

#### At a specific date

In [42]:
today = datetime.datetime.utcnow().date()
yesterday = today - datetime.timedelta(days=1)

In [43]:
admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date($day)})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths, c.date as dateUTC
"""
graph.run(query, admin2=admin2, day=yesterday).to_data_frame()

Unnamed: 0,name,confirmed,deaths,dateUTC
0,San Diego County,8619,296,2020-06-08


##### All available dates

In [44]:
query = """
MATCH (c:Cases)-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths, c.date as dateUTC
ORDER BY c.date DESC
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths,dateUTC
0,San Diego County,8619,296,2020-06-08
1,San Diego County,8345,296,2020-06-07
2,San Diego County,8345,296,2020-06-06
3,San Diego County,7940,288,2020-06-05
4,San Diego County,7798,283,2020-06-04
5,San Diego County,7674,276,2020-06-03
6,San Diego County,7554,269,2020-06-02
7,San Diego County,7481,269,2020-06-01
8,San Diego County,7481,269,2020-05-31
9,San Diego County,7385,269,2020-05-30


### Aggregate cases by State (Admin1)

In [45]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: $date})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeDeaths) as deaths, sum(c.cummulativeConfirmed) as confirmed, c.date as dateUTC
ORDER BY deaths DESC
"""
graph.run(query, date=yesterday).to_data_frame()

Unnamed: 0,state,deaths,confirmed,dateUTC
0,New York,30417,378799,2020-06-08
1,New Jersey,12214,164212,2020-06-08
2,Massachusetts,7344,103262,2020-06-08
3,Pennsylvania,5953,80339,2020-06-08
4,Illinois,5828,126915,2020-06-08
5,Michigan,5494,53745,2020-06-08
6,California,4614,131037,2020-06-08
7,Connecticut,4084,43820,2020-06-08
8,Florida,2675,63875,2020-06-08
9,,2513,50503,2020-06-08
