# Example queries for biological entities on COVID-19 Knowledge Graph
[Work in progress]

This notebook demonstrates how to run Cypher queries in a Jupyter Notebook by connecting to a database server.

In [45]:
import os
import time
import pandas as pd
from py2neo import Graph

In [46]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [47]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### List Node Metadata

In [48]:
query = """
MATCH (n:NodeMetadata)
RETURN n.name, n.shortDescription, n.description, n.example, n.details
"""
graph.run(query).to_data_frame()

Unnamed: 0,n.name,n.shortDescription,n.description,n.example,n.details
0,Location,Geographic location,A geograpic location,"World, ..., Country, State, Country, City, Cru...",
1,World,The World,Top level location,,
2,UNRegion,Continental regions,Continental regions according to the M49 stan...,Americas,https://unstats.un.org/unsd/methodology/m49/
3,UNSubRegion,Subcontinental regions,Subcontinental regions according to the M49 st...,Latin America and the Caribbean,https://unstats.un.org/unsd/methodology/m49/
4,UNIntermediateRegion,Subdivisions of subcontinental regions,Subdivisions of subcontinental regions accordi...,Caribbean,https://unstats.un.org/unsd/methodology/m49/
5,Country,Countries and dependent Territories,Countries and dependent Territories defined b...,United States,http://www.geonames.org/
6,Admin1,"State, Province, Municipality","First administrative divisions, e.g, State, Pr...",California,http://www.geonames.org/
7,Admin2,County,Second administrative divisions: County in the US,San Diego County,http://www.geonames.org/
8,City,City,City,San Diego,http://www.geonames.org/
9,USRegion,Regions of the US,Regions defined by the US Census Bureau,West Region,https://www.census.gov/geographies/reference-f...


### List Organisms in KG

In [49]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009
3,human,Homo sapiens,taxonomy:9606
4,intermediate horseshoe bat,Rhinolophus affinis,taxonomy:59477
5,Malayan horseshoe bat,Rhinolophus malayanus,taxonomy:608659
6,horseshoe bat,Rhinolophus,taxonomy:49442
7,Malayan pangolin,Manis javanica,taxonomy:9974
8,palm civet,Paradoxurus,taxonomy:71116
9,carnivores,Canidae,taxonomy:9608


### List Coronavirus Outbreaks

In [50]:
query = """
MATCH (p:Organism)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy,outbreak,startDate
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049,COVID-19,2019
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626,MERS,2012
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009,SARS,2003


### List Strains that are mentioned in PubMed Central Articles

In [51]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(o:Organism)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, o.name as host, s.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame().head(20)

Unnamed: 0,pmc,name,collectionDate,host,host_id
0,pmc:PMC7106203,BetaCoV/Wuhan/WH-01/2019,2019-12-26,human,https://www.gisaid.org/EPI_ISL_406798
1,pmc:PMC7106203,BetaCoV/Wuhan/IPBCAMS-WH-03/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_403930
2,pmc:PMC7106203,WIV02,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402127
3,pmc:PMC7106203,WIV05,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402128
4,pmc:PMC7045880,BetaCoV/Wuhan/IVDC-HB-01/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402119
5,pmc:PMC7224157,WIV04,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402124
6,pmc:PMC7228367,Wuhan-Hu-1,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402125
7,pmc:PMC7228367,BetaCoV/Wuhan/IPBCAMS-WH-03/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_403930
8,pmc:PMC7228367,WIV04,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402124
9,pmc:PMC7108203,BetaCoV/Wuhan/IVDC-HB-05/2019,2019-12-30,human,https://www.gisaid.org/EPI_ISL_402121


### List Gene and Protein information for Reference Genome
This query lists the genes and proteins encoded by the SARS-CoV-2 reference genome. This is the first genome of SARS-CoV-2 collected in Wuhan on Dec. 5, 2019.

In [52]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,referenceGenome,name,collectionDate,gene,geneId,protein,protein_id
0,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF1ab,ncbigene:4374057826621555,Replicase polyprotein 1ab,md5:e6608b50fcd6e004708a875615ddf2d9
1,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF1ab,ncbigene:4374057826613483,Replicase polyprotein 1a,md5:e781b58591b8dbdd15f84dcbdec82105
2,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,S,ncbigene:437405682156325384,Spike glycoprotein,md5:4c35f09aac2f7be4f3cffd30c6aecac8
3,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF3a,ncbigene:437405692539326220,Protein 3a,md5:f5c8b89ceac3f14e456577557df1ef40
4,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,E,ncbigene:437405702624526472,Envelope small membrane protein,md5:375e0f905c315e06a99c80b736c125d2
5,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,M,ncbigene:437405712652327191,Membrane protein,md5:1cd6abff79ad3633e17582eb0e576539
6,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF6,ncbigene:437405722720227387,Non-structural protein 6,md5:90b50e0be9abd893bd32b163d6933f0c
7,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF7a,ncbigene:437405732739427759,Protein 7a,md5:f65213344e2e68de1cae4feb9c5e07b1
8,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF7b,ncbigene:437405742775627887,Protein non-structural 7b,md5:c7f0179da4ca26456ee905081f485cc8
9,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-06,ORF8,ncbigene:437405772789428259,Non-structural protein 8,md5:3a77ad9207d29beacfbf6f18b0da9e26


### Find Mutations in the SARS-CoV-2 "S" gene
The S gene is translated by the human or other species to the spike protein, which binds the human ACE2 receptor to gain entrance into the cell. Here we compare the mutations (missense mutations) found at different locations.

In [53]:
query = """
MATCH (g:Gene{name:'S'})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})
<-[:HAS_VARIANT]-(s:Strain)-[:FOUND_IN]->(l:Location)-[:IN*]->(r:USRegion{name:$region}) 
RETURN v.name AS mutation, l.name AS location, r.name AS region
ORDER BY v.name, l.name
"""

##### Variants in the US West Region

In [54]:
region = 'West Region'
graph.run(query, region=region).to_data_frame().head(100)

Unnamed: 0,mutation,location,region
0,S:c.1042Gca>Aca,Washington,West Region
1,S:c.1042Gca>Aca,Washington,West Region
2,S:c.1151cCt>cTt,King County,West Region
3,S:c.1151cCt>cTt,San Diego,West Region
4,S:c.1153Act>Gct,Washington County,West Region
5,S:c.1183Gtc>Atc,San Diego,West Region
6,S:c.1214gAt>gTt,Arizona,West Region
7,S:c.1240Caa>Gaa,Arizona,West Region
8,S:c.1240Caa>Gaa,Arizona,West Region
9,S:c.1240Caa>Gaa,Arizona,West Region


##### Variants in the US Northeast Region

In [55]:
region = 'Northeast Region'
graph.run(query, region=region).to_data_frame().head(100)

Unnamed: 0,mutation,location,region
0,S:c.1076aGc>aAc,Queens,Northeast Region
1,S:c.1150Cct>Tct,Queens,Northeast Region
2,S:c.1150Cct>Tct,Queens,Northeast Region
3,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
4,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
5,S:c.13Ctt>Ttt,Brooklyn,Northeast Region
6,S:c.13Ctt>Ttt,Hudson County,Northeast Region
7,S:c.13Ctt>Ttt,Manhattan,Northeast Region
8,S:c.13Ctt>Ttt,Massachusetts,Northeast Region
9,S:c.13Ctt>Ttt,Massachusetts,Northeast Region


### Cases in a specific County (Admin2)

In [57]:
admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date("2020-06-04")})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths
0,San Diego County,7798,283


### Aggregate cases by State (Admin1)

In [29]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: date("2020-06-04")})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeDeaths) as deaths, sum(c.cummulativeConfirmed) as confirmed
ORDER BY deaths DESC
"""
graph.run(query).to_data_frame()

Unnamed: 0,state,deaths,confirmed
0,New York,30174,375133
1,New Jersey,11970,162104
2,Massachusetts,7190,101698
3,Pennsylvania,5832,78335
4,Illinois,5644,123329
5,Michigan,5475,53474
6,California,4401,119524
7,Connecticut,4007,42998
8,Florida,2570,59291
9,,2435,48191
