# Example queries for biological entities on COVID-19 Knowledge Graph
[Work in progress]

This notebook demonstrates how to run Cypher queries in a Jupyter Notebook by connecting to a database server.

In [1]:
import datetime
import pandas as pd
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### List Node Metadata

In [4]:
query = """
MATCH (n:NodeMetadata)
RETURN n.name, n.shortDescription, n.description, n.example, n.details
"""
graph.run(query).to_data_frame()

Unnamed: 0,n.name,n.shortDescription,n.description,n.example,n.details
0,Location,Geographic location,A geograpic location,"World, ..., Country, State, Country, City, Cru...",
1,World,The World,Top level location,,
2,UNRegion,Continental regions,Continental regions according to the M49 stan...,Americas,https://unstats.un.org/unsd/methodology/m49/
3,UNSubRegion,Subcontinental regions,Subcontinental regions according to the M49 st...,Latin America and the Caribbean,https://unstats.un.org/unsd/methodology/m49/
4,UNIntermediateRegion,Subdivisions of subcontinental regions,Subdivisions of subcontinental regions accordi...,Caribbean,https://unstats.un.org/unsd/methodology/m49/
5,Country,Countries and dependent Territories,Countries and dependent Territories defined b...,United States,http://www.geonames.org/
6,Admin1,"State, Province, Municipality","First administrative divisions, e.g, State, Pr...",California,http://www.geonames.org/
7,Admin2,County,Second administrative divisions: County in the US,San Diego County,http://www.geonames.org/
8,City,City,City,San Diego,http://www.geonames.org/
9,CruiseShip,Cruise ship,Cruise ship,Diamond Princess,http://www.productontology.org/doc/Cruise_ship


### List Organisms in KG

In [5]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,,taxonomy:2697049
1,MERS-CoV,,taxonomy:1335626
2,SARS-CoV,,taxonomy:694009
3,human,,taxonomy:9606
4,house mouse,,taxonomy:10090
5,intermediate horseshoe bat,,taxonomy:59477
6,Malayan horseshoe bat,,taxonomy:608659
7,horseshoe bat,,taxonomy:49442
8,Malayan pangolin,,taxonomy:9974
9,palm civet,,taxonomy:71116


### List Coronavirus Outbreaks

In [6]:
query = """
MATCH (p:Pathogen)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy,outbreak,startDate
0,SARS-CoV-2,,taxonomy:2697049,COVID-19,2019
1,MERS-CoV,,taxonomy:1335626,MERS,2012
2,SARS-CoV,,taxonomy:694009,SARS,2003


### List Strains that are mentioned in PubMed Central Articles

In [7]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(h:Host)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, h.name as host, h.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame().head(20)

Unnamed: 0,pmc,name,collectionDate,host,host_id
0,pmc:PMC7166309,BetaCoV/pangolin/Guangxi/P1E/2017,2017-01-01,Malayan pangolin,taxonomy:9974
1,pmc:PMC7166309,BetaCoV/pangolin/Guangxi/P4L/2017,2017-01-01,Malayan pangolin,taxonomy:9974
2,pmc:PMC7166309,BetaCoV/pangolin/Guangxi/P5E/2017,2017-01-01,Malayan pangolin,taxonomy:9974
3,pmc:PMC7166309,BetaCoV/pangolin/Guangxi/P2V/2017,2017-01-01,Malayan pangolin,taxonomy:9974
4,pmc:PMC7166309,BetaCoV/pangolin/Guangxi/P5L/2017,2017-01-01,Malayan pangolin,taxonomy:9974
5,pmc:PMC7228214,BetaCoV/pangolin/Guangxi/P4L/2017,2017-01-01,Malayan pangolin,taxonomy:9974
6,pmc:PMC7166309,BetaCoV/pangolin/Guandong/1/2020,2019-01-01,Malayan pangolin,taxonomy:9974
7,pmc:PMC7228214,BetaCoV/pangolin/Guandong/1/2020,2019-01-01,Malayan pangolin,taxonomy:9974
8,pmc:PMC7205519,BetaCoV/pangolin/Guandong/1/2020,2019-01-01,Malayan pangolin,taxonomy:9974
9,pmc:PMC7256558,hCoV-19/bat/Yunnan/RmYN02/2019,2019-06-25,Malayan horseshoe bat,taxonomy:608659


### List Gene and Protein information for Reference Genome
This query lists the genes and proteins encoded by the SARS-CoV-2 reference genome. This is the first genome of SARS-CoV-2 collected in Wuhan on Dec. 5, 2019.

In [8]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,referenceGenome,name,collectionDate,gene,geneId,protein,protein_id
0,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF1ab,ncbigene:43740578-266-21555,Replicase polyprotein 1ab,md5:e6608b50fcd6e004708a875615ddf2d9
1,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF1ab,ncbigene:43740578-266-13483,Replicase polyprotein 1a,md5:e781b58591b8dbdd15f84dcbdec82105
2,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,S,ncbigene:43740568-21563-25384,Spike glycoprotein,md5:4c35f09aac2f7be4f3cffd30c6aecac8
3,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF3a,ncbigene:43740569-25393-26220,Protein 3a,md5:f5c8b89ceac3f14e456577557df1ef40
4,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,E,ncbigene:43740570-26245-26472,Envelope small membrane protein,md5:375e0f905c315e06a99c80b736c125d2
5,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,M,ncbigene:43740571-26523-27191,Membrane protein,md5:1cd6abff79ad3633e17582eb0e576539
6,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF6,ncbigene:43740572-27202-27387,Non-structural protein 6,md5:90b50e0be9abd893bd32b163d6933f0c
7,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF7a,ncbigene:43740573-27394-27759,Protein 7a,md5:f65213344e2e68de1cae4feb9c5e07b1
8,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF7b,ncbigene:43740574-27756-27887,Protein non-structural 7b,md5:c7f0179da4ca26456ee905081f485cc8
9,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-10,ORF8,ncbigene:43740577-27894-28259,Non-structural protein 8,md5:3a77ad9207d29beacfbf6f18b0da9e26


## List COVID Case Data

### Cases in a specific County (Admin2) 

#### At a specific date

In [9]:
today = datetime.datetime.utcnow().date()
yesterday = today - datetime.timedelta(days=1)

In [10]:
admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date($day)})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths, c.date as dateUTC
"""
graph.run(query, admin2=admin2, day=yesterday).to_data_frame()

Unnamed: 0,name,confirmed,deaths,dateUTC
0,San Diego County,8619,296,2020-06-09


##### All available dates

In [11]:
query = """
MATCH (c:Cases)-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths, c.date as dateUTC
ORDER BY c.date DESC
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths,dateUTC
0,San Diego County,8619,296,2020-06-09
1,San Diego County,8619,296,2020-06-08
2,San Diego County,8345,296,2020-06-07
3,San Diego County,8345,296,2020-06-06
4,San Diego County,7940,288,2020-06-05
5,San Diego County,7798,283,2020-06-04
6,San Diego County,7674,276,2020-06-03
7,San Diego County,7554,269,2020-06-02
8,San Diego County,7481,269,2020-06-01
9,San Diego County,7481,269,2020-05-31


### Aggregate cases by State (Admin1)

In [12]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: $date})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeDeaths) as deaths, sum(c.cummulativeConfirmed) as confirmed, c.date as dateUTC
ORDER BY deaths DESC
"""
graph.run(query, date=yesterday).to_data_frame()

Unnamed: 0,state,deaths,confirmed,dateUTC
0,New York,30458,379482,2020-06-09
1,New Jersey,12303,164541,2020-06-09
2,Massachusetts,7397,103534,2020-06-09
3,Pennsylvania,6014,80870,2020-06-09
4,Illinois,5919,127678,2020-06-09
5,Michigan,5821,60009,2020-06-09
6,California,4701,133848,2020-06-09
7,Connecticut,4097,43893,2020-06-09
8,Florida,2728,64951,2020-06-09
9,,2554,51020,2020-06-09
