# Example queries run on COVID-19 Knowledge Graph on Server
[Work in progress]

This notebook demonstrates how to run Cypher queries in a Jupyter Notebook by connecting to a database server.

In [1]:
import os
import time
import pandas as pd
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### List Organisms in KG

In [4]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009
3,human,Homo sapiens,taxonomy:9606
4,intermediate horseshoe bat,Rhinolophus affinis,taxonomy:59477
5,horseshoe bat,Rhinolophus,taxonomy:49442
6,Malayan pangolin,Manis javanica,taxonomy:9974
7,palm civet,Paradoxurus,taxonomy:71116
8,carnivores,Canidae,taxonomy:9608
9,European mink,Mustela lutreola,taxonomy:9666


### List Coronavirus Outbreaks

In [5]:
query = """
MATCH (p:Organism)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy,outbreak,startDate
0,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,taxonomy:2697049,COVID-19,2019
1,MERS-CoV,Middle East respiratory syndrome-related coron...,taxonomy:1335626,MERS,2012
2,SARS-CoV,Severe acute respiratory syndrome-related coro...,taxonomy:694009,SARS,2003


### List Strains that are mentioned in PubMed Central Articles

In [6]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(o:Organism)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, o.name as host, s.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame().head(20)
# TODO where do the 2013 bat strains come from??

Unnamed: 0,pmc,name,collectionDate,host,host_id
0,pmc:PMC7166773,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
1,pmc:PMC7095418,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
2,pmc:PMC7166309,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
3,pmc:PMC7118693,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
4,pmc:PMC7156227,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
5,pmc:PMC7106073,bat/Yunnan/RaTG13/2013,2013-07-24,intermediate horseshoe bat,https://www.gisaid.org/EPI_ISL_402131
6,pmc:PMC7166309,pangolin/Guangxi/P4L/2017,2017-05-05,Malayan pangolin,https://www.gisaid.org/EPI_ISL_410538
7,pmc:PMC7166309,pangolin/Guangxi/P5E/2017,2017-05-05,Malayan pangolin,https://www.gisaid.org/EPI_ISL_410541
8,pmc:PMC7166309,pangolin/Guangxi/P2V/2017,2017-05-05,Malayan pangolin,https://www.gisaid.org/EPI_ISL_410542
9,pmc:PMC7166309,pangolin/Guangxi/P5L/2017,2017-05-05,Malayan pangolin,https://www.gisaid.org/EPI_ISL_410540


### List Gene and Protein information for Reference Genome
This query lists the genes and proteins encoded by the SARS-CoV-2 reference genome. This is the first genome of SARS-CoV-2 collected in Wuhan on Dec. 5, 2019.

In [7]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

Unnamed: 0,referenceGenome,name,collectionDate,gene,geneId,protein,protein_id
0,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,ORF1ab polyprotein,ncbiprotein:YP_009724389
1,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp10,ncbiprotein:YP_009725306
2,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp3,ncbiprotein:YP_009725299
3,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,3'-to-5' exonuclease,ncbiprotein:YP_009725309
4,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp4,ncbiprotein:YP_009725300
5,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp9,ncbiprotein:YP_009725305
6,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp11,ncbiprotein:YP_009725312
7,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp8,ncbiprotein:YP_009725304
8,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp2,ncbiprotein:YP_009725298
9,ncbiprotein:NC_045512,Wuhan-Hu-1,2019-12-05,ORF1ab,ncbigene:43740578,nsp7,ncbiprotein:YP_009725303


### Cases in a specific County (Admin2)

In [8]:
admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date("2020-05-04")})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths
0,San Diego County,3927,139


### Aggregate cases by State (Admin1)

In [9]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: date("2020-05-04")})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeConfirmed) as confirmed, sum(c.cummulativeDeaths) as deaths
ORDER BY deaths
"""
graph.run(query).to_data_frame()

Unnamed: 0,state,confirmed,deaths
0,Rhode Island,8519,0
1,Wyoming,596,1
2,Alaska,370,9
3,Montana,457,16
4,Hawaii,612,17
5,North Dakota,1225,19
6,South Dakota,2668,21
7,Utah,4919,45
8,West Virginia,1224,48
9,Vermont,898,52
