In [1]:
from graph_vis import display_graph
from open_neo4j import open_driver, get_graph, open_gds, make_graph_projection
import pandas as pd
import numpy as np
import re
import os
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

node_pattern = re.compile(r'\(n[0-9]+:[A-Za-z]+\)')
edge_pattern = re.compile(r'\[:[A-Za-z_]+\]')

driver = open_driver()
spoke = open_gds("spoke") # spoke
genelab = open_gds("spoke-genelab-v0.0.2") # genelab database
composite = open_gds("compositenasa") # connected composite db

# MetaGraph for GeneLab DB

In [2]:
metagraph = get_graph(driver, 'call apoc.meta.graph', 'spoke-genelab-v0.0.2')
display_graph(metagraph, hide_edge=False, meta=True)

GraphWidget(layout=Layout(height='500px', width='100%'))

# Check node count in GeneLab 

In [3]:
query = """
MATCH (node)
RETURN
  head(labels(node)) AS label,
  count(*) AS count
ORDER BY count DESC
"""

df = genelab.run_cypher(query)
df.head(20)

Unnamed: 0,label,count
0,MGene,105984
1,Gene,19755
2,Assay,1698
3,Study,69
4,Anatomy,25
5,MetaNode,6
6,CellType,3


# Check edge count in GeneLab 

In [4]:
query = """
MATCH ()-[rel]->()
RETURN
  type(rel) AS rel_type,
  count(*) AS count
ORDER BY count DESC
"""

df = genelab.run_cypher(query)
df.head(20)

Unnamed: 0,rel_type,count
0,MEASURED_AmMG,5046101
1,IS_ORTHOLOG_MGiG,60972
2,PERFORMED_SpA,1698
3,INVESTIGATED_AiA,1665
4,INVESTIGATED_AiCT,55
5,MetaRelationship,5


# Find the number of assays that are connected to Anatomy nodes in SPOKE

In [5]:
query = """
CALL {
    USE compositenasa.glds
    MATCH (n0:Anatomy)-[]-(n1:Assay)
    WITH n0, SIZE(COLLECT(DISTINCT n1)) as n_assays
    RETURN COLLECT(n0.identifier) AS ids, n_assays
}
CALL {
    USE compositenasa.gldsspoke
    WITH ids
    MATCH (n:Anatomy) WHERE n.identifier IN ids
    RETURN DISTINCT n.identifier as Node, n.name as Node_Name
}
RETURN DISTINCT Node, Node_Name, n_assays ORDER BY n_assays DESC
"""
composite.run_cypher(query) 

Unnamed: 0,Node,Node_Name,n_assays
0,UBERON:0000178,blood,335
1,UBERON:0000468,multicellular organism,291
2,UBERON:0002107,liver,232
3,UBERON:0002106,spleen,176
4,UBERON:0004538,left kidney,97
5,UBERON:0001115,left lobe of liver,91
6,UBERON:0001377,quadriceps femoris,79
7,UBERON:0000966,retina,70
8,UBERON:0001911,mammary gland,63
9,UBERON:0001389,soleus muscle,36


# Forwarding GeneLab values to SPOKE

#### Here we find the Assays that measured gene expression in the eye. Then we group the upregulated Genes by the Symptoms they are connected to in SPOKE. Finally, we calculate the average log2fc for each Symptom.

In [6]:
query = """
CALL {
    USE compositenasa.glds
    MATCH (n0:Anatomy {identifier:'UBERON:0000970'})-[]-(n1:Assay)-[r:MEASURED_AmMG {direction:'%s'}]->(n2:MGene)-[]->(n3:Gene)
    RETURN DISTINCT n3.identifier AS ids, AVG(r.log2fc) as mean_log2fc
}
 
CALL {
    USE compositenasa.gldsspoke
    WITH ids
    MATCH (n0:Gene)-[]-(n1:Symptom) WHERE n0.identifier = ids
    RETURN DISTINCT n1.identifier as Node, n1.name as Node_Name, n0
}
WITH Node, Node_Name, AVG(mean_log2fc) as avg_log2fc
RETURN DISTINCT Node, Node_Name, avg_log2fc ORDER BY avg_log2fc DESC LIMIT 10 
"""
composite.run_cypher(query % 'upregulates')

Unnamed: 0,Node,Node_Name,avg_log2fc
0,D007022,Hypotension,3.532166
1,C537048,Renotubular dysgenesis,3.532166
2,D051436,Chronic kidney disease,2.145078
3,D053099,Azotemia,1.910546
4,D003318,Corneal opacity,1.795103
5,C535342,Zonular cataract,1.73977
6,C566162,Aculeiform cataract,1.73977
7,C563426,Pulverulent cataract,1.73977
8,D004844,Epistaxis,1.631696
9,D008595,Menorrhagia,1.631696


# MetaGraph for GeneLab DB

In [7]:
# find assay where factor 1 = space flight
query = """MATCH (n:Assay) WHERE "['Space Flight']" in n.factors_1 AND "['Ground Control']" in n.factors_2 RETURN DISTINCT n.identifier as n"""
sf_assays = list(genelab.run_cypher(query).n.unique())

# end disease
disease = 'DOID:9428' # intracranial-hypertension

# MetaPaths from GeneLab
gs_metapath = "(a:Assay)-[r:MEASURED_AmMG]->(:MGene)-[]->(n:Gene)"
gs_filter = "\nWHERE r.direction = 'upregulates' AND a.identifier IN %s" % sf_assays

# DWPC
##### Degree Weighted Path Count (DWPC) scores the connectivity between nodes by taking the number of forward and reverse degrees into consideration.
##### Read about how DWPC was used to predict [Disease-Gene](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004259) and [Disease-Compound](https://elifesciences.org/articles/26726) relationships.
![alt text](jupyter_example_images_dwpc.png "DWPC")
### Look at weighted DWPC for BiologicalProcesses using upregulated genes in Space Flight v Control assays
![alt text](bp_dwpc_metagraph.png "BP DWPC")

In [15]:
from spokecloud import get_inner_dwpc_degree_str, OVERALL_WEIGHTED_DWPC_QUERY , plot_dwpc_paths
spoke_metapath = "(n0:Gene)-[:PARTICIPATES_GpBP]-(n1:BiologicalProcess)-[:PARTICIPATES_GpBP]-(n2:Gene)-[:ASSOCIATES_DaG]-(n3:Disease {identifier:'%s'})"
spoke_metapath = spoke_metapath % disease
weight_prop = 'log2fc'
edges = [edge[2:-1] for edge in edge_pattern.findall(spoke_metapath)]
degree_str = get_inner_dwpc_degree_str(edges)
group_by_node = 'n1'

query = OVERALL_WEIGHTED_DWPC_QUERY % (gs_metapath, gs_filter, spoke_metapath, gs_metapath[:-1], gs_filter, spoke_metapath, degree_str, group_by_node, group_by_node, weight_prop, group_by_node, group_by_node)
dwpc_df = composite.run_cypher(query)
inner_nodes = list(dwpc_df.identifier.values[:8])
plot_dwpc_paths(genelab, driver, spoke_metapath, group_by_node, gs_metapath, gs_filter, inner_nodes, max_show = 10)
dwpc_df.head(10)


GraphWidget(layout=Layout(height='500px', width='100%'))

Unnamed: 0,identifier,name,weighted_DWPC
0,GO:0019835,cytolysis,3.962651e-07
1,GO:0032200,telomere organization,2.245425e-07
2,GO:0060333,type II interferon-mediated signaling pathway,2.112674e-07
3,GO:0048877,homeostasis of number of retina cells,2.007509e-07
4,GO:0051095,regulation of helicase activity,1.941231e-07
5,GO:0006954,inflammatory response,1.627973e-07
6,GO:0071357,cellular response to type I interferon,1.504975e-07
7,GO:0000122,negative regulation of transcription by RNA po...,1.482725e-07
8,GO:1903706,regulation of hemopoiesis,1.445508e-07
9,GO:0034340,response to type I interferon,1.363305e-07


In [16]:
print(query)


CALL {
    USE compositenasa.glds
    MATCH (eye:Anatomy)-[]-(a:Assay)-[r:MEASURED_AmMG]->(:MGene)-[]->(n:Gene) 
WHERE r.direction = 'upregulates' AND a.identifier IN ['GLDS-255-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-102-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-47-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-137-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-162-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-100-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-326-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-173-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-423-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-21-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-4-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-352-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-25-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-421-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-161-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-194-8e37d1fa27e648b5dd35d0c8f19d6408', 'GLDS-513-8e37d1fa27e648b5dd35d0c8f19d6408'] AND eye.identifier IN ['UBERON:0000970', 'UBERON:0004548', 'UBERON:0000

### Look at weighted DWPC for Symptoms using upregulated genes in Space Flight v Control assays

In [9]:
spoke_metapath = "(n0:Gene)-[:ASSOCIATES_GaS]-(n1:Symptom)-[:PRESENTS_DpS]-(n2:Disease {identifier:'%s'})"
spoke_metapath = spoke_metapath % disease
edges = [edge[2:-1] for edge in edge_pattern.findall(spoke_metapath)]
degree_str = get_inner_dwpc_degree_str(edges)
query = OVERALL_WEIGHTED_DWPC_QUERY % (gs_metapath, gs_filter, spoke_metapath, gs_metapath[:-1], gs_filter, spoke_metapath, degree_str, group_by_node, group_by_node, weight_prop, group_by_node, group_by_node)
dwpc_df = composite.run_cypher(query)
inner_nodes = list(dwpc_df.identifier.values[:3])
plot_dwpc_paths(genelab, driver, spoke_metapath, group_by_node, gs_metapath, gs_filter, inner_nodes, max_show = 10)
dwpc_df.head(10)

GraphWidget(layout=Layout(height='500px', width='100%'))

Unnamed: 0,identifier,name,weighted_DWPC
0,D003128,Coma,0.000651
1,D014474,Unconsciousness,0.00041
2,D006935,Hypercapnia,0.000353
3,D006261,Headache,0.000277
4,D006985,Hyperventilation,0.000257
5,D012021,"Reflex, Abnormal",0.000256
6,D015354,"Vision, Low",0.000185
7,D004172,Diplopia,0.000172
8,D014786,Vision Disorders,0.000161
9,D014012,Tinnitus,0.000159


### Look at weighted DWPC for BiologicalProcess using upregulated genes in Space Flight v Control assays in the Eye
![alt text](bp_dwpc_metagraph_eye.png "BP DWPC")

In [10]:
eye_anatomies = ['UBERON:0000970', 'UBERON:0004548', 'UBERON:0000966']
gs_metapath = "(eye:Anatomy)-[]-(a:Assay)-[r:MEASURED_AmMG]->(:MGene)-[]->(n:Gene)"
gs_filter = "\nWHERE r.direction = 'upregulates' AND a.identifier IN %s AND eye.identifier IN %s" % (sf_assays, eye_anatomies)

spoke_metapath = "(n0:Gene)-[:PARTICIPATES_GpBP]-(n1:BiologicalProcess)-[:PARTICIPATES_GpBP]-(n2:Gene)-[:ASSOCIATES_DaG]-(n3:Disease {identifier:'%s'})"
spoke_metapath = spoke_metapath % disease
edges = [edge[2:-1] for edge in edge_pattern.findall(spoke_metapath)]
degree_str = get_inner_dwpc_degree_str(edges)
query = OVERALL_WEIGHTED_DWPC_QUERY % (gs_metapath, gs_filter, spoke_metapath, gs_metapath[:-1], gs_filter, spoke_metapath, degree_str, group_by_node, group_by_node, weight_prop, group_by_node, group_by_node)
dwpc_df = composite.run_cypher(query)
inner_nodes = list(dwpc_df.identifier.values[:8])
plot_dwpc_paths(genelab, driver, spoke_metapath, group_by_node, gs_metapath, gs_filter, inner_nodes, max_show = 10)
dwpc_df.head(10)

GraphWidget(layout=Layout(height='500px', width='100%'))

Unnamed: 0,identifier,name,weighted_DWPC
0,GO:0019835,cytolysis,3.962651e-07
1,GO:0032200,telomere organization,2.245425e-07
2,GO:0060333,type II interferon-mediated signaling pathway,2.112674e-07
3,GO:0048877,homeostasis of number of retina cells,2.007509e-07
4,GO:0051095,regulation of helicase activity,1.941231e-07
5,GO:0006954,inflammatory response,1.627973e-07
6,GO:0071357,cellular response to type I interferon,1.504975e-07
7,GO:0000122,negative regulation of transcription by RNA po...,1.482725e-07
8,GO:1903706,regulation of hemopoiesis,1.445508e-07
9,GO:0034340,response to type I interferon,1.363305e-07


### Look at weighted DWPC for Symptoms using upregulated genes in Space Flight v Control assays in the Eye

In [11]:
spoke_metapath = "(n0:Gene)-[:ASSOCIATES_GaS]-(n1:Symptom)-[:PRESENTS_DpS]-(n2:Disease {identifier:'%s'})"
spoke_metapath = spoke_metapath % disease
edges = [edge[2:-1] for edge in edge_pattern.findall(spoke_metapath)]
degree_str = get_inner_dwpc_degree_str(edges)
query = OVERALL_WEIGHTED_DWPC_QUERY % (gs_metapath, gs_filter, spoke_metapath, gs_metapath[:-1], gs_filter, spoke_metapath, degree_str, group_by_node, group_by_node, weight_prop, group_by_node, group_by_node)
dwpc_df = composite.run_cypher(query)
inner_nodes = list(dwpc_df.identifier.values[:3])
plot_dwpc_paths(genelab, driver, spoke_metapath, group_by_node, gs_metapath, gs_filter, inner_nodes, max_show = 10)
dwpc_df.head(10)

GraphWidget(layout=Layout(height='500px', width='100%'))

Unnamed: 0,identifier,name,weighted_DWPC
0,D006261,Headache,3.1e-05
1,D004172,Diplopia,1e-05
2,D015354,"Vision, Low",8e-06
3,D012021,"Reflex, Abnormal",7e-06
4,D014786,Vision Disorders,7e-06
5,D014012,Tinnitus,3e-06
6,D001766,Blindness,2e-06
7,D003128,Coma,2e-06


# Make Projection

![alt text](jupyter_example_images_projection_gds.png "Projection")

In [12]:
from spokecloud import get_projection_info
projection_name='genelab_proj'
G=spoke.graph.get(projection_name)
node_info_df = get_projection_info(spoke, projection_name)
node_info_df.head()

Unnamed: 0,nodeId,Node,Node_Name,Node_Type
0,2496884,9833,MELK,Gene
1,1001069,inchikey:JYGXADMDTFJGBT-ZSWZSQRESA-N,"(9S,10R,11S,13S,14S,17R)-11,17-dihydroxy-17-(2...",Compound
2,2492538,3312,HSPA8,Gene
3,2493458,4783,NFIL3,Gene
4,2495116,7048,TGFBR2,Gene


# Run PageRank
##### The PageRank algorithm measures the importance of each node within the graph, based on the number incoming relationships and the importance of the corresponding source nodes. The underlying assumption roughly speaking is that a page is only as important as the pages that link to it. Here, we are creating a PageRank embedding for each comorbidity.
![alt text](jupyter_example_images_pagerank.png "PageRank")

#### Here we are using all upregulated genes in Space Flight v Control assays in the Eye as restart nodes and setting the dampingFactor = 0.33.
#### Read more about how to configure PageRank [here](https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/)

In [13]:

from spokecloud import display_top_nodes, PAGERANK_WEIGHT_QUERY, PAGERANK_SINGLE_GENELIST_QUERY, run_multi_pagerank

dampingFactor = 0.33
filter_str = ''
pagerank_df = composite.run_cypher(PAGERANK_SINGLE_GENELIST_QUERY%(gs_metapath, gs_filter, 'Gene', projection_name, dampingFactor))
pagerank_df = node_info_df.merge(pagerank_df, on='nodeId')
top_node_df = display_top_nodes(pagerank_df, driver, n_top=10, score_col='score', asc=False, db_name='spoke')
pagerank_df.head()


GraphWidget(layout=Layout(height='500px', width='100%'))

Unnamed: 0,nodeId,Node,Node_Name,Node_Type,score
0,2496884,9833,MELK,Gene,0.769196
1,1001069,inchikey:JYGXADMDTFJGBT-ZSWZSQRESA-N,"(9S,10R,11S,13S,14S,17R)-11,17-dihydroxy-17-(2...",Compound,0.000779
2,2492538,3312,HSPA8,Gene,0.839952
3,2493458,4783,NFIL3,Gene,0.75884
4,2495116,7048,TGFBR2,Gene,0.166235


In [17]:
print(PAGERANK_SINGLE_GENELIST_QUERY)


CALL {
    USE compositenasa.glds
    MATCH %s%s
    RETURN COLLECT(DISTINCT n.identifier) AS ids
}
CALL {
    USE compositenasa.gldsspoke
    WITH ids
    MATCH (g:%s) WHERE g.identifier IN ids
    RETURN DISTINCT ID(g) AS overlap_genes
}
WITH COLLECT(overlap_genes) as sourceNodes
CALL {
    USE compositenasa.gldsspoke
    WITH sourceNodes
    CALL gds.pageRank.stream('%s', { sourceNodes:sourceNodes, maxIterations:40,dampingFactor:%s})
    YIELD nodeId, score
    RETURN nodeId, score 
}
RETURN nodeId, score ORDER BY score DESC



#### Here we are running PageRank for each upregulated genes in Space Flight v Control assays in the Eye. This results in an embedding vector for each gene which is then multiplied by the average log2fc.

In [14]:
pagerank_weight_df = composite.run_cypher(PAGERANK_WEIGHT_QUERY % (gs_metapath, gs_filter, weight_prop))
multi_pagerank_df = run_multi_pagerank(spoke, G, node_info_df, pagerank_weight_df)
top_node_df = display_top_nodes(multi_pagerank_df, driver, n_top=10, score_col='mean_pr', asc=False, db_name='spoke')

GraphWidget(layout=Layout(height='500px', width='100%'))

In [20]:
query = """MATCH (n:Assay) WHERE "['Ground Control']" in n.factors_1 RETURN DISTINCT n.identifier as n"""
list(genelab.run_cypher(query).n.unique())

['GLDS-173-ec2ef062472a03059fc9d18ffbeb6415',
 'GLDS-21-a9111b1e09b475016847c828371b2c84',
 'GLDS-21-150b1510550b43ee09451f2d8a0bd32c',
 'GLDS-21-ec2ef062472a03059fc9d18ffbeb6415',
 'GLDS-25-ec2ef062472a03059fc9d18ffbeb6415',
 'GLDS-513-ec2ef062472a03059fc9d18ffbeb6415']

In [22]:
query = """MATCH (n:Assay) WHERE "['Ground Control']" in n.factors_1 RETURN DISTINCT n.factors_2 as n"""
genelab.run_cypher(query).n.values

array([list(["['Vivarium Control']"]),
       list(["['Hindlimb Suspension and Reloading']"]),
       list(["['Hindlimb Suspension']"])], dtype=object)