In [1]:
import ipycytoscape
import pandas as pd
from covidkg import Node, Edge, KnowledgeGraph

# Ipycytoscape Wrapper

In [2]:
# Read in nodes and interactions data
nodes_data = pd.read_csv("data/nodes_data.csv")
interactions_data = pd.read_csv("data/interactions_data.csv")
genes = pd.read_csv("data/01c-NCBIRefSeq.csv")

# Modify interactions data to include protein names
sars_names = nodes_data[["ID", "Name"]]
sars_names.columns = ["SARS_COV2_Protein_ID", "SARS_Name"]
human_names = nodes_data[["ID", "Name"]]
human_names.columns = ["Human_Protein_ID", "Human_Name"]
interactions_data = interactions_data.merge(sars_names, how="left")
interactions_data = interactions_data.merge(human_names, how="left")

# Generate taxonomy ID, color code, and UniprotID link
tax_id = nodes_data["TaxonomyID"].astype(str)
color = ["blue" if i == '2697049' else "red" for i in tax_id.values]
href = "https://www.uniprot.org/uniprot/" + nodes_data["Identifier"].str[8:14]

In [3]:
nodes_data

Unnamed: 0,End Pos,ID,Identifier,Length,Name,Sequence,Start Pos,TaxonomyID
0,26472.0,375e0f905c315e06a99c80b736c125d2,uniprot:P0DTC4,75.0,E,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...,26245.0,2697049
1,27191.0,1cd6abff79ad3633e17582eb0e576539,uniprot:P0DTC5,222.0,M,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...,26523.0,2697049
2,805.0,5c2c364f44079728c451280435c4236a,uniprot:P0DTD1-PRO_0000449619,180.0,NSP1,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,266.0,2697049
3,13484.0,63d2c81f37726f44c600eb5225676a66,uniprot:P0DTC1-PRO_0000449645,13.0,NSP11,SADAQSFLNGFAV,13442.0,2697049
4,13441.0,af0cec59296f3c845a7b04500cd6886b,uniprot:P0DTD1-PRO_0000449628,139.0,NSP10,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...,13025.0,2697049
...,...,...,...,...,...,...,...,...
354,,uniprot:Q9ULX6,uniprot:Q9ULX6,,AKP8L_HUMAN,,,9606
355,,uniprot:Q9UDR5,uniprot:Q9UDR5,,AASS_HUMAN,,,9606
356,,uniprot:Q9UBU6,uniprot:Q9UBU6,,FA8A1_HUMAN,,,9606
357,,uniprot:Q9NQC3,uniprot:Q9NQC3,,RTN4_HUMAN,,,9606


In [4]:
interactions_data

Unnamed: 0,SARS_COV2_Protein_ID,Human_Protein_ID,SARS_Name,Human_Name
0,375e0f905c315e06a99c80b736c125d2,uniprot:Q8IWA5,E,CTL2_HUMAN
1,375e0f905c315e06a99c80b736c125d2,uniprot:Q86VM9,E,ZCH18_HUMAN
2,375e0f905c315e06a99c80b736c125d2,uniprot:Q6UX04,E,CWC27_HUMAN
3,375e0f905c315e06a99c80b736c125d2,uniprot:P25440,E,BRD2_HUMAN
4,375e0f905c315e06a99c80b736c125d2,uniprot:O60885,E,BRD4_HUMAN
...,...,...,...,...
327,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:P19784,N,CSK22_HUMAN
328,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:P11940,N,PABP1_HUMAN
329,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:O43818,N,U3IP2_HUMAN
330,4c35f09aac2f7be4f3cffd30c6aecac8,uniprot:Q9C0B5,SPIKE,ZDHC5_HUMAN


In [5]:
genes.head()

Unnamed: 0,genbank_id,gene,start,end,ncbigene_id,ncbiprotein_id,product,Sequence
0,ncbiprotein:NC_045512,ORF1ab,266,21555,ncbigene:43740578,ncbiprotein:YP_009724389,ORF1ab polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
1,ncbiprotein:NC_045512,ORF1ab,266,13483,ncbigene:43740578,ncbiprotein:YP_009725295,ORF1a polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
2,ncbiprotein:NC_045512,S,21563,25384,ncbigene:43740568,ncbiprotein:YP_009724390,surface glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3,ncbiprotein:NC_045512,ORF3a,25393,26220,ncbigene:43740569,ncbiprotein:YP_009724391,ORF3a protein,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...
4,ncbiprotein:NC_045512,E,26245,26472,ncbigene:43740570,ncbiprotein:YP_009724392,envelope protein,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...


In [6]:
kg = KnowledgeGraph()

def create_nodes(n_name, n_id, tax_id, href, color, kg):
    for i in range(len(n_name)):
        kg.add_node(Node(n_name[i], n_id[i], tax_id[i], href[i], color[i], ""))

def create_edges(source_name, target_name, source_id, target_id, kg):
    for i in range(len(source_name)):
        kg.add_edge(Edge(Node(source_name[i], source_id[i]), Node(target_name[i], target_id[i])))
        
create_nodes(nodes_data["Name"], nodes_data["ID"], tax_id, href, color, kg)

create_edges(interactions_data["SARS_Name"].values, 
             interactions_data["Human_Name"].values, 
             interactions_data["SARS_COV2_Protein_ID"].values, 
             interactions_data["Human_Protein_ID"].values, kg)

In [7]:
# Color code: {Blue: Viral, Red: Human}
kg.load_graph(50)

CytoscapeWidget(cytoscape_layout={'name': 'cola', 'nodeSpacing': 50}, cytoscape_style=[{'selector': 'node', 'c…

In [8]:
# View stats
df = pd.DataFrame()
df_nodes = [key for key in kg.stats.keys()]
df_nchildren = [kg.stats[key] for key in kg.stats.keys()]
df["Nodes"] = df_nodes
df["Number of Children"] = df_nchildren 
df.sort_values(by="Number of Children", ascending=False)

Unnamed: 0,Nodes,Number of Children
22,ORF8,47
6,NSP13,40
11,NSP7,32
1,M,30
24,ORF9C,26
19,NSP8,24
5,NSP12,20
20,NSP9,16
25,N,15
23,ORF9B,11


# Neo4j

In [23]:
import os
import numpy as np
from py2neo import Graph, Node, Relationship

In [10]:
NEO4J_HOME = os.getenv('NEO4J_HOME')
print(NEO4J_HOME)

C:\Users\ericy\.Neo4jDesktop\neo4jDatabases\database-648d6182-7a2c-4fe8-b4cb-63ba7c4737da\installation-4.0.3


In [11]:
graph = Graph("bolt://localhost:11006", auth=("neo4j", "neo4jbinder"))
!"$NEO4J_HOME"\bin\neo4j start

Neo4j service started


In [18]:
query1 = """
LOAD CSV WITH HEADERS FROM 'FILE:///interactions.csv' AS row 
RETURN row.Interaction_ID, row.SARS_COV2_Protein_ID, row.Human_Protein_ID
"""
query2 = """
LOAD CSV WITH HEADERS FROM 'FILE:///human_data.csv' AS row 
RETURN row.Human_Protein_ID, row.Human_Protein_Name
"""
query3 = """
LOAD CSV WITH HEADERS FROM 'FILE:///virus_data.csv' AS row 
RETURN row.SARS_COV2_Protein_ID, row.SARS_COV2_Identifier, row.SARS_COV2_Protein_Name, row.Sequence, row.Length, row.`Start Pos`, row.`End Pos`
"""
query4 = """
LOAD CSV WITH HEADERS FROM 'FILE:///human_alias.csv' AS row 
RETURN row.Human_Protein_ID, row.Alt_intact_ID, row.Alt_uniprot_ID
"""
query5 = """
LOAD CSV WITH HEADERS FROM 'FILE:///virus_alias.csv' AS row 
RETURN row.SARS_COV2_Protein_ID, row.Alt_uniprot_ID, row.Alt_intact_ID
"""
interactions_neo4j = graph.run(query1).to_data_frame()
interactions_neo4j

Unnamed: 0,row.Interaction_ID,row.SARS_COV2_Protein_ID,row.Human_Protein_ID
0,intact:EBI-25490454,375e0f905c315e06a99c80b736c125d2,uniprot:Q8IWA5
1,intact:EBI-25490454,375e0f905c315e06a99c80b736c125d2,uniprot:Q86VM9
2,intact:EBI-25490454,375e0f905c315e06a99c80b736c125d2,uniprot:Q6UX04
3,intact:EBI-25490454,375e0f905c315e06a99c80b736c125d2,uniprot:P25440
4,intact:EBI-25490454,375e0f905c315e06a99c80b736c125d2,uniprot:O60885
...,...,...,...
327,intact:EBI-25490574,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:P19784
328,intact:EBI-25490574,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:P11940
329,intact:EBI-25490574,b100d0849c3e01fc6c75ead0e916ce0f,uniprot:O43818
330,intact:EBI-25490625,4c35f09aac2f7be4f3cffd30c6aecac8,uniprot:Q9C0B5


In [32]:
node_dict = {}
sars = interactions_neo4j["row.SARS_COV2_Protein_ID"].values
humans = interactions_neo4j["row.Human_Protein_ID"].values
for i in range(interactions_neo4j.shape[0]):
    if sars[i] not in node_dict.keys():
        node_dict[sars[i]] = [humans[i]]
    else:
        node_dict[sars[i]].append(humans[i])
        
for node in node_dict.keys():
    curr_sars = Node("SARS_COV2", name=node, num_interactions=len(node_dict[node]))
    graph.create(curr_sars)
    for target in node_dict[node]:
        curr_hum = Node("Human", name=target, parent=node)
        graph.create(curr_hum)
        graph.create(Relationship(curr_sars, "derives", curr_hum))

In [33]:
node_query = """
MATCH (n)
RETURN id(n) as id, n
"""

In [None]:
delete = """
MATCH (n)
DETACH DELETE n
"""

In [34]:
graph.run(node_query).data()

[{'id': 0,
  'n': (_0:SARS_COV2 {name: '375e0f905c315e06a99c80b736c125d2', num_interactions: 6})},
 {'id': 1,
  'n': (_1:Human {name: 'uniprot:Q8IWA5', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 2,
  'n': (_2:Human {name: 'uniprot:Q86VM9', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 3,
  'n': (_3:Human {name: 'uniprot:Q6UX04', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 4,
  'n': (_4:Human {name: 'uniprot:P25440', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 5,
  'n': (_5:Human {name: 'uniprot:O60885', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 6,
  'n': (_6:Human {name: 'uniprot:O00203', parent: '375e0f905c315e06a99c80b736c125d2'})},
 {'id': 7,
  'n': (_7:SARS_COV2 {name: '1cd6abff79ad3633e17582eb0e576539', num_interactions: 30})},
 {'id': 8,
  'n': (_8:Human {name: 'uniprot:Q9BSJ2', parent: '1cd6abff79ad3633e17582eb0e576539'})},
 {'id': 9,
  'n': (_9:Human {name: 'uniprot:Q9BQT8', parent: '1cd6abff79ad3633e17582eb0e576539'})},
 

In [35]:
relationship_query = """
MATCH (n)-[r]->(m)
RETURN id(n) as source, id(m) as target, type(r) as edge
"""

In [36]:
graph.run(relationship_query).data()

[{'source': 0, 'target': 1, 'edge': 'derives'},
 {'source': 0, 'target': 2, 'edge': 'derives'},
 {'source': 0, 'target': 3, 'edge': 'derives'},
 {'source': 0, 'target': 4, 'edge': 'derives'},
 {'source': 0, 'target': 5, 'edge': 'derives'},
 {'source': 0, 'target': 6, 'edge': 'derives'},
 {'source': 7, 'target': 8, 'edge': 'derives'},
 {'source': 7, 'target': 9, 'edge': 'derives'},
 {'source': 7, 'target': 10, 'edge': 'derives'},
 {'source': 7, 'target': 11, 'edge': 'derives'},
 {'source': 7, 'target': 12, 'edge': 'derives'},
 {'source': 7, 'target': 13, 'edge': 'derives'},
 {'source': 7, 'target': 14, 'edge': 'derives'},
 {'source': 7, 'target': 15, 'edge': 'derives'},
 {'source': 7, 'target': 16, 'edge': 'derives'},
 {'source': 7, 'target': 17, 'edge': 'derives'},
 {'source': 7, 'target': 18, 'edge': 'derives'},
 {'source': 7, 'target': 19, 'edge': 'derives'},
 {'source': 7, 'target': 20, 'edge': 'derives'},
 {'source': 7, 'target': 21, 'edge': 'derives'},
 {'source': 7, 'target': 22,

In [17]:
# !"$NEO4J_HOME"\bin\neo4j stop