In [1]:
import ipycytoscape
import pandas as pd
from covidkg import Node, Edge, KnowledgeGraph

In [2]:
# Read in nodes and interactions data
nodes_data = pd.read_csv("data/nodes_data.csv")
interactions_data = pd.read_csv("data/interactions_data.csv")
genes = pd.read_csv("data/01c-NCBIRefSeq.csv")

# Modify interactions data to include protein names
sars_names = nodes_data[["ID", "Name"]]
sars_names.columns = ["SARS_COV2_Protein_ID", "SARS_Name"]
human_names = nodes_data[["ID", "Name"]]
human_names.columns = ["Human_Protein_ID", "Human_Name"]
interactions_data = interactions_data.merge(sars_names, how="left")
interactions_data = interactions_data.merge(human_names, how="left")

# Generate taxonomy ID, color code, and UniprotID link
tax_id = nodes_data["TaxonomyID"].astype(str)
color = ["blue" if i == '2697049' else "red" for i in tax_id.values]
href = "https://www.uniprot.org/uniprot/" + nodes_data["Identifier"].str[8:14]

In [3]:
nodes_data.head()

Unnamed: 0,End Pos,ID,Identifier,Length,Name,Sequence,Start Pos,TaxonomyID
0,26472.0,375e0f905c315e06a99c80b736c125d2,uniprot:P0DTC4,75.0,E,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...,26245.0,2697049
1,27191.0,1cd6abff79ad3633e17582eb0e576539,uniprot:P0DTC5,222.0,M,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...,26523.0,2697049
2,805.0,5c2c364f44079728c451280435c4236a,uniprot:P0DTD1-PRO_0000449619,180.0,NSP1,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,266.0,2697049
3,13484.0,63d2c81f37726f44c600eb5225676a66,uniprot:P0DTC1-PRO_0000449645,13.0,NSP11,SADAQSFLNGFAV,13442.0,2697049
4,13441.0,af0cec59296f3c845a7b04500cd6886b,uniprot:P0DTD1-PRO_0000449628,139.0,NSP10,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...,13025.0,2697049


In [4]:
genes.head()

Unnamed: 0,genbank_id,gene,start,end,ncbigene_id,ncbiprotein_id,product,Sequence
0,ncbiprotein:NC_045512,ORF1ab,266,21555,ncbigene:43740578,ncbiprotein:YP_009724389,ORF1ab polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
1,ncbiprotein:NC_045512,ORF1ab,266,13483,ncbigene:43740578,ncbiprotein:YP_009725295,ORF1a polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
2,ncbiprotein:NC_045512,S,21563,25384,ncbigene:43740568,ncbiprotein:YP_009724390,surface glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3,ncbiprotein:NC_045512,ORF3a,25393,26220,ncbigene:43740569,ncbiprotein:YP_009724391,ORF3a protein,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...
4,ncbiprotein:NC_045512,E,26245,26472,ncbigene:43740570,ncbiprotein:YP_009724392,envelope protein,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...


In [5]:
kg = KnowledgeGraph()

def create_nodes(n_name, n_id, tax_id, href, color, kg):
    for i in range(len(n_name)):
        kg.add_node(Node(n_name[i], n_id[i], tax_id[i], href[i], color[i], ""))

def create_edges(source_name, target_name, source_id, target_id, kg):
    for i in range(len(source_name)):
        kg.add_edge(Edge(Node(source_name[i], source_id[i]), Node(target_name[i], target_id[i])))
        
create_nodes(nodes_data["Name"], nodes_data["ID"], tax_id, href, color, kg)

create_edges(interactions_data["SARS_Name"].values, 
             interactions_data["Human_Name"].values, 
             interactions_data["SARS_COV2_Protein_ID"].values, 
             interactions_data["Human_Protein_ID"].values, kg)

In [6]:
# Color code: {Blue: Viral, Red: Human}
kg.load_graph(50)

CytoscapeWidget(cytoscape_layout={'name': 'cola', 'nodeSpacing': 50}, cytoscape_style=[{'selector': 'node', 'c…

In [7]:
# View stats
df = pd.DataFrame()
df_nodes = [key for key in kg.stats.keys()]
df_nchildren = [kg.stats[key] for key in kg.stats.keys()]
df["Nodes"] = df_nodes
df["Number of Children"] = df_nchildren 
df.sort_values(by="Number of Children", ascending=False)

Unnamed: 0,Nodes,Number of Children
22,ORF8,47
6,NSP13,40
11,NSP7,32
1,M,30
24,ORF9C,26
19,NSP8,24
5,NSP12,20
20,NSP9,16
25,N,15
23,ORF9B,11
