# <p style="text-align: center;">RNA Knowledge Graph Analysis</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it), [ACabri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=alberto.cabri@unimi.it)

**GitHub Repositories:** [testRNA-KG](https://github.com/emanuelecavalleri/testRNA-KG), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)
  
<br>  
  
**Purpose:** In the following notebook, we will be loading the simplified RNA-centered KG represented as the N-Triples (.nt) file generated by PheKnowLator. We will then proceed to analyze and visualize its network topology.

<br>

**Assumptions:**   
- Knowledge graphs ➞ `./resources/knowledge_graphs`
<br>

**Dependencies:**   
- **Scripts**: This notebook makes use of the [`GRAPE`](https://github.com/AnacletoLAB/grape/) tool.  
- **Data**: All downloaded and generated data sources are provided through [this](https://drive.google.com/drive/folders/1sev5zczMviX7UVqMhTpkFXG43K3nQa9f) dedicated Google Drive repository. 
_____
***

## Table of Contents
***

### [Preprocessing](#pre-processing)


### [Neo4j](#neo4j)   


### [GRAPE](#grape)  

____
***

## Set-Up Environment
***

In [None]:
# Run this to install/update grape
#!pip install --upgrade grape ensmallen embiggen graphviz

In [None]:
# import needed libraries
import pandas as pd
import numpy as np
from typing import Union
import re
import requests
from tqdm import tqdm

from grape import Graph, GraphVisualizer
from grape.embedders import Node2VecCBOWEnsmallen

tqdm.pandas()

***
# Preprocessing  <a class="anchor" id="pre-processing"></a>

The aim of this section is defining the required file path to access the graph, the nodes and edges type lists.

In [None]:
fpath = "./testRNA-KG/resources/knowledge_graphs/"
graphname = "PheKnowLator_v3.1.1_full_instance_inverseRelations_OWLNETS"
graphext = ".nt"

graph_fname = fpath+graphname+graphext

colnames = ["subject","predicate","object","unused"]

In [None]:
fulldata = pd.read_csv(graph_fname,sep=' ',header=None, names=colnames)
fulldata.drop([colnames[3]],axis=1,inplace=True) # remove the last column containing the dot symbol
fulldata.head()

### Build the nodes dataframe
This is extracted from the list of nodes in the graph file and is then integrated with the relevant object types as a new column named "type".

In [None]:
nodes_df = pd.DataFrame(set(fulldata[colnames[0]])|set(fulldata[colnames[2]]),columns=["name"])
nodes_df.dropna(inplace=True)
nodes_df.head()

In [None]:
print('Number of nodes in ' + graph_fname + ': ' + str(len(nodes_df)))

In [None]:
# Full mapping for all node types in RNA-KG
RNAonly = False # when false all nodes are considered otherwise only RNA nodes are selected

def uri2ntype(uri: str)->Union[str,None]:
    
    retval = None
    
    # match regular expression for all RNA genes in ncbi format
    retlist=re.split(r"gene/[\w\-]+[?]", uri)
    if len(retlist) == 2:  # pattern matched therefore list item 1 contains the RNA type
        value = retlist[1][:-1]
        if value == "others":
            retval = "otherRNA"
        elif value == "pseudo":
            retval = "Pseudogene"
        elif value == "unknown":
            retval = "unknown RNA"
        else:
            retval = value
    # regular expressions didn't match -> continue with direct string matching
    elif ("https://www.mirbase.org/" in uri): 
        retval = "miRNA"
    elif ("https://www.addgene.org/" in uri):
        retval = "gRNA"
    elif ("https://www.ncbi.nlm.nih.gov/nuccore/" in uri):
        retval = "Viral RNA"
    elif ("http://web.mit.edu/sirna/" in uri):
        retval = "s(i/h)RNA"
    elif ("https://hanlab.uth.edu/HeRA" in uri): 
        retval = "eRNA"
    elif ("http://bigdata.ibp.ac.cn/piRBase" in uri): 
        retval = "piRNA"
    elif ("http://scottgroup.med.usherbrooke.ca/snoDB" in uri): 
        retval = "rRNA"
    elif ("tRNA" in uri) or ("trna" in uri) or ("TRNA" in uri):
        retval = "tRNA"
    elif ("tRF" in uri) or ("trf" in uri):
        retval = "tRF"
    elif ("tsRNA" in uri): 
        retval = "tsRNA"
    elif ("https://go.drugbank.com/drugs/" in uri): 
        retval = "RNA drug"
    elif ("https://eskip-finder.org" in uri): 
        retval = "ASO"
    elif ("https://www.aptagen.com/aptamer-details" in uri): 
        retval = "Aptamer"
    elif ("retained_intron" in uri): 
        retval = "Retained intron"
    elif ("tbdb.io/tboxes/" in uri) or ("penchovsky" in uri):
        retval = "Riboswitch"
    elif ("http://rfamlive.xfam.org/" in uri):
        retval = "Ribozyme"
    elif not RNAonly:    
        if ("http://purl.obolibrary.org/obo/MONDO" in uri) or ("purl.obolibrary.org/obo/DOID" in uri) or ("ghr.nlm.nih.gov/condition" in uri) or ("rarediseases.info.nih.gov/diseases" in uri):
            retval = "Disease"
        elif ("purl.obolibrary.org/obo/IDO" in uri):
            retval = "Infectious disease"
        elif ("purl.obolibrary.org/obo/MFOMD" in uri):
            retval = "Mental disease"
        elif ("http://purl.obolibrary.org/obo/GO" in uri):
            retval = "GO"
        elif ("http://purl.obolibrary.org/obo/CHR" in uri):
            retval = "Chromosome"
        elif ("http://purl.obolibrary.org/obo/SO" in uri):
            retval = "Sequence"
        elif ("http://purl.obolibrary.org/obo/VO" in uri):
            retval = "Vaccine"
        elif ("http://purl.obolibrary.org/obo/CHEBI" in uri): 
            retval = "Chemical"
        elif ("http://purl.obolibrary.org/obo/PR" in uri) or ("http://purl.obolibrary.org/obo/vo/ontorat/PR" in uri): 
            retval = "Protein"
        elif ("http://purl.obolibrary.org/obo/PW" in uri) or ("https://reactome.org/content/detail/" in uri): 
            retval = "Pathway"
        elif ("http://purl.obolibrary.org/obo/VO" in uri): 
            retval = "Vaccine"
        elif ("http://purl.obolibrary.org/obo/FOODON" in uri): 
            retval = "Food"
        elif ("http://purl.obolibrary.org/obo/MF" in uri): 
            retval = "Mental functioning"
        elif ("http://purl.obolibrary.org/obo/OGMS" in uri): 
            retval = "General medical science"
        elif ("http://purl.obolibrary.org/obo/MAXO" in uri): 
            retval = "Medical action"
        elif ("https://www.ncbi.nlm.nih.gov/snp/" in uri):
            retval = "Variant (SNP)"
        elif ("http://purl.obolibrary.org/obo/NBO" in uri):
            retval = "Neuro behaviour"
        elif ("https://www.genome.gov/genetics-glossary/" in uri):
            retval = "Biological role"
        elif  ("http://purl.obolibrary.org/obo/CARO" in uri) or ("http://purl.obolibrary.org/obo/UBERON" in uri) or ("http://sig.uw.edu/fma" in uri) or ("http://purl.obolibrary.org/obo/FMA" in uri): 
            retval = "Anatomy"  
        elif  ("http://purl.obolibrary.org/obo/NCIT" in uri): 
            retval = "NCI thesaurus" 
        elif ("http://purl.obolibrary.org/obo/FBbt" in uri):
            retval = "Drosophila anatomy" 
        elif ("http://purl.obolibrary.org/obo/CL_" in uri): 
            retval = "Cell"
        elif ("http://purl.obolibrary.org/obo/CLO" in uri) or ("http://www.ebi.ac.uk/cellline" in uri): 
            retval = "Cell line"
        elif ("http://purl.obolibrary.org/obo/HP" in uri) or ("http://purl.obolibrary.org/obo/PATO" in uri) or ("http://purl.obolibrary.org/obo/UPHENO" in uri): 
            retval = "Phenotype"
        elif ("http://purl.obolibrary.org/obo/GNO" in uri): 
            retval = "Glycan"
        elif ("http://purl.obolibrary.org/obo/BFO" in uri): 
            retval = "Basic formal"
        elif ("http://purl.obolibrary.org/obo/ENVO" in uri): 
            retval = "Environment"
        elif ("http://purl.obolibrary.org/obo/ECTO" in uri): 
            retval = "Environmental exposure"
        elif ("www.ncbi.nlm.nih.gov/gene" in uri) or ("http://purl.obolibrary.org/obo/OGG" in uri):
            retval = "Gene"
        elif ("http://purl.obolibrary.org/obo/OGG" in uri):
            retval = "Genome"
        elif ("www.ncbi.nlm.nih.gov/Taxonomy/Browser" in uri) or ("purl.obolibrary.org/obo/NCBITaxon" in uri): 
            retval = "Species"
        elif ("https://www.encodeproject.org/targets" in uri): 
            retval = "Epigenetic modification"
        elif ("crdd.osdd.net/raghava/dbem?" in uri): 
            retval = "Histone modification"
        elif ("bigdata.ibp.ac.cn/SmProt/SmProt.php?ID" in uri): 
            retval = "Small protein"
        elif ("snomedct" in uri) or ("SNOMEDCT" in uri): 
            retval = "snomedct"
        elif ("http://www.ebi.ac.uk/efo/EFO" in uri): 
            retval = "Experimental factor"
        elif ("http://purl.obolibrary.org/obo/HsapDv" in uri): 
            retval = "Human developmental stage"
        elif ("http://www.w3.org/2002/07/owl#Nothing" in uri): 
            retval = "owlNothing"
        elif ("http://purl.obolibrary.org/obo/" in uri):    # all unmapped obo types are dealt with here
            retlist = re.split(r"obo/",uri)
            retval = retlist[1].split('_', 1)[0]
            
    else:
        retval = None

    return retval

In [None]:
%%time
ntypes_list = []
for u in tqdm(nodes_df["name"].values):
    nty = uri2ntype(u)
    ntypes_list.append(nty)

nodes_df.loc[:,"type"] = ntypes_list
nodes_df.tail()

In [None]:
nodes_df['type'].unique()

In [None]:
print("Unassigned node types:", nodes_df.type.isna().sum())
nodes_df.type.fillna('undefined',inplace=True)

nodes_df = nodes_df.dropna()
nodes_df.tail()

### Build the edges dataframe

In [None]:
# extract from graph those codes which refer only to edges, maintaining their
# id as from the graph numerical codes
ety_df = pd.DataFrame(set(fulldata[colnames[1]]),columns=["name"])
#ety_df=fulluri[fulluri.id.isin(ecodes)]
#ety_df.reset_index(drop=True,inplace=True)
ety_df.head()

In [None]:
# split camel case strings, e.g., "overexpressedIn" --> "overexpressed in"
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

# automatically match all OBO items for the specified ontologies
hdr = {'Accept': 'application/json'}
ontos = ["ro","bspo","vo","clo","mondo","ogg","cl","mf"]
tomatch = "http://purl.obolibrary.org/obo/"

def uri2etype(uri: str)->Union[str,None]:
    label = None
    for oy in ontos:
        baseuri = f"https://www.ebi.ac.uk/ols4/api/ontologies/{oy}/properties?iri={uri[1:-1]}"
        try:
            res = requests.get(baseuri,hdr).json()
            label=res['_embedded']['properties'][0]['label']
            label=label.lower().capitalize()
        except:
            pass

    if ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type" in uri):   
        label = "Type"  
    elif ("http://www.w3.org/2000/01/rdf-schema#subClassOf" in uri):   
        label = "SubClass of" 
    # new manually set edges by splitting over the # symbol of the uri
    elif ("http://semanticscience.org/resource/SIO_000420" in uri):   
        label = "Has expression" 
    elif ("http://purl.obolibrary.org/obo/CLO_0054408" in uri):   
        label = "Overexpresses gene"
    elif ("http://purl.obolibrary.org/obo/CLO_0054409" in uri):   
        label = "Adenoma formation induced by cell lineage cells in mice"
    elif ("http://purl.obolibrary.org/obo/uberon/core" in uri) or \
         ("http://purl.obolibrary.org/obo/mondo" in uri) or \
         ("http://purl.obolibrary.org/obo/so" in uri) or \
         ("http://purl.obolibrary.org/obo/envo" in uri) or \
         ("http://purl.obolibrary.org/obo/pr" in uri) or \
         ("http://purl.obolibrary.org/obo/pato" in uri) or \
         ("http://purl.obolibrary.org/obo/pw" in uri) or \
         ("http://purl.obolibrary.org/obo/exo.obo" in uri) or \
         ("http://purl.obolibrary.org/obo/cl" in uri) or \
         ("http://purl.obolibrary.org/obo/nbo" in uri) or \
         ("http://purl.obolibrary.org/obo/MF" in uri) or \
         ("http://www.obofoundry.org/ro" in uri) or \
         ("http://purl.obolibrary.org/obo/chebi" in uri):
            label = uri[1:-1].split("#")[1]
            label = '_'.join(camel_case_split(label))
            label = label.replace('_',' ').lower().capitalize()
    else:
        pass
    
    return label

In [None]:
%%time
etypes_list = []
for u in tqdm(ety_df["name"].values):
    ety = uri2etype(u)
    etypes_list.append(ety)

ety_df.loc[:,"type"] = etypes_list
ety_df.tail()

In [None]:
print("Unassigned edge types:",ety_df.type.isna().sum())
#ety_df.type.fillna('undefined',inplace=True)
#ety_df = ety_df.dropna()

In [None]:
%%time
# add the type column to the original graph structure
edges_df = fulldata.copy()

efmap = lambda x: ety_df.type[np.where(ety_df['name'].eq(x))[0][0]]
edges_df["type"] = edges_df[colnames[1]].progress_apply(efmap)
edges_df.tail()

***
# Neo4j  <a class="anchor" id="neo4j"></a>

In [None]:
properties = pd.read_csv('testRNA-KG/resources/knowledge_graphs/PheKnowLator_v3.1.1_full_instance_inverseRelations_OWLNETS_NodeLabels.txt',
            sep='\t')
node_properties = properties[properties['entity_type'] == 'NODES']
node_properties

In [None]:
neo4jnodes_df = nodes_df.copy()
neo4jnodes_df = pd.merge(neo4jnodes_df, node_properties, left_on='name', right_on='entity_uri', how='outer')
neo4jnodes_df = neo4jnodes_df[['name', 'type', 'label', 'description/definition', 'synonym']]
neo4jnodes_df

In [None]:
neo4jedges_df = ety_df.copy()
edge_properties = properties[properties['entity_type'] == 'RELATIONS']
neo4jedges_df = pd.merge(neo4jedges_df, edge_properties, left_on='name', right_on='entity_uri', how='outer')
neo4jedges_df = neo4jedges_df[['name', 'type', 'description/definition', 'synonym']]
neo4jedges_df = neo4jedges_df.rename(columns={'type':'label'})
neo4jedges_df

In [None]:
neo4jedges_df = pd.merge(neo4jedges_df, edges_df, left_on='name', right_on='predicate', how='outer')
neo4jedges_df = neo4jedges_df[['subject','predicate','object','label','description/definition','synonym']]
neo4jedges_df

In [None]:
neo4jnodes_df.to_csv('nodes.csv', index=None)
neo4jedges_df.to_csv('relationships.csv', index=None)

***
# GRAPE  <a class="anchor" id="grape"></a>

In [None]:
%%time
# load it into a graph
graph = Graph.from_pd(
    edges_df=edges_df,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    #edge_weight_column="weight",
    edge_type_column="type",
    node_types_separator="|",
    directed=True,
    name="graph",
)

graph

In [None]:
htmlrep = fpath+"RNAgraphReport.html"

ff = open(htmlrep,"w")
ff.write(str(graph))
ff.close()

In [None]:
graph.get_diameter()

In [None]:
graph = Graph.from_pd(
    edges_df=edges_df,
    nodes_df=nodes_df,
    node_name_column="name",
    node_type_column="type",
    edge_src_column="subject",
    edge_dst_column="object",
    #edge_weight_column="weight",
    edge_type_column="type",
    node_types_separator="|",
    directed=False,
    name="graph",
)

engine = Node2VecCBOWEnsmallen(walk_length=5)
embedding = engine.fit_transform(graph)
vis = GraphVisualizer(graph)

vis.fit_edges(embedding)
vis.plot_edge_types(k=9)

In [None]:
vis.fit_nodes(embedding)
vis.plot_node_types(k=9)

### Predictions using Node2Vec

In [None]:
train, test = graph.connected_holdout(train_size=0.7)
train.enable()

vis = GraphVisualizer(
    graph=test,
    support=graph
)

vis.fit_negative_and_positive_edges(embedding)
vis.plot_positive_and_negative_edges()