In [14]:
import sys
sys.path.append("..")

%matplotlib inline
%load_ext autoreload
%autoreload 1
%aimport pyfantom.network_tools

from orangecontrib.bio.ontology import OBOOntology, OBOObject
import networkx as nx
from pylab import * 
from itertools import repeat
import pandas as pd
import re
from pprint import pprint
from collections import Counter

from pyfantom.network_tools import *
pd.set_option('display.max_colwidth', -1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
delim_df = pd.read_csv("../data/delimiter_nodes.2.csv", index_col=0, sep='\t')

# Create a Cytoscape Network from the Ontology
We want to create a network that is as comprehensive as possible. For that we our tweaked ontology from 
the previous notebook. 

In [7]:
obo_corr = OBOOntology()
obo_corr.load(open("../data/ff-phase2-140729.corr.obo"))
col_vars = pd.read_csv("../data/column_vars.processed.csv", index_col=0)

In [8]:
HUMAN_SAMPLE = "FF:0000210"

Let's check if our manual entries are in there:

In [9]:
obo_corr.child_terms("GS:0001")

{OBOObject(id='FF:11796-124C5', name=CD4+CD25+CD45RA+ naive regulatory T cells, donor2, ...),
 OBOObject(id='FF:11793-124C2', name=CD4+CD25+CD45RA+ naive regulatory T cells expanded, donor1, ...),
 OBOObject(id='FF:11907-125F8', name=CD4+CD25+CD45RA+ naive regulatory T cells, donor3, ...)}

In [10]:
obo_corr.parent_terms("FF:11796-124C5")

{OBOObject(id='EFO:0002091', name=biological replicate, ...),
 OBOObject(id='FF:0000002', name=in vivo cell sample, ...),
 OBOObject(id='GS:0001', name=human CD4-positiveCD25-positiveCD45RA-positive naive regulatory T cells sample, ...),
 OBOObject(id='FF:0000210', name=human sample, ...)}

## Take advantage of all relations
We know from the previous notebook, that there are more relations than 'is_a'. Especially relevant is the 'derives_from' and 'develops_from' annotation, as we are interested in a hierarchy of cell types

#### What kind of entities do we have in the Ontology?
(what comes before the colon in the ids) 

In [13]:
obo_corr.root_terms()

[OBOObject(id='BFO:0000040', name=material entity, ...),
 OBOObject(id='DOID:4', name=disease, ...),
 OBOObject(id='EFO:0000001', name=experimental factor, ...),
 OBOObject(id='FF:0000001', name=sample, ...),
 OBOObject(id='GO:0005575', name=cellular_component, ...),
 OBOObject(id='NCBITaxon:1', name=root, ...),
 OBOObject(id='UBERON:0001062', name=anatomical entity, ...)]

In [5]:
!cat ../data/ff-phase2-140729.obo | grep -e "^id" | cut -d" " -f2 | cut -d":" -f1 | sort | uniq -c | sort -nr

   4144 FF
   1225 UBERON
    472 CL
    208 DOID
     40 CHEBI
     38 NCBITaxon
     20 EFO
      6 UO
      2 OBI
      2 GO
      2 BFO
      1 treated_with
      1 SpecificallyDependentContinuant
      1 Role
      1 ProcessualEntity
      1 part_of
      1 NCBITaxon_2759
      1 NCBITaxon_10088
      1 MaterialEntity
      1 is_model_for
      1 immediate_transformation_of
      1 IAO
      1 Disposition
      1 develops_from
      1 derives_from
      1 CL_0002321
      1 CL_0000057
      1 CL_0000056


We note the `UBERON` and `CL` (=cell) entities. 

[Uberon](http://uberon.github.io/) is an ontology annotation cell compartments. 
Cell entities derive from uberon entities at a certain level. 

## Create the network bottom up
* add all parent nodes of *all* samples (-> we can't miss samples like that) 
* include all relations, not only is_a and not only descendants of human_sample (like that we would loose the UBERON cell compartment annotation) 
* we need a useful stopping criterion (otherwise we blow up the network with useless information)

## $\Rightarrow$ Create List of *delimiter nodes*
We curate a list of nodes that mark a stopping point in their respective branch. I.e. we deem any information closer to the root than these nodes *non-informative*. This is the case if
* The node is ubiquitous, i.e. applies to all samples (-> no information gain) 
* The node represents irrelevant information (e.g. time units)

Count occurences of each term for each sample: 

In [17]:
cntr = Counter()
for obo_id in col_vars.obo_id: 
    for term in obo_corr.super_terms(obo_id): 
        cntr[term.id] += 1        

Make a dataframe out of it, mark all ubiquitous nodes as delimiter: 

In [47]:
delim_df = pd.DataFrame([{
    "obo_id": obo_id,
    "name": obo_corr.term(obo_id).name,
    "count": count,
    "delimiter": 1 if count >= len(col_vars) else 0
} for obo_id, count in cntr.most_common()])
delim_df.set_index("obo_id", inplace=True)

In [48]:
delim_df.sort_values("count", ascending=False, inplace=True)
delim_df.head()

Unnamed: 0_level_0,count,delimiter,name
obo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BFO:0000040,1829,1,material entity
NCBITaxon:33154,1829,1,Opisthokonta
NCBITaxon:33213,1829,1,Bilateria
NCBITaxon:6072,1829,1,Eumetazoa
NCBITaxon:117570,1829,1,Teleostomi


#### Mark time units as delimiters: 

In [49]:
exclude_patterns = [
    "(\d+) days sample$",
    "(\d+) minutes sample$",
    "(\d+) hr",
]

In [50]:
for pattern in exclude_patterns:
    delim_df.loc[delim_df.name.str.contains(pattern), 'delimiter'] = 1

  from ipykernel import kernelapp as app


#### Exclude all terms that are not part of the ontologies relevant for us: 

In [51]:
delim_df.loc[~delim_df.index.str.contains("^(FF|UBERON|GS|CL)"), 'delimiter'] = 1

  if __name__ == '__main__':


### We manually annotate the delimiter nodes in the csv file. 

In [52]:
delim_df.to_csv("../data/delimiter_nodes.tsv", sep='\t')
delim_df = pd.read_csv("../data/delimiter_nodes.tsv", sep="\t", index_col=0)

In [142]:
obo2= OBOOntology()
obo2.load(open("../data/ff-phase2-140729.corr.3.obo"))
obo2.set_rel_type(["is_a", "derives_from", "develops_from", "part_of"])
graph2 = nx.Graph()
for obo_id in col_vars[col_vars.sample_type == "primary cell"].obo_id:
    graph2.add_node(obo_id)

In [143]:
def filter_fun(obo_id):
    return True if obo_id.split(":")[0] in ["FF", "UBERON", "GO", "GS", "CL"] else False

In [144]:
filter_fun("CL:0002564")

True

In [145]:
for node in graph2.nodes(): 
    add_superelements_to_graph2(obo2, node, graph2, delimiter_nodes)

In [146]:
for node in graph2.nodes():
    if(node[:2] != "FF"):
        print(node)

UBERON:0004852
UBERON:0004809
UBERON:0004802
UBERON:0000119
UBERON:0011215
UBERON:0001621
UBERON:0010409
UBERON:0001456
UBERON:0000475
UBERON:0003914
UBERON:0004638
UBERON:0003928
CL:0000058
CL:0000775
UBERON:0004198
CL:0002304
UBERON:0002367
CL:0000147
CL:0000790
UBERON:0000949
UBERON:0000473
CL:0002503
UBERON:0001911
CL:0000226
CL:0002433
UBERON:0002513
UBERON:0001768
UBERON:0000353
UBERON:0001895
EFO:0001461
DOID:450
CL:0000557
UBERON:0006915
CL:0000569
CL:0000056
UBERON:0008816
UBERON:0002371
CL:0000151
UBERON:0005095
UBERON:0002048
UBERON:0001529
UBERON:0004573
UBERON:0005749
UBERON:2005260
UBERON:0004732
UBERON:0002100
UBERON:0010532
CL:0005012
CL:0002579
DOID:934
UBERON:0001803
CL:0000239
CL:0000650
UBERON:0011095
CL:0000223
DOID:0080000
UBERON:0004175
CL:0000186
UBERON:0007503
UBERON:0005908
UBERON:0002330
UBERON:0001981
DOID:0050557
CL:1000494
UBERON:0011137
UBERON:0005498
CL:0002077
UBERON:0006444
CL:0002242
UBERON:0008780
DOID:0050560
CL:0002494
CL:0002584
UBERON:0013702
CL:

In [147]:
try:
    graph2.remove_node(HUMAN_SAMPLE)
except nx.NetworkXError:
    pass
graph2_ = annotate_graph(graph2, col_vars)
graph2_ = relabel_nodes(obo2, graph2_)

In [148]:
nx.write_graphml(graph2_, "ontology_network/human_primary_bottom_up_all_rels.graphml")

In [26]:
obo2.root_terms()

[OBOObject(id='BFO:0000040', name=material entity, ...),
 OBOObject(id='DOID:4', name=disease, ...),
 OBOObject(id='EFO:0000001', name=experimental factor, ...),
 OBOObject(id='FF:0000001', name=sample, ...),
 OBOObject(id='GO:0005575', name=cellular_component, ...),
 OBOObject(id='NCBITaxon:1', name=root, ...),
 OBOObject(id='UBERON:0001062', name=anatomical entity, ...)]

relevant are CL: ( GO:0005575 )
FF
UBERON 

make supertrees for the three entitites
* FF is known