In [2]:
import sys
sys.path.append("..")

%matplotlib inline
%load_ext autoreload
%autoreload 1
%aimport pyfantom.network_tools

from orangecontrib.bio.ontology import OBOOntology, OBOObject
import networkx as nx
from pylab import * 
from itertools import repeat
import pandas as pd
import re
from pprint import pprint

from pyfantom.network_tools import *
pd.set_option('display.max_colwidth', -1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Improving the Ontology Network

Based on our finding in `explore_ontology.ipynb`, this notebook deals with improving the network built from the ontology. 
In particular, we will
* Add the missing entries
* deal with the singletons

Load Ontology and Annotation:

In [3]:
obo = OBOOntology()
obo.load(open("../data/ff-phase2-140729.obo"))
## Obo instance to which we will write the corrections
obo_corr = OBOOntology()
obo_corr.load(open("../data/ff-phase2-140729.obo"))
col_vars = pd.read_csv("../data/column_vars.processed.csv", index_col=0)

Define ontology constants:

In [4]:
UPDATE = "correction_gs01"
HUMAN_SAMPLE = ("FF:0000210", None, "human sample")
IS_A = "is_a"
RELATIONSHIP = "relationship"
SUBSET = "subset"

## Add the missing entries
(they will show up as singletons in the next step to add them to their corresponding group) 

In [5]:
missing_samples = !cat ../data/missing_samples.txt
missing_sample_ids = [x.split()[0][:-1] for x in missing_samples]

In [6]:
for obo_id in missing_sample_ids:
    term = obo_corr.term(obo_id)
    term.add_tag(SUBSET, UPDATE)
    term.add_tag(IS_A, *HUMAN_SAMPLE)

In [7]:
obo_corr.write(open("../data/tmp/ff-phase2-140729.corr.2.obo", 'w'))

## Inspect Singletons
What is a singleton? 

Every sample (FF:#####-#####) that is a direct descendant of human sample appears in the network as a singleton. 

Get these samples:

In [8]:
singletons_df = col_vars[col_vars.obo_id.isin([term.id for term in obo_corr.child_terms(HUMAN_SAMPLE[0])])]

In [9]:
len(singletons_df)

568

#### Most of the singletons do actually have a superclass
These superclasses are just not associated with 'human sample', thus, these classes appear as singleton in the network 

In [10]:
for i, obo_id in enumerate(singletons_df.obo_id.values): 
    for tag, value, _ , comment in obo.term(obo_id).tags(): 
        if tag == IS_A: 
            print(i, tag, value, comment)

0 is_a FF:0000210 human sample
0 is_a FF:0000357 0 hr sample
0 is_a FF:0200008 293SLAM rinderpest infection sample
1 is_a FF:0000210 human sample
1 is_a FF:0000357 0 hr sample
1 is_a FF:0200008 293SLAM rinderpest infection sample
2 is_a FF:0000210 human sample
2 is_a FF:0000357 0 hr sample
2 is_a FF:0200008 293SLAM rinderpest infection sample
3 is_a FF:0000210 human sample
3 is_a FF:0000363 6 hr sample
3 is_a FF:0200008 293SLAM rinderpest infection sample
4 is_a FF:0000210 human sample
4 is_a FF:0000363 6 hr sample
4 is_a FF:0200008 293SLAM rinderpest infection sample
5 is_a FF:0000210 human sample
5 is_a FF:0000363 6 hr sample
5 is_a FF:0200008 293SLAM rinderpest infection sample
6 is_a FF:0000210 human sample
6 is_a FF:0000367 12 hr sample
6 is_a FF:0200008 293SLAM rinderpest infection sample
7 is_a FF:0000210 human sample
7 is_a FF:0000367 12 hr sample
7 is_a FF:0200008 293SLAM rinderpest infection sample
8 is_a FF:0000210 human sample
8 is_a FF:0000367 12 hr sample
8 is_a FF:020000

If we integrate the human subnetworks of these superclasses into our network, we get rid of most of the singletons. 
We do that by adding all meta nodes that are *between* the three 'root nodes' *cell line samples*, *tissue samples* and *in vivo cell samples* and the actual sample. 

We must not add nodes that are 'beyond' the 'root nodes', because otherwise we end up with a blown-up network including all terms down to the taxa. 

In [11]:
graph = build_tree(obo_corr, HUMAN_SAMPLE[0], graph=nx.Graph())

In [12]:
for node in graph.nodes():
    if is_sample_id(node):
        add_superelements_to_graph(obo_corr, node, graph, delimiter_nodes=("FF:0000002", "FF:0000004", "FF:0000003"))

Fill unknown sample types with 'unknown' for better node coloring in cytoscape

In [13]:
col_vars.sample_type.fillna("unknown", inplace=True)

#### Export the new network for inspection in cytoscape

In [14]:
graph.remove_node(HUMAN_SAMPLE[0])
graph_ = annotate_graph(graph, col_vars)
graph_ = relabel_nodes(obo, graph_)

In [15]:
nx.write_graphml(graph_, "../networks/improve_network/human_samples_extended.graphml")

## Singletons revisited (I)

We found that there is even another annotation "relationship"
that we could use for improving our network hierarchy. 

In [16]:
def is_singleton(graph, node_id):
    return True if len(graph[node_id]) == 0 else False

In [17]:
singletons_df = col_vars[col_vars.obo_id.isin([node for node in graph.nodes() if is_singleton(graph, node)])]
len(singletons_df)

212

In [18]:
superclasses = []
for i, obo_id in enumerate(singletons_df.obo_id.values): 
    for tag, value, _ , comment in obo.term(obo_id).tags(): 
        if tag in [IS_A, RELATIONSHIP]: 
            superclasses.append((value, comment))
sorted([(superclasses.count(i), *i) for i in set(superclasses)])

[(1, 'FF:0000380', '0 days sample'),
 (1, 'derives_from CL:0000134', 'mesenchymal cell'),
 (1, 'derives_from UBERON:0000014', 'zone of skin'),
 (1, 'derives_from UBERON:0000055', 'vessel'),
 (1, 'derives_from UBERON:0001764', 'maxillary sinus'),
 (1, 'derives_from UBERON:0001872', 'parietal lobe'),
 (1, 'derives_from UBERON:0001873', 'caudate nucleus'),
 (1, 'derives_from UBERON:0001874', 'putamen'),
 (1, 'derives_from UBERON:0001875', 'globus pallidus'),
 (1, 'derives_from UBERON:0001876', 'amygdala'),
 (1, 'derives_from UBERON:0001896', 'medulla oblongata'),
 (1, 'derives_from UBERON:0001897', 'dorsal plus ventral thalamus'),
 (1, 'derives_from UBERON:0001954', "Ammon's horn"),
 (1, 'derives_from UBERON:0002021', 'occipital lobe'),
 (1, 'derives_from UBERON:0002037', 'cerebellum'),
 (1, 'derives_from UBERON:0002038', 'substantia nigra'),
 (1, 'derives_from UBERON:0002148', 'locus ceruleus'),
 (1, 'derives_from UBERON:0002240', 'spinal cord'),
 (1, 'derives_from UBERON:0002771', 'midd

there are still superclasses e.g. ` iPS differentiation to neuron, down-syndrome donor sample` or `rinderpest infection sample`. Why are these not captured? 

=> these classes are not in a is_a relationship with any sample, but in a develops_from/derives_from relationship with GO terms. 
add these to network. 

In [19]:
!grep relationship ../data/ff-phase2-140729.obo | cut -d" " -f1,2 | sort -u

relationship: derives_from
relationship: develops_from
relationship: immediate_transformation_of
relationship: is_model_for
relationship: part_of
relationship: treated_with


### To capture most of it, we just add some manually

In [20]:
[(term.id, term.name) for term in obo_corr.parent_terms("FF:0200009")]

[('FF:0200010', 'rinderpest infection sample')]

In [21]:
[(term.id, term.name) for term in obo_corr.parent_terms("FF:0200008")]

[('FF:0200010', 'rinderpest infection sample')]

In [22]:
[(term.id, term.name) for term in obo_corr.parent_terms("FF:0200006")]

[('EFO:0001461', 'control'),
 ('EFO:0001746', 'development or differentiation design'),
 ('CL:0000056', 'myoblast'),
 ('CL:0002372', 'myotube')]

In [23]:
[(term.id, term.name) for term in obo_corr.parent_terms("FF:0200003")]

[('EFO:0001461', 'control'),
 ('FF:0200005', 'iPS differentiation to neuron sample')]

In [24]:
[(term.id, term.name) for term in obo_corr.parent_terms("FF:0200004")]

[('FF:0200005', 'iPS differentiation to neuron sample'),
 ('DOID:14250', 'Down syndrome')]

In [27]:
graph2 = graph.copy()

In [28]:
delimiter_nodes = ["FF:0200009", "FF:0200006", "FF:0200005", "FF:0200008", "FF:0300101"]
for node in graph2.nodes():
    if is_sample_id(node):
        add_superelements_to_graph(obo_corr, node, graph2, delimiter_nodes=delimiter_nodes, inclusive=True)

## Singletons revisited (II)
However there are some where the proper superclass is really missing (lack of annotation). 
We will create these superclasses manually. 

In [29]:
singletons_df = col_vars[col_vars.obo_id.isin([node for node in graph2.nodes() if is_singleton(graph2, node)])]
len(singletons_df)

66

In [30]:
superclasses = []
for i, obo_id in enumerate(singletons_df.obo_id.values): 
    for tag, value, _ , comment in obo.term(obo_id).tags(): 
        if tag in [IS_A, RELATIONSHIP]: 
            superclasses.append((value, comment))
sorted([(superclasses.count(i), *i) for i in set(superclasses)])

[(1, 'derives_from CL:0000134', 'mesenchymal cell'),
 (1, 'derives_from UBERON:0000014', 'zone of skin'),
 (1, 'derives_from UBERON:0000055', 'vessel'),
 (1, 'derives_from UBERON:0001764', 'maxillary sinus'),
 (1, 'derives_from UBERON:0001872', 'parietal lobe'),
 (1, 'derives_from UBERON:0001873', 'caudate nucleus'),
 (1, 'derives_from UBERON:0001874', 'putamen'),
 (1, 'derives_from UBERON:0001875', 'globus pallidus'),
 (1, 'derives_from UBERON:0001876', 'amygdala'),
 (1, 'derives_from UBERON:0001896', 'medulla oblongata'),
 (1, 'derives_from UBERON:0001897', 'dorsal plus ventral thalamus'),
 (1, 'derives_from UBERON:0001954', "Ammon's horn"),
 (1, 'derives_from UBERON:0002021', 'occipital lobe'),
 (1, 'derives_from UBERON:0002037', 'cerebellum'),
 (1, 'derives_from UBERON:0002038', 'substantia nigra'),
 (1, 'derives_from UBERON:0002148', 'locus ceruleus'),
 (1, 'derives_from UBERON:0002240', 'spinal cord'),
 (1, 'derives_from UBERON:0002771', 'middle temporal gyrus'),
 (1, 'derives_fr

In [31]:
try:
    graph2.remove_node(HUMAN_SAMPLE[0])
except nx.NetworkXError:
    pass
graph2_ = annotate_graph(graph2, col_vars)
graph2_ = relabel_nodes(obo, graph2_)

In [32]:
nx.write_graphml(graph2_, "../networks/improve_network/human_samples_extended.2.graphml")

* The CD4 cells: we can would need to classify them manually. 
* The smooth muscle cells: can be annotated using the derives_from ontology 
* `derives_from` could be intersting anyway

In [33]:
singletons_df.to_csv("../data/tmp/singletons.tsv", sep="\t", na_rep="na")

### Create Meta Nodes manually
There are some samples, where the proper meta-nodes are missing in the ontology (mainly T-Cell samples).
Some of them are even annotated with a comment that the annotation is not complete. 

I will manually create these meta-nodes here and add them.

In [34]:
graph3 = graph2.copy()

In [35]:
def create_meta_node(name, obo, id_cntr, parent_ids=[]):
    new_id = "GS:{:04d}".format(id_cntr)
    try:
        term = OBOObject("Term", id=new_id, name=name)
        term.add_tag(SUBSET, UPDATE)
        for p_id in parent_ids:
            term.add_tag(IS_A, p_id, comment=obo.term(p_id).name)
        obo.add_object(term)
        return term
    except ValueError: 
        print("WARNING: Node with id already exists in ontology.")
        return obo.term(new_id)

In [36]:
cd4_1 = create_meta_node("human CD4-positiveCD25-positiveCD45RA-positive naive regulatory T cells sample", obo_corr, 1, [
                  HUMAN_SAMPLE[0], 
                  "FF:0000031" # CD 4 positive T cell sample
                 ]) 
terms = ["FF:11793-124C2", "FF:11796-124C5", "FF:11907-125F8"]
for term in terms: 
    obo_corr.term(term).add_tag(IS_A, cd4_1.id, comment=cd4_1.name)

In [37]:
cd4_2 = create_meta_node("human CD4-positiveCD25-positiveCD45RA- memory regulatory T cells sample", obo_corr, 2, [
                  HUMAN_SAMPLE[0], 
                  "FF:0000031" # CD 4 positive T cell sample
                 ]) 
terms = ["FF:11794-124C3", "FF:11916-125G8", "FF:11920-125H3", "FF:11797-124C6", "FF:11908-125F9"]
for term in terms: 
    obo_corr.term(term).add_tag(IS_A, cd4_2.id, comment=cd4_2.name)

In [38]:
cd4_3 = create_meta_node("human CD4-positiveCD25-CD45RA-positive naive conventional T cells sample", obo_corr, 3, [
                  HUMAN_SAMPLE[0], 
                  "FF:0000031" # CD 4 positive T cell sample
                 ]) 
terms = ["FF:11791-124B9", "FF:11913-125G5", "FF:11917-125G9", "FF:11784-124B2", "FF:11795-124C4", "FF:11906-125F7"]
for term in terms: 
    obo_corr.term(term).add_tag(IS_A, cd4_3.id, comment=cd4_3.name)

In [39]:
cd4_4 = create_meta_node("human CD4-positiveCD25-CD45RA- memory conventional T cells sample", obo_corr, 4, [
                  HUMAN_SAMPLE[0], 
                  "FF:0000031" # CD 4 positive T cell sample
                 ]) 
terms = ["FF:11792-124C1", "FF:11798-124C7", "FF:11909-125G1"]
for term in terms: 
    obo_corr.term(term).add_tag(IS_A, cd4_4.id, comment=cd4_4.name)


In [40]:
smc_1 = create_meta_node("human airway smooth muscle cell sample", obo_corr, 5, [
                  HUMAN_SAMPLE[0], 
                  "FF:0000167" # smooth muscle cell sample
                 ]) 
terms = ["FF:11960-126C7",
"FF:11961-126C8",
"FF:11962-126C9",
"FF:11963-126D1",
"FF:11964-126D2",
"FF:11965-126D3",
"FF:11966-126D4",
"FF:11967-126D5",
"FF:11968-126D6",
"FF:11969-126D7"]
for term in terms: 
    obo_corr.term(term).add_tag(IS_A, smc_1.id, comment=smc_1.name)



Replicates, where the annotation has been forgotten: 

In [41]:
parent = obo_corr.term("FF:0010040") # human frontal lobe - adult sample
obo_corr.term("FF:10150-102I6").add_tag(IS_A, parent.id, comment=parent.name) # medial frontal gyrus, adult, donor10252

### Build the network (from scratch, again) 

In [42]:
graph3 = build_tree(obo_corr, HUMAN_SAMPLE[0], graph=nx.Graph())

In [43]:
delimiter_nodes = delimiter_nodes=("FF:0000002", "FF:0000004", "FF:0000003")
for node in graph3.nodes():
    if is_sample_id(node):
        add_superelements_to_graph(obo_corr, node, graph3, delimiter_nodes)

In [44]:
delimiter_nodes = ["FF:0200009", "FF:0200006", "FF:0200005", "FF:0200008", "FF:0300101"]
for node in graph3.nodes():
    if is_sample_id(node):
        add_superelements_to_graph(obo_corr, node, graph3, delimiter_nodes=delimiter_nodes, inclusive=True)

In [45]:
try:
    graph3.remove_node(HUMAN_SAMPLE[0])
except nx.NetworkXError:
    pass
graph3_ = annotate_graph(graph3, col_vars)
graph3_ = relabel_nodes(obo_corr, graph3_)

In [46]:
nx.write_graphml(graph3_, "../networks/improve_network/human_samples_extended.3.graphml")

In [47]:
obo_corr.write(open("../data/ff-phase2-140729.corr.obo", 'w'))

## Conclusion
We tried our very best to fix the bugs in the annotation, however it is unlikely that my analasis reveals and corrected all mistakes. So in a next step we have to do some computational outlier detection. 