In [1]:
import os
import math
import pandas as pd
import numpy as np
from ddot import Ontology

In [2]:
def load_hierarchy_network():
    
    network_name = '../data/NeST/NeST'
    
    nodes_df = pd.read_csv(network_name + '_node.csv')[['name', 'Genes']]
    
    edges_df = pd.read_csv(network_name + '_edge.sif', sep='\t', header=None, names = ['S', 'M', 'T'])
    
    return nodes_df, edges_df

In [3]:
def filter_ontology(ont, ont_file):
    
    if len(ont.get_roots()) > 1:
        cur_roots = ont.get_roots()
        ont.add_root("NEST_ROOT", inplace=True)
        ont.delete(to_delete=cur_roots, inplace=True)
    
    ont.propagate(direction='reverse', inplace=True)
    ont = ont.collapse_ontology(method='python')
    
    terms_to_delete = []
    for key in ont.term_2_gene.keys():
        if len(ont.term_2_gene[key]) < 3:
            terms_to_delete.append(key)
    ont.delete(to_delete=terms_to_delete, inplace=True)
    
    ont.to_table(ont_file, clixo_format=True)
    
    return ont

In [4]:
def convert_to_clixo_format(hierarchy_edges_df, hierarchy_nodes_df, gene_list, ont_file):
    
    ont = pd.DataFrame()
    temp_file = '../data/temp_ont.txt'
    
    for _, row in hierarchy_edges_df.iterrows():
        ont = ont.append({'Source' : row['S'], 'Target' : row['T'], 'Mapping' : 'default'}, ignore_index=True)

    for _, row in hierarchy_nodes_df.iterrows():
        if pd.isna(row['Genes']):
            continue
        genes = row['Genes'].split()
        for gene in genes:
            if gene in gene_list:
                ont = ont.append({'Source' : row['name'], 'Target' : gene, 'Mapping' : 'gene'}, ignore_index=True)

    ont = ont[['Source', 'Target', 'Mapping']]
    ont.to_csv(temp_file, sep='\t', header=False, index=False)
    ont = Ontology.from_table(temp_file, clixo_format=True)
    os.remove(temp_file)
    
    return filter_ontology(ont, ont_file)

In [12]:
n_type = 'nci'

gene_list = list(pd.read_csv('../data/gene2ind_' + n_type + '.txt', sep='\t', header=None, names=['I', 'G'])['G'])

h_nodes_df, h_edges_df = load_hierarchy_network()

In [13]:
ont_file = '../data/ontology_' + n_type + '.txt'

ont = convert_to_clixo_format(h_edges_df, h_nodes_df, gene_list, ont_file)

print(ont)
print(ont.get_roots())

Unifying 2 roots into one super-root
718 genes, 164 terms, 1482 gene-term relations, 200 term-term relations
node_attributes: []
edge_attributes: []
['NEST_ROOT']


In [11]:
#Randomize the genes

ont = Ontology.from_table(ont_file, clixo_format=True)
ont2_file = '../data/ontology_' + n_type + '_bb_e.txt'

ont2 = ont.shuffle_genes()

ont2 = filter_ontology(ont2, ont2_file)

print(ont2)
print(ont2.get_roots())

718 genes, 164 terms, 1405 gene-term relations, 200 term-term relations
node_attributes: []
edge_attributes: []
['NEST_ROOT']
