In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ddot import Ontology
import networkx as nx
from sklearn.preprocessing import StandardScaler

In [2]:
def load_interaction_network(n_name):
    
    network_name = '../data/' + n_name + '/' + n_name
    
    nodes = pd.read_csv(network_name + '_node.csv')['name']
    
    edges_df = pd.read_csv(network_name + '_edge.sif', sep='\t', header=None, names = ['S', 'M', 'T']).drop('M', 1)
    
    return nodes, edges_df

In [3]:
def load_hierarchy_network(n_name, n_type):
    
    network_name = '../data/' + n_name + '/' + n_name + '_' + n_type
    
    nodes_df = pd.read_csv(network_name + '_node.csv')[['name', 'CD_CommunityName', 'CD_MemberList']]
    root_i = nodes_df[nodes_df['name'] == 'ROOT'].index
    nodes_df.drop(root_i, inplace = True)
    
    edges_df = pd.read_csv(network_name + '_edge.sif', sep='\t', header=None, names = ['S', 'M', 'T'])
    
    return nodes_df, edges_df

In [4]:
def convert_to_clixo_format(hierarchy_nodes_df, hierarchy_edges_df, gene_list, n_name, n_type):
    
    ont = pd.DataFrame()
    temp_file = '../temp/' + n_name + '_' + n_type + '.txt'
    ont_file = '../data/' + n_name + '/' + n_name + '_' + n_type + '_ont.txt'
    
    for _, row in hierarchy_edges_df.iterrows():
        ont = ont.append({'Source' : row['S'], 'Target' : row['T'], 'Mapping' : 'default'}, ignore_index=True)

    for _, row in hierarchy_nodes_df.iterrows():
        genes = row['CD_MemberList'].split()
        for gene in genes:
            if gene in gene_list:
                ont = ont.append({'Source' : row['name'], 'Target' : gene, 'Mapping' : 'gene'}, ignore_index=True)

    ont = ont[['Source', 'Target', 'Mapping']]
    ont.to_csv(temp_file, sep='\t', header=False, index=False)
    ont = Ontology.from_table(temp_file, clixo_format=True)
    ont.propagate(direction='reverse', inplace=True)
    ont = ont.collapse_ontology(method='python')
    ont.to_table(ont_file, clixo_format=True)
    
    return ont

In [5]:
def create_gene_interaction_feature(nodes, interaction_edges_df):
    g_i = {}
    gene_list = list(nodes)
    for gene in gene_list:
        g_i[gene] = 0
        
    for i, edge in interaction_edges_df.iterrows():
        g_i[edge['S']] += 1
        
    for gene in gene_list:
        g_i[gene] /= float(len(gene_list))
        
    return g_i

In [6]:
def create_gene_mutation_feature(gene_list, cell_mutation_matrix):
    g_m = {}        
    cell_mutation_matrix = cell_mutation_matrix.transpose()
    for i, gene in enumerate(gene_list):
        g_m[gene] = np.sum(cell_mutation_matrix[i])/float(len(cell_mutation_matrix[i]))
        
    return g_m

In [7]:
def get_subsystem_count(gene, hierarchy_nodes_df):
    counter = 0
    for _,row in hierarchy_nodes_df.iterrows():
        if gene in row['CD_MemberList']:
            counter += 1
    
    return counter

In [8]:
def create_gene_subsystem_feature(hierarchy_nodes_df, hierarchy_edges_df, gene_list):
    dG = nx.DiGraph()
    for _,row in hierarchy_edges_df.iterrows():
        dG.add_edge(row['S'], row['T'])
    leaf_nodes = [n for n in dG.nodes() if dG.out_degree(n) == 0]
    
    g_s = {}
    for gene in gene_list:
        g_s[gene] = get_subsystem_count(gene, hierarchy_nodes_df)/float(len(hierarchy_nodes_df))
        
    return g_s, leaf_nodes

In [9]:
def create_feature_matrix(gene_list, g_m, g_i, g_s):
    feature_df = pd.DataFrame()
    for g in gene_list:
        feature_df = feature_df.append({'G' : g, 'M' : g_m[g], 'I' : g_i[g], 'S' : g_s[g]}, ignore_index=True)
    
    return feature_df

In [15]:
def score_subsystems(hierarchy_nodes_df, gene_list, feature_df):
    s_score = {}
    for _,row in hierarchy_nodes_df.iterrows():
        genes = row['CD_MemberList'].split()
        num = 0
        den = 0
        for g in genes:
            if g not in gene_list:
                continue
            gm = float(feature_df[feature_df.G == g]['M'])
            gi = float(feature_df[feature_df.G == g]['I'])
            gs = float(feature_df[feature_df.G == g]['S'])
            num += gs * math.sqrt(gm*gi)
            den += gs
        if num == 0:
            s_score[row['name']] = 0
        else:
            s_score[row['name']] = num/den
    
    return s_score

In [131]:
gene_list = list(pd.read_csv('../data/Reactome/gene2ind.txt', sep='\t', header=None, names=['I', 'G'])['G'])
cell_mutation = np.loadtxt('../data/Reactome/cell2mutation.txt', delimiter=',')

n_name = 'Reactome'
n_type = 'medium'

i_nodes, i_edges_df = load_interaction_network(n_name)
h_nodes_df, h_edges_df = load_hierarchy_network(n_name, n_type)

In [33]:
gene_list = list(pd.read_csv('../data/PCNet/gene2ind.txt', sep='\t', header=None, names=['I', 'G'])['G'])
cell_mutation = np.loadtxt('../data/PCNet/cell2mutation.txt', delimiter=',')

n_name = 'PCNet'
n_type = 'large'

i_nodes, i_edges_df = load_interaction_network(n_name)
h_nodes_df, h_edges_df = load_hierarchy_network(n_name, n_type)

In [34]:
g_m = create_gene_mutation_feature(gene_list, cell_mutation)
g_i = create_gene_interaction_feature(i_nodes, i_edges_df)
g_s, leaf_subsystems = create_gene_subsystem_feature(h_nodes_df, h_edges_df, gene_list)

In [35]:
feature_df = create_feature_matrix(gene_list, g_m, g_i, g_s)

In [36]:
s_score = score_subsystems(h_nodes_df, gene_list, feature_df)

In [37]:
leaf__subsystem_score = {}
for ls in leaf_subsystems:
    leaf__subsystem_score[ls] = s_score[ls]
leaf__subsystem_score = {s:sc for s,sc in sorted(leaf__subsystem_score.items(), key=lambda item:item[1])}

In [38]:
leaf__subsystem_score

{'C290312': 0,
 'C290161': 0,
 'C290249': 0,
 'C290316': 0,
 'C290379': 0,
 'C290108': 0,
 'C290094': 0,
 'C290210': 0,
 'C290096': 0,
 'C290023': 0,
 'C290370': 0,
 'C290213': 0,
 'C290025': 0,
 'C290194': 0,
 'C290362': 0,
 'C290254': 0,
 'C290185': 0,
 'C290321': 0,
 'C290092': 0,
 'C290288': 0,
 'C290085': 0,
 'C290276': 0,
 'C290037': 0,
 'C290219': 0,
 'C290089': 0,
 'C289948': 0,
 'C290029': 0,
 'C290013': 0,
 'C290068': 0,
 'C290183': 0,
 'C289899': 0,
 'C290124': 0,
 'C290327': 0,
 'C289990': 0,
 'C289964': 0,
 'C289921': 0,
 'C289956': 0,
 'C289909': 0,
 'C289862': 0,
 'C290168': 0,
 'C289900': 0,
 'C289888': 0,
 'C289918': 0,
 'C290059': 0,
 'C290058': 0,
 'C290040': 0,
 'C290123': 0,
 'C289978': 0,
 'C289867': 0,
 'C289987': 0,
 'C289816': 0,
 'C289831': 0,
 'C289880': 0,
 'C289793': 0,
 'C290283': 0,
 'C289801': 0,
 'C289690': 0,
 'C289863': 0,
 'C289806': 0,
 'C289830': 0,
 'C289955': 0,
 'C289952': 0,
 'C290107': 0,
 'C289803': 0,
 'C289868': 0,
 'C289748': 0,
 'C289725'

In [39]:
low_score_systems = [sys for sys in leaf__subsystem_score.keys() if leaf__subsystem_score[sys] < 0.0001]
low_score_df = h_nodes_df[h_nodes_df.name.isin(low_score_systems)]
for _,row in low_score_df.iterrows():
    gene_count = len(row['CD_MemberList'].split())
    if gene_count > 10:
        low_score_systems.remove(row['name'])

low_score_systems

['C290312',
 'C290161',
 'C290249',
 'C290316',
 'C290379',
 'C290108',
 'C290094',
 'C290210',
 'C290096',
 'C290023',
 'C290370',
 'C290213',
 'C290025',
 'C290194',
 'C290362',
 'C290254',
 'C290185',
 'C290321',
 'C290092',
 'C290288',
 'C290085',
 'C290276',
 'C290037',
 'C290219',
 'C290089',
 'C289948',
 'C290029',
 'C290013',
 'C290068',
 'C290183',
 'C289899',
 'C290124',
 'C290327',
 'C289990',
 'C289964',
 'C289921',
 'C289956',
 'C289909',
 'C289862',
 'C290168',
 'C289900',
 'C289888',
 'C289918',
 'C290059',
 'C290058',
 'C290040',
 'C290123',
 'C289978',
 'C289867',
 'C289987',
 'C289816',
 'C289831',
 'C289880',
 'C290283',
 'C289863',
 'C289830',
 'C289955',
 'C289952',
 'C290107',
 'C289868',
 'C290298',
 'C290221',
 'C290259',
 'C290331',
 'C290229',
 'C289837',
 'C290320',
 'C290341',
 'C290199',
 'C290082',
 'C290066',
 'C289962',
 'C290290',
 'C290114',
 'C290062',
 'C290291',
 'C290310',
 'C290190',
 'C290084',
 'C289897',
 'C290182',
 'C290364',
 'C290227',
 'C2

In [42]:
h_nodes_df = h_nodes_df.query("name not in @low_score_systems")
h_edges_df = h_edges_df.query("T not in @low_score_systems")

In [43]:
ont = convert_to_clixo_format(h_nodes_df, h_edges_df, gene_list, n_name, n_type)

In [41]:
len(leaf__subsystem_score)

758