In [1]:
# this is the pipeline of phenotype analysis
import sys
sys.path.append('..')
import os
import pandas as pd
import GCN
import copy
import numpy as np
import tree_util
from pyseat.SEAT import SEAT

In [2]:
np.random.seed(0)
outer = '../result/GCN_fix_tree'
if not os.path.exists(outer):
    os.makedirs(outer)
ori_GCN = GCN.input_GCN('../data/gcn2008.tsv', transfer=True)
d_df = pd.read_csv('../data/sp_d.tsv', sep='\t', header=0, index_col=0)
fr_df = pd.DataFrame(1 - d_df)


In [3]:
seat = tree_util.make_tree(fr_df)
newick_tree = seat.newick
subroot_nodes = seat.se_tree.optimal_subpopulation_node_ids
json_tree = tree_util.parse(newick_tree)

parent_dict = {}
tree_util.parents(json_tree, parent_dict)
parent_dict_int = {}
for key, value in parent_dict.items():
    if value == '':
        continue
    key_new = int(key[1:])
    parent_dict_int[key_new] = int(value[1:])

conpact_children_dict = {'root': subroot_nodes}
for lid in seat.leaves_list:
    conpact_children_dict[lid] = []
    first_pid = parent_dict_int[lid]
    if first_pid not in conpact_children_dict.keys():
        conpact_children_dict[first_pid] = []
    conpact_children_dict[first_pid].append(lid)
    if first_pid in subroot_nodes:
        continue
    # find subpopulation
    current_aid = first_pid
    while current_aid not in subroot_nodes:
        current_aid = parent_dict_int[current_aid]
    if current_aid not in conpact_children_dict.keys():
        conpact_children_dict[current_aid] = []
    if first_pid not in conpact_children_dict[current_aid]:
        conpact_children_dict[current_aid].append(first_pid)

compact_json = tree_util.call_tree(conpact_children_dict, 'root')

# rename leaves as species name
name_dict, reverse_dict = tree_util.name_reflection(d_df)
tree_util.rename_node(compact_json, reverse_dict)
newick_tree = tree_util.call_newick(compact_json)
newick_tree = newick_tree.replace('_', '-')

with open(os.path.join(outer, 'GCN_tree.newick'), 'w') as fp:
    fp.write(newick_tree)

In [4]:
json_tree = tree_util.parse(newick_tree)
largest = {'largest': 0}
leaf_list, l = tree_util.recu_compute(json_tree, 0, largest)
largest_level = largest['largest']
nlayer = largest_level
layer_leaves_dict = tree_util.make_layer_dict(nlayer)
tree_util.recu_layer(json_tree, layer_leaves_dict)
tree_util.to_layer_leaves(layer_leaves_dict, nlayer)

parent_dict = {}
tree_util.parents(json_tree, parent_dict)
node_leaves = {}
for level in layer_leaves_dict.keys():
    for node, sp_list in layer_leaves_dict[level].items():
        if node in node_leaves.keys():
            continue
        node_leaves[node] = copy.deepcopy(sp_list)

direct_children_dict = {}
for node, parent in parent_dict.items():
    if parent not in direct_children_dict:
        direct_children_dict[parent] = []
    direct_children_dict[parent].append(node)

leaves_num = pd.DataFrame(columns=['num'], index=list(node_leaves.keys()))
for k, v in node_leaves.items():
    leaves_num.loc[k, 'num'] = len(v)

leaves_num = leaves_num.sort_values(by='num', ascending=False)

In [5]:
c_tmp_short = 'cluster_C{}'
c_tmp = 'cluster_S{}-C{}'
sc_tmp = 'supercluster_S{}'
# sort and rename supercluster and cluster
rename_df = pd.DataFrame(columns=['class', 'leaves n', 'parent_name', 'alias', 'parent_alias']) # name is the index
rename_df.loc['nroot', ] = ['root', len(reverse_dict.keys()), 'NA', 'root', 'NA']
internal_rename_dict = {'nroot': 'root'}
supclusters = []
leaf_set = set(leaf_list)

children = direct_children_dict['nroot']
selected_num = leaves_num.loc[children, ]
selected_num = selected_num.sort_values(by='num', ascending=False)
i = 0
for c in selected_num.index:
    i += 1
    cchildren = direct_children_dict[c]
    if not set(cchildren).issubset(leaf_set):
        # is a supercluster
        parent = parent_dict[c]
        supclusters.append(c)
        internal_rename_dict[c] = sc_tmp.format(i)
        rename_df.loc[c, ] = ['supercluster', leaves_num.loc[c, 'num'], parent, internal_rename_dict[c], internal_rename_dict[parent]]
    else:
        parent = parent_dict[c]
        internal_rename_dict[c] = c_tmp_short.format(i)
        rename_df.loc[c, ] = ['cluster', leaves_num.loc[c, 'num'], parent, internal_rename_dict[c], internal_rename_dict[parent]]


for node in supclusters:
    children = direct_children_dict[node]
    selected_num = leaves_num.loc[children, ]
    selected_num = selected_num.sort_values(by='num', ascending=False)
    i = 0
    for c in selected_num.index:
        if not c.startswith('n'):
            continue
        i += 1
        
        parent = parent_dict[c]
        parent_alias = internal_rename_dict[parent]
        digit = parent_alias[14:]
        internal_rename_dict[c] = c_tmp.format(digit, i)
        if c == 'n3486':
            print(parent, internal_rename_dict[c])
        rename_df.loc[c, ] = ['cluster', leaves_num.loc[c, 'num'], parent, internal_rename_dict[c], parent_alias]

rename_df = rename_df.sort_values(by = 'leaves n', ascending=False)
rename_df.to_csv(os.path.join(outer, 'node_rename.tsv'), sep='\t')

In [6]:
# sort rule
def sort_rule(x):
    x = x['name']
    if ( 's--' in x) or ('s__' in x):
        return x

    if 'C' in x:
        numx = int(x.split('C')[-1])
    else:
        numx = int(x.split('S')[-1])
    return numx

def sort_children(node, sort_rule):
    if len(node['children']) > 0:
        node['children'] = sorted(node['children'], key=sort_rule)
        for c in node['children']:
            sort_children(c, sort_rule)


In [7]:
# rename the cluster and supercluster
renamed_json_tree = copy.deepcopy(json_tree)
tree_util.rename_node(renamed_json_tree, internal_rename_dict)
tree_util.sort_children(renamed_json_tree, sort_rule)
newick = tree_util.call_newick(renamed_json_tree)
with open(os.path.join(outer, 'renamed_GCN_tree.newick'), 'w') as fp:
    fp.write(newick)

In [8]:
internal_newick = tree_util.call_internal_tree(renamed_json_tree, set(leaf_list))
with open(os.path.join(outer, 'internal_tree.newick'), 'w') as fp:
    fp.write(internal_newick)

In [9]:
node_leaves

{'n3702': ['s--Haemophilus-aegyptius',
  's--Haemophilus-paraphrohaemolyticus',
  's--Haemophilus-sputorum',
  's--Haemophilus-parahaemolyticus',
  's--Haemophilus-influenzae',
  's--Aggregatibacter-sp-oral-taxon-458',
  's--Haemophilus-haemolyticus',
  's--Aggregatibacter-segnis',
  's--Haemophilus-sp-C1',
  's--Haemophilus-quentini',
  's--Haemophilus-sp-HMSC71H05',
  's--Aggregatibacter-actinomycetemcomitans',
  's--Haemophilus-parainfluenzae',
  's--Haemophilus-pittmaniae',
  's--Pasteurella-dagmatis',
  's--Aggregatibacter-aphrophilus',
  's--Pasteurella-bettyae',
  's--Rodentibacter-heylii',
  's--Haemophilus-haemoglobinophilus',
  's--Pasteurella-multocida',
  's--Avibacterium-paragallinarum',
  's--Pasteurella-canis',
  's--Gilliamella-apicola',
  's--Frischella-perrara'],
 'n3866': ['s--Capnocytophaga-canimorsus',
  's--Capnocytophaga-canis',
  's--Capnocytophaga-cynodegmi',
  's--Capnocytophaga-sp-oral-taxon-332',
  's--Capnocytophaga-sputigena',
  's--Capnocytophaga-sp-oral-

In [10]:
df = pd.DataFrame(columns=['species', 'cluster', 'supercluster'], index=[x.replace('-', '_') for x in leaf_list])
df.fillna('NA', inplace=True)
for k, leaves in node_leaves.items():
    k = rename_df.loc[k, 'alias']
    cluster_id = k.split('_')[-1] 
    contents = cluster_id.split('-')
    for l in leaves:
        l = l.replace('-', '_')
        if contents[-1][0] == 'C':
            df.loc[l, 'cluster'] = cluster_id.replace('-', '_')
        if contents[-1][0] == 'S':
            df.loc[l, 'supercluster'] = contents[-1].replace('-', '_')
        if contents[0][0] == 'S':
            if (df.loc[l, 'supercluster'] != 'NA') and (df.loc[l, 'supercluster'] !=contents[0]):
                print(l, df.loc[l, 'supercluster'])
            df.loc[l, 'supercluster'] = contents[0].replace('-', '_')

df['species'] = df.index

In [11]:
df.sort_values(by=['supercluster', 'cluster']).to_csv(os.path.join(outer, 'leaves_cluster.tsv'), sep='\t', index=False)