In [4]:
import numpy as np
from ete3 import Tree, TreeStyle
from bionlp.util import io, plot, func, bintree

# Load the tree data
lbid = 0
t = io.read_obj('../data/gesgnext/cnsrnn0.5_hrc_tree_%i.pkl' % lbid)
gsm_X = io.read_df('../data/gesgnext/gsm_X_%i.npz' % lbid, with_idx=True, sparse_fmt='csr')
gsm_y = io.read_df('../data/gesgnext/gsm_y_%i.npz' % lbid, with_idx=True)

# Layout design
def layout(node):
    if (node.data.has_key('clt')):
        if (node.is_leaf()):
            node.img_style['size'] = 10
            node.img_style['vt_line_type'] = 0
            node.img_style['vt_line_width'] = 5
            node.img_style['hz_line_width'] = 5
            node.img_style['vt_line_color'] = '#0000FF'
            node.img_style['hz_line_color'] = '#0000FF'
        else:
            node.img_style['size'] = 15
            node.img_style['vt_line_type'] = 0
            node.img_style['vt_line_width'] = 3
            node.img_style['hz_line_width'] = 3
            node.img_style['vt_line_color'] = '#00FFFF'
            node.img_style['hz_line_color'] = '#00FFFF'
        ctrl_num, pert_num = gsm_y['ctrl'].iloc[np.array(node.data['clt'])].sum(), gsm_y['pert'].iloc[np.array(node.data['clt'])].sum()
        if (ctrl_num > pert_num):
            node.img_style['fgcolor'] = 'green'
        elif (pert_num > ctrl_num):
            node.img_style['fgcolor'] = 'red'
        else:
            node.img_style['fgcolor'] = 'grey'
    else:
        node.img_style['size'] = 8
        node.img_style['shape'] = 'square'
        node.img_style['fgcolor'] = 'grey'
        node.img_style['vt_line_type'] = 1
        node.img_style['hz_line_type'] = 1
        node.img_style['vt_line_width'] = 2
        node.img_style['hz_line_width'] = 2
        node.img_style['vt_line_color'] = '#C6C6C6'
        node.img_style['hz_line_color'] = '#C6C6C6'


# Modify the leaf name
vertices = bintree.preorder_getnode(t)
for v in vertices:
    v.name = '|'.join(gsm_X.index[np.array(v.data['clt'])])
# Add level info
def in_lev(node, val):
    node.data['level'] = val['lev']
    val['lev'] += 1
def out_lev(node, val):
    val['lev'] -= 1
bintree.preorder_modify(t, in_lev, out_lev, val=dict(lev=0))
# Prune the tree
remain_nodes = bintree.preorder_search(t, lambda x: x.data['cond'] > 100, stop_found=True)
levs = [node.data['level'] for node in remain_nodes]
nodes_inlev = {}
for k, v in zip(levs, remain_nodes):
    nodes_inlev.setdefault(k, []).append(v)
for k, v in nodes_inlev.iteritems():
    node = bintree.ETENode(data=dict(data={}))
    node.children.extend(v)
    nodes_inlev[k] = node
lev_nodes = func.sorted_tuples(nodes_inlev.items(), key_idx=0)[::-1]
# t = bintree.ETENode()
# t.left, t.right = lev_nodes[0][1], lev_nodes[1][1]
# t.post_build()
# for k, v in lev_nodes[2:]:
#     node = bintree.ETENode()
#     node.left, node.right = t, v
#     node.post_build()
#     t = node
cur_lev, next_lev = [v for k, v in lev_nodes], []
while (len(cur_lev) != 1):
    for i in range(0, len(cur_lev), 2):
        try:
            node = bintree.ETENode()
            node.left, node.right = cur_lev[i], cur_lev[i+1]
            node.post_build()
            next_lev.append(node)
        except:
            next_lev.append(cur_lev[i])
    cur_lev = next_lev
    next_lev = []
t = cur_lev[0]
t.name = 'root'


# Set the tree style
ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = True
# ts.branch_vertical_margin = 10
ts.mode = "c"
ts.arc_start = 0
ts.arc_span = 360
t.render("circular_tree_%i.pdf" % lbid, w=600, dpi=400, tree_style=ts)

{'faces': [[327.4434585102459,
   93.86936570618487,
   334.6975894387412,
   136.1629473689877,
   2556,
   'GSM39849|GSM39848|GSM39845|GSM39844|GSM39847|GSM39846|GSM39841|GSM39840|GSM39843|GSM39842|GSM39870|GSM39871|GSM39872|GSM39852|GSM39851|GSM39829|GSM39828|GSM39823|GSM39822|GSM39821|GSM39820|GSM39827|GSM39826|GSM39825|GSM39824|GSM39858|GSM39859|GSM39850|GSM39854|GSM39855|GSM39838|GSM39839|GSM39830|GSM39831|GSM39832|GSM39833|GSM39834|GSM39835|GSM39836|GSM39837|GSM39867|GSM39866|GSM39865|GSM39864|GSM39863|GSM39862|GSM39861|GSM39860|GSM39869|GSM39868|GSM39819|GSM39853|GSM39856|GSM39857'],
  [351.3261671019667,
   335.3884771877531,
   374.9155507119988,
   351.73609821384986,
   318,
   'GSM11472|GSM11470|GSM11471|GSM11447|GSM11446|GSM11445|GSM11444|GSM11443|GSM11442|GSM11441|GSM11440|GSM11449|GSM11448|GSM11438|GSM11439|GSM11437|GSM11469|GSM11468|GSM11465|GSM11464|GSM11467|GSM11466|GSM11461|GSM11460|GSM11463|GSM11462|GSM11454|GSM11455|GSM11456|GSM11457|GSM11450|GSM11451|GSM11452|GSM

## Find the samples missed in the clustering

In [None]:
from bionlp.util import io, func

clt_mdl_0=io.read_obj('../../../workspace/gesgnext/kallima_clusters_0.pkl')
gsm_X_0=io.read_df('../data/gesgnext/udt200/gsm_X_0.npz', with_idx=True, sparse_fmt='csr')
print gsm_X_0.shape[0], len(set(func.flatten_list(clt_mdl_0)))
ndp = list(set(range(gsm_X_0.shape[0]))-set(func.flatten_list(clt_mdl_0)))
print gsm_X_0.index[ndp]

In [None]:
from bionlp.util import io, func

clt_mdl_1=io.read_obj('../../../workspace/gesgnext/kallima_clusters_1.pkl')
gsm_X_1=io.read_df('../data/gesgnext/udt200/gsm_X_1.npz', with_idx=True, sparse_fmt='csr')
print gsm_X_1.shape[0], len(set(func.flatten_list(clt_mdl_1)))
ndp = list(set(range(gsm_X_1.shape[0]))-set(func.flatten_list(clt_mdl_1)))
print gsm_X_1.index[ndp]

In [None]:
from bionlp.util import io, func

clt_mdl_2=io.read_obj('../../../workspace/gesgnext/kallima_clusters_2.pkl')
gsm_X_2=io.read_df('../data/gesgnext/udt200/gsm_X_2.npz', with_idx=True, sparse_fmt='csr')
print gsm_X_2.shape[0], len(set(func.flatten_list(clt_mdl_2)))
ndp = list(set(range(gsm_X_2.shape[0]))-set(func.flatten_list(clt_mdl_2)))
print gsm_X_2.index[ndp]