In [1]:
import glob
import os
import operator
import tqdm
import pandas as pd
from ete3 import Tree as eTree
import numpy as np
import dendropy as dpy
from collections import OrderedDict


from tree_distance import PhyloTree
from Bio import SeqIO

import dendropy
from dendropy import Tree
from dendropy.calculate import treecompare

from scipy.spatial import distance
from matplotlib import rc


%matplotlib inline

# The following %config line changes the inline figures to have a higher DPI.
# You can comment out (#) this line if you don't have a high-DPI (~220) display.
%config InlineBackend.figure_format = 'retina'
# Set the global font to be DejaVu Sans, size 10 (or any other sans-serif font of your choice!)
rc('font',**{'family':'sans-serif','sans-serif':['DejaVu Sans'],'size':12})
# method_translate = {'pac':'PacBIO', 'ill':'Illumina', 'ont':'ONT'}


def get_p_dist_vec(tree, species):
    pd_vec = {}
    pd_vec[species] = 0.0
    ref_sp = tree.taxon_namespace.get_taxon(species)
    pdm = tree.phylogenetic_distance_matrix()
    dist_to_ref = []
    for leaf in tree.leaf_nodes():
        if leaf.taxon is not ref_sp:
#             pd_vec.append(pdm.patristic_distance(ref_sp, leaf.taxon))
            pd_vec[leaf.taxon.label] = pdm.patristic_distance(ref_sp, leaf.taxon)
    #out = [x[1][0] for x in sorted(pd_vec.items())]
    return pd_vec

In [2]:
REF_FOLDER = "/Users/daviddylus/projects/r2t/ref_trees"
REF_TREE = "/Users/daviddylus/projects/r2t/ref_trees/arath_ref_0.phy.contree"

tree = Tree()
tns = dendropy.TaxonNamespace()
tree_ref = tree.get_from_path(REF_TREE,"newick",taxon_namespace=tns)
ref_tree = tree_ref
ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=lambda t: t[1]))
#     print(ref_dict_pd)
ref_neighbour = list(ref_dict_pd.items())[1]
#     print(tmp_tree)
pd_ref_dist = [x[1] for x in sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=operator.itemgetter(1))]
pd_vec_ref = np.array(pd_ref_dist)
print(ref_neighbour[0])

arath_closest_neighbour = {}

tns = dendropy.TaxonNamespace()
for f in glob.glob(os.path.join(REF_FOLDER, 'arath*contree')):
    idist = os.path.basename(f).split('.')[0].split('_')[2]
    tree = Tree()
    tree_ref = tree.get_from_path(f,"newick",taxon_namespace=tns)
    ref_tree = tree_ref
    ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=lambda t: t[1]))
    #     print(ref_dict_pd)
    ref_neighbour = list(ref_dict_pd.items())[1]
    arath_closest_neighbour[int(idist)] = ref_neighbour[0]
arath_closest_neighbour

ARALY


{0: 'ARALY',
 1: 'ARAAL',
 2: 'BRANA',
 3: 'VITVI',
 4: 'THECC',
 5: 'VITVI',
 6: 'SOLTU'}

In [39]:
assembly_data = pd.read_csv('/Users/daviddylus/Downloads/benchmark_reads - like_shen (2).csv')
yeast_ass_data = assembly_data.loc[assembly_data.species=='yeast']
# yeast_ass_data = yeast_ass_data.dropna()
arath_ass_data = assembly_data.loc[assembly_data.species=='arath']
mouse_ass_data = assembly_data.loc[assembly_data.species=='mouse']

# arath_ass_data.tree = [t.replace('_*','') for t in arath_ass_data.tree]
arath_ass_data.head()

Unnamed: 0,species,coverage,dist,assembly_time,technology,assembly tree,r2t tree shen,r2t tree,ref trees
0,arath,0.2,0,,Illumina,,"(CHLVA:0.3296090866,((SELML:0.1971417168,(((((...","(BRARP:0.00539097,((((((((((((((TRIUA:0.019465...","(VITVI:0.09177319336826902,(((((((ORYLO:0.0175..."
1,arath,0.2,1,,Illumina,,"(SELML:0.2002209391,((CHLVA:0.3438467287,KLEFL...","(ARATH:0.0345819,((BRARP:0.00537901,(BRANA:0.0...","(VITVI:0.09177761921655042,(((((((ORYLO:0.0175..."
2,arath,0.2,2,,Illumina,,"(SORBI:0.0139614912,(((((((((((BRARP:0.0041331...","(TRIUA:0.0194671,(((((((((((((BRARP:0.00533048...","(VITVI:0.09179352450169354,(((((((ORYLO:0.0175..."
3,arath,0.2,3,,Illumina,,"(3_ill:0.0704052773,((((CHLVA:0.3267546679,KLE...","(ARATH:0.100639,((((((((((((TRIUA:0.0194609,AE...","(VITVI:0.09141184952100921,(((((((ORYLO:0.0175..."
4,arath,0.2,4,,Illumina,,"(4_ill:0.0654262303,((((CHLVA:0.3459067888,KLE...","(ARATH:0.0983816,(((((((((((TRIUA:0.019467,AEG...","(VITVI:0.08986284816204641,(((((((ORYLO:0.0175..."


# ARATH

In [73]:
arath_ass_data.fillna('',inplace=True)
rf,euc,normalized_euc1,wrf = [],[],[],[]
dist = {'species':[],'rf':[], 'nrf':[], 'euc':[], 'method':[],'technology':[], 'dist':[], 'coverage':[], 'neuc':[], 'same_neighbour':[]}
for i,r in arath_ass_data.iterrows():
    if r['assembly tree'] != '':
#         print(r['assembly tree'])
        if 'ARATH' in r['assembly tree']:
#             arath_ref_row = arath_ref_data[(arath_ref_data.internode_dist==r.dist) & (arath_ref_data.gcov==r.coverage) & (arath_ref_data.method==r.technology)]
        #     print(r['assembly tree'].replace('_'+r.technology[0:3].lower(),''))
            ass_etree = eTree(r['assembly tree'].replace('_'+r.technology[0:3].lower(),'').replace('HORVV','HORVD'))
#             ref_etree = eTree(arath_ref_row.ref_trees.values[0])
            ref_etree = eTree(r['ref trees'])
            common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

            ass_etree.prune(common_leaves)
            ref_etree.prune(common_leaves)

            ass_tree = Tree
            ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
            ref_tree = Tree()
            ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)

            ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=lambda t: t[1]))
            ref_neighbour = list(ref_dict_pd.items())[1]

            euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
            normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
            rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
            wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))

            test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'ARATH').items(), key=lambda t: t[1]))
            test_neighbour = list(test_dict_pd.items())[1]
    #         print(test_dict_pd)
    #         print(arath_closest_neighbour[r.dist], test_neighbour[0])
            if ref_neighbour[0] in test_neighbour[0]:
                dist['same_neighbour'].append('same')
            elif rf[-1] == 0:
                dist['same_neighbour'].append('same')
            else:
                dist['same_neighbour'].append('different')


            # normalize tree edge lengths such that the maximum distance between root and leaf is 1
            max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
            for edge in ass_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
            for edge in ref_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

            dist['species'].append(r.species)
            dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
            dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
            dist['euc'].append(euc[-1])
            dist['method'].append('assembly')
            dist['technology'].append(r.technology)
            dist['coverage'].append(r.coverage)
            dist['dist'].append(r.dist)
        else:
            dist['same_neighbour'].append('different')
            dist['neuc'].append(np.nan)
            dist['species'].append(r.species)
            dist['rf'].append(np.nan)
            dist['nrf'].append(1)
            dist['euc'].append(np.nan)
            dist['method'].append('assembly')
            dist['technology'].append(r.technology)
            dist['coverage'].append(r.coverage)
            dist['dist'].append(r.dist)
    else:
        dist['neuc'].append(np.nan)
        dist['same_neighbour'].append('different')
        dist['species'].append(r.species)
        dist['rf'].append(np.nan)
        dist['nrf'].append(1)
        dist['euc'].append(np.nan)
        dist['method'].append('assembly')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)

        
for i,r in arath_ass_data.iterrows():
    if 'ont' in r.technology[0:3].lower():
        techh = 'nan'
    else:
        techh = r.technology[0:3].lower()
    if techh in r['r2t tree shen']:
#         arath_ref_row = arath_ref_data[(arath_ref_data.internode_dist==r.dist) & (arath_ref_data.gcov==r.coverage) & (arath_ref_data.method==r.technology)]
        #     print(r['assembly tree'].replace('_'+r.technology[0:3].lower(),''))
        ass_etree = eTree(r['r2t tree shen'].replace(str(r.dist)+'_'+techh.lower(),'ARATH').replace('HORVV','HORVD'))
#         ref_etree = eTree(arath_ref_row.ref_trees.values[0])
        ref_etree = eTree(r['ref trees'])
        common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

        ass_etree.prune(common_leaves)
        ref_etree.prune(common_leaves)

        ass_tree = Tree
        ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
        ref_tree = Tree()
        ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)
        ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=lambda t: t[1]))
        ref_neighbour = list(ref_dict_pd.items())[1]

        euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
        normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
        rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
        wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))

        test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'ARATH').items(), key=lambda t: t[1]))
        test_neighbour = list(test_dict_pd.items())[1]
#         print(test_dict_pd)
#         print(arath_closest_neighbour[r.dist], test_neighbour[0])
        if ref_neighbour[0] in test_neighbour[0]:
            dist['same_neighbour'].append('same')
        elif rf[-1] == 0:
            dist['same_neighbour'].append('same')
        else:
            dist['same_neighbour'].append('different')


        # normalize tree edge lengths such that the maximum distance between root and leaf is 1
        max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
        for edge in ass_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
        for edge in ref_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

        dist['species'].append(r.species)
        dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
        dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
        dist['euc'].append(euc[-1])
        dist['method'].append('r2t')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)
    else:
        dist['neuc'].append(np.nan)
        dist['same_neighbour'].append('different')
        dist['species'].append(r.species)
        dist['rf'].append(np.nan)
        dist['nrf'].append(np.nan)
        dist['euc'].append(np.nan)
        dist['method'].append('r2t')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [74]:
df = pd.DataFrame(dist)
df.to_csv('/Users/daviddylus/projects/r2t/benchmark/like_shen/arath_tree_compare_assembly-like_shen_all.csv')
df[df.coverage == 10]

Unnamed: 0,coverage,dist,euc,method,neuc,nrf,rf,same_neighbour,species,technology
28,10.0,0,0.281532,assembly,0.260578,0.032258,2.0,same,arath,Illumina
29,10.0,1,0.276043,assembly,0.206596,0.066667,4.0,same,arath,Illumina
30,10.0,2,0.226962,assembly,0.161238,0.034483,2.0,same,arath,Illumina
31,10.0,3,0.200533,assembly,0.165699,0.0,0.0,same,arath,Illumina
32,10.0,4,0.197474,assembly,0.275238,0.0,0.0,same,arath,Illumina
33,10.0,5,0.226565,assembly,0.423443,0.0,0.0,same,arath,Illumina
34,10.0,6,0.149877,assembly,0.38576,0.0,0.0,same,arath,Illumina
70,10.0,0,0.321342,assembly,0.332999,0.129032,8.0,same,arath,ONT
71,10.0,1,0.270236,assembly,0.218354,0.1,6.0,different,arath,ONT
72,10.0,2,0.311499,assembly,0.278261,0.034483,2.0,different,arath,ONT


# YEAST

In [75]:
# REF_TREE = "/Users/daviddylus/projects/r2t/ref_trees/yeast_ref_0.phy.contree"

# tree = Tree()
# tns = dendropy.TaxonNamespace()
# tree_ref = tree.get_from_path(REF_TREE,"newick",taxon_namespace=tns)
# ref_tree = tree_ref
# ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=lambda t: t[1]))
# #     print(ref_dict_pd)
# ref_neighbour = list(ref_dict_pd.items())[1]
# #     print(tmp_tree)
# pd_ref_dist = [x[1] for x in sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=operator.itemgetter(1))]
# pd_vec_ref = np.array(pd_ref_dist)
yeast_closest_neighbour = {}
tns = dendropy.TaxonNamespace()
for f in glob.glob(os.path.join(REF_FOLDER, 'yeast*contree')):
    idist = os.path.basename(f).split('.')[0].split('_')[2]
    tree = Tree()
    tree_ref = tree.get_from_path(f,"newick",taxon_namespace=tns)
    ref_tree = tree_ref
    ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=lambda t: t[1]))
    #     print(ref_dict_pd)
    ref_neighbour = list(ref_dict_pd.items())[1]
    yeast_closest_neighbour[int(idist)] = ref_neighbour[0]
yeast_closest_neighbour

{0: 'CANGA',
 1: 'DEBHA',
 2: 'DEKBR',
 3: 'SCHPO',
 4: 'SCHPO',
 5: 'SPIPN',
 6: 'CAPO3'}

In [76]:
yeast_ass_data.fillna('',inplace=True)
rf,euc,normalized_euc1,wrf = [],[],[],[]
dist = {'species':[],'rf':[], 'nrf':[], 'euc':[], 'method':[],'technology':[], 'dist':[], 'coverage':[], 'neuc':[], 'same_neighbour':[]}

for i,r in yeast_ass_data.iterrows():
    if r['assembly tree'] != '':
        if 'YEAST' in r['assembly tree']:
    #         yeast_ref_row = yeast_ref_data[(yeast_ref_data.internode_dist==r.dist) & (yeast_ref_data.gcov==r.coverage) & (yeast_ref_data.method==r.technology)]
            ass_etree = eTree(r['assembly tree'].replace('_'+r.technology[0:3].lower(),''))
    #         ref_etree = eTree(yeast_ref_row.ref_trees.values[0])
            ref_etree = eTree(r['ref trees'])
            common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

            ass_etree.prune(common_leaves)
            ref_etree.prune(common_leaves)

            ass_tree = Tree
            ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
            ref_tree = Tree()
            ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)

            ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=lambda t: t[1]))
            ref_neighbour = list(ref_dict_pd.items())[1]

            euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
            normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
            rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
            wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))


            test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'YEAST').items(), key=lambda t: t[1]))
            test_neighbour = list(test_dict_pd.items())[1]
    #         print(r.coverage, r.technology, r.dist)
    #         print(ref_neighbour[0], test_neighbour[0], rf[-1])
            if yeast_closest_neighbour[r.dist] in test_neighbour[0]:
                dist['same_neighbour'].append('same')
            elif rf[-1] == 0:
                dist['same_neighbour'].append('same')
            else:
                dist['same_neighbour'].append('different')

                # normalize tree edge lengths such that the maximum distance between root and leaf is 1
            max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
            for edge in ass_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
            for edge in ref_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

            dist['species'].append(r.species)
            dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
            dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
            dist['euc'].append(euc[-1])
            dist['method'].append('assembly')
            dist['technology'].append(r.technology)
            dist['coverage'].append(r.coverage)
            dist['dist'].append(r.dist)
        else:
            dist['same_neighbour'].append('different')
            dist['neuc'].append(np.nan)
            dist['species'].append(r.species)
            dist['rf'].append(np.nan)
            dist['nrf'].append(1)
            dist['euc'].append(np.nan)
            dist['method'].append('assembly')
            dist['technology'].append(r.technology)
            dist['coverage'].append(r.coverage)
            dist['dist'].append(r.dist)
    else:
        dist['same_neighbour'].append('different')
        dist['neuc'].append(np.nan)
        dist['species'].append(r.species)
        dist['rf'].append(np.nan)
        dist['nrf'].append(1)
        dist['euc'].append(np.nan)
        dist['method'].append('assembly')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)
    

for i,r in yeast_ass_data.iterrows():
    if 'ont' in r.technology[0:3].lower():
        techh = 'nan'
    else:
        techh = r.technology[0:3].lower()
    if techh in r['r2t tree shen']:
#         yeast_ref_row = yeast_ref_data[(yeast_ref_data.internode_dist==r.dist) & (yeast_ref_data.gcov==r.coverage) & (yeast_ref_data.method==r.technology)]
        ass_etree = eTree(r['r2t tree shen'].replace(str(r.dist)+'_'+techh,'YEAST'))
#         ref_etree = eTree(yeast_ref_row.ref_trees.values[0])
        ref_etree = eTree(r['ref trees'])
        common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

        ass_etree.prune(common_leaves)
        ref_etree.prune(common_leaves)

        ass_tree = Tree
        ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
        ref_tree = Tree()
        ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)
        
        ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=lambda t: t[1]))
        ref_neighbour = list(ref_dict_pd.items())[1]

        euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
        normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
        rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
        wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))


        test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'YEAST').items(), key=lambda t: t[1]))
        test_neighbour = list(test_dict_pd.items())[1]
#         print(r.coverage, r.technology, r.dist)
#         print(ref_neighbour[0], test_neighbour[0], rf[-1])
        if yeast_closest_neighbour[r.dist] in test_neighbour[0]:
            dist['same_neighbour'].append('same')
        elif rf[-1] == 0:
            dist['same_neighbour'].append('same')
        else:
            dist['same_neighbour'].append('different')

            # normalize tree edge lengths such that the maximum distance between root and leaf is 1
        max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
        for edge in ass_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
        for edge in ref_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

        dist['species'].append(r.species)
        dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
        dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
        dist['euc'].append(euc[-1])
        dist['method'].append('r2t')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)
    else:
        dist['same_neighbour'].append('different')
        dist['neuc'].append(np.nan)
        dist['species'].append(r.species)
        dist['rf'].append(np.nan)
        dist['nrf'].append(np.nan)
        dist['euc'].append(np.nan)
        dist['method'].append('r2t')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)

df = pd.DataFrame(dist)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,coverage,dist,euc,method,neuc,nrf,rf,same_neighbour,species,technology
0,0.2,0,,assembly,,1,,different,yeast,Illumina
1,0.2,1,,assembly,,1,,different,yeast,Illumina
2,0.2,2,,assembly,,1,,different,yeast,Illumina
3,0.2,3,,assembly,,1,,different,yeast,Illumina
4,0.2,4,,assembly,,1,,different,yeast,Illumina
5,0.2,5,,assembly,,1,,different,yeast,Illumina
6,0.2,6,,assembly,,1,,different,yeast,Illumina
7,0.5,0,,assembly,,1,,different,yeast,Illumina
8,0.5,1,,assembly,,1,,different,yeast,Illumina
9,0.5,2,,assembly,,1,,different,yeast,Illumina


In [77]:
df[df.coverage == 20]
df.to_csv('/Users/daviddylus/projects/r2t/benchmark/like_shen/yeast_tree_compare_assembly-like_shen_all.csv')

# MOUSE

In [78]:
# REF_TREE = "/Users/daviddylus/projects/r2t/ref_trees/mouse_ref_0.phy.contree"

# tree = Tree()
# tns = dendropy.TaxonNamespace()
# tree_ref = tree.get_from_path(REF_TREE,"newick",taxon_namespace=tns)
# ref_tree = tree_ref
# ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=lambda t: t[1]))
# #     print(ref_dict_pd)
# ref_neighbour = list(ref_dict_pd.items())[1]
# #     print(tmp_tree)
# pd_ref_dist = [x[1] for x in sorted(get_p_dist_vec(ref_tree, 'YEAST').items(), key=operator.itemgetter(1))]
# pd_vec_ref = np.array(pd_ref_dist)
mouse_closest_neighbour = {}
tns = dendropy.TaxonNamespace()
for f in glob.glob(os.path.join(REF_FOLDER, 'mouse*contree')):
    idist = os.path.basename(f).split('.')[0].split('_')[2]
    tree = Tree()
    tree_ref = tree.get_from_path(f,"newick",taxon_namespace=tns)
    ref_tree = tree_ref
    ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'MOUSE').items(), key=lambda t: t[1]))
    #     print(ref_dict_pd)
    ref_neighbour = list(ref_dict_pd.items())[1]
    mouse_closest_neighbour[int(idist)] = ref_neighbour[0]
mouse_closest_neighbour

{0: 'RATNO',
 1: 'HUMAN',
 2: 'CHICK',
 3: 'XENTR',
 4: 'PETMA',
 5: 'BRAFL',
 6: 'CIOIN'}

In [79]:
mouse_ass_data.fillna('',inplace=True)
rf,euc,normalized_euc1,wrf = [],[],[],[]
dist = {'species':[],'rf':[], 'nrf':[], 'euc':[], 'method':[],'technology':[], 'dist':[], 'coverage':[], 'neuc':[], 'same_neighbour':[]}

for i,r in mouse_ass_data.iterrows():
    if r['assembly tree'] != '':
        if 'MOUSE' in r['assembly tree']:
#             mouse_ref_row = mouse_ref_data[(mouse_ref_data.internode_dist==r.dist) & (mouse_ref_data.gcov==r.coverage) & (mouse_ref_data.method==r.technology)]
            ass_etree = eTree(r['assembly tree'].replace('_'+r.technology[0:3].lower(),''))
#             ref_etree = eTree(mouse_ref_row.ref_trees.values[0])
            ref_etree = eTree(r['ref trees'])
            common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

            ass_etree.prune(common_leaves)
            ref_etree.prune(common_leaves)

            ass_tree = Tree
            ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
            ref_tree = Tree()
            ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)

            ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'MOUSE').items(), key=lambda t: t[1]))
            ref_neighbour = list(ref_dict_pd.items())[1]

            euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
            normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
            rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
            wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))


            test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'MOUSE').items(), key=lambda t: t[1]))
            test_neighbour = list(test_dict_pd.items())[1]
#             print(r.coverage, r.technology, r.dist)
#             print(ref_neighbour[0], test_neighbour[0], rf[-1])
            if mouse_closest_neighbour[r.dist] in test_neighbour[0]:
                dist['same_neighbour'].append('same')
            elif rf[-1] == 0:
                dist['same_neighbour'].append('same')
            else:
                dist['same_neighbour'].append('different')

                # normalize tree edge lengths such that the maximum distance between root and leaf is 1
            max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
            for edge in ass_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
            for edge in ref_tree.postorder_edge_iter():
                if edge.length is None:
                    edge.length = 0
                else:
                    edge.length = float(edge.length)/max_dist_to_root

            dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

            dist['species'].append(r.species)
            dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
            dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
            dist['euc'].append(euc[-1])
            dist['method'].append('assembly')
            dist['technology'].append(r.technology)
            dist['coverage'].append(r.coverage)
            dist['dist'].append(r.dist)
    else:
        dist['same_neighbour'].append('different')
        dist['neuc'].append(np.nan)
        dist['species'].append(r.species)
        dist['rf'].append(np.nan)
        dist['nrf'].append(1)
        dist['euc'].append(np.nan)
        dist['method'].append('assembly')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)

for i,r in mouse_ass_data.iterrows():
    if 'ont' in r.technology[0:3].lower():
        techh = 'nan'
    else:
        techh = r.technology[0:3].lower()
    if techh in r['r2t tree shen']:
#         mouse_ref_row = mouse_ref_data[(mouse_ref_data.internode_dist==r.dist) & (mouse_ref_data.gcov==r.coverage) & (mouse_ref_data.method==r.technology)]
        ass_etree = eTree(r['r2t tree shen'].replace(str(r.dist)+'_'+techh,'MOUSE'))
#         ref_etree = eTree(mouse_ref_row.ref_trees.values[0])
        ref_etree = eTree(r['ref trees'])
        common_leaves = list(set(ass_etree.get_leaf_names()) & set(ref_etree.get_leaf_names()))

        ass_etree.prune(common_leaves)
        ref_etree.prune(common_leaves)

        ass_tree = Tree
        ass_tree = ass_tree.get_from_string(ass_etree.write(),"newick",taxon_namespace=tns)
        ref_tree = Tree()
        ref_tree = ref_tree.get_from_string(ref_etree.write(),"newick",taxon_namespace=tns)
        
        ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'MOUSE').items(), key=lambda t: t[1]))
        ref_neighbour = list(ref_dict_pd.items())[1]

        euc.append(treecompare.euclidean_distance(ass_tree, ref_tree))
        normalized_euc1.append(treecompare.euclidean_distance(ass_tree, ref_tree)/(ass_tree.length()+ref_tree.length()))
        rf.append(treecompare.symmetric_difference(ass_tree, ref_tree))
        wrf.append(treecompare.weighted_robinson_foulds_distance(ass_tree, ref_tree))


        test_dict_pd = OrderedDict(sorted(get_p_dist_vec(ass_tree, 'MOUSE').items(), key=lambda t: t[1]))
        test_neighbour = list(test_dict_pd.items())[1]
#         print(r.coverage, r.technology, r.dist)
#         print(ref_neighbour[0], test_neighbour[0], rf[-1])
        if mouse_closest_neighbour[r.dist] in test_neighbour[0]:
            dist['same_neighbour'].append('same')
        elif rf[-1] == 0:
            dist['same_neighbour'].append('same')
        else:
            dist['same_neighbour'].append('different')

            # normalize tree edge lengths such that the maximum distance between root and leaf is 1
        max_dist_to_root = np.array(ass_tree.calc_node_root_distances()).max()
        for edge in ass_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
        for edge in ref_tree.postorder_edge_iter():
            if edge.length is None:
                edge.length = 0
            else:
                edge.length = float(edge.length)/max_dist_to_root

        dist['neuc'].append(treecompare.euclidean_distance(ass_tree, ref_tree))

        dist['species'].append(r.species)
        dist['rf'].append(treecompare.symmetric_difference(ass_tree, ref_tree))
        dist['nrf'].append(ass_etree.compare(ref_etree,unrooted=True)["norm_rf"])
        dist['euc'].append(euc[-1])
        dist['method'].append('r2t')
        dist['technology'].append(r.technology)
        dist['coverage'].append(r.coverage)
        dist['dist'].append(r.dist)
    
df = pd.DataFrame(dist)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,coverage,dist,euc,method,neuc,nrf,rf,same_neighbour,species,technology
0,0.2,0,,assembly,,1.000000,,different,mouse,Illumina
1,0.2,1,,assembly,,1.000000,,different,mouse,Illumina
2,0.2,2,,assembly,,1.000000,,different,mouse,Illumina
3,0.2,3,,assembly,,1.000000,,different,mouse,Illumina
4,0.2,4,,assembly,,1.000000,,different,mouse,Illumina
5,0.2,5,,assembly,,1.000000,,different,mouse,Illumina
6,0.2,6,,assembly,,1.000000,,different,mouse,Illumina
7,0.5,0,,assembly,,1.000000,,different,mouse,Illumina
8,0.5,1,,assembly,,1.000000,,different,mouse,Illumina
9,0.5,2,,assembly,,1.000000,,different,mouse,Illumina


In [80]:
df.to_csv('/Users/daviddylus/projects/r2t/benchmark/like_shen/mouse_tree_compare_assembly-like_shen_all.csv')

In [49]:
df

Unnamed: 0,coverage,dist,euc,method,neuc,nrf,rf,same_neighbour,species,technology
0,0.2,0,,assembly,,,,,mouse,Illumina
1,0.2,1,,assembly,,,,,mouse,Illumina
2,0.2,2,,assembly,,,,,mouse,Illumina
3,0.2,3,,assembly,,,,,mouse,Illumina
4,0.2,4,,assembly,,,,,mouse,Illumina
5,0.2,5,,assembly,,,,,mouse,Illumina
6,0.2,6,,assembly,,,,,mouse,Illumina
7,0.5,0,,assembly,,,,,mouse,Illumina
8,0.5,1,,assembly,,,,,mouse,Illumina
9,0.5,2,,assembly,,,,,mouse,Illumina
