In [6]:
import glob
import os
import operator
import tqdm
import pandas as pd
from ete3 import Tree as eTree
import numpy as np
import dendropy as dpy

from tree_distance import PhyloTree
from Bio import SeqIO

import dendropy
from dendropy import Tree
from dendropy.calculate import treecompare

from scipy.spatial import distance

OUTPUT_FOLDER_REF = "/Volumes/D2c/wally/r2t/r2t_aling_sp_removal/ref"
OUTPUT_FOLDER = "/Volumes/D2c/yeast/r2t/rm_species/"


# arath
t_neighbour = ['ARATH','ARALY','ARAAL','BRAOL','LOTJA','GOSHI','VITVI','SOLTU']

# yeast
#t_neighbour = ['YEAST','CANGA','DEBHA','DEKBR','TUBMM','SCHPO','SPIPN']

# mouse
#t_neighbour = ['MOUSE','RATNO','CHICK','XENTR','PETMA','BRAFL','STRPU']

tns = dendropy.TaxonNamespace()
out_dict = {}
translate_names = {'pac20X':20, 'pac10X':10, 'pac02X':0.2, 'pac1X':1, 'pac5X':5, 'pac05X':0.5,'ill20X':20, 'ill10X':10, 'ill02X':0.2, 'ill1X':1, 'ill2X':2, 'ill5X':5, 'ill05X':0.5, 'nan20X':20, 'nan10X':10, 'nan02X':0.2, 'nan1X':1, 'nan5X':5, 'nan05X':0.5, 'nan50X':50, 'nan100X':100}

In [7]:
df_trees = pd.read_csv('/Users/daviddylus/Downloads/benchmark_reads - like_shen.csv')
df_trees

Unnamed: 0,species,coverage,dist,assembly_time,technology,assembly tree,r2t tree
0,arath,5,0,1737941.0,ONT,"(CHLVA:0.3080690824,((((((((((((BRAOL:0.003424...","(KLEFL:0.1932431691,(((((((((((TRIUA:0.0123911..."
1,arath,5,1,1737941.0,ONT,"(BRARP:0.0057111899,((ARATH:0.1825372512,(((((...","(TRIUA:0.0123842930,(((((((((((KLEFL:0.1954737..."
2,arath,5,2,1737941.0,ONT,"(ARATH:0.1936335677,(((((((((CHLVA:0.305819149...","(TRIUA:0.0129611552,(((((((((((KLEFL:0.2017310..."
3,arath,5,3,1737941.0,ONT,"(TRIUA:0.0157587712,AEGTA:0.0112201570,(((((((...","(3_ONT:0.1185682305,((((((((CHLVA:0.3229969294..."
4,arath,5,4,1737941.0,ONT,"(VITVI:0.0787176945,(ARATH:0.2905347123,((((((...","(AMBTC:0.1204692866,(((((4_ONT:0.1219879740,(G..."
5,arath,5,5,1737941.0,ONT,"(ARATH:0.2937085301,(((((CHLVA:0.3685693057,KL...","(AMBTC:0.1203091905,((5_ONT:0.1116193523,((SOL..."
6,arath,5,6,1737941.0,ONT,"(SELML:0.2085547550,((CHLVA:0.3610055246,KLEFL...","(TRIUA:0.0132110030,((((((((6_ONT:0.1041204469..."
7,arath,5,0,9600.0,Illumina,"(KLEFL:0.1987601737,CHLVA:0.3127645175,(((((((...","(LOTJA:0.0543836062,(((BRARP:0.0041778261,(BRA..."
8,arath,5,1,9600.0,Illumina,"(KLEFL:0.2031197160,CHLVA:0.3302112639,(((((((...","(BRARP:0.0042874961,((((((((AMBTC:0.1179213347..."
9,arath,5,2,9600.0,Illumina,"(KLEFL:0.2027736855,(((((((((ARATH:0.045389567...","(LOTJA:0.0555186546,((((((AMBTC:0.1185441968,(..."


In [8]:
def get_p_dist_vec(tree, species):
    pd_vec = {}
    pd_vec[species] = 0.0
    ref_sp = tree.taxon_namespace.get_taxon(species)
    pdm = tree.phylogenetic_distance_matrix()
    dist_to_ref = []
    for leaf in tree.leaf_nodes():
        if leaf.taxon is not ref_sp:
#             pd_vec.append(pdm.patristic_distance(ref_sp, leaf.taxon))
            pd_vec[leaf.taxon.label] = pdm.patristic_distance(ref_sp, leaf.taxon)
    #out = [x[1][0] for x in sorted(pd_vec.items())]
    return pd_vec

def get_ref_tree_dic_ete(folder,suffix):
    out_dict = {}
    for file in glob.glob(os.path.join(folder,suffix+'*.contree')):
        basename = int(os.path.basename(file).split("_")[2].rsplit('.phy.contree')[0])
        t_tmp = eTree(file)
        out_dict[str(basename)] = t_tmp
    return out_dict

def get_ref_tree_dic_dendro(folder,suffix):
    out_dict = {}
    for file in glob.glob(os.path.join(folder,suffix+'*.contree')):
        basename = int(os.path.basename(file).split("_")[2].rsplit('.phy.contree')[0])
        t_tmp = eTree(file)
        tree = Tree()
        tree1 = tree.get_from_string(t_tmp.write(),"newick",taxon_namespace=tns)
        out_dict[str(basename)] = tree1
    return out_dict

def get_dist_closest_neighbour(tree, species):
    my_dict = get_p_dist_vec(tree, species)
    sorted_x = sorted(my_dict.items(), key=operator.itemgetter(1))
    species = [x[0] for x in sorted_x]
    pd_dist = [x[1] for x in sorted_x]
    out_dict = {'species':species, 'pd_dist':pd_dist}
    df = pd.DataFrame(data=out_dict)
    return df

In [9]:
from collections import OrderedDict
tree_dic = get_ref_tree_dic_dendro(OUTPUT_FOLDER_REF,"arath")
tree_dic_ete = get_ref_tree_dic_ete(OUTPUT_FOLDER_REF,"arath")
print(tree_dic['0'])

(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:0.00294788,ORYRU:0.00313224)100:0.00363969)100:0.00853521,ORYPU:0.0128507)100:0.0497024,((((TRIUA:0.0194657,AEGTA:0.0157179)100:0.00295538,WHEAT:0.0024146)100:0.0093652,HORVD:0.0130745)100:0.0383621,BRADI:0.0353621)100:0.0411451)100:0.0123307,(ERATE:0.0540072,((SORBI:0.016609,MAIZE:0.0278379)100:0.0205462,SETIT:0.0280751)100:0.0180213)100:0.0198332)100:0.141934,(MUSAM:8.545e-07,MUSAC:8.545e-07)100:0.120173)100:0.0612213,(AMBTC:0.145541,(SELML:0.241401,((KLEFL:0.266807,CHLVA:0.503713)100:0.12726,PHYPA:0.221224)100:0.0484907)100:0.169497)100:0.0452054)100:0.055932,(SOLLC:0.0120982,SOLTU:0.0105958)100:0.134611)100:0.0216301,(((LOTJA:0.0668104,MEDTR:0.0796151)100:0.0790018,((ARAAL:0.0441382,(ARALY:0.0123033,ARATH:0.0166304)100:0.0267635)87:0.0100966,(BRARP:0.00539793,(BRANA:0.0019589,BRAOL:0.00601349)69:0.0026334)100:0.0374468)100:0.148639)93:0.0213875,((MANES:0.0708263,POPTR:0.0864117)100:0.0321686,(THECC:0.0395627,GOSHI:0.0554196)100:0.057670

In [10]:
my_dict_pd = OrderedDict(sorted(get_p_dist_vec(tree_dic['0'], 'ARATH').items(), key=lambda t: t[1]))
my_dict_pd

OrderedDict([('ARATH', 0.0),
             ('ARALY', 0.0289337),
             ('ARAAL', 0.0875321),
             ('BRANA', 0.09552959999999999),
             ('BRARP', 0.09633523),
             ('BRAOL', 0.09958419),
             ('VITVI', 0.33406299999999994),
             ('THECC', 0.3361359),
             ('MANES', 0.34189759999999997),
             ('LOTJA', 0.3479417),
             ('GOSHI', 0.3519928),
             ('POPTR', 0.357483),
             ('MEDTR', 0.3607464),
             ('SOLTU', 0.4117213),
             ('SOLLC', 0.4132237),
             ('MUSAC', 0.5038416545000001),
             ('MUSAM', 0.5038416545000001),
             ('AMBTC', 0.5131929000000001),
             ('SETIT', 0.5915313999999999),
             ('ERATE', 0.5994422),
             ('ORYPU', 0.6004856),
             ('SORBI', 0.6006115),
             ('ORYNI', 0.60275768),
             ('ORYRU', 0.60294204),
             ('MAIZE', 0.6118404),
             ('ORYLO', 0.61325311),
             ('BRADI', 0.6

## Tree distances vs reference Tree

### Change the species name to something that is possible to compare with reference

e.g. nan or ill containig leaf name to CANVA or ARATH

In [120]:
trees = []
same_neighbour = []
dist_to_neighbour = []
ref_trees = []
name = []
mean = []
stdev = []
gcov = []
num_ogs = []
species_freq = []
species = []
method = []
euc = []
normalized_euc = []
normalized_euc1 = []
rf = []
nrf = []
wrf = []
gcov = []
names = []
pd_dif = []
branch_diff = []
pd_cor = []
pd_euc = []
internode_dist = []
closest_neighbour_dist = []

In [121]:
for idist, ref_tree in tqdm.tqdm(tree_dic.items()):
    ref_dict_pd = OrderedDict(sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=lambda t: t[1]))
    tmp_tree = tree_dic_ete[idist]
    ref_neighbour = list(ref_dict_pd.items())[1]
    pd_ref_dist = [x[1] for x in sorted(get_p_dist_vec(ref_tree, 'ARATH').items(), key=operator.itemgetter(1))]
    pd_vec_ref = np.array(pd_ref_dist)
    df_trees_sub = df_trees[(df_trees['dist']==int(idist)) & (df_trees['species'] == 'ARATH'.lower())]
    df_trees_sub = df_trees_sub.dropna()
    for i, r in df_trees_sub.iterrows():
#         print(r['assembly tree'])
        if r['assembly tree']:
            if 'ARATH' in r['assembly tree']:
                t_tmp = eTree(r['assembly tree']) 
                print(ref_tree)
                common_leaves = list(set(t_tmp.get_leaf_names()) & set(tmp_tree.get_leaf_names()))
    #             print(common_leaves)
                t_tmp.prune(common_leaves)
                tmp_tree.prune(common_leaves)
                

    #             prune_t = [l for l in tree1.get_leaf_names() if l not in common_leaves]
    #             prune_ref = [l for l in ete_ref_tree.get_leaf_names() if l not in common_leaves]
                meth = r['technology']
                do_it = True
                tree = Tree()
                if do_it:
                    tree1 = tree.get_from_string(t_tmp.write(),"newick",taxon_namespace=tns)
                    ref_tree = tree.get_from_string(tmp_tree.write(),"newick",taxon_namespace=tns)
                    print(ref_tree)
                    test_dict_pd = OrderedDict(sorted(get_p_dist_vec(tree1, 'ARATH').items(), key=lambda t: t[1]))
        #             print(test_dict_pd)
                    test_neighbour = list(test_dict_pd.items())[1]
                    if ref_neighbour[0] in test_neighbour[0]:
                        same_neighbour.append('same')
                    else:
                        same_neighbour.append('different')
                    if ref_neighbour[0] in test_dict_pd.keys():
                        dist_to_neighbour.append(ref_neighbour[1]-test_dict_pd[ref_neighbour[0]])
                    else:
                        dist_to_neighbour.append(None)
                    trees.append(str(tree1)+";")
                    ref_trees.append(str(ref_tree)+";")
                    gcov.append(r['coverage']) # gcov = given coverage
                    species.append('ARATH')
                    method.append(meth)
                    internode_dist.append(int(idist))
                    closest_neighbour_dist.append(my_dict_pd[t_neighbour[int(idist)]])
                    tree1.encode_bipartitions()
                    euc.append(treecompare.euclidean_distance(tree1, ref_tree))
                    normalized_euc1.append(treecompare.euclidean_distance(tree1, ref_tree)/(tree1.length()+ref_tree.length()))
                    rf.append(treecompare.symmetric_difference(tree1, ref_tree))
                    wrf.append(treecompare.weighted_robinson_foulds_distance(tree1, ref_tree))
                    pd_dist = [x[1] for x in sorted(get_p_dist_vec(tree1, 'ARATH').items(), key=operator.itemgetter(1))]
                    pd_sp = np.array(pd_dist)
        #                     pd_cor.append(np.corrcoef(pd_vec_ref,pd_sp)[0][1])
                    pd_cor.append(0)
        #                     pd_euc.append(distance.euclidean(pd_vec_ref,pd_sp))
                    pd_euc.append(0)
                    et = eTree(str(ref_tree)+";")
                    et2 = eTree(str(tree1)+";")
                    res = et.compare(et2,unrooted=True)
                    nrf.append(res["norm_rf"])

                    # normalize tree edge lengths such that the maximum distance between root and leaf is 1
                    max_dist_to_root = np.array(tree1.calc_node_root_distances()).max()
                    for edge in tree1.postorder_edge_iter():
                        if edge.length is None:
                            edge.length = 0
                        else:
                            edge.length = float(edge.length)/max_dist_to_root

                    max_dist_to_root = np.array(ref_tree.calc_node_root_distances()).max()
                    for edge in ref_tree.postorder_edge_iter():
                        if edge.length is None:
                            edge.length = 0
                        else:
                            edge.length = float(edge.length)/max_dist_to_root

                    normalized_euc.append(treecompare.euclidean_distance(tree1, ref_tree))


# out_dict = {'species':species, 'closest_neighbour_dist':closest_neighbour_dist, 'internode_dist':internode_dist, 'method':method, 'gcov':gcov, 'trees':trees,'rf':rf, 'nrf':nrf, 'wrf':wrf, 'euc':euc, 'neuc1':normalized_euc1, 'neuc':normalized_euc, 'pd_cor':pd_cor, 'pd_euc':pd_euc}
out_dict = {'same_neighbour':same_neighbour, 'dist_to_neighbour':dist_to_neighbour, 'species':species, 'closest_neighbour_dist':closest_neighbour_dist, 'internode_dist':internode_dist, 'method':method, 'gcov':gcov, 'trees':trees, 'ref_trees':ref_trees, 'rf':rf, 'nrf':nrf, 'wrf':wrf, 'euc':euc, 'neuc1':normalized_euc1, 'neuc':normalized_euc}

df = pd.DataFrame(data=out_dict)
df
# df.sort_values(by=['gcov'])

  0%|          | 0/7 [00:00<?, ?it/s]

(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:0.00294788,ORYRU:0.00313224)100:0.00363969)100:0.00853521,ORYPU:0.0128507)100:0.0497024,((((TRIUA:0.0194657,AEGTA:0.0157179)100:0.00295538,WHEAT:0.0024146)100:0.0093652,HORVD:0.0130745)100:0.0383621,BRADI:0.0353621)100:0.0411451)100:0.0123307,(ERATE:0.0540072,((SORBI:0.016609,MAIZE:0.0278379)100:0.0205462,SETIT:0.0280751)100:0.0180213)100:0.0198332)100:0.141934,(MUSAM:8.545e-07,MUSAC:8.545e-07)100:0.120173)100:0.0612213,(AMBTC:0.145541,(SELML:0.241401,((KLEFL:0.266807,CHLVA:0.503713)100:0.12726,PHYPA:0.221224)100:0.0484907)100:0.169497)100:0.0452054)100:0.055932,(SOLLC:0.0120982,SOLTU:0.0105958)100:0.134611)100:0.0216301,(((LOTJA:0.0668104,MEDTR:0.0796151)100:0.0790018,((ARAAL:0.0441382,(ARALY:0.0123033,ARATH:0.0166304)100:0.0267635)87:0.0100966,(BRARP:0.00539793,(BRANA:0.0019589,BRAOL:0.00601349)69:0.0026334)100:0.0374468)100:0.148639)93:0.0213875,((MANES:0.0708263,POPTR:0.0864117)100:0.0321686,(THECC:0.0395627,GOSHI:0.0554196)100:0.057670

 14%|█▍        | 1/7 [00:00<00:04,  1.48it/s]

(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:0.00294788,ORYRU:0.00313224)100:0.00363969)100:0.00853521,ORYPU:0.0128507)100:0.0497024,(BRADI:0.0353621,((TRIUA:0.0194657,AEGTA:0.0157179)100:0.00295538,WHEAT:0.0024146)100:0.0093652)100:0.0411451)100:0.0123307,(ERATE:0.0540072,((SORBI:0.016609,MAIZE:0.0278379)100:0.0205462,SETIT:0.0280751)100:0.0180213)100:0.0198332)100:0.141934,MUSAM:8.545e-07)100:0.0612213,(AMBTC:0.145541,(SELML:0.241401,((KLEFL:0.266807,CHLVA:0.503713)100:0.12726,PHYPA:0.221224)100:0.0484907)100:0.169497)100:0.0452054)100:0.055932,(SOLLC:0.0120982,SOLTU:0.0105958)100:0.134611)100:0.0216301,(((LOTJA:0.0668104,MEDTR:0.0796151)100:0.0790018,((ARAAL:0.0441382,(ARALY:0.0123033,ARATH:0.0166304)100:0.0267635)87:0.0100966,(BRARP:0.00539793,(BRANA:0.0019589,BRAOL:0.00601349)69:0.0026334)100:0.0374468)100:0.148639)93:0.0213875,((MANES:0.0708263,POPTR:0.0864117)100:0.0321686,(THECC:0.0395627,GOSHI:0.0554196)100:0.0576705)93:0.0153857)100:0.0213674)
(VITVI:0.09177319336826902,((((

(VITVI:0.0891761,(((((((ORYLO:0.017083,(ORYNI:0.00294788,ORYRU:0.0031323)100:0.00363974)100:0.00853543,ORYPU:0.0128507)100:0.0497035,(BRADI:0.0353618,((TRIUA:0.0194663,AEGTA:0.0157178)100:0.00295555,WHEAT:0.00241456)100:0.00936501)100:0.0411442)100:0.0123313,(ERATE:0.0540071,((SORBI:0.0166092,MAIZE:0.0278368)100:0.0205458,SETIT:0.0280754)100:0.018021)100:0.0198309)100:0.14193,MUSAM:8.545e-07)100:0.0612171,(AMBTC:0.145529,(SELML:0.24138,((KLEFL:0.266818,CHLVA:0.503699)100:0.127256,PHYPA:0.221225)100:0.0484933)100:0.169485)100:0.0451964)100:0.0559115,(SOLLC:0.0120914,SOLTU:0.0106032)100:0.134618)100:0.0216129,(((LOTJA:0.0668313,MEDTR:0.0795915)100:0.0789381,((ARAAL:0.0434508,ARATH:0.0408414)78:0.0107411,(BRARP:0.00540555,(BRANA:0.00195096,BRAOL:0.00601995)68:0.0026219)100:0.0373243)100:0.148528)92:0.0213777,((MANES:0.0708534,POPTR:0.0863839)100:0.0321488,(THECC:0.0395612,GOSHI:0.0554242)100:0.0576867)92:0.0153904)100:0.0213645)
(VITVI:0.09177761921655041,(((((((ORYLO:0.017581359457033114

 29%|██▊       | 2/7 [00:01<00:03,  1.46it/s]

(VITVI:0.09177761921655041,(((((((ORYLO:0.017581359457033114,(ORYNI:0.0030338780024702205,ORYRU:0.0032236780558019566)100:0.003745921516720816)100:0.008784432649437695,ORYPU:0.013225591288093159)100:0.05115349176214045,(BRADI:0.036393403784330244,((TRIUA:0.020034187063071106,AEGTA:0.016176332709345844)100:0.0030417717580772823,WHEAT:0.0024849995487077137)100:0.009638213845853168)100:0.042344492757247665)100:0.012691038920125998,(ERATE:0.055582639953868355,((SORBI:0.017093737370119676,MAIZE:0.02864887823763621)100:0.02114517913319153,SETIT:0.028894438874904144)100:0.018546723571690786)100:0.020409423476934845)100:0.14607049977970554,MUSAM:8.794281833421997e-07)100:0.06300297605907287,(AMBTC:0.14977449279532704,(SELML:0.24842173773568185,((KLEFL:0.2746018361884131,CHLVA:0.5183933253613605)100:0.13096841767044465,PHYPA:0.22767875934450335)100:0.04990798680312263)100:0.1744293571138124)100:0.04651490689948202)100:0.05754259669155927,(SOLLC:0.012444140358178903,SOLTU:0.010912525352386203)10

(VITVI:0.09179352450169352,(((((((ORYLO:0.017583878303764223,(ORYNI:0.0030342022598388546,ORYRU:0.003223946049111053)100:0.003746570067582715)100:0.008784064897851883,ORYPU:0.013229467290760483)100:0.05117174046541061,(BRADI:0.03640056628752905,((TRIUA:0.020036837098461935,AEGTA:0.01617804089794905)100:0.003041911823588712,WHEAT:0.0024853719005031904)100:0.00964110282375896)100:0.042346091271765106)100:0.012691650593924615,(ERATE:0.05558729274500201,((SORBI:0.017097630250969456,MAIZE:0.028652197390358985)100:0.021152716679399282,SETIT:0.02889202761114895)100:0.01854834575818567)100:0.02040872833782423)100:0.14605351651833218,MUSAM:8.795490286052606e-07)100:0.06299794363598672,(AMBTC:0.14980330833952465,(SELML:0.24845587422438598,((KLEFL:0.2746364822320803,CHLVA:0.5184686768350563)100:0.13094730055132128,PHYPA:0.227694605755143)100:0.04992379986590094)100:0.17440186034140706)100:0.04650441791970237)100:0.05757500149765223,(SOLLC:0.012466436635517558,SOLTU:0.010891688760897092)100:0.1385

 43%|████▎     | 3/7 [00:02<00:02,  1.41it/s]

(VITVI:0.09179352450169352,(((((((ORYLO:0.017583878303764223,(ORYNI:0.0030342022598388546,ORYRU:0.003223946049111053)100:0.003746570067582715)100:0.008784064897851883,ORYPU:0.013229467290760483)100:0.05117174046541061,(BRADI:0.03640056628752905,((TRIUA:0.020036837098461935,AEGTA:0.01617804089794905)100:0.003041911823588712,WHEAT:0.0024853719005031904)100:0.00964110282375896)100:0.042346091271765106)100:0.012691650593924615,(ERATE:0.05558729274500201,((SORBI:0.017097630250969456,MAIZE:0.028652197390358985)100:0.021152716679399282,SETIT:0.02889202761114895)100:0.01854834575818567)100:0.02040872833782423)100:0.14605351651833218,MUSAM:8.795490286052606e-07)100:0.06299794363598672,(AMBTC:0.14980330833952465,(SELML:0.24845587422438598,((KLEFL:0.2746364822320803,CHLVA:0.5184686768350563)100:0.13094730055132128,PHYPA:0.227694605755143)100:0.04992379986590094)100:0.17440186034140706)100:0.04650441791970237)100:0.05757500149765223,(SOLLC:0.012466436635517558,SOLTU:0.010891688760897092)100:0.1385

(VITVI:0.09141184952100921,(((((((ORYLO:0.017575400759800866,(ORYNI:0.003032820832004905,ORYRU:0.0032265440075784487)100:0.0037502286680857386)100:0.008779680911637503,ORYPU:0.013229137209190042)100:0.05114547140421301,(BRADI:0.03640509165202179,((TRIUA:0.020034052371142638,AEGTA:0.016177707298879407)100:0.003040510874100422,WHEAT:0.002486220463727039)100:0.00963767136466128)100:0.04234886622322473)100:0.01267601008497033,(ERATE:0.05559406295807609,((SORBI:0.01709999762195485,MAIZE:0.028655444009338817)100:0.02116038102665871,SETIT:0.02888954247955576)100:0.018548958030179762)100:0.02039775944910062)100:0.1457679907093585,MUSAM:8.796708126665597e-07)100:0.06286583676665747,(AMBTC:0.14975095896472063,(SELML:0.2484398325238543,((KLEFL:0.274618918265353,CHLVA:0.5184992867408746)100:0.13093248915132716,PHYPA:0.22776525214845053)100:0.04993019768349578)100:0.1744136548793523)100:0.046517445534787864)100:0.05745861506562633,(SOLLC:0.012451279671389162,SOLTU:0.010906888620292177)100:0.1383950

 57%|█████▋    | 4/7 [00:02<00:02,  1.44it/s]

(VITVI:0.0887962,(((((((ORYLO:0.0170725,(ORYNI:0.00294604,ORYRU:0.00313422)100:0.00364292)100:0.00852846,ORYPU:0.0128506)100:0.049682,(BRADI:0.0353634,((TRIUA:0.0194608,AEGTA:0.0157148)100:0.00295351,WHEAT:0.00241508)100:0.0093619)100:0.0411371)100:0.0123133,(ERATE:0.0540033,((SORBI:0.0166107,MAIZE:0.0278355)100:0.0205549,SETIT:0.0280629)100:0.0180182)100:0.0198141)100:0.141597,MUSAM:8.545e-07)100:0.061067,(AMBTC:0.145466,(SELML:0.241331,((KLEFL:0.266761,CHLVA:0.503663)100:0.127186,PHYPA:0.221248)100:0.0485015)100:0.169423)100:0.0451864)100:0.0558145,(SOLLC:0.012095,SOLTU:0.0105948)100:0.134435)100:0.0216117,(((LOTJA:0.0668137,MEDTR:0.0794586)100:0.0780396,ARATH:0.17624)93:0.021721,((MANES:0.0707488,POPTR:0.0864561)100:0.0322172,(THECC:0.0395282,GOSHI:0.0553785)100:0.0575019)93:0.0156124)100:0.0208854)
(VITVI:0.0919103,((((((ORYLO:0.0170657,(ORYNI:0.00294717,ORYRU:0.00313382)100:0.00363663)100:0.00851994,ORYPU:0.0128558)100:0.0496185,((((TRIUA:0.019475,AEGTA:0.0156835)100:0.00295336,WH

 71%|███████▏  | 5/7 [00:03<00:01,  1.55it/s]

(VITVI:0.09732647966263369,((((((ORYLO:0.018071364188546962,(ORYNI:0.00312084370377775,ORYRU:0.0033184927967415483)100:0.003850932874068778)100:0.009022011321221445,ORYPU:0.013613379101655488)100:0.05254246728756614,(BRADI:0.037415561003627466,((TRIUA:0.0206226417651753,AEGTA:0.01660771256093077)100:0.0031273984741257057,WHEAT:0.002557535846863622)100:0.009893139241535768)100:0.043477505807166486)100:0.012997611903378238,(ERATE:0.05714584332256332,((SORBI:0.017577585608053958,MAIZE:0.029465757625559325)100:0.02176151987652041,SETIT:0.029688132710061856)100:0.019042084378849206)100:0.020873819717765776)100:0.14775235971732525,MUSAM:9.048548081305414e-07)100:0.0635581877235399,(AMBTC:0.15444161404495618,(SELML:0.2554592024316398,((KLEFL:0.28261225917837274,CHLVA:0.5330870829302343)100:0.1345697529222204,PHYPA:0.2338221067095434)100:0.051370656485135176)100:0.1780631426639942)100:0.04683452243997567)100:0.056074842558440176,((SOLLC:0.012558897629523957,SOLTU:0.011450516672624906)100:0.133

(ORYLO:0.017071,(((BRADI:0.0353691,((TRIUA:0.0194737,AEGTA:0.0156819)100:0.00295326,WHEAT:0.00241727)100:0.00933084)100:0.0410786,((ERATE:0.0538918,((SORBI:0.0165856,MAIZE:0.0278325)100:0.0205806,SETIT:0.0280198)100:0.018031)100:0.0195572,((((SOLLC:0.0118815,SOLTU:0.010788)100:0.13029,ARATH:0.173036)100:0.0544225,(AMBTC:0.149973,(SELML:0.241461,((KLEFL:0.266958,CHLVA:0.503444)100:0.126497,PHYPA:0.221493)100:0.0479955)100:0.16399)100:0.0422115)100:0.058869,MUSAM:8.545e-07)100:0.135888)100:0.0124004)100:0.0496171,ORYPU:0.0128584)100:0.00851456,(ORYNI:0.0029463,ORYRU:0.00313336)100:0.00363654)
(ORYLO:0.014851747095635632,(((BRADI:0.030771069544856554,((TRIUA:0.016942092871904374,AEGTA:0.013643231959407674)100:0.0025693322375758237,WHEAT:0.0021030216567200014)100:0.008117818280700649)100:0.03573832688435229,((ERATE:0.04688579369272897,((SORBI:0.014429449746902601,MAIZE:0.024214237656802688)100:0.01790509438676344,SETIT:0.024377188405500044)100:0.015686945807592178)100:0.017014737759871428,

 86%|████████▌ | 6/7 [00:03<00:00,  1.65it/s]

(ORYLO:0.017071,(((BRADI:0.0353691,((TRIUA:0.0194737,AEGTA:0.0156819)100:0.00295326,WHEAT:0.00241727)100:0.00933084)100:0.0410786,((ERATE:0.0538918,((SORBI:0.0165856,MAIZE:0.0278325)100:0.0205806,SETIT:0.0280198)100:0.018031)100:0.0195572,((((SOLLC:0.0118815,SOLTU:0.010788)100:0.13029,ARATH:0.173036)100:0.0544225,(AMBTC:0.149973,(SELML:0.241461,((KLEFL:0.266958,CHLVA:0.503444)100:0.126497,PHYPA:0.221493)100:0.0479955)100:0.16399)100:0.0422115)100:0.058869,MUSAM:8.545e-07)100:0.135888)100:0.0124004)100:0.0496171,ORYPU:0.0128584)100:0.00851456,(ORYNI:0.0029463,ORYRU:0.00313336)100:0.00363654)
(ORYLO:0.014851747095635632,(((BRADI:0.030771069544856554,((TRIUA:0.016942092871904374,AEGTA:0.013643231959407674)100:0.0025693322375758237,WHEAT:0.0021030216567200014)100:0.008117818280700649)100:0.03573832688435229,((ERATE:0.04688579369272897,((SORBI:0.014429449746902601,MAIZE:0.024214237656802688)100:0.01790509438676344,SETIT:0.024377188405500044)100:0.015686945807592178)100:0.017014737759871428,

100%|██████████| 7/7 [00:04<00:00,  1.66it/s]

(VITVI:0.08986284816204641,(((((((ORYLO:0.017559398833091405,(ORYNI:0.0030331844454580897,ORYRU:0.003224499997376138)100:0.003746597082533012)100:0.008765551141179311,ORYPU:0.013223441191895991)100:0.05107033503836343,(BRADI:0.03639955100062263,((TRIUA:0.02003035653962613,AEGTA:0.0161576391580634)100:0.0030393170797259946,WHEAT:0.002486556823229595)100:0.009630324600537593)100:0.04232978775837709)100:0.012741988243246204,(ERATE:0.05554088136449878,((SORBI:0.017087823953061066,MAIZE:0.028629215272852317)100:0.02112795401841375,SETIT:0.028877298850367734)100:0.01851860045465869)100:0.02028070384120017)100:0.1452209852570855,MUSAM:8.792510036786545e-07)100:0.06254865077415761,(AMBTC:0.14957762510445285,(SELML:0.2482955444045423,((KLEFL:0.2744631655567387,CHLVA:0.5181232204685107)100:0.1307629912808552,PHYPA:0.2276359754731672)100:0.04991593866501401)100:0.17428205734239413)100:0.046412722787747364)100:0.057248346683988954,(SOLLC:0.012438031840102056,SOLTU:0.01090075741120105)100:0.1378607




Unnamed: 0,closest_neighbour_dist,dist_to_neighbour,euc,gcov,internode_dist,method,neuc,neuc1,nrf,ref_trees,rf,same_neighbour,species,trees,wrf
0,0.0,-0.014622,0.262258,5,0,Illumina,0.247921,0.040601,0.033333,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",2,same,ARATH,"(KLEFL:0.19876,CHLVA:0.312765,((((((((((ARATH:...",1.088974
1,0.0,-0.165553,0.304212,10,0,ONT,0.332921,0.044995,0.133333,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",8,same,ARATH,"(PHYPA:0.164619,((((((((((((BRARP:0.00546963,B...",1.155554
2,0.0,0.003541,0.261306,10,0,Illumina,0.260259,0.039362,0.033333,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",2,same,ARATH,"(KLEFL:0.201628,((((((((BRADI:0.0304908,((AEGT...",0.889884
3,0.0,-0.074435,0.258984,10,0,PacBIO,0.247173,0.038515,0.1,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",6,same,ARATH,"(BRARP:0.00418417,(((ARATH:0.0946952,ARALY:0.0...",1.017268
4,0.0,-0.026144,0.276644,20,0,ONT,0.243203,0.041704,0.1,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",6,same,ARATH,"(CHLVA:0.282105,(((((((((MANES:0.0572856,POPTR...",1.007628
5,0.0,0.004042,0.265912,20,0,Illumina,0.254564,0.039914,0.066667,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",4,same,ARATH,"(KLEFL:0.20131,CHLVA:0.289678,((((((SOLLC:0.00...",0.910975
6,0.0,-0.026143,0.276644,20,0,PacBIO,0.209927,0.041703,0.1,"(VITVI:0.0891786,(((((((ORYLO:0.017083,(ORYNI:...",6,same,ARATH,"(MANES:0.057286,((((((((ERATE:0.0454437,((MAIZ...",1.007629
7,0.028934,-0.133453,0.284463,5,1,ONT,0.267449,0.04258,0.068966,"(VITVI:0.0891761,(((((((ORYLO:0.017083,(ORYNI:...",4,same,ARATH,"(BRARP:0.00571119,((ARATH:0.182537,((((((((((B...",1.082115
8,0.028934,0.015868,0.247284,5,1,Illumina,0.259738,0.038362,0.034483,"(VITVI:0.0891761,(((((((ORYLO:0.017083,(ORYNI:...",2,same,ARATH,"(KLEFL:0.20312,CHLVA:0.330211,((((((((((ARAAL:...",1.025925
9,0.028934,-0.064583,0.249836,10,1,ONT,0.218222,0.037962,0.103448,"(VITVI:0.0891761,(((((((ORYLO:0.017083,(ORYNI:...",6,different,ARATH,"(KLEFL:0.205768,((((((((BRADI:0.0289713,((AEGT...",1.04006


In [122]:
df.to_csv("/Users/daviddylus/Desktop/assembly_align_arath_sp_removal-like_shen.csv", index = False)

In [None]:
t_yeast_neighbour

In [None]:
for key, val in out_dict.items():
    print(key)
    print(len(val))