In [1]:
#Load all our trees
import pickle

def extract_spcode(nodename):
    return nodename.split('.')[0]

all_trees = pickle.load(open('data/alltrees.pkl', 'rb'))
sptree_markers = pickle.load(open('data/sptree_markers.pkl', 'rb'))


# Compare gene trees vs reference species tree

In [2]:
# TASK: Dump all newick string into a file "marker_newicks.txt". one newick per line
# and export the species attribute as part of the newick 

with open("data/marker_newicks.txt", "w") as NEWICKS: 
    for tname in sptree_markers:
        t = all_trees[tname]
        print(t.write(features=["species"]), file=NEWICKS)

In [3]:
!head -n1  data/marker_newicks.txt

(((933801.Ahos_0739:0.150697[&&NHX:species=933801],273063.STK_02680:0.118573[&&NHX:species=273063])0.994:0.421795[&&NHX:species=],(673860.AciM339_0977:0.0577864[&&NHX:species=673860],263820.PTO0416:0.206158[&&NHX:species=263820])0.791:0.110055[&&NHX:species=])1:0.396335[&&NHX:species=],(1051632.TPY_0322:0.160071[&&NHX:species=1051632],((224324999.sul008:0[&&NHX:species=224324999],224324.aq_008:0[&&NHX:species=224324])1:0.545592[&&NHX:species=],(525897.Dbac_2775:0.252446[&&NHX:species=525897],(((743299.Acife_2711:0.0103803[&&NHX:species=743299],637389.Acaty_c0617:0.00868782[&&NHX:species=637389])0.957:0.0998053[&&NHX:species=],(555778.Hneap_0322:0.138856[&&NHX:species=555778],((1255043.TVNIR_2575:0[&&NHX:species=1255043],713587.THITH_14295:0[&&NHX:species=713587])1:0.0164901[&&NHX:species=],1158165.KB898880_gene1474:0.0147795[&&NHX:species=1158165])0.857:0.0596475[&&NHX:species=])0.985:0.149629[&&NHX:species=])0.945:0.124834[&&NHX:species=],1121405.dsmv_3590:0.188083[&&NHX:species=11214

In [4]:
# TASK: Run ete3 compare to measure the Robinson Foulds distance of each marker gene tree to the reference species tree

!ete3 compare --src_tree_list "data/marker_newicks.txt" -r "data/sptree.nw" --src_tree_attr "species"


source          | ref             | E.size  | nRF     | RF      | maxRF   | src-br+ | ref-br+ | subtre+ | treekoD
(..):0.150697[+ | data/sptree.nw  | 15      | 0.15    | 4.00    | 26.00   | 0.85    | 0.85    | 1       | NA     
(..):0.0499937+ | data/sptree.nw  | 15      | 0.31    | 8.00    | 26.00   | 0.69    | 0.69    | 1       | NA     
(..):0.125104[+ | data/sptree.nw  | 15      | 0.15    | 4.00    | 26.00   | 0.85    | 0.85    | 1       | NA     
(..):0.0473113+ | data/sptree.nw  | 15      | 0.38    | 10.00   | 26.00   | 0.62    | 0.62    | 1       | NA     
(..):0.199027[+ | data/sptree.nw  | 15      | 0.08    | 2.00    | 26.00   | 0.92    | 0.92    | 1       | NA     
(..):0.230297[+ | data/sptree.nw  | 15      | 0.31    | 8.00    | 26.00   | 0.69    | 0.69    | 1       | NA     
(..)0:0[&&NHX:+ | data/sptree.nw  | 15      | 0.38    | 10.00   | 26.00   | 0.62    | 0.62    | 1       | NA     
(..):0.209872[+ | data/sptree.nw  | 15      | 0.08    | 2.00    | 26.00   | 0.92

## Compare trees programmatically 

In [5]:
# TASK: How many gene trees are identical to the species tree? Can you locate them?
from ete3 import PhyloTree
sptree = PhyloTree('data/sptree.nw')

for tname in sptree_markers:
    genetree = all_trees[tname]
    c = genetree.compare(sptree, source_tree_attr="species")
    if c['rf'] == 0:
        print (tname)

224324999.sul072
273063.STK_12580
224324999.sul1945
224324999.sul1939


In [6]:
      
# TASK: is there any clade fully supported? What is the least supported clade?
from collections import Counter
from pprint import pprint
edges = Counter()
for tname in sptree_markers:
    genetree = all_trees[tname]
    c = genetree.compare(sptree, source_tree_attr="species")
    edges.update(c["common_edges"])

pprint(edges)

Counter({('637389', '743299'): 33,
         ('1051632', '1121405', '1158165', '1255043', '224324', '224324999', '263820', '273063', '525897', '555778', '637389', '673860', '713587', '743299', '933801'): 33,
         ('224324', '224324999'): 33,
         ('273063', '933801'): 33,
         ('1255043', '713587'): 32,
         ('1158165', '1255043', '713587'): 31,
         ('1158165', '1255043', '555778', '637389', '713587', '743299'): 30,
         ('263820', '273063', '673860', '933801'): 28,
         ('1051632', '1121405', '1158165', '1255043', '224324', '224324999', '525897', '555778', '637389', '713587', '743299'): 28,
         ('263820', '673860'): 27,
         ('1158165', '1255043', '555778', '713587'): 25,
         ('1051632', '1121405', '1158165', '1255043', '525897', '555778', '637389', '713587', '743299'): 20,
         ('1121405', '525897'): 17,
         ('1121405', '1158165', '1255043', '525897', '555778', '637389', '713587', '743299'): 11})
