In [14]:
from ete3 import Tree, TreeStyle, TextFace, RectFace, NodeStyle, CircleFace

In [49]:
# random tree for Figure 1
import ngesh
tree = ngesh.gen_tree(1.0, 0.5, max_time=3.0, labels="human")
tree = Tree("(((Sapgepu:0.76,Elu:0.85)1:0.372265,((((Sabihu:0.790596,Zine:0.790596)1:0.213875,Hegtigu:0.567503)1:0.165539,(Uipu:0.132467,LoL:0.25))1:0.105096,(((Kefih:0.123651,Mupe:0.19)1:0.8,(Batif:0.05,Hunu:0.138753)1:1.02295)1:0.0855855)1:0.0278151)1:0.288353)1:0.1);")
for node in tree.traverse():
    node.name = ""
    nstyle = NodeStyle()
    nstyle["size"] = 0
    node.set_style(nstyle)


ts = TreeStyle()
ts.show_scale = False
ts.min_leaf_separation = 35
#tree.show(tree_style=ts)
tree.render('tree_fig2.svg', tree_style=ts, h=400, w=100);


In [15]:
# read taxonomy file and save it in a dict
crass_taxonomy = dict()

tax_file = "resources/crass_taxonomy.txt"
lines = [line.strip().split("\t") for line in open(tax_file).readlines()]

families = sorted(list(set([line[2] for line in lines])))
subfamilies = list(set([line[3] for line in lines]))
genera = list(set([line[4] for line in lines]))


for line in lines:
    crass_taxonomy[line[0]] = {
                               "family":line[2],
                               "subfamily":line[3],
                               "genus":line[4]
                              }

In [16]:
tree_file = "results/5_phylogenies/2_trees/TerL_trimmed.nwk"
#tree_file = "results/5_phylogenies/tree/MCP_trimmed.nwk"
#tree_file = "/home/danielc/TerL_trimmed.nwk"
t = Tree(tree_file, format=1)

In [4]:
# assign taxonomy
for leaf in t.iter_leaves():
    # check if the leaf comes from the reference set
    genome = leaf.name.split("|")[0]
    if genome in crass_taxonomy:    
        leaf.add_features(family=crass_taxonomy[genome]["family"],
                          subfamily=crass_taxonomy[genome]["subfamily"],
                          genus=crass_taxonomy[genome]["genus"],
                          genome=genome)
    else:
        leaf.add_features(family="new",
                          subfamily="new",
                          genus="new",
                          genome=genome)

In [5]:
# find the LCA of the two outgroup species
outgs_leaves = t.search_nodes(family="outgroup")
outgs_lca = t.get_common_ancestor(outgs_leaves)
# reroot the tree 
t.set_outgroup(outgs_lca)

In [40]:
# check the monophyly of the families in the tree
for family in families:
    print(family)
    fam_leaves = t.search_nodes(family=family)
    fam_lca = t.get_common_ancestor(fam_leaves)
    
    
    for leaf in fam_lca.iter_leaves():
        if leaf.family not in [family, "new"]:
            print(f"WATCH OUT! Monophyly of the {family} family is not guaranteed. Check {leaf.name} leaf.")
            

#for subfam in subfamilies:
#    check = False
#    subfam_leaves = t.search_nodes(subfamily=subfam)
#    if len(subfam_leaves) > 1:
#        subfam_lca = t.get_common_ancestor(subfam_leaves)
#        #print(genus)
#        #print(gen_lca)
#        #gen_lca.show(tree_style=ts)
#    
#    
#        for leaf in subfam_lca.iter_leaves():
#            if leaf.subfamily not in [subfam, "new"]:
#                check = True
#                print(f"WATCH OUT! Monophyly of the {subfam} subfamily is not guaranteed. Check {leaf.name} leaf.")
#        if check:
#            subfam_lca.show(tree_style=ts)
#            break
#   
            
# check the monophyly of the genera in the tree
#for genus in genera:
#    check = False
#    gen_leaves = t.search_nodes(genus=genus)
#    #print(gen_leaves)
#    if len(gen_leaves) > 1:
#        gen_lca = t.get_common_ancestor(gen_leaves)
#        #print(genus)
#        #print(gen_lca)
#        #gen_lca.show(tree_style=ts)
#    
#    
#        for leaf in gen_lca.iter_leaves():
#            if leaf.genus not in [genus, "new"]:
#                check = True
#                print(f"WATCH OUT! Monophyly of the {genus} family is not guaranteed. Check {leaf.name} leaf.")
#        if check:
#            gen_lca.show(tree_style=ts)
#

Crevaviridae
Intestiviridae
Jelitoviridae
Steigviridae
Suoliviridae
Tinaiviridae
outgroup


In [None]:
t.show()

**Classify the found genomes**

In [None]:
# approach 1

# identify which genomes were found by crAssUS and need to be classified
crassus_terl_classification = dict()
for leaf in t.iter_leaves():
    # check if it is a new contig found by crAssUS
    if leaf.genome not in crass_taxonomy:
        # if so, create an entry in the terL classification
        crassus_terl_classification[leaf.genome] = {"family":str(), "subfamily":str(), "genus":str()}


families = ["Intestiviridae", "Jelitoviridae", "Crevaviridae", "Tinaiviridae", "Suoliviridae", "Steigviridae"]
subfamilies = ["Asinivirinae", "Bearivirinae", "Boorivirinae", "Churivirinae", "Coarsevirinae", "Crudevirinae", "Doltivirinae", "Grossvirinae", "Loutivirinae", "Lumpivirinae", "Oafivirinae", "Uncouvirinae"]


# iterate families and subfamilies looking for their LCA
for family in families:
    nodes = t.search_nodes(family=family)
    lca = t.get_common_ancestor(nodes)
    for leaf in lca.iter_leaves():
        if leaf.family in [family, ""]:
            if leaf.family == "":
                crassus_terl_classification[leaf.genome]["family"] = family
            
        else:
            print(f"check monophyly of {family}")

for subfamily in subfamilies:
    nodes = t.search_nodes(subfamily=subfamily)
    lca = t.get_common_ancestor(nodes)
    for leaf in lca.iter_leaves():
        if leaf.subfamily in [subfamily, ""]:
            if leaf.subfamily == "":
                crassus_terl_classification[leaf.genome]["subfamily"] = subfamily
            
        else:
            print(f"check monophyly of {subfamily}")




for genome, taxa in crassus_terl_classification.items():
    print(genome, taxa)

In [None]:
# approach 2
crassus_terl_classification = dict()

for leaf in t.iter_leaves():
    # check if it is a new contig found by crAssUS
    if leaf.genome not in crass_taxonomy:
        # if so, create an entry in the terL classification
        crassus_terl_classification[leaf.genome] = {"family":str(), "subfamily":str(), "genus":str()}
        
        # family and subfamily: inspect the upper node
        node = leaf.up
        fam,subfam = list(), list()
        for hoja in node.iter_leaves():
            fam.append(hoja.family)
            subfam.append(hoja.subfamily)
            
        fam.remove("")
        subfam.remove("")
        
        fam, subfam = list(set(fam)), list(set(subfam))
        if len(fam) == 1:
            crassus_terl_classification[leaf.genome]["family"] = fam[0]
        if len(subfam) == 1:
            crassus_terl_classification[leaf.genome]["subfamily"] = subfam[0]
            
        # genus: inspect the two upper nodes
        node = leaf.up.up
        genus = list()
        for hoja in node.iter_leaves():
            genus.append(hoja.genus)
        
        genus.remove("")
        genus = list(set(genus))
        if len(genus) == 1:
            crassus_terl_classification[leaf.genome]["genus"] = genus[0]
        
            
            

for genome, taxa in crassus_terl_classification.items():
    print(genome, taxa)

In [None]:
R = t.get_midpoint_outgroup()
# and set it as tree outgroup
t.set_outgroup(R)

In [None]:
t.show(tree_style=ts)

## LAB MEETING SLIDES

In [10]:
from ete3 import Tree, TreeStyle, TextFace, RectFace, NodeStyle, CircleFace
import pandas as pd
from matplotlib import colors


In [21]:
def collapse_families(families_list, tree):
    for family in families_list:
        fam_leaves = tree.search_nodes(family=family)
        fam_lca = tree.get_common_ancestor(fam_leaves)
        fam_lca.name = family
        fam_lca.add_face(TextFace(family), column=0, position = "branch-right")
        for child in fam_lca.iter_descendants():
            child.detach()
            
            
# read taxonomy file and save it in a dict
crass_taxonomy = dict()

tax_file = "resources/crass_taxonomy.txt"
lines = [line.strip().split("\t") for line in open(tax_file).readlines()]

families = sorted(list(set([line[2] for line in lines])))


for line in lines:
    crass_taxonomy[line[0]] = {
                               "family":line[2],
                               "subfamily":line[3],
                               "genus":line[4]
                              }
    
    
    
##
mex = ["SRR12557708", "SRR12557709", "SRR12557710", "SRR12557712", "SRR12557714", "SRR12557715", "SRR12557716", "SRR12557717", "SRR12557718", "SRR12557719", "SRR12557720", "SRR12557721", "SRR12557723", "SRR12557724","SRR12557725", "SRR12557728", "SRR12557729", "SRR12557730", "SRR12557731"]
ancient = ["SRR12557704", "SRR12557705", "SRR12557706", "SRR12557707", "SRR12557711", "SRR12557722", "SRR12557733", "SRR12557734"]
apes = ["Chimp25", "Chimp20", "Baboon19", "Baboon22", "Baboon36B", "Lemur1", "Lemur8", "Lemur9", "Howler1", "Howler2", "Howler3", "Dunia-s-4", "Dunia-s-5", "Pinga-s-2", "Pinga-s-3", "Serafuli-s-6", "Serafuli-s-7"]
iceman = ["ERR1094777","ERR1094778","ERR1094779","ERR1094780","ERR1094781","ERR1094782","ERR1094783","ERR1094784","ERR1094785","ERR1094786","ERR1094787","ERR1094788","ERR1094789","ERR1094790","ERR1094791","ERR1094792","ERR1094793","ERR1094794"]

samples_type = dict()
for sample in mex:
    samples_type[sample] = 'modern'
for sample in ancient:
    samples_type[sample] = 'ancient'
for sample in apes:
    samples_type[sample] = 'NH_primates'
for sample in iceman:
    samples_type[sample] = 'iceman'

samples_type_colors = {"NH_primates":"#8b4513", "iceman":"#20b2aa", "soil":"#228b22", "ancient":"magenta", "modern":"#ffa500"}


#colors.to_hex("orange")


## read sp_gen_file, save to df
#sp_gen_file = "results/7_ANI/assigned_genus_species.txt"
#sp_gen_df = pd.read_csv(sp_gen_file, header =0, sep="\t", names=["genome", "genus", "species"])
#sp_gen_df = sp_gen_df.set_index("genome")
#sp_gen_df

In [28]:
##    

tree_file = "results/5_phylogenies/2_trees/TerL_trimmed.nwk"
t = Tree(tree_file, format=1)


# assign taxonomy
for leaf in t.iter_leaves():
    # check if the leaf comes from the reference set
    genome = leaf.name.split("|")[0]
    sample = genome.split("_")[0]
    if genome in crass_taxonomy:    
        leaf.add_features(family=crass_taxonomy[genome]["family"],
                          subfamily=crass_taxonomy[genome]["subfamily"],
                          genus=crass_taxonomy[genome]["genus"],
                          genome=genome)
    else:
        leaf.add_features(family="new",
                          subfamily="new",
                          genus="new",
                          genome=genome,
                          sample_type=samples_type[sample], 
                          sample=sample)
        
# find the LCA of the two outgroup species
outgs_leaves = t.search_nodes(family="outgroup")
outgs_lca = t.get_common_ancestor(outgs_leaves)
t.set_outgroup(outgs_lca)





# prepare a file to write ranges for iTOL
to_write_ranges = ["TREE_COLORS\nSEPARATOR TAB\nDATA\n"]

colors = ["red","cyan", "green", "orange", "violet", "brown", "grey"]
fams_cols = {fam:col for fam, col in zip(families, colors)}
print(fams_cols)
ts = TreeStyle()
ts.show_leaf_name = False
#ts.force_topology = False


for node in t.traverse():
    if node.is_leaf():
        sample = node.name.split("_")[0]
        if node.genome in crass_taxonomy:# and node.genome not in ["NC_021803", "TARA_MK892509"]:
            nstyle = NodeStyle()
            nstyle["fgcolor"] = fams_cols[node.family]
            nstyle["size"] = 0
            node.set_style(nstyle)
            node.add_face(TextFace(f"{crass_taxonomy[node.genome]['subfamily']},{crass_taxonomy[node.genome]['genus']}"), column=0, position = "branch-right")
        else:
            node.add_face(TextFace(node.name), column=0, position = "branch-right")
            node.img_style["size"] = 0
            node.img_style["bgcolor"] = samples_type_colors[node.sample_type]
            
            # write range
            if sample in samples_type:
                itol_range = f"{node.name}\trange\t{samples_type_colors[node.sample_type]}\n"
                to_write_ranges.append(itol_range)
            
        # add genus, species annot
        #node.add_face(TextFace(sp_gen_df.loc[node.genome, "genus"], fsize=9, fgcolor="red"), column=2, position = "branch-right")
        #node.add_face(TextFace(sp_gen_df.loc[node.genome, "species"], fsize=9, fgcolor="blue"), column=4, position = "branch-right")
           
        
    else:
        # support values are in the name of the node, support is a mock 1 by default.
        # Put the name into the support object
        if node.name != "":
            node.support = round(float(node.name), 2)
        if  node.get_distance(outgs_lca, topology_only=True) < 7:
            node.img_style['size'] = 10
            if node.support < 0.5:
                node.img_style["fgcolor"] = "red"
            elif 0.5 < node.support < 0.7:
                 node.img_style["fgcolor"] = "orange"
            elif 0.7 <= node.support <= 1:
                 node.img_style["fgcolor"] = "green"
                
                
        else:
            nstyle = NodeStyle()
            nstyle["size"] = 0
            node.set_style(nstyle)
        
        
for family in families:
        fam_leaves = t.search_nodes(family=family)
        fam_lca = t.get_common_ancestor(fam_leaves)
        fam_lca.name = family
        # color branches according to the family
        for node in fam_lca.traverse():
            node.img_style['hz_line_color'] = fams_cols[family] 
            node.img_style['vt_line_color'] = fams_cols[family]
            node.img_style['hz_line_width'] = 2
            node.img_style['vt_line_width'] = 2
            node.img_style['size'] = 0
            
                
            


#collapse_families(families, t)

# get the ids of the proteins that were not collapsed within a family
#with open("trees_benchmark/terl_outs.txt", "w") as fout:
#    for leaf in t.iter_leaves():
#        if leaf.name not in families:
#            fout.write(f"{leaf.name}\n")
#            
mex = ["SRR12557708", "SRR12557709", "SRR12557710", "SRR12557712", "SRR12557714", "SRR12557715", "SRR12557716", "SRR12557717", "SRR12557718", "SRR12557719", "SRR12557720", "SRR12557721", "SRR12557723", "SRR12557725", "SRR12557728", "SRR12557729", "SRR12557730", "SRR12557731"]
ancient = ["SRR12557704", "SRR12557705", "SRR12557706", "SRR12557707", "SRR12557711", "SRR12557722", "SRR12557733", "SRR12557734"]
apes = ["Baboon19", "Baboon22", "Baboon36B", "Lemur1", "Lemur8", "Lemur9", "Howler1", "Howler2", "Howler3", "Dunia-s-4", "Dunia-s-5", "Pinga-s-2", "Pinga-s-3", "Serafuli-s-6", "Serafuli-s-7"]
  
t.write(format=1, outfile="ancient_TerL_for_itol.nw")
with open("ancient_terl_itol_ranges.txt", "w") as fout:
    for line in to_write_ranges:
        fout.write(line)
    
    

{'Crevaviridae': 'red', 'Intestiviridae': 'cyan', 'Jelitoviridae': 'green', 'Steigviridae': 'orange', 'Suoliviridae': 'violet', 'Tinaiviridae': 'brown', 'outgroup': 'grey'}


In [8]:
ts = TreeStyle()
ts.show_leaf_name = False
ts.mode = "c"
#ts.show_branch_support = True
#ts.arc_start = -180 # 0 degrees = 3 o'clock
#ts.arc_span = 180
t.show(tree_style=ts)
#t.render("terl_tree_2.png", tree_style=ts);