# Create a constraint tree joining manual and antumated constraints

In this notebook, we will create constraint statements for BEAST2 that will join soft constraints that we had produced before from published Diptera trees (see [create_constraint.ipynb](create_constraint.ipynb)) with hard constraints pulled from several literature sources, summarized in a text file.

## Retrieve previous constraint tree

In [241]:
import dendropy, numpy as np, copy
from pathlib import Path
from collections import defaultdict, Counter

In [242]:
with open('./constraint_trees/result/partial_iqtree.tre', 'r') as f:
    tr = dendropy.Tree.get(file=f, schema="newick",preserve_underscores=True)
    for node in tr.internal_nodes():
        node.label = None
        node.taxon = None

tr.print_plot(show_internal_node_labels=True)

/----------------------------------------------------- Deuterophlebia_560719   
|                                                                              
|  /-------------------------------------------------- Nymphomyia_560766       
|  |                                                                           
|  |                                        /--------- Trichocera_52759        
|  |                                        |                                  
|  |  /-------------------------------------@     /--- Dicranota_700836        
|  |  |                                     |  /--@                            
|  |  |                                     |  |  \--- Pedicia_472374          
|  |  |                                     \--@                               
@  |  |                                        |  /--- Liogma_560731           
|  |  |                                        |  |                            
|  |  |                                 

## Retrieve hard monophyly statements

In [243]:
new_constraints = dict()
counts = Counter()
with open(Path('constraint_trees')/'final_constraints_info'/'hard_constraints.txt','r') as f:
    for l in f.readlines():
        k,v = l.strip().split('=')
        v=v.split(',')
        new_constraints[k]=v
        counts[k]=len(v)
counts.most_common()

[('first_split', 500),
 ('Brachycera', 392),
 ('Oestroidea', 76),
 ('Bibionomorpha', 39),
 ('Culicomorpha', 37),
 ('Tephritoidea', 35),
 ('Tachinidae', 33),
 ('Sciaroidea', 30),
 ('Syrphidae', 29),
 ('Tabanomorpha', 20),
 ('Chironomidae', 18),
 ('Ephydroidea', 17),
 ('Muscidae', 17),
 ('Tipulomorpha', 16),
 ('Tipuloidea', 15),
 ('Tabanidae', 14),
 ('Chloropidae', 13),
 ('Anthomyiidae', 13),
 ('Sciomyzidae', 13),
 ('Hippoboscoidea', 11),
 ('Cecidomyiidae', 11),
 ('Mycetophilidae', 11),
 ('Empidoidea', 10),
 ('Lauxanioidea', 8),
 ('Asilidae', 8),
 ('Ceratopogonidae', 8),
 ('Culicoidea', 7),
 ('Empididae', 7),
 ('Acroceridae', 7),
 ('Hippoboscidae', 7),
 ('Sarcophagidae', 7),
 ('Platypezidae', 7),
 ('Ephydridae', 6),
 ('Scathophagidae', 6),
 ('Rhinophoridae', 6),
 ('Bombyliidae', 4),
 ('Milichiidae', 4),
 ('Conopidae', 4),
 ('Micropezidae', 4),
 ('Mesembrinellidae', 4),
 ('Pipunculidae', 4),
 ('Lonchaeidae', 4),
 ('Piophilidae', 4),
 ('Ulidiidae', 4),
 ('Culicidae', 4),
 ('Dolichopodidae'

## Retrive parent hard constraints

In [244]:
constraint_parent = dict()
with open(Path('constraint_trees')/'final_constraints_info'/'parent_nodes.txt','r') as f:
    for l in f.readlines():
        k,v = l.strip().split('=')
        constraint_parent[k]=v

## Combine constraints

Now we will go through the new constraints, from most to least inclusive

For each new constraint, we will find in our tree the MRCA of all overlapping tips and name it with that constraint. We will also add a property that this is a hard constraint.

If there are no overlapping tips, we will find the parent hard constraint node, and add a child node.

We will then traverse constraints from least to most inclusive, and check if all child taxa are present, adding them if not.

## overlap with tree

In [245]:
tree_taxa = set([t.label for t in tr.taxon_namespace])
print('constraint','n_tips','n_overlap')
for k,v in new_constraints.items():
    print(k,len(v),len(set(v).intersection(tree_taxa)))

constraint n_tips n_overlap
first_split 500 190
Tabanomorpha 20 1
Bibionomorpha 39 13
Culicomorpha 37 30
Tipulomorpha 16 6
Empidoidea 10 0
Ephydroidea 17 0
Hippoboscoidea 11 8
Lauxanioidea 8 6
Oestroidea 76 0
Sciaroidea 30 6
Tephritoidea 35 26
Culicoidea 7 1
Tipuloidea 15 5
Asilidae 8 0
Bombyliidae 4 0
Dolichopodidae 3 0
Empididae 7 0
Acroceridae 7 0
Chloropidae 13 13
Milichiidae 4 0
Conopidae 4 0
Psilidae 3 3
Ephydridae 6 0
Hippoboscidae 7 7
Nycteribiidae 3 0
Celyphidae 2 0
Anthomyiidae 13 0
Fanniidae 2 0
Muscidae 17 17
Scathophagidae 6 6
Micropezidae 4 4
Neriidae 2 2
Mesembrinellidae 4 0
Rhiniidae 2 0
Rhinophoridae 6 0
Sarcophagidae 7 0
Tachinidae 33 0
Anthomyzidae 3 3
Clusiidae 3 0
Platypezidae 7 7
Sciomyzidae 13 13
Sepsidae 2 2
Pipunculidae 4 0
Syrphidae 29 0
Lonchaeidae 4 0
Piophilidae 4 4
Platystomatidae 3 0
Ulidiidae 4 4
Tabanidae 14 0
Xylophagidae 3 0
Cecidomyiidae 11 0
Keroplatidae 2 2
Mycetophilidae 11 0
Sciaridae 3 3
Ceratopogonidae 8 8
Chironomidae 18 18
Culicidae 4 0
Brach

### Step 1 add constraints

In [246]:
with open('./constraint_trees/result/partial_iqtree.tre', 'r') as f:
    tr = dendropy.Tree.get(file=f, schema="newick",preserve_underscores=True)
    for node in tr.internal_nodes():
        node.label = None
        node.taxon = None

for c,_ in counts.most_common():
    tree_taxa = set([t.label for t in tr.taxon_namespace])
    common_taxa = set(new_constraints[c]).intersection(tree_taxa)
    if common_taxa:
        # Find the MRCA of the common taxa in the tree
        common_taxa_nodes = [tr.find_node_with_taxon_label(taxon) for taxon in common_taxa]
        mrca = tr.mrca(taxon_labels=common_taxa)
        if mrca.is_leaf():
            new_internal_node = tr.node_factory(label=c)
            new_internal_node.annotations.add_new("constraint_type", "hard")
            parent_node = mrca.parent_node
            parent_node.remove_child(mrca)
            parent_node.add_child(new_internal_node)
            new_internal_node.add_child(mrca)
        else:
            mrca.annotations.add_new("constraint_type", "hard")
            mrca.label = c   
    else:
        mrca = tr.find_node_with_label(constraint_parent[c])
        new_node = tr.node_factory(label=c)
        for taxon in new_constraints[c]:
            if taxon not in tree_taxa:
                new_taxon = tr.taxon_namespace.require_taxon(label=taxon)
                child_node = tr.node_factory(taxon=new_taxon)
                new_node.add_child(child_node)
        mrca.add_child(new_node)
    tr.update_bipartitions(suppress_unifurcations=False)
    tr.reconstruct_taxon_namespace()

## Step 2 add missing taxa

In [247]:
for c,_ in reversed(counts.most_common()):
    tree_taxa = set([t.label for t in tr.taxon_namespace])
    included_taxa = set(new_constraints[c])
    common_taxa = included_taxa.intersection(tree_taxa)
    
    if common_taxa != included_taxa:
        missing_taxa = included_taxa - common_taxa
        mrca = tr.find_node_with_label(c)
        
        for taxon in missing_taxa:
            new_taxon = tr.taxon_namespace.require_taxon(label=taxon)
            new_leaf = tr.node_factory(taxon=new_taxon)
            mrca.add_child(new_leaf)

        # Update the taxon namespace and tree structure
        tr.reconstruct_taxon_namespace()
        tr.update_bipartitions()
    

In [248]:
tr.print_plot(show_internal_node_labels=True)

/---------------------------------------------------- Deuterophlebia_560719    
|                                                                              
|  /------------------------------------------------- Nymphomyia_560766        
|  |                                                                           
|  |                                       /--------- Trichocera_52759         
|  |                                       |                                   
|  |                                       |     /--- Dicranota_700836         
|  |                                       |  /--@                             
|  |                                       |  |  \--- Pedicia_472374           
|  |                                       |  |                                
|  |                                       |  |  /--- Liogma_560731            
|  |                                       |  |  |                             
|  |  /---------------------------------

### Save BEAST xml statements

Since we have already an XML file, here we will just generate constraint statements to manually add to it. All nodes in our tree will be constraints, following these rules:

1. If the node is annotated as a hard constraint, it will be a monophyletic constraint for the children of the node
2. If the node is not annotated as a hard constraint, it will be a soft constraint: we will include all children of the node as the cosntraint, and we will look for parent nodes until we find a hard constraint. We will then include the ancestors from this node as rogues, so they can float freely within the hard constraint (unless a less inclusive hard constraint excludes them).


In [249]:
node_lab_idx = 0
tr.seed_node.label = 'Diptera'
constraint_statements = []
log_statements = []

rogues = dict()

for node in tr.preorder_internal_node_iter():
    if node == tr.seed_node: continue
    included_taxa = [leaf.taxon.label for leaf in node.leaf_nodes()]
    constraint_lines = []
    
    if not node.label:
        node.label = f'Node_{node_lab_idx}' 
        node_lab_idx += 1
        
    
    log_statements.append(f'            <log idref="{node.label}.prior"/>')   
    
    
    if not node.annotations.get_value("constraint_type") == "hard": #soft constraint
        parent = node.parent_node
        while True:
            if parent.annotations.get_value("constraint_type") == "hard" or parent == tr.seed_node:
                break
            else:
                parent = parent.parent_node
                
        rogues = set([leaf.taxon.label for leaf in parent.leaf_nodes()]) - set(included_taxa)
                
        constraint_lines.append(f'                <distribution id="{node.label}.prior" spec="beastlabs.math.distributions.MRCAPriorWithRogues" monophyletic="true" tree="@Tree.t:mito_nonprotcoding">')
        constraint_lines.append(f'                    <taxonset id="{node.label}" spec="TaxonSet">')
        for tx in included_taxa:
            constraint_lines.append(f'                        <taxon idref="{tx}" spec="Taxon"/>')
        constraint_lines.append(f'                    </taxonset>')
           
            
        constraint_lines.append(f'                    <rogues id="{node.label}_rogues" spec="TaxonSet">')
        for rogue_tx in rogues:
            constraint_lines.append(f'                        <taxon idref="{rogue_tx}" spec="Taxon"/>')
        constraint_lines.append(f'                    </rogues>')
        
         
    else: #hard constraint
        constraint_lines = []
        constraint_lines.append(f'                 <distribution id="{node.label}.prior" spec="beast.base.evolution.tree.MRCAPrior" monophyletic="true" tree="@Tree.t:mito_nonprotcoding">' )
        constraint_lines.append(f'                    <taxonset id="{node.label}" spec="TaxonSet">')
        for tx in included_taxa:
            constraint_lines.append(f'                        <taxon idref="{tx}" spec="Taxon"/>')
        constraint_lines.append(f'                    </taxonset>')
        
    constraint_lines.append(f'                </distribution>')
        
        
    constraint_statements.append('\n'.join(constraint_lines))
            
        
    

        
        

Let's now print these statements so we can copy and paste to the xml file:

### Log statements

In [250]:
print('\n'.join(log_statements))

            <log idref="first_split.prior"/>
            <log idref="Node_0.prior"/>
            <log idref="Tipulomorpha.prior"/>
            <log idref="Tipuloidea.prior"/>
            <log idref="Node_1.prior"/>
            <log idref="Node_2.prior"/>
            <log idref="Node_3.prior"/>
            <log idref="Node_4.prior"/>
            <log idref="Node_5.prior"/>
            <log idref="Node_6.prior"/>
            <log idref="Node_7.prior"/>
            <log idref="Culicomorpha.prior"/>
            <log idref="Node_8.prior"/>
            <log idref="Node_9.prior"/>
            <log idref="Ceratopogonidae.prior"/>
            <log idref="Chironomidae.prior"/>
            <log idref="Node_10.prior"/>
            <log idref="Culicoidea.prior"/>
            <log idref="Culicidae.prior"/>
            <log idref="Brachycera.prior"/>
            <log idref="Bibionomorpha.prior"/>
            <log idref="Node_11.prior"/>
            <log idref="Node_12.prior"/>
            <log idref=

### Constraint statements

In [251]:
print('\n'.join(constraint_statements))

                 <distribution id="first_split.prior" spec="beast.base.evolution.tree.MRCAPrior" monophyletic="true" tree="@Tree.t:mito_nonprotcoding">
                    <taxonset id="first_split" spec="TaxonSet">
                        <taxon idref="Nymphomyia_560766" spec="Taxon"/>
                        <taxon idref="Trichocera_52759" spec="Taxon"/>
                        <taxon idref="Dicranota_700836" spec="Taxon"/>
                        <taxon idref="Pedicia_472374" spec="Taxon"/>
                        <taxon idref="Liogma_560731" spec="Taxon"/>
                        <taxon idref="Phalacrocera_2203737" spec="Taxon"/>
                        <taxon idref="Cylindrotoma_700834" spec="Taxon"/>
                        <taxon idref="Tipula_41043" spec="Taxon"/>
                        <taxon idref="Dictenidia_2108369" spec="Taxon"/>
                        <taxon idref="Limonia_52743" spec="Taxon"/>
                        <taxon idref="Prionocera_1690388" spec="Taxon"/>
   