In [1]:
# Load Trees

import pickle
def extract_spcode(nodename):
    return nodename.split('.')[0]

all_trees = pickle.load(open('data/alltrees.pkl', 'rb'))
print("Trees loaded:", len(all_trees))

Trees loaded: 2601


# Identify speciation events (orthology relationships)

## Relevant documentation: 

- [Detecting evolutionary events](http://etetoolkit.org/docs/latest/tutorial/tutorial_phylogeny.html#detecting-evolutionary-events)
- [Tree rooting](http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#tree-rooting)
- [Standarizing topology](http://etetoolkit.org/docs/latest/reference/reference_tree.html?highlight=standardize#ete3.TreeNode.standardize)



In [2]:
# Root all trees to midpoint, and label nodes as duplication or speciation events
for tname, t in all_trees.items():
    t.standardize() # Let's resolve artifically all multifurcations (as 0-distance bifurcations)
    out = t.get_midpoint_outgroup()
    t.set_outgroup(out)    
    
    # Label all nodes in the tree as "S"peciation or "D"uplication. 
    # This method is based on a simple species overalp algorith where
    # a split is considered a duplication if species in both sides of
    # the event overalp in at least one species.
    events = t.get_descendant_evol_events()    


### TASK: List all fine-grained orthology relationships between the reference Aquifex and your strain

- are all one-to-one relationships?

In [7]:
# List fine grained orthologs
target_sp = "224324"
for tname, t in all_trees.items():
    for n in t.traverse():
        if n.species == "224324999":
            seqs.add(n.name)
        if getattr(n, "evoltype", None) == "S":                       
            # branch order is random
            side1 = n.children[0]
            side2 = n.children[1]
                        
            if "224324999" in side1.get_species() and target_sp in side2.get_species():
                sides = [side1, side2]
            elif "224324999" in side2.get_species() and target_sp in side1.get_species():
                sides = [side2, side1]
            else: 
                sides = None
            
            if sides: 
                aquifex = [leaf.name for leaf in sides[0].get_leaves() if leaf.species == "224324999"]
                othersp = [leaf.name for leaf in sides[1].get_leaves() if leaf.species == target_sp]
                print (aquifex, othersp)           
                
                

['224324999.sul075'] ['224324.aq_075']
['224324999.sul1171s'] ['224324.aq_1171a']
['224324999.sul1192s'] ['224324.aq_1192a']
['224324999.sul012'] ['224324.aq_012']
['224324999.sul124s'] ['224324.aq_124a']
['224324999.sul1102'] ['224324.aq_1102']
['224324999.2984354'] ['224324.2984354']
['224324999.sul176'] ['224324.aq_176']
['224324999.sul2067'] ['224324.aq_2067']
['224324999.sul792'] ['224324.aq_792']
['224324999.sul1550'] ['224324.aq_1550']
['224324999.sul384'] ['224324.aq_384']
['224324999.sul2049'] ['224324.aq_2049']
['224324999.sul862'] ['224324.aq_862']
['224324999.sul177'] ['224324.aq_177']
['224324999.sul1579'] ['224324.aq_1579']
['224324999.sul1860'] ['224324.aq_1860']
['224324999.sul616'] ['224324.aq_616']
['224324999.sul328'] ['224324.aq_328']
['224324999.sul1050'] ['224324.aq_1050']
['224324999.sul238'] ['224324.aq_238']
['224324999.sul2087'] ['224324.aq_2087']
['224324999.sul1446'] ['224324.aq_1446']
['224324999.sul1899s'] ['224324.aq_1899a']
['224324999.sul064b'] ['224324

['224324999.sul574'] ['224324.aq_574']
['224324999.sul327'] ['224324.aq_327']
['224324999.sul975'] ['224324.aq_975']
['224324999.sul1669'] ['224324.aq_1669']
['224324999.sul1343'] ['224324.aq_1343']
['224324999.sul705'] ['224324.aq_705']
['224324999.sul1706'] ['224324.aq_1706']
['224324999.sul832'] ['224324.aq_832']
['224324999.sul250'] ['224324.aq_250']
['224324999.sul2095'] ['224324.aq_2095']
['224324999.sul2009'] ['224324.aq_2009']
['224324999.sul1333'] ['224324.aq_1333']
['224324999.sul1990'] ['224324.aq_1990']
['224324999.sul1744'] ['224324.aq_1744']
['224324999.sul390'] ['224324.aq_390']
['224324999.sul401'] ['224324.aq_401']
['224324999.sul1656'] ['224324.aq_1656']
['224324999.sul1613'] ['224324.aq_1613']
['224324999.sul505'] ['224324.aq_505']
['224324999.sul504'] ['224324.aq_504']
['224324999.sul1355'] ['224324.aq_1355']
['224324999.sul1820'] ['224324.aq_1820']
['224324999.sul017'] ['224324.aq_017']
['224324999.sul1101'] ['224324.aq_1101']
['224324999.sul1067'] ['224324.aq_1067

['224324999.sul556'] ['224324.aq_556']
['224324999.sul1189'] ['224324.aq_1189']
['224324999.sul1769'] ['224324.aq_1769']
['224324999.sul023'] ['224324.aq_023']
['224324999.sul170'] ['224324.aq_170']
['224324999.sul2186'] ['224324.aq_2186']
['224324999.sul232'] ['224324.aq_232']
['224324999.sul235'] ['224324.aq_235']
['224324999.sul1069'] ['224324.aq_1069']
['224324999.sul242'] ['224324.aq_242']
['224324999.sul1008'] ['224324.aq_1008']
['224324999.sul1945'] ['224324.aq_1945']
['224324999.sul1784'] ['224324.aq_1784']
['224324999.sul940'] ['224324.aq_940']
['224324999.sul094'] ['224324.aq_094']
['224324999.sul1372'] ['224324.aq_1372']
['224324999.sul597'] ['224324.aq_597']
['224324999.sul1939'] ['224324.aq_1939']
['224324999.sul319'] ['224324.aq_319']
['224324999.sul624'] ['224324.aq_624']
['224324999.sul1441'] ['224324.aq_1441']
['224324999.sul186'] ['224324.aq_186']
['224324999.sul1639'] ['224324.aq_1639']
['224324999.sul2038'] ['224324.aq_2038']
['224324999.sul1595'] ['224324.aq_1595']

### TASKS: Is there any gene in our strain without an ortholog in the reference Aquifex genome?

             

In [None]:


                
aquifex_with_orthologs.update(aquifex)        
#print(aquifex, "<->", othersp)
print(len(aquifex_with_orthologs))
print(len(seqs))
        
print(seqs-aquifex_with_orthologs)



#present in other bacteria? (OJO: manipulate gene family to introduce a dup and a HGT in new aquifex)

## Duplication profiles


## prunning trees