-
-
Notifications
You must be signed in to change notification settings - Fork 216
/
evoltree.py
570 lines (507 loc) · 23.1 KB
/
evoltree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
# #START_LICENSE###########################################################
#
#
# This file is part of the Environment for Tree Exploration program
# (ETE). http://etetoolkit.org
#
# ETE is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ETE is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ETE. If not, see <http://www.gnu.org/licenses/>.
#
#
# ABOUT THE ETE PACKAGE
# =====================
#
# ETE is distributed under the GPL copyleft license (2008-2015).
#
# If you make use of ETE in published work, please cite:
#
# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
# ETE: a python Environment for Tree Exploration. Jaime BMC
# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
#
# Note that extra references to the specific methods implemented in
# the toolkit may be available in the documentation.
#
# More info at http://etetoolkit.org. Contact: huerta@embl.de
#
#
# #END_LICENSE#############################################################
#!/usr/bin/python
"""
this module defines the EvolNode dataytype to manage evolutionary
variables and integrate them within phylogenetic trees. It inheritates
the coretype PhyloNode and add some speciall features to the the node
instances.
"""
from __future__ import absolute_import
from six.moves import map
__author__ = "Francois-Jose Serra"
__email__ = "francois@barrabin.org"
__licence__ = "GPLv3"
__version__ = "0.0"
__references__ = '''
Yang, Z., Nielsen, R., Goldman, N., & Pedersen, A. M. 2000.
Codon-substitution models for heterogeneous selection pressure at amino acid sites.
Genetics 155: 431-49.
Retrieved from http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1461088&tool=pmcentrez&rendertype=abstract
Yang, Z., & Nielsen, R. 2002.
Codon-substitution models for detecting molecular adaptation at individual sites along specific lineages.
Molecular biology and evolution 19: 908-17.
Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/12032247
Bielawski, J. P., & Yang, Z. 2004.
A maximum likelihood method for detecting functional divergence at individual codon sites, with application to gene family evolution.
Journal of molecular evolution 59: 121-32.
Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/15383915
Zhang, J., Nielsen, R., & Yang, Z. 2005.
Evaluation of an improved branch-site likelihood method for detecting positive selection at the molecular level.
Molecular biology and evolution 22: 2472-9.
Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/16107592
Yang, Z. 2007.
PAML 4: phylogenetic analysis by maximum likelihood.
Molecular biology and evolution 24: 1586-91.
Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/17483113
'''
import os
import sys
from warnings import warn
from .. import PhyloNode, SeqGroup
from ..parser.newick import write_newick
from .model import Model, PARAMS, AVAIL
from .utils import translate
from ..tools.utils import which
try:
from scipy.stats import chi2
chi_high = lambda x, y: 1 - chi2.cdf(x, y)
except ImportError:
from .utils import chi_high
try:
from ..treeview import TreeStyle
except ImportError:
TREEVIEW = False
else:
TREEVIEW = True
__all__ = ["EvolNode", "EvolTree"]
def _parse_species(name):
'''
just to return specie name from fasta description
'''
return name[:3]
class EvolNode(PhyloNode):
""" Re-implementation of the standart TreeNode instance. It adds
attributes and methods to work with phylogentic trees.
:argument newick: path to tree in newick format, can also be a string
:argument alignment: path to alignment, can also be a string.
:argument fasta alg_format: alignment format.
:argument sp_naming_function: function to infer species name.
:argument format: type of newick format
:argument binpath: path to binaries, in case codeml or SLR are not in global path.
"""
def __init__(self, newick=None, alignment=None, alg_format="fasta",
sp_naming_function=_parse_species, format=0,
binpath='', **kwargs):
'''
freebranch: path to find codeml output of freebranch model.
'''
# _update names?
self.workdir = '/tmp/ete3-tmp/'
if not binpath:
ete3_path = which("ete3")
binpath = os.path.split(ete3_path)[0]
self.execpath = binpath
self._models = {}
self.__gui_mark_mode = False
PhyloNode.__init__(self, newick=newick, format=format,
sp_naming_function=sp_naming_function, **kwargs)
if newick:
self._label_as_paml()
# initialize node marks
self.mark_tree([])
def _set_mark_mode(self, val):
self.__gui_mark_mode = val
def _is_mark_mode(self):
return self.__gui_mark_mode
def _label_internal_nodes(self, nid=None):
"""
nid needs to be a list in order to keep count through recursivity
"""
for node in self.get_children():
if node.is_leaf():
continue
nid[0] += 1
node.add_feature('node_id', nid[0])
node._label_internal_nodes(nid)
def _label_as_paml(self):
'''
to label tree as paml, nearly walking man over the tree algorithm
WARNING: sorted names in same order that sequence
WARNING: depends on tree topology conformation, not the same after a swap
activates the function get_descendants_by_pamlid
'''
nid = 1
# check we do not have dupplicated names in tree
if (len(self)) != len(set(self.get_leaf_names())):
duplis = [n for n in self.get_leaf_names() if self.get_leaf_names().count(n)>1]
raise Exception('EvolTree require unique names for leaves', duplis)
# put ids
for leaf in sorted(self, key=lambda x: x.name):
leaf.add_feature ('node_id', nid)
nid += 1
self.add_feature('node_id', nid)
self._label_internal_nodes([nid])
def get_descendant_by_node_id(self, idname):
'''
returns node list corresponding to a given idname
'''
try:
for n in self.iter_descendants():
if n.node_id == idname:
return n
if self.node_id == idname:
return self
except AttributeError:
warn('Should be first labelled as paml ' + \
'(automatically done when alignemnt is loaded)')
def _write_algn(self, fullpath):
"""
to write algn in paml format
"""
seq_group = SeqGroup()
for n in self:
seq_group.id2seq [n.node_id] = n.nt_sequence
seq_group.id2name [n.node_id] = n.name
seq_group.name2id [n.name ] = n.node_id
seq_group.write(outfile=fullpath, format='paml')
def run_model(self, model_name, ctrl_string='', keep=True, **kwargs):
'''
To compute evolutionnary models. e.g.: b_free_lala.vs.lele, will launch one free branch model, and store
it in "WORK_DIR/b_free_lala.vs.lele" directory
WARNING: this functionality needs to create a working directory in "rep"
WARNING: you need to have codeml and/or SLR in your path
The models available are:
=========== ============================= ==================
Model name Description Model kind
=========== ============================= ==================\n%s
=========== ============================= ==================\n
**Note that M1 and M2 models are making reference to the new versions
of these models, with continuous omega rates (namely M1a and M2a in the
PAML user guide).**
:argument model_name: a string like "model-name[.some-secondary-name]" (e.g.: "fb.my_first_try", or just "fb")
* model-name is compulsory, is the name of the model (see table above for the full list)
* the second part is accessory, it is to avoid over-writing models with the same name.
:argument ctrl_string: list of parameters that can be used as control file.
:argument kwargs: extra parameters should be one of: %s.
'''
from subprocess import Popen, PIPE
model_obj = Model(model_name, self, **kwargs)
fullpath = os.path.join (self.workdir, model_obj.name)
os.system("mkdir -p %s" %fullpath)
# write tree file
self._write_algn(fullpath + '/algn')
if model_obj.properties['exec'] == 'Slr':
self.write(outfile=fullpath+'/tree', format = (11))
else:
self.write(outfile=fullpath+'/tree',
format = (10 if model_obj.properties['allow_mark'] else 9))
# write algn file
## MODEL MODEL MDE
if ctrl_string == '':
ctrl_string = model_obj.get_ctrl_string(fullpath+'/tmp.ctl')
else:
open(fullpath+'/tmp.ctl', 'w').write(ctrl_string)
hlddir = os.getcwd()
os.chdir(fullpath)
bin_ = os.path.join(self.execpath, model_obj.properties['exec'])
try:
proc = Popen([bin_, 'tmp.ctl'], stdout=PIPE, stdin=PIPE)
except OSError:
raise Exception(('ERROR: {} not installed, ' +
'or wrong path to binary\n').format(bin_))
run, err = proc.communicate(b'\n') # send \n via stdin in case codeml/slr asks something (note on py3, stdin needs bytes)
run = run.decode(sys.stdout.encoding)
if err is not None:
warn("ERROR: codeml not found!!!\n" +
" define your variable EvolTree.execpath")
return 1
if 'error' in run or 'Error' in run:
warn("ERROR: inside codeml!!\n" + run)
return 1
os.chdir(hlddir)
if keep:
setattr(model_obj, 'run', run)
self.link_to_evol_model(os.path.join(fullpath,'out'), model_obj)
sep = '\n'
run_model.__doc__ = run_model.__doc__ % \
(sep.join([' %-8s %-27s %-15s ' % \
('%s' % (x), AVAIL[x]['evol'], AVAIL[x]['typ']) for x in sorted (sorted (AVAIL.keys()), key=lambda x: \
AVAIL[x]['typ'],
reverse=True)]),
', '.join(list(PARAMS.keys())))
#def test_codon_model(self):
# for c_frq in range(4):
# self.run_model('M0.model_test-'+str(c_frq), CodonFreq=c_frq)
# if self.get_most_likely('M0.model_test-1', 'M0.model_test-0') > 0.05:
#
# self.get_most_likely('M0.model_test-2', 'M0.model_test-0')
# self.get_most_likely('M0.model_test-3', 'M0.model_test-0')
# self.get_most_likely('M0.model_test-2', 'M0.model_test-1')
# self.get_most_likely('M0.model_test-3', 'M0.model_test-1')
# self.get_most_likely('M0.model_test-3', 'M0.model_test-2')
def link_to_alignment(self, alignment, alg_format="paml",
nucleotides=True, **kwargs):
'''
same function as for phyloTree, but translate sequences if nucleotides
nucleotidic sequence is kept under node.nt_sequence
:argument alignment: path to alignment or string
:argument alg_format: one of fasta phylip or paml
:argument True alignment: set to False in case we want to keep it untranslated
'''
super(EvolTree, self).link_to_alignment(alignment,
alg_format=alg_format, **kwargs)
check_len = 0
for leaf in self.iter_leaves():
seq_len = len(str(leaf.sequence))
if check_len and check_len != seq_len:
warn('WARNING: sequences with different lengths found!')
check_len = seq_len
leaf.nt_sequence = str(leaf.sequence)
if nucleotides:
leaf.sequence = translate(leaf.nt_sequence)
def show(self, layout=None, tree_style=None, histfaces=None):
'''
call super show of PhyloTree
histface should be a list of models to be displayes as histfaces
:argument layout: a layout function
:argument None tree_style: tree_style object
:argument Nonehistface: an histogram face function. This is only to plot selective pressure among sites
'''
if TREEVIEW:
if not tree_style:
ts = TreeStyle()
else:
ts = tree_style
if histfaces:
for hist in histfaces:
try:
mdl = self.get_evol_model(hist)
except AttributeError:
warn('model %s not computed' % (hist))
if not 'histface' in mdl.properties:
if len(histfaces)>1 and histfaces.index(hist)!=0:
mdl.set_histface(up=False)
else:
mdl.set_histface()
if mdl.properties ['histface'].up:
ts.aligned_header.add_face(
mdl.properties['histface'], 1)
else:
ts.aligned_foot.add_face(
mdl.properties['histface'], 1)
super(EvolTree, self).show(layout=layout, tree_style=ts)
else:
raise ValueError("Treeview module is disabled")
def render(self, file_name, layout=None, w=None, h=None,
tree_style=None, header=None, histfaces=None):
'''
call super show adding up and down faces
:argument layout: a layout function
:argument None tree_style: tree_style object
:argument Nonehistface: an histogram face function. This is only to plot selective pressure among sites
'''
if TREEVIEW:
if not tree_style:
ts = TreeStyle()
else:
ts = tree_style
if histfaces:
for hist in histfaces:
try:
mdl = self.get_evol_model(hist)
except AttributeError:
warn('model %s not computed' % (hist))
if not 'histface' in mdl.properties:
if len(histfaces)>1 and histfaces.index(hist)!=0:
mdl.set_histface(up=False)
else:
mdl.set_histface()
if mdl.properties ['histface'].up:
ts.aligned_header.add_face(
mdl.properties['histface'], 1)
else:
ts.aligned_foot.add_face(
mdl.properties['histface'], 1)
return super(EvolTree, self).render(file_name, layout=layout,
tree_style=ts,
w=w, h=h)
else:
raise ValueError("Treeview module is disabled")
def mark_tree(self, node_ids, verbose=False, **kargs):
'''
function to mark branches on tree in order that paml could interpret it.
takes a "marks" argument that should be a list of #1,#1,#2
e.g.:
::
t=Tree.mark_tree([2,3], marks=["#1","#2"])
:argument node_ids: list of node ids (have a look to node.node_id)
:argument False verbose: warn if marks do not correspond to codeml standard
:argument kargs: mainly for the marks key-word which needs a list of marks (marks=['#1', '#2'])
'''
from re import match
node_ids = list(map(int , node_ids))
if 'marks' in kargs:
marks = list(kargs['marks'])
else:
marks = ['#1']*len (node_ids)
for node in self.traverse():
if not hasattr(node, 'node_id'):
continue
if node.node_id in node_ids:
if ('.' in marks[node_ids.index(node.node_id)] or
match('#[0-9]+',
marks[node_ids.index(node.node_id)]) is None) and verbose:
warn('WARNING: marks should be "#" sign directly ' +
'followed by integer\n' + self.mark_tree.__doc__)
node.add_feature('mark', ' '+marks[node_ids.index(node.node_id)])
elif not 'mark' in node.features:
node.add_feature('mark', '')
def link_to_evol_model(self, path, model):
'''
link EvolTree to evolutionary model
* free-branch model ("fb") will append evol values to tree
* Site models (M0, M1, M2, M7, M8) will give evol values by site
and likelihood
:argument path: path to outfile containing model computation result
:argument model: either the name of a model, or a Model object (usually empty)
'''
if type(model) == str :
model = Model(model, self, path)
else:
model._load(path)
# new entry in _models dict
while model.name in self._models:
model.name = model.name.split('__')[0] + str(
(int(model.name.split('__')[1]) + 1)
if '__' in model.name else 0)
self._models[model.name] = model
if not os.path.isfile(path):
warn("ERROR: not a file: " + path)
return 1
if len(self._models) == 1 and model.properties['exec']=='codeml':
self.change_dist_to_evol('bL', model, fill=True)
def get_evol_model(self, modelname):
'''
returns one precomputed model
:argument modelname: string of the name of a model object stored
:returns: Model object
'''
try:
return self._models[modelname]
except KeyError:
warn("Model %s not found." % (modelname))
def write(self, features=None, outfile=None, format=10):
"""
Inherits from Tree but adds the tenth format, that allows to display marks for CodeML.
TODO: internal writting format need to be something like 0
"""
from re import sub
if int(format)==11:
nwk = ' %s 1\n' % (len(self))
nwk += sub('\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \
write_newick(self, features=['mark'],format=9))
elif int(format)==10:
nwk = sub('\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \
write_newick(self, features=['mark'],format=9))
else:
nwk = write_newick(self, features=features,format=format)
if outfile is not None:
open(outfile, "w").write(nwk)
return nwk
else:
return nwk
write.__doc__ += super(PhyloNode, PhyloNode()).write.__doc__.replace(
'argument format', 'argument 10 format')
def get_most_likely(self, altn, null):
'''
Returns pvalue of LRT between alternative model and null model.
usual comparison are:
============ ======= ===========================================
Alternative Null Test
============ ======= ===========================================
M2 M1 PS on sites (M2 prone to miss some sites)
(Yang 2000).
M3 M0 test of variability among sites
M8 M7 PS on sites
(Yang 2000)
M8 M8a RX on sites?? think so....
bsA bsA1 PS on sites on specific branch
(Zhang 2005)
bsA M1 RX on sites on specific branch
(Zhang 2005)
bsC M1 different omegas on clades branches sites
ref: Yang Nielsen 2002
bsD M3 different omegas on clades branches sites
(Yang Nielsen 2002, Bielawski 2004)
b_free b_neut foreground branch not neutral (w != 1)
- RX if P<0.05 (means that w on frg=1)
- PS if P>0.05 and wfrg>1
- CN if P>0.05 and wfrg>1
(Yang Nielsen 2002)
b_free M0 different ratio on branches
(Yang Nielsen 2002)
============ ======= ===========================================
**Note that M1 and M2 models are making reference to the new versions
of these models, with continuous omega rates (namely M1a and M2a in the
PAML user guide).**
:argument altn: model with higher number of parameters (np)
:argument null: model with lower number of parameters (np)
'''
altn = self.get_evol_model(altn)
null = self.get_evol_model(null)
if null.np > altn.np:
warn("first model should be the alternative, change the order")
return 1.0
try:
if hasattr(altn, 'lnL') and hasattr(null, 'lnL'):
if null.lnL - altn.lnL < 0:
return chi_high(2 * abs(altn.lnL - null.lnL),
float(altn.np - null.np))
else:
warn("\nWARNING: Likelihood of the alternative model is "
"smaller than null's (%f - %f = %f)" % (
null.lnL, altn.lnL, null.lnL - altn.lnL) +
"\nLarge differences (> 0.1) may indicate mistaken "
"assigantion of null and alternative models")
return 1
except KeyError:
warn("at least one of %s or %s, was not calculated" % (altn.name,
null.name))
exit(self.get_most_likely.__doc__)
def change_dist_to_evol(self, evol, model, fill=False):
'''
change dist/branch length of the tree to a given evolutionary
variable (dN, dS, w or bL), default is bL.
:argument evol: evolutionary variable
:argument model: Model object from which to retrieve evolutionary variables
:argument False fill: do not affects only dist parameter, each node will be annotated with all evolutionary variables (nodel.dN, node.w...).
'''
# branch-site outfiles do not give specific branch info
if not model.branches:
return
for node in self.iter_descendants():
if not evol in model.branches[node.node_id]:
continue
node.dist = model.branches[node.node_id][evol]
if fill:
for e in ['dN', 'dS', 'w', 'bL']:
node.add_feature(e, model.branches [node.node_id][e])
# cosmetic alias
EvolTree = EvolNode