Skip to content

Commit

Permalink
Merge '64-phyml-empirical-freqs' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Connor McCoy committed Jan 30, 2014
2 parents 634a153 + 17e8e2c commit 1fcd0dd
Show file tree
Hide file tree
Showing 10 changed files with 139 additions and 45 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Expand Up @@ -6,6 +6,7 @@
=========

* Fix GH-63: "empirical_frequencies" now set to false when parsing FastTree AA statistics files
* Close GH-64: "empirical_frequencies" is now available as a flag for PhyML statistics files

0.5.1
=====
Expand Down
47 changes: 32 additions & 15 deletions taxtastic/refpkg.py
Expand Up @@ -20,26 +20,30 @@
#
# You should have received a copy of the GNU General Public License
# along with taxtastic. If not, see <http://www.gnu.org/licenses/>.
import contextlib
from decorator import decorator
import subprocess
import tempfile
import hashlib
import shutil
import errno
import os
import copy
import json
import time
import warnings

import Bio.Phylo
import Bio.SeqIO
import contextlib
import copy
import csv
import errno
import functools
import hashlib
import json
import shutil
import subprocess
import tempfile
import warnings
import zipfile

import Bio.SeqIO
import Bio.Phylo
from decorator import decorator

from taxtastic import utils, taxdb


FORMAT_VERSION = '1.1'

def md5file(fobj):
Expand Down Expand Up @@ -448,8 +452,8 @@ def update_file(self, key, new_path):
md5_value = md5file(open(new_path))
self.contents['md5'][key] = md5_value
self._log('Updated file: %s=%s' % (key,new_path))
if key == 'tree_stats':
self.update_phylo_model(None, new_path)
if key == 'tree_stats' and old_path:
warnings.warn('Updating tree_stats, but not phylo_model.')
return old_path

@transaction
Expand All @@ -473,7 +477,7 @@ def reroot(self, rppr=None, pretend=False):
self.update_file('tree', name)
self._log('Rerooting refpkg')

def update_phylo_model(self, stats_type, stats_file):
def update_phylo_model(self, stats_type, stats_file, frequency_type=None):
"""Parse a stats log and use it to update ``phylo_model``.
``pplacer`` expects its input to include the deatils of the
Expand All @@ -486,8 +490,20 @@ def update_phylo_model(self, stats_type, stats_file):
parameter must be 'RAxML', 'PhyML' or 'FastTree', depending on which
program generated the log. It may also be None to attempt to guess
which program generated the log.
:param stats_type: Statistics file type. One of 'RAxML', 'FastTree', 'PhyML'
:param stats_file: path to statistics/log file
:param frequency_type: For ``stats_type == 'PhyML'``, amino acid
alignments only: was the alignment inferred with ``model`` or
``empirical`` frequencies?
"""

if frequency_type not in (None, 'model', 'empirical'):
raise ValueError('Unknown frequency type: "{0}"'.format(frequency_type))
if frequency_type and stats_type not in (None, 'PhyML'):
raise ValueError('Frequency type should only be specified for '
'PhyML alignments.')

if stats_type is None:
with open(stats_file) as fobj:
for line in fobj:
Expand All @@ -510,7 +526,8 @@ def update_phylo_model(self, stats_type, stats_file):
elif stats_type == 'FastTree':
parser = utils.parse_fasttree
elif stats_type == 'PhyML':
parser = utils.parse_phyml
parser = functools.partial(utils.parse_phyml,
frequency_type=frequency_type)
else:
raise ValueError('invalid log type: %r' % (stats_type,))

Expand Down
6 changes: 5 additions & 1 deletion taxtastic/subcommands/create.py
Expand Up @@ -70,6 +70,9 @@ def build_parser(parser):
parser.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML'),
help="""stats file type [default: attempt to guess from
file contents]""")
parser.add_argument("--frequency-type", choices=('empirical', 'model'),
help="""Residue frequency type from the model. Required
for PhyML Amino Acid alignments.""")
parser.add_argument("-S", "--aln-sto",
action="store", dest="aln_sto",
help='Multiple alignment in Stockholm format', metavar='FILE')
Expand Down Expand Up @@ -120,7 +123,8 @@ def action(args):
if args.package_version:
r.update_metadata('package_version', args.package_version)
if args.tree_stats:
r.update_phylo_model(args.stats_type, args.tree_stats)
r.update_phylo_model(args.stats_type, args.tree_stats,
frequency_type=args.frequency_type)

for file_name in ['aln_fasta', 'aln_sto', 'mask',
'profile', 'seq_info', 'taxonomy', 'tree', 'tree_stats',
Expand Down
27 changes: 21 additions & 6 deletions taxtastic/subcommands/update.py
Expand Up @@ -41,6 +41,15 @@ def build_parser(parser):
parser.add_argument('--metadata', action='store_const', const=True,
default=False, help='Update metadata instead of files')

stats_group = parser.add_argument_group('Tree inference log file parsing '
'(for updating `tree_stats`)')
stats_group.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML'),
help="""stats file type [default: attempt to guess from
file contents]""")
stats_group.add_argument("--frequency-type", choices=('empirical', 'model'),
help="""Residue frequency type from the model. Required
for PhyML Amino Acid alignments.""")


def action(args):
"""Updates a Refpkg with new files.
Expand All @@ -52,26 +61,32 @@ def action(args):
"""
log.info('loading reference package')

pairs = [p.split('=',1) for p in args.changes]
pairs = [p.split('=', 1) for p in args.changes]
if args.metadata:
rp = refpkg.Refpkg(args.refpkg, create=False)
rp.start_transaction()
for (key,value) in pairs:
for key, value in pairs:
rp.update_metadata(key, value)
rp.commit_transaction('Updated metadata: ' + \
', '.join(['%s=%s' % (a,b)
for a,b in pairs]))
else:
for (key,filename) in pairs:
for key, filename in pairs:
if not(os.path.exists(filename)):
print "No such file: %s" % filename
exit(1)

rp = refpkg.Refpkg(args.refpkg, create=False)
rp.start_transaction()
for (key,filename) in pairs:
for key, filename in pairs:
rp.update_file(key, os.path.abspath(filename))
rp.commit_transaction('Updates files: ' + \
if key == 'tree_stats':
# Trigger model update
log.info('Updating phylo_model to match tree_stats')
rp.update_phylo_model(args.stats_type, filename,
args.frequency_type)

rp.commit_transaction('Updates files: ' +
', '.join(['%s=%s' % (a,b)
for a,b in pairs]))
for a, b in pairs]))
return 0
16 changes: 10 additions & 6 deletions taxtastic/utils.py
Expand Up @@ -210,7 +210,9 @@ def parse_fasttree(fobj):

return data

def parse_phyml(fobj):
def parse_phyml(fobj, frequency_type=None):
if frequency_type not in (None, 'empirical', 'model'):
raise ValueError("Unknown frequency_type: {0}".format(frequency_type))
s = ''.join(fobj)
result = {'gamma': {}}
try_set_fields(result, r'---\s*(?P<program>PhyML.*?)\s*---', s)
Expand All @@ -234,19 +236,21 @@ def parse_phyml(fobj):
if rates:
result['subs_rates'] = rates

# PhyML doesn't record whether empirical base frequencies were used, or
# ML estimates were made.
# Setting to empirical for now.
result['empirical_frequencies'] = True
elif 'amino acids' in s:
result['datatype'] = 'AA'
if not frequency_type:
raise ValueError("frequency type required for PhyML AA models.")
result['empirical_frequencies'] = frequency_type == 'empirical'
try_set_fields(result,
r'Model of amino acids substitution:\s+(?P<subs_model>\w+)',
s)
else:
raise ValueError('Could not determine if alignment is AA or DNA')

# PhyML doesn't record whether empirical base frequencies were used, or
# ML estimates were made.
# Setting to empirical for now.
result['empirical_frequencies'] = True

return result

def has_rppr(rppr_name='rppr'):
Expand Down
2 changes: 1 addition & 1 deletion testfiles/phyml_aa_stats.txt.json
Expand Up @@ -7,5 +7,5 @@
"alpha": 0.626,
"n_cats": 4
},
"empirical_frequencies": true
"empirical_frequencies": false
}
1 change: 1 addition & 0 deletions testfiles/phyml_aa_stats_empirical.txt
11 changes: 11 additions & 0 deletions testfiles/phyml_aa_stats_empirical.txt.json
@@ -0,0 +1,11 @@
{
"datatype": "AA",
"subs_model": "LG",
"program": "PhyML v3.0_export",
"ras_model": "gamma",
"gamma": {
"alpha": 0.626,
"n_cats": 4
},
"empirical_frequencies": true
}
63 changes: 48 additions & 15 deletions tests/test_subcommands.py
Expand Up @@ -13,16 +13,26 @@
from config import OutputRedirectMixin

class TestUpdate(OutputRedirectMixin, unittest.TestCase):
def setUp(self):
super(TestUpdate, self).setUp()
class _Args(object):
refpkg = None
changes = []
metadata = False
stats_type = None
frequency_type = None
self.args = _Args()

def test_action(self):
with config.tempdir() as scratch:
pkg_path = os.path.join(scratch, 'test.refpkg')
r = refpkg.Refpkg(pkg_path, create=True)
test_file = config.data_path('bv_refdata.csv')
class _Args(object):
refpkg=pkg_path
changes = ['meep='+test_file, 'hilda='+test_file]
metadata = False
update.action(_Args())

self.args.refpkg = pkg_path
self.args.changes = ['meep='+test_file, 'hilda='+test_file]

update.action(self.args)
r._sync_from_disk()
self.assertEqual(r.contents['files']['meep'], 'bv_refdata.csv')

Expand All @@ -38,15 +48,34 @@ def test_metadata_action(self):
with config.tempdir() as scratch:
pkg_path = os.path.join(scratch, 'test.refpkg')
r = refpkg.Refpkg(pkg_path, create=True)
class _Args(object):
refpkg=pkg_path
changes = ['meep=boris', 'hilda=vrrp']
metadata = True
update.action(_Args())
self.args.changes = ['meep=boris', 'hilda=vrrp']
self.args.metadata = True
self.args.refpkg = pkg_path
update.action(self.args)
r._sync_from_disk()
self.assertEqual(r.metadata('meep'), 'boris')
self.assertEqual(r.metadata('hilda'), 'vrrp')

def test_update_stats_action(self):
with config.tempdir() as scratch:
pkg_path = os.path.join(scratch, 'test.refpkg')
r = refpkg.Refpkg(pkg_path, create=True)

args = self.args
stats_path = os.path.join(config.datadir, 'phyml_aa_stats.txt')

args.refpkg = pkg_path
args.changes = ['tree_stats=' + stats_path]
args.frequency_type = 'empirical'

update.action(args)

r._sync_from_disk()

self.assertIn('tree_stats', r.contents['files'])
self.assertIn('phylo_model', r.contents['files'])
self.assertTrue(r.contents['files']['phylo_model'].endswith('.json'))

class TestCreate(OutputRedirectMixin, unittest.TestCase):

def setUp(self):
Expand All @@ -70,6 +99,7 @@ class _Args(object):
taxonomy = None
reroot = False
rppr = 'rppr'
frequency_type = None
def __init__(self, scratch):
self.package_name = os.path.join(scratch, 'test.refpkg')
self._Args = _Args
Expand All @@ -90,24 +120,27 @@ def test_create(self):
args2.clobber = True
self.assertEqual(0, create.action(args2))

def _test_create_phylo_model(self, stats_path, stats_type=None):
def _test_create_phylo_model(self, stats_path, stats_type=None,
frequency_type=None):
with config.tempdir() as scratch:
args = self._Args(scratch)
args.tree_stats = stats_path
args.stats_type = stats_type
args.frequency_type = frequency_type
create.action(args)

r = refpkg.Refpkg(args.package_name, create=False)
self.assertIn('phylo_model', r.contents['files'])

def test_create_phyml_aa(self):
stats_path = os.path.join(config.datadir, 'phyml_aa_stats.txt')
self._test_create_phylo_model(stats_path)
self._test_create_phylo_model(stats_path, 'PhyML')
self._test_create_phylo_model(stats_path, frequency_type='model')
self._test_create_phylo_model(stats_path, 'PhyML', frequency_type='model')
self._test_create_phylo_model(stats_path, 'PhyML', frequency_type='empirical')
self.assertRaises(ValueError, self._test_create_phylo_model,
stats_path, 'FastTree')
stats_path, 'FastTree', frequency_type='empirical')
self.assertRaises(ValueError, self._test_create_phylo_model,
stats_path, 'garli')
stats_path, 'garli', frequency_type='empirical')


class TestStrip(OutputRedirectMixin, unittest.TestCase):
Expand Down
10 changes: 9 additions & 1 deletion tests/test_utils.py
@@ -1,4 +1,5 @@

import functools
import logging
import os
import json
Expand Down Expand Up @@ -131,12 +132,19 @@ class FastTreeAATestCase(FastTreeStatsMixin, unittest.TestCase):
test_file_name = 'V278.updated.pruned.log'

class PhyMLStatsMixIn(StatsFileParsingMixIn):
frequency_type = None
@property
def parse_func(self):
return taxtastic.utils.parse_phyml
return functools.partial(taxtastic.utils.parse_phyml,
frequency_type=self.frequency_type)

class PhyMLAminoAcidTestCase(PhyMLStatsMixIn, unittest.TestCase):
test_file_name = 'phyml_aa_stats.txt'
frequency_type = 'model'

class PhyMLEmpiricalAminoAcidTestCase(PhyMLStatsMixIn, unittest.TestCase):
test_file_name = 'phyml_aa_stats_empirical.txt'
frequency_type = 'empirical'

class PhyMLDNATestCase(PhyMLStatsMixIn, unittest.TestCase):
test_file_name = 'phyml_dna_stats.txt'
Expand Down

0 comments on commit 1fcd0dd

Please sign in to comment.