Permalink
Browse files

subtree conservation for halPhyloP (thanks Melissa)

and halPhyloP mask option fix
  • Loading branch information...
1 parent 8eff8c2 commit 6bfd7707faa802777d61b4219fbd81b403d018c4 @joelarmstrong joelarmstrong committed Oct 31, 2013
Showing with 181 additions and 22 deletions.
  1. +5 −1 phyloP/halPhyloPMP.py
  2. +13 −3 phyloP/halTreePhyloP.py
  3. +67 −17 phyloP/impl/halPhyloP.cpp
  4. +87 −0 phyloP/impl/halPhyloP.h
  5. +9 −1 phyloP/impl/halPhyloPMain.cpp
View
6 phyloP/halPhyloPMP.py
@@ -52,7 +52,8 @@ def getHalPhyloPCmd(options):
opt == 'dupType' or
opt == 'dupMask' or
opt == 'step' or
- opt == 'refBed')):
+ opt == 'refBed' or
+ opt == 'subtree')):
if val is not True:
cmd += ' --%s %s' % (opt, str(val))
else:
@@ -266,6 +267,9 @@ def main(argv=None):
"reference genome to stream from standard "
" input.",
default=None)
+ hppGrp.add_argument("--subtree",
+ help="Subtree root for lineage-specific acceleration/conservation",
+ default=None)
args = parser.parse_args()
View
16 phyloP/halTreePhyloP.py
@@ -54,8 +54,12 @@ def computeTreePhyloP(args):
# Run halPhyloP on the inserts
wigFile = outFileName(args, genome, "wig", "phyloP", False)
- runShellCommand("halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
- args.hal, genome, args.mod, bedFlags, args.numProc, wigFile))
+ cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
+ args.hal, genome, args.mod, bedFlags, args.numProc, wigFile)
+ if args.subtree is not None:
+ cmd += " --subtree %s" % args.subtree
+
+ runShellCommand(cmd)
runShellCommand("rm -f %s" % bedInsertsFile)
@@ -110,6 +114,9 @@ def main(argv=None):
parser.add_argument("--bigWig",
help="Run wigToBigWig on each generated wiggle",
action="store_true", default=False)
+ parser.add_argument("--subtree",
+ help="Run clade-specific acceleration/conservation on subtree below this node",
+ default=None)
# need phyloP options here:
args = parser.parse_args()
@@ -126,10 +133,13 @@ def main(argv=None):
args.halGenomes = getHalGenomes(args.hal)
if args.root is None:
args.root = getHalRootName(args.hal)
-
+
if not args.root in args.halGenomes:
raise RuntimeError("Root genome %s not found." % args.root)
+ if args.subtree is not None and args.root not in args.halGenomes:
+ raise RuntimeError("Subtree root %s not found." % args.subtree)
+
# make a little id tag for temporary maf slices
S = string.ascii_uppercase + string.digits
args.tempID = 'halTreePhyloP' + ''.join(random.choice(S) for x in range(5))
View
84 phyloP/impl/halPhyloP.cpp
@@ -46,7 +46,8 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
ostream* outStream,
bool softMaskDups,
const string& dupType,
- const string& phyloPMode)
+ const string& phyloPMode,
+ const string &subtree)
{
clear();
_alignment = alignment;
@@ -59,11 +60,11 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
if (dupType == "ambiguous")
{
- _maskAllDups = 1;
+ _maskAllDups = 0;
}
else if (dupType == "all")
{
- _maskAllDups = 0;
+ _maskAllDups = 1;
}
else
{
@@ -97,6 +98,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_mod = tm_new_from_file(infile, TRUE);
phast_fclose(infile);
+
// make sure all species in the tree are in the alignment, otherwise print
// warning and prune tree; create targetSet from these species.
// Make a hash of species names to species index (using phast's hash
@@ -131,6 +133,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
lst_free(leafNames);
lst_free(pruneNames);
+
//create a dummy alignment with a single column. As we iterate through the
// columns we will fill in the bases and compute the phyloP scores for
// individual columns.
@@ -144,7 +147,28 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_msa = msa_new(seqs, names, numspec, 1, NULL);
ss_from_msas(_msa, 1, 0, NULL, NULL, NULL, -1, FALSE);
msa_free_seqs(_msa);
- _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+
+
+ if (subtree != "\"\"") {
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL) {
+ tr_name_ancestors(_mod->tree);
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL)
+ throw hal_exception("no node named " + subtree);
+ }
+ _modcpy = tm_create_copy(_mod);
+ _modcpy->subtree_root = NULL;
+ _colfitdata = col_init_fit_data(_modcpy, _msa, ALL, NNEUT, FALSE);
+ _colfitdata2 = col_init_fit_data(_mod, _msa, SUBTREE, _mode, FALSE);
+ _colfitdata2->tupleidx = 0;
+ _insideNodes = lst_new_ptr(_mod->tree->nnodes);
+ _outsideNodes = lst_new_ptr(_mod->tree->nnodes);
+ tr_partition_leaves(_mod->tree, _mod->subtree_root,
+ _insideNodes, _outsideNodes);
+ } else {
+ _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+ }
_colfitdata->tupleidx = 0;
}
@@ -267,7 +291,7 @@ void PhyloP::processSequence(const Sequence* sequence,
else
{
/** Reset the iterator to a non-contiguous position */
- colIt->toSite(pos, last);
+ colIt->toSite(pos, last - 1);
}
}
}
@@ -340,18 +364,43 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
//finally, compute the score!
double alt_lnl, null_lnl, this_scale, delta_lnl, pval;
int sigfigs=4; //same value used in phyloP code
- _mod->scale = 1;
- tm_set_subst_matrices(_mod);
- null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
- _colfitdata->fels_scratch[0]);
- vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
- opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
- _colfitdata,
- &alt_lnl, sigfigs, _colfitdata->lb->data[0],
- _colfitdata->ub->data[0],
- NULL, NULL, NULL);
- alt_lnl *= -1;
- this_scale = _colfitdata->params->data[0];
+
+ if (_mod->subtree_root == NULL) {
+ _mod->scale = 1;
+ tm_set_subst_matrices(_mod);
+ null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
+ _colfitdata->fels_scratch[0]);
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &alt_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0],
+ NULL, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata->params->data[0];
+ } else { //subtree case
+ if (!col_has_data_sub(_mod, _msa, 0, _insideNodes, _outsideNodes)) {
+ alt_lnl = 0;
+ null_lnl = 0;
+ this_scale = 1;
+ } else {
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &null_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0], NULL, NULL, NULL);
+ null_lnl *= -1;
+
+ vec_set(_colfitdata2->params, 0,
+ max(0.05, _colfitdata->params->data[0]));
+ vec_set(_colfitdata2->params, 1, _colfitdata2->init_scale_sub);
+ opt_bfgs(col_likelihood_wrapper, _colfitdata2->params,
+ _colfitdata2, &alt_lnl, _colfitdata2->lb,
+ _colfitdata2->ub, NULL, NULL, OPT_HIGH_PREC, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata2->params->data[1];
+ }
+ }
delta_lnl = alt_lnl - null_lnl;
if (delta_lnl <= -0.01)
@@ -360,6 +409,7 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
cerr << "got delta_lnl = " << delta_lnl << endl;
throw hal_exception(string("ERROR col_lrts: delta_lnl < 0 "));
}
+ if (delta_lnl < 0) delta_lnl=0;
if (_mode == NNEUT || _mode == CONACC)
{
pval = chisq_cdf(2*delta_lnl, 1, FALSE);
View
87 phyloP/impl/halPhyloP.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) and
+ * Melissa Jane Hubisz (Cornell University)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+#ifndef _HALPHYLOP_H
+#define _HALPHYLOP_H
+
+#include <cstdlib>
+#include <string>
+#include "hal.h"
+
+extern "C"{
+#include "tree_model.h"
+#include "fit_column.h"
+#include "msa.h"
+#include "sufficient_stats.h"
+#include "hashtable.h"
+}
+
+namespace hal {
+
+/** Use the Phast library methods to compute a PhyloP score for a HAL
+ * aligment, column by column. Thanks to Melissa Jane Hubisz. */
+class PhyloP
+{
+public:
+
+ PhyloP();
+ virtual ~PhyloP();
+
+ /** @param dupHardMask true for hard duplication mask or false for
+ * soft duplication mask
+ * @param dupType ambiguous or all
+ * @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices,
+ * though I think we are mainly interested in CONACC
+ * (conservation/acceleration- negative p-values indicate acceleration)
+ * @param subtree If equal to empty string, perform phyloP test on
+ * entire tree. Otherwise, subtree names a branch to perform test on
+ * subtree relative to rest of tree. The subtree includes all children
+ * of the named node as well as the branch leading to the node.
+ */
+ void init(AlignmentConstPtr alignment, const std::string& modFilePath,
+ std::ostream* outStream,
+ bool softMaskDups = true,
+ const std::string& dupType = "ambiguous",
+ const std::string& phyloPMode = "CONACC",
+ const std::string& subtree = "\"\"");
+
+ void processSequence(const Sequence* sequence,
+ hal_index_t start,
+ hal_size_t length,
+ hal_size_t step);
+
+protected:
+
+ // return phyloP score
+ double pval(const ColumnIterator::ColumnMap *cmap);
+
+ void clear();
+
+protected:
+
+ hal::AlignmentConstPtr _alignment;
+ TreeModel* _mod;
+ TreeModel* _modcpy;
+ std::set<const Genome*> _targetSet;
+ std::ostream* _outStream;
+
+ // 1 default = soft mask, if 0 use hard mask (mask entire column)
+ int _softMaskDups;
+ int _maskAllDups;
+
+ // 0 default = mask only ambiguous bases in dups; if 1 mask any duplication
+ hash_table* _seqnameHash;
+ ColFitData* _colfitdata;
+ ColFitData *_colfitdata2;
+ List *_insideNodes;
+ List *_outsideNodes;
+ mode_type _mode;
+ MSA* _msa;
+};
+
+}
+#endif
View
10 phyloP/impl/halPhyloPMain.cpp
@@ -65,6 +65,11 @@ static CLParserPtr initParser()
optionsParser->addOption("refBed", "Bed file with coordinates to "
"annotate in the reference genome (or \"stdin\") "
"to stream from standard input", "\"\"");
+ optionsParser->addOption("subtree", "Partition the tree into the subtree "
+ "beneath the node given (including the branch "
+ "leading up to this node), and test for "
+ "conservation/acceleration in this subtree "
+ "relative to the rest of the tree", "\"\"");
optionsParser->setDescription("Make PhyloP wiggle plot for a genome.");
return optionsParser;
@@ -80,6 +85,7 @@ int main(int argc, char** argv)
string refSequenceName;
string dupType;
string dupMask;
+ string subtree;
hal_size_t start;
hal_size_t length;
hal_size_t step;
@@ -97,6 +103,7 @@ int main(int argc, char** argv)
step = optionsParser->getOption<hal_size_t>("step");
dupType = optionsParser->getOption<string>("dupType");
dupMask = optionsParser->getOption<string>("dupMask");
+ subtree = optionsParser->getOption<string>("subtree");
std::transform(dupMask.begin(), dupMask.end(), dupMask.begin(), ::tolower);
refBedPath = optionsParser->getOption<string>("refBed");
}
@@ -163,7 +170,8 @@ int main(int argc, char** argv)
}
PhyloP phyloP;
- phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType);
+ phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType,
+ "CONACC", subtree=subtree);
ifstream refBedStream;
if (refBedPath != "\"\"")

0 comments on commit 6bfd770

Please sign in to comment.