Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

subtree conservation for halPhyloP (thanks Melissa)

and halPhyloP mask option fix
  • Loading branch information...
commit 6bfd7707faa802777d61b4219fbd81b403d018c4 1 parent 8eff8c2
Joel Armstrong joelarmstrong authored
6 phyloP/halPhyloPMP.py
View
@@ -52,7 +52,8 @@ def getHalPhyloPCmd(options):
opt == 'dupType' or
opt == 'dupMask' or
opt == 'step' or
- opt == 'refBed')):
+ opt == 'refBed' or
+ opt == 'subtree')):
if val is not True:
cmd += ' --%s %s' % (opt, str(val))
else:
@@ -266,6 +267,9 @@ def main(argv=None):
"reference genome to stream from standard "
" input.",
default=None)
+ hppGrp.add_argument("--subtree",
+ help="Subtree root for lineage-specific acceleration/conservation",
+ default=None)
args = parser.parse_args()
16 phyloP/halTreePhyloP.py
View
@@ -54,8 +54,12 @@ def computeTreePhyloP(args):
# Run halPhyloP on the inserts
wigFile = outFileName(args, genome, "wig", "phyloP", False)
- runShellCommand("halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
- args.hal, genome, args.mod, bedFlags, args.numProc, wigFile))
+ cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
+ args.hal, genome, args.mod, bedFlags, args.numProc, wigFile)
+ if args.subtree is not None:
+ cmd += " --subtree %s" % args.subtree
+
+ runShellCommand(cmd)
runShellCommand("rm -f %s" % bedInsertsFile)
@@ -110,6 +114,9 @@ def main(argv=None):
parser.add_argument("--bigWig",
help="Run wigToBigWig on each generated wiggle",
action="store_true", default=False)
+ parser.add_argument("--subtree",
+ help="Run clade-specific acceleration/conservation on subtree below this node",
+ default=None)
# need phyloP options here:
args = parser.parse_args()
@@ -126,10 +133,13 @@ def main(argv=None):
args.halGenomes = getHalGenomes(args.hal)
if args.root is None:
args.root = getHalRootName(args.hal)
-
+
if not args.root in args.halGenomes:
raise RuntimeError("Root genome %s not found." % args.root)
+ if args.subtree is not None and args.root not in args.halGenomes:
+ raise RuntimeError("Subtree root %s not found." % args.subtree)
+
# make a little id tag for temporary maf slices
S = string.ascii_uppercase + string.digits
args.tempID = 'halTreePhyloP' + ''.join(random.choice(S) for x in range(5))
84 phyloP/impl/halPhyloP.cpp
View
@@ -46,7 +46,8 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
ostream* outStream,
bool softMaskDups,
const string& dupType,
- const string& phyloPMode)
+ const string& phyloPMode,
+ const string &subtree)
{
clear();
_alignment = alignment;
@@ -59,11 +60,11 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
if (dupType == "ambiguous")
{
- _maskAllDups = 1;
+ _maskAllDups = 0;
}
else if (dupType == "all")
{
- _maskAllDups = 0;
+ _maskAllDups = 1;
}
else
{
@@ -97,6 +98,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_mod = tm_new_from_file(infile, TRUE);
phast_fclose(infile);
+
// make sure all species in the tree are in the alignment, otherwise print
// warning and prune tree; create targetSet from these species.
// Make a hash of species names to species index (using phast's hash
@@ -131,6 +133,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
lst_free(leafNames);
lst_free(pruneNames);
+
//create a dummy alignment with a single column. As we iterate through the
// columns we will fill in the bases and compute the phyloP scores for
// individual columns.
@@ -144,7 +147,28 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_msa = msa_new(seqs, names, numspec, 1, NULL);
ss_from_msas(_msa, 1, 0, NULL, NULL, NULL, -1, FALSE);
msa_free_seqs(_msa);
- _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+
+
+ if (subtree != "\"\"") {
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL) {
+ tr_name_ancestors(_mod->tree);
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL)
+ throw hal_exception("no node named " + subtree);
+ }
+ _modcpy = tm_create_copy(_mod);
+ _modcpy->subtree_root = NULL;
+ _colfitdata = col_init_fit_data(_modcpy, _msa, ALL, NNEUT, FALSE);
+ _colfitdata2 = col_init_fit_data(_mod, _msa, SUBTREE, _mode, FALSE);
+ _colfitdata2->tupleidx = 0;
+ _insideNodes = lst_new_ptr(_mod->tree->nnodes);
+ _outsideNodes = lst_new_ptr(_mod->tree->nnodes);
+ tr_partition_leaves(_mod->tree, _mod->subtree_root,
+ _insideNodes, _outsideNodes);
+ } else {
+ _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+ }
_colfitdata->tupleidx = 0;
}
@@ -267,7 +291,7 @@ void PhyloP::processSequence(const Sequence* sequence,
else
{
/** Reset the iterator to a non-contiguous position */
- colIt->toSite(pos, last);
+ colIt->toSite(pos, last - 1);
}
}
}
@@ -340,18 +364,43 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
//finally, compute the score!
double alt_lnl, null_lnl, this_scale, delta_lnl, pval;
int sigfigs=4; //same value used in phyloP code
- _mod->scale = 1;
- tm_set_subst_matrices(_mod);
- null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
- _colfitdata->fels_scratch[0]);
- vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
- opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
- _colfitdata,
- &alt_lnl, sigfigs, _colfitdata->lb->data[0],
- _colfitdata->ub->data[0],
- NULL, NULL, NULL);
- alt_lnl *= -1;
- this_scale = _colfitdata->params->data[0];
+
+ if (_mod->subtree_root == NULL) {
+ _mod->scale = 1;
+ tm_set_subst_matrices(_mod);
+ null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
+ _colfitdata->fels_scratch[0]);
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &alt_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0],
+ NULL, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata->params->data[0];
+ } else { //subtree case
+ if (!col_has_data_sub(_mod, _msa, 0, _insideNodes, _outsideNodes)) {
+ alt_lnl = 0;
+ null_lnl = 0;
+ this_scale = 1;
+ } else {
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &null_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0], NULL, NULL, NULL);
+ null_lnl *= -1;
+
+ vec_set(_colfitdata2->params, 0,
+ max(0.05, _colfitdata->params->data[0]));
+ vec_set(_colfitdata2->params, 1, _colfitdata2->init_scale_sub);
+ opt_bfgs(col_likelihood_wrapper, _colfitdata2->params,
+ _colfitdata2, &alt_lnl, _colfitdata2->lb,
+ _colfitdata2->ub, NULL, NULL, OPT_HIGH_PREC, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata2->params->data[1];
+ }
+ }
delta_lnl = alt_lnl - null_lnl;
if (delta_lnl <= -0.01)
@@ -360,6 +409,7 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
cerr << "got delta_lnl = " << delta_lnl << endl;
throw hal_exception(string("ERROR col_lrts: delta_lnl < 0 "));
}
+ if (delta_lnl < 0) delta_lnl=0;
if (_mode == NNEUT || _mode == CONACC)
{
pval = chisq_cdf(2*delta_lnl, 1, FALSE);
87 phyloP/impl/halPhyloP.h
View
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) and
+ * Melissa Jane Hubisz (Cornell University)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+#ifndef _HALPHYLOP_H
+#define _HALPHYLOP_H
+
+#include <cstdlib>
+#include <string>
+#include "hal.h"
+
+extern "C"{
+#include "tree_model.h"
+#include "fit_column.h"
+#include "msa.h"
+#include "sufficient_stats.h"
+#include "hashtable.h"
+}
+
+namespace hal {
+
+/** Use the Phast library methods to compute a PhyloP score for a HAL
+ * aligment, column by column. Thanks to Melissa Jane Hubisz. */
+class PhyloP
+{
+public:
+
+ PhyloP();
+ virtual ~PhyloP();
+
+ /** @param dupHardMask true for hard duplication mask or false for
+ * soft duplication mask
+ * @param dupType ambiguous or all
+ * @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices,
+ * though I think we are mainly interested in CONACC
+ * (conservation/acceleration- negative p-values indicate acceleration)
+ * @param subtree If equal to empty string, perform phyloP test on
+ * entire tree. Otherwise, subtree names a branch to perform test on
+ * subtree relative to rest of tree. The subtree includes all children
+ * of the named node as well as the branch leading to the node.
+ */
+ void init(AlignmentConstPtr alignment, const std::string& modFilePath,
+ std::ostream* outStream,
+ bool softMaskDups = true,
+ const std::string& dupType = "ambiguous",
+ const std::string& phyloPMode = "CONACC",
+ const std::string& subtree = "\"\"");
+
+ void processSequence(const Sequence* sequence,
+ hal_index_t start,
+ hal_size_t length,
+ hal_size_t step);
+
+protected:
+
+ // return phyloP score
+ double pval(const ColumnIterator::ColumnMap *cmap);
+
+ void clear();
+
+protected:
+
+ hal::AlignmentConstPtr _alignment;
+ TreeModel* _mod;
+ TreeModel* _modcpy;
+ std::set<const Genome*> _targetSet;
+ std::ostream* _outStream;
+
+ // 1 default = soft mask, if 0 use hard mask (mask entire column)
+ int _softMaskDups;
+ int _maskAllDups;
+
+ // 0 default = mask only ambiguous bases in dups; if 1 mask any duplication
+ hash_table* _seqnameHash;
+ ColFitData* _colfitdata;
+ ColFitData *_colfitdata2;
+ List *_insideNodes;
+ List *_outsideNodes;
+ mode_type _mode;
+ MSA* _msa;
+};
+
+}
+#endif
10 phyloP/impl/halPhyloPMain.cpp
View
@@ -65,6 +65,11 @@ static CLParserPtr initParser()
optionsParser->addOption("refBed", "Bed file with coordinates to "
"annotate in the reference genome (or \"stdin\") "
"to stream from standard input", "\"\"");
+ optionsParser->addOption("subtree", "Partition the tree into the subtree "
+ "beneath the node given (including the branch "
+ "leading up to this node), and test for "
+ "conservation/acceleration in this subtree "
+ "relative to the rest of the tree", "\"\"");
optionsParser->setDescription("Make PhyloP wiggle plot for a genome.");
return optionsParser;
@@ -80,6 +85,7 @@ int main(int argc, char** argv)
string refSequenceName;
string dupType;
string dupMask;
+ string subtree;
hal_size_t start;
hal_size_t length;
hal_size_t step;
@@ -97,6 +103,7 @@ int main(int argc, char** argv)
step = optionsParser->getOption<hal_size_t>("step");
dupType = optionsParser->getOption<string>("dupType");
dupMask = optionsParser->getOption<string>("dupMask");
+ subtree = optionsParser->getOption<string>("subtree");
std::transform(dupMask.begin(), dupMask.end(), dupMask.begin(), ::tolower);
refBedPath = optionsParser->getOption<string>("refBed");
}
@@ -163,7 +170,8 @@ int main(int argc, char** argv)
}
PhyloP phyloP;
- phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType);
+ phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType,
+ "CONACC", subtree=subtree);
ifstream refBedStream;
if (refBedPath != "\"\"")
Please sign in to comment.
Something went wrong with that request. Please try again.