Browse files

Merge branch 'development' of github.com:glennhickey/hal into develop…

…ment
  • Loading branch information...
2 parents 560f1a7 + d306819 commit 287e5d40b2e62eb158aa6d1243096f46762fb159 @glennhickey committed Nov 2, 2013
View
6 phyloP/halPhyloPMP.py
@@ -53,7 +53,8 @@ def getHalPhyloPCmd(options):
opt == 'dupMask' or
opt == 'step' or
opt == 'refBed' or
- opt == 'subtree')):
+ opt == 'subtree' or
+ opt == 'prec')):
if val is not True:
cmd += ' --%s %s' % (opt, str(val))
else:
@@ -270,6 +271,9 @@ def main(argv=None):
hppGrp.add_argument("--subtree",
help="Subtree root for lineage-specific acceleration/conservation",
default=None)
+ hppGrp.add_argument("--prec",
+ help="Number of decimal places in wig output", type=int,
+ default=None)
args = parser.parse_args()
View
20 phyloP/halPhyloPTrain.py
@@ -142,8 +142,8 @@ def computeFit(options):
tree = options.tree
else:
tree = getHalTree(options.hal)
- runShellCommand("phyloFit --tree \"%s\" --subst-mod SSREV --sym-freqs %s --out-root %s" % (
- tree, options.outMafSS,
+ runShellCommand("phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --out-root %s" % (
+ tree, options.substMod, options.outMafSS,
os.path.splitext(options.outMod)[0]))
runShellCommand("rm -f %s" % options.outMafSS)
@@ -164,7 +164,8 @@ def computeModel(options):
extractGeneMAFs(options)
computeAgMAFStats(options)
computeFit(options)
- modFreqs(options)
+ if not options.noModFreqs:
+ modFreqs(options)
runShellCommand("rm -f %s" % options.outMafAllPaths)
def main(argv=None):
@@ -210,6 +211,17 @@ def main(argv=None):
" in the alignment. Note that it is best to enclose"
" this string in quotes",
default=None)
+ parser.add_argument("--substMod", help="Substitution model for phyloFit"
+ ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|"
+ "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S",
+ default = "SSREV")
+ parser.add_argument("--noModFreqs", help="By default, equilibrium "
+ "frequencies for the nucleotides of the trained model"
+ " are corrected with the observed frequencies of "
+ "the reference genome (using the PHAST modFreqs"
+ " tool. This flag disables this step, and keeps the"
+ " trained frequencies", action="store_true",
+ default=False)
args = parser.parse_args()
@@ -231,6 +243,8 @@ def main(argv=None):
raise RuntimeError("Reference genome %s not found." % args.refGenome)
if os.path.splitext(args.outMod)[1] != ".mod":
raise RuntimeError("Output model must have .mod extension")
+ if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split("|"):
+ raise RuntimeError("Invalid substitution model: %s" % args.substMod)
args.outDir = os.path.dirname(args.outMod)
args.outName = os.path.splitext(os.path.basename(args.outMod))[0]
View
19 phyloP/halTreePhyloP.py
@@ -42,14 +42,20 @@ def computeTreePhyloP(args):
bigwigCmds = []
while len(visitQueue) > 0:
genome = visitQueue.pop()
+ children = getHalChildrenNames(args.hal, genome)
bedFlags = ""
# Generate a bed file of all regions of
# genome that dont align to parent
bedInsertsFile = outFileName(args, genome, "bed", "inserts", True)
if genome != args.root:
- runShellCommand(
- "halAlignedExtract %s %s --alignedFile %s --complement" % (
- args.hal, genome, bedInsertsFile))
+ if True or len(children) > 0:
+ runShellCommand(
+ "halAlignedExtract %s %s --alignedFile %s --complement" % (
+ args.hal, genome, bedInsertsFile))
+ else:
+ #empty file for leaves (ie we dont want to phyloP anything
+ #-- it all gets lifted down)
+ runShellCommand("rm -f %s && touch %s" % (bedInsertsFile, bedInsertsFile))
bedFlags = "--refBed %s" % bedInsertsFile
# Run halPhyloP on the inserts
@@ -58,6 +64,8 @@ def computeTreePhyloP(args):
args.hal, genome, args.mod, bedFlags, args.numProc, wigFile)
if args.subtree is not None:
cmd += " --subtree %s" % args.subtree
+ if args.prec is not None:
+ cmd += " --prec %d" % args.prec
runShellCommand(cmd)
@@ -83,7 +91,6 @@ def computeTreePhyloP(args):
bigwigCmds.append(bwCmd)
# Recurse on children.
- children = getHalChildrenNames(args.hal, genome)
for child in children:
visitQueue.append(child)
@@ -117,6 +124,10 @@ def main(argv=None):
parser.add_argument("--subtree",
help="Run clade-specific acceleration/conservation on subtree below this node",
default=None)
+ parser.add_argument("--prec",
+ help="Number of decimal places in wig output", type=int,
+ default=None)
+
# need phyloP options here:
args = parser.parse_args()
View
4 phyloP/impl/halPhyloP.cpp
@@ -54,10 +54,6 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_softMaskDups = (int)softMaskDups;
_outStream = outStream;
- // set float precision to 3 places to be consistent with non-hal phyloP
- _outStream->setf(ios::fixed, ios::floatfield);
- _outStream->precision(3);
-
if (dupType == "ambiguous")
{
_maskAllDups = 0;
View
87 phyloP/impl/halPhyloP.h
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu) and
- * Melissa Jane Hubisz (Cornell University)
- *
- * Released under the MIT license, see LICENSE.txt
- */
-
-#ifndef _HALPHYLOP_H
-#define _HALPHYLOP_H
-
-#include <cstdlib>
-#include <string>
-#include "hal.h"
-
-extern "C"{
-#include "tree_model.h"
-#include "fit_column.h"
-#include "msa.h"
-#include "sufficient_stats.h"
-#include "hashtable.h"
-}
-
-namespace hal {
-
-/** Use the Phast library methods to compute a PhyloP score for a HAL
- * aligment, column by column. Thanks to Melissa Jane Hubisz. */
-class PhyloP
-{
-public:
-
- PhyloP();
- virtual ~PhyloP();
-
- /** @param dupHardMask true for hard duplication mask or false for
- * soft duplication mask
- * @param dupType ambiguous or all
- * @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices,
- * though I think we are mainly interested in CONACC
- * (conservation/acceleration- negative p-values indicate acceleration)
- * @param subtree If equal to empty string, perform phyloP test on
- * entire tree. Otherwise, subtree names a branch to perform test on
- * subtree relative to rest of tree. The subtree includes all children
- * of the named node as well as the branch leading to the node.
- */
- void init(AlignmentConstPtr alignment, const std::string& modFilePath,
- std::ostream* outStream,
- bool softMaskDups = true,
- const std::string& dupType = "ambiguous",
- const std::string& phyloPMode = "CONACC",
- const std::string& subtree = "\"\"");
-
- void processSequence(const Sequence* sequence,
- hal_index_t start,
- hal_size_t length,
- hal_size_t step);
-
-protected:
-
- // return phyloP score
- double pval(const ColumnIterator::ColumnMap *cmap);
-
- void clear();
-
-protected:
-
- hal::AlignmentConstPtr _alignment;
- TreeModel* _mod;
- TreeModel* _modcpy;
- std::set<const Genome*> _targetSet;
- std::ostream* _outStream;
-
- // 1 default = soft mask, if 0 use hard mask (mask entire column)
- int _softMaskDups;
- int _maskAllDups;
-
- // 0 default = mask only ambiguous bases in dups; if 1 mask any duplication
- hash_table* _seqnameHash;
- ColFitData* _colfitdata;
- ColFitData *_colfitdata2;
- List *_insideNodes;
- List *_outsideNodes;
- mode_type _mode;
- MSA* _msa;
-};
-
-}
-#endif
View
9 phyloP/impl/halPhyloPMain.cpp
@@ -70,6 +70,7 @@ static CLParserPtr initParser()
"leading up to this node), and test for "
"conservation/acceleration in this subtree "
"relative to the rest of the tree", "\"\"");
+ optionsParser->addOption("prec", "Number of decimal places in wig output", 3);
optionsParser->setDescription("Make PhyloP wiggle plot for a genome.");
return optionsParser;
@@ -90,6 +91,7 @@ int main(int argc, char** argv)
hal_size_t length;
hal_size_t step;
string refBedPath;
+ hal_size_t prec;
try
{
optionsParser->parseOptions(argc, argv);
@@ -106,6 +108,7 @@ int main(int argc, char** argv)
subtree = optionsParser->getOption<string>("subtree");
std::transform(dupMask.begin(), dupMask.end(), dupMask.begin(), ::tolower);
refBedPath = optionsParser->getOption<string>("refBed");
+ prec = optionsParser->getOption<hal_size_t>("prec");
}
catch(exception& e)
{
@@ -169,9 +172,13 @@ int main(int argc, char** argv)
}
}
+ // set the precision of floating point output
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.precision(prec);
+
PhyloP phyloP;
phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType,
- "CONACC", subtree=subtree);
+ "CONACC", subtree);
ifstream refBedStream;
if (refBedPath != "\"\"")
View
11 phyloP/inc/halPhyloP.h
@@ -37,12 +37,17 @@ class PhyloP
* @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices,
* though I think we are mainly interested in CONACC
* (conservation/acceleration- negative p-values indicate acceleration)
+ * @param subtree If equal to empty string, perform phyloP test on
+ * entire tree. Otherwise, subtree names a branch to perform test on
+ * subtree relative to rest of tree. The subtree includes all children
+ * of the named node as well as the branch leading to the node.
*/
void init(AlignmentConstPtr alignment, const std::string& modFilePath,
std::ostream* outStream,
bool softMaskDups = true,
const std::string& dupType = "ambiguous",
- const std::string& phyloPMode = "CONACC");
+ const std::string& phyloPMode = "CONACC",
+ const std::string& subtree = "\"\"");
void processSequence(const Sequence* sequence,
hal_index_t start,
@@ -60,6 +65,7 @@ class PhyloP
hal::AlignmentConstPtr _alignment;
TreeModel* _mod;
+ TreeModel* _modcpy;
std::set<const Genome*> _targetSet;
std::ostream* _outStream;
@@ -70,6 +76,9 @@ class PhyloP
// 0 default = mask only ambiguous bases in dups; if 1 mask any duplication
hash_table* _seqnameHash;
ColFitData* _colfitdata;
+ ColFitData *_colfitdata2;
+ List *_insideNodes;
+ List *_outsideNodes;
mode_type _mode;
MSA* _msa;
};

0 comments on commit 287e5d4

Please sign in to comment.