Permalink
Browse files

Merge branch 'blockViz' into chains

  • Loading branch information...
2 parents b62c591 + 61d2f32 commit eb069698e23768c9da09002880580c9b7830d765 @joelarmstrong joelarmstrong committed Nov 6, 2013
View
39 chain/impl/halBlockViz.cpp
@@ -182,7 +182,8 @@ extern "C" void halFreeBlocks(struct hal_block_t* head)
while (head != NULL)
{
hal_block_t* next = head->next;
- free(head->sequence);
+ free(head->qSequence);
+ free(head->tSequence);
free(head->qChrom);
free(head);
head = next;
@@ -707,7 +708,8 @@ void readBlock(AlignmentConstPtr seqAlignment,
cur->next = NULL;
string seqBuffer = qSequence->getName();
- string dnaBuffer;
+ string qDnaBuffer;
+ string tDnaBuffer;
size_t prefix =
seqBuffer.find(genomeName + '.') != 0 ? 0 : genomeName.length() + 1;
cur->qChrom = (char*)malloc(seqBuffer.length() + 1 - prefix);
@@ -737,7 +739,8 @@ void readBlock(AlignmentConstPtr seqAlignment,
assert(firstRefSeg->getLength() == firstQuerySeg->getLength());
cur->size = 1 + tEnd - cur->tStart;
cur->strand = firstQuerySeg->getReversed() ? '-' : '+';
- cur->sequence = NULL;
+ cur->tSequence = NULL;
+ cur->qSequence = NULL;
if (getSequenceString != 0)
{
const Genome* qSeqGenome =
@@ -757,14 +760,36 @@ void readBlock(AlignmentConstPtr seqAlignment,
<< " for DNA sequence extraction";
throw hal_exception(ss.str());
}
+
+ const Genome* tSeqGenome =
+ seqAlignment->openGenome(tSequence->getGenome()->getName());
+ if (tSeqGenome == NULL)
+ {
+ stringstream ss;
+ ss << "Unable to open genome " << tSequence->getGenome()->getName()
+ << " for DNA sequence extraction";
+ throw hal_exception(ss.str());
+ }
+
+ const Sequence* tSeqSequence = tSeqGenome->getSequence(tSequence->getName());
+ if (tSeqSequence == NULL)
+ {
+ stringstream ss;
+ ss << "Unable to open sequence " << tSequence->getName()
+ << " for DNA sequence extraction";
+ throw hal_exception(ss.str());
+ }
- qSeqSequence->getSubString(dnaBuffer, cur->qStart, cur->size);
+ qSeqSequence->getSubString(qDnaBuffer, cur->qStart, cur->size);
+ tSeqSequence->getSubString(tDnaBuffer, cur->tStart, cur->size);
if (cur->strand == '-')
{
- reverseComplement(dnaBuffer);
+ reverseComplement(qDnaBuffer);
}
- cur->sequence = (char*)malloc(dnaBuffer.length() * sizeof(char) + 1);
- strcpy(cur->sequence, dnaBuffer.c_str());
+ cur->qSequence = (char*)malloc(qDnaBuffer.length() * sizeof(char) + 1);
+ cur->tSequence = (char*)malloc(tDnaBuffer.length() * sizeof(char) + 1);
+ strcpy(cur->qSequence, qDnaBuffer.c_str());
+ strcpy(cur->tSequence, tDnaBuffer.c_str());
}
}
View
3 chain/inc/halBlockViz.h
@@ -57,7 +57,8 @@ struct hal_block_t
hal_int_t qStart;
hal_int_t size;
char strand;
- char *sequence;
+ char *qSequence; // query DNA, if requested
+ char *tSequence; // target DNA, if requested
};
/** Some information about a genome */
View
4 chain/test/blockVizTest.c
@@ -70,8 +70,8 @@ static int parseArgs(int argc, char** argv, bv_args_t* args)
static void printBlock(FILE* file, struct hal_block_t* b)
{
- fprintf(file, "chr:%s, tSt:%ld, qSt:%ld, size:%ld, strand:%c: %s\n",
- b->qChrom, b->tStart, b->qStart, b->size, b->strand, b->sequence);
+ fprintf(file, "chr:%s, tSt:%ld, qSt:%ld, size:%ld, strand:%c: tgt : %s query: %s\n",
+ b->qChrom, b->tStart, b->qStart, b->size, b->strand, b->tSequence, b->qSequence);
}
static void printDupeList(FILE* file, struct hal_target_dupe_list_t* d)
View
16 extract/hal4dExtract.cpp
@@ -1,16 +0,0 @@
-/*
- * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
- *
- * Released under the MIT license, see LICENSE.txt
- */
-
-#include <cstdlib>
-#include <iostream>
-#include <fstream>
-#include "hal.h"
-#include "halBedScanner.h"
-
-using namespace std;
-using namespace hal;
-
-
View
5 include.mk
@@ -26,6 +26,11 @@ HDF5_CCLINKER = ${cxx}
cpp = h5c++ ${h5prefix}
cxx = h5cc ${h5prefix}
+ifeq (${SYS},Darwin) #This is to deal with the Mavericks replacing gcc with clang fully and changing libraries
+ cppflags += -stdlib=libstdc++
+ cflags += -stdlib=libstdc++
+endif
+
# add compiler flag and kent paths if udc is enabled
# relies on KENTSRC containing path to top level kent/ dir
# and MACHTYPE being specified
View
1 liftover/impl/halBedLine.cpp
@@ -30,6 +30,7 @@ istream& BedLine::read(istream& is, int version, string& lineBuffer)
_version = version;
std::getline(is, lineBuffer);
stringstream ss(lineBuffer);
+ ss.imbue(is.getloc());
ss >> _chrName;
if (ss.bad() || ss.fail())
{
View
40 liftover/impl/halBedScanner.cpp
@@ -25,10 +25,15 @@ BedScanner::~BedScanner()
}
-void BedScanner::scan(const string& bedPath, int bedVersion)
+void BedScanner::scan(const string& bedPath, int bedVersion,
+ const locale* inLocale)
{
assert(_bedStream == NULL);
_bedStream = new ifstream(bedPath.c_str());
+ if (inLocale != NULL)
+ {
+ _bedStream->imbue(*inLocale);
+ }
try {
scan(_bedStream, bedVersion);
@@ -46,11 +51,15 @@ void BedScanner::scan(const string& bedPath, int bedVersion)
_bedStream = NULL;
}
-void BedScanner::scan(istream* is, int bedVersion)
+void BedScanner::scan(istream* is, int bedVersion, const locale* inLocale)
{
visitBegin();
_bedStream = is;
_bedVersion = bedVersion;
+ if (inLocale != NULL)
+ {
+ _bedStream->imbue(*inLocale);
+ }
if (_bedVersion == -1)
{
@@ -65,13 +74,13 @@ void BedScanner::scan(istream* is, int bedVersion)
_lineNumber = 0;
try
{
- skipWhiteSpaces(_bedStream);
+ skipWhiteSpaces(_bedStream, inLocale);
while (_bedStream->good())
{
++_lineNumber;
_bedLine.read(*_bedStream, _bedVersion, lineBuffer);
visitLine();
- skipWhiteSpaces(_bedStream);
+ skipWhiteSpaces(_bedStream, inLocale);
}
}
catch(hal_exception e)
@@ -84,13 +93,18 @@ void BedScanner::scan(istream* is, int bedVersion)
_bedStream = NULL;
}
-int BedScanner::getBedVersion(istream* bedStream)
+int BedScanner::getBedVersion(istream* bedStream, const locale* inLocale)
{
assert(bedStream != &cin);
if (bedStream->bad())
{
throw hal_exception("Error reading bed input stream");
}
+ if (inLocale != NULL)
+ {
+ bedStream->imbue(*inLocale);
+ }
+
string lineBuffer;
BedLine bedLine;
int version = 12;
@@ -101,7 +115,7 @@ int BedScanner::getBedVersion(istream* bedStream)
{
bedStream->clear();
bedStream->seekg(pos);
- skipWhiteSpaces(bedStream);
+ skipWhiteSpaces(bedStream, inLocale);
*bedStream >> std::skipws;
bedLine.read(*bedStream, version, lineBuffer);
break;
@@ -120,9 +134,14 @@ int BedScanner::getBedVersion(istream* bedStream)
return version;
}
-size_t BedScanner::getNumColumns(const string& bedLine)
+size_t BedScanner::getNumColumns(const string& bedLine,
+ const locale* inLocale)
{
stringstream ss(bedLine);
+ if (inLocale != NULL)
+ {
+ ss.imbue(*inLocale);
+ }
size_t c = 0;
string buffer;
while (ss.good())
@@ -148,9 +167,12 @@ void BedScanner::visitEOF()
{
}
-void BedScanner::skipWhiteSpaces(istream* bedStream)
+void BedScanner::skipWhiteSpaces(istream* bedStream,
+ const locale* inLocale)
{
- while (bedStream->good() && std::isspace(bedStream->peek()))
+ locale defaultLocale;
+ const locale& myLocale = inLocale == NULL ? defaultLocale : *inLocale;
+ while (bedStream->good() && std::isspace((char)bedStream->peek(), myLocale))
{
bedStream->get();
}
View
14 liftover/impl/halLiftover.cpp
@@ -33,7 +33,8 @@ void Liftover::convert(AlignmentConstPtr alignment,
int outBedVersion,
bool addExtraColumns,
bool traverseDupes,
- bool outPSL)
+ bool outPSL,
+ const locale* inLocale)
{
_srcGenome = srcGenome;
_tgtGenome = tgtGenome;
@@ -43,6 +44,7 @@ void Liftover::convert(AlignmentConstPtr alignment,
_outBedVersion = outBedVersion;
_traverseDupes = traverseDupes;
_outPSL = outPSL;
+ _inLocale = inLocale;
_missedSet.clear();
_tgtSet.clear();
assert(_srcGenome && inBedStream && tgtGenome && outBedStream);
@@ -56,12 +58,12 @@ void Liftover::convert(AlignmentConstPtr alignment,
stringstream* firstLineStream = NULL;
if (_inBedVersion <= 0)
{
- skipWhiteSpaces(inBedStream);
+ skipWhiteSpaces(inBedStream, _inLocale);
std::getline(*inBedStream, firstLineBuffer);
firstLineStream = new stringstream(firstLineBuffer);
- _inBedVersion = BedScanner::getBedVersion(firstLineStream);
+ _inBedVersion = BedScanner::getBedVersion(firstLineStream, inLocale);
assert(inBedStream->eof() || _inBedVersion >= 3);
- size_t numCols = BedScanner::getNumColumns(firstLineBuffer);
+ size_t numCols = BedScanner::getNumColumns(firstLineBuffer, inLocale);
if ((int)numCols > _inBedVersion)
{
cerr << "Warning: auto-detecting input BED version " << _inBedVersion
@@ -75,10 +77,10 @@ void Liftover::convert(AlignmentConstPtr alignment,
if (firstLineStream != NULL)
{
- scan(firstLineStream, _inBedVersion);
+ scan(firstLineStream, _inBedVersion, inLocale);
delete firstLineStream;
}
- scan(inBedStream, _inBedVersion);
+ scan(inBedStream, _inBedVersion, inLocale);
}
void Liftover::visitBegin()
View
20 liftover/impl/halLiftoverMain.cpp
@@ -9,6 +9,7 @@
#include <fstream>
#include "halColumnLiftover.h"
#include "halBlockLiftover.h"
+#include "halTabFacet.h"
using namespace std;
using namespace hal;
@@ -43,7 +44,10 @@ static CLParserPtr initParser()
"columns in the input beyond the specified or "
"detected bed version, and which are cut by "
"default.", false);
-
+ optionsParser->addOptionFlag("tab", "input is tab-separated. this allows"
+ " column entries to contain spaces. if this"
+ " flag is not set, both spaces and tabs are"
+ " used to separate input columns.", false);
optionsParser->setDescription("Map BED genome interval coordinates between "
"two genomes.");
return optionsParser;
@@ -64,6 +68,7 @@ int main(int argc, char** argv)
int outBedVersion;
bool keepExtra;
bool outPSL;
+ bool tab;
try
{
optionsParser->parseOptions(argc, argv);
@@ -78,6 +83,7 @@ int main(int argc, char** argv)
outBedVersion = optionsParser->getOption<int>("outBedVersion");
keepExtra = optionsParser->getFlag("keepExtra");
outPSL = optionsParser->getFlag("outPSL");
+ tab = optionsParser->getFlag("tab");
}
catch(exception& e)
{
@@ -145,11 +151,21 @@ int main(int argc, char** argv)
throw hal_exception("Error opening tgtBed, " + tgtBedPath);
}
}
+
+ locale* inLocale = NULL;
+ if (tab == true)
+ {
+ inLocale = new locale(cin.getloc(), new TabSepFacet(cin.getloc()));
+ assert(std::isspace('\t', *inLocale) == true);
+ assert(std::isspace(' ', *inLocale) == false);
+ }
BlockLiftover liftover;
liftover.convert(alignment, srcGenome, srcBedPtr, tgtGenome, tgtBedPtr,
inBedVersion, outBedVersion, keepExtra, !noDupes,
- outPSL);
+ outPSL, inLocale);
+
+ delete inLocale;
}
catch(hal_exception& e)
View
26 liftover/impl/halWiggleLiftover.cpp
@@ -8,6 +8,7 @@
#include <cassert>
#include "halWiggleLiftover.h"
#include "halBlockMapper.h"
+#include "halWiggleLoader.h"
using namespace std;
using namespace hal;
@@ -25,6 +26,15 @@ WiggleLiftover::~WiggleLiftover()
}
+void WiggleLiftover::preloadOutput(AlignmentConstPtr alignment,
+ const Genome* tgtGenome,
+ istream* inputFile)
+{
+ WiggleLoader loader;
+ _outVals.init(tgtGenome->getSequenceLength(), DefaultValue, DefaultTileSize);
+ loader.load(alignment, tgtGenome, inputFile, &_outVals);
+}
+
void WiggleLiftover::convert(AlignmentConstPtr alignment,
const Genome* srcGenome,
istream* inputFile,
@@ -56,7 +66,12 @@ void WiggleLiftover::convert(AlignmentConstPtr alignment,
inputSet.insert(_srcGenome);
inputSet.insert(_tgtGenome);
getGenomesInSpanningTree(inputSet, _tgtSet);
- _outVals.init(tgtGenome->getSequenceLength(), DefaultValue, DefaultTileSize);
+ // if not init'd by preload()...
+ if (_outVals.getGenomeSize() == 0)
+ {
+ _outVals.init(tgtGenome->getSequenceLength(), DefaultValue,
+ DefaultTileSize);
+ }
scan(inputFile);
write();
_outVals.clear();
@@ -97,11 +112,8 @@ void WiggleLiftover::visitLine()
{
throw hal_exception("Coordinate out of order");
}
- if (_value != DefaultValue)
- {
- CoordVal cv = {absFirst, absLast, _value};
- _cvals.push_back(cv);
- }
+ CoordVal cv = {absFirst, absLast, _value};
+ _cvals.push_back(cv);
}
void WiggleLiftover::visitEOF()
@@ -194,7 +206,7 @@ void WiggleLiftover::mapFragments(vector<MappedSegmentConstPtr>& fragments)
}
else
{
- mpos = seg->getEndPosition() - j;
+ mpos = seg->getStartPosition() - j;
}
if (_cvIdx < _cvals.size() && _cvals[_cvIdx]._first <= pos &&
_cvals[_cvIdx]._last >= pos)
View
25 liftover/impl/halWiggleLiftoverMain.cpp
@@ -24,7 +24,10 @@ static CLParserPtr initParser()
" to stream to standard output.");
optionsParser->addOptionFlag("noDupes", "do not map between duplications in"
" graph.", false);
- optionsParser->addOptionFlag("append", "append results to tgtWig", false);
+ optionsParser->addOptionFlag("append", "append/merge results into tgtWig. "
+ "Note that the entire tgtWig file will be loaded into"
+ " memory then overwritten, so this data can be lost "
+ "in event of a crash", false);
/* optionsParser->addOptionFlag("unique",
"only map block if its left-most paralog is in"
"the input. this "
@@ -107,8 +110,19 @@ int main(int argc, char** argv)
throw hal_exception("Error opening srcWig, " + srcWigPath);
}
}
-
- ios_base::openmode mode = append ? ios::out | ios::app : ios_base::out;
+
+ WiggleLiftover liftover;
+ if (append == true && tgtWigPath != "stdout")
+ {
+ // load the wig data into memory so that it can be properly merged
+ // with the new data from the liftover.
+ ifstream tgtWig(tgtWigPath.c_str());
+ if (tgtWig)
+ {
+ liftover.preloadOutput(alignment, tgtGenome, &tgtWig);
+ }
+ }
+
ofstream tgtWig;
ostream* tgtWigPtr;
if (tgtWigPath == "stdout")
@@ -117,15 +131,14 @@ int main(int argc, char** argv)
}
else
{
- tgtWig.open(tgtWigPath.c_str(), mode);
+ tgtWig.open(tgtWigPath.c_str());
tgtWigPtr = &tgtWig;
if (!tgtWig)
{
throw hal_exception("Error opening tgtWig, " + tgtWigPath);
}
}
-
- WiggleLiftover liftover;
+
liftover.convert(alignment, srcGenome, srcWigPtr, tgtGenome, tgtWigPtr,
!noDupes, unique);
}
View
64 liftover/impl/halWiggleLoader.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+#include <deque>
+#include <cassert>
+#include "halWiggleLoader.h"
+#include "halWiggleLiftover.h"
+
+using namespace std;
+using namespace hal;
+
+
+WiggleLoader::WiggleLoader()
+{
+
+}
+
+WiggleLoader::~WiggleLoader()
+{
+
+}
+
+void WiggleLoader::load(AlignmentConstPtr alignment,
+ const Genome* genome,
+ istream* inputFile,
+ WiggleTiles<double>* vals)
+{
+ _alignment = alignment;
+ _srcGenome = genome;
+ _srcSequence = NULL;
+ _vals = vals;
+ scan(inputFile);
+}
+
+void WiggleLoader::visitHeader()
+{
+ _srcSequence = _srcGenome->getSequence(_sequenceName);
+ if (_srcSequence == NULL)
+ {
+ stringstream ss;
+ ss << "Sequence " << _sequenceName << " not found in genome "
+ << _srcGenome->getName();
+ throw hal_exception(ss.str());
+ }
+}
+
+void WiggleLoader::visitLine()
+{
+ if (_srcSequence == NULL)
+ {
+ throw hal_exception("Missing Wig header");
+ }
+
+ hal_index_t absFirst = _first + _srcSequence->getStartPosition();
+ hal_index_t absLast = _last + _srcSequence->getStartPosition();
+
+ for (hal_index_t absPos = absFirst; absPos <= absLast; ++absPos)
+ {
+ _vals->set(absPos, _value);
+ }
+}
View
16 liftover/inc/halBedScanner.h
@@ -13,6 +13,7 @@
#include <vector>
#include <string>
#include <fstream>
+#include <locale>
#include "hal.h"
#include "halBedLine.h"
@@ -26,19 +27,24 @@ class BedScanner
public:
BedScanner();
virtual ~BedScanner();
- virtual void scan(const std::string& bedPath, int bedVersion = -1);
- virtual void scan(std::istream* bedStream, int bedVersion = -1);
+ virtual void scan(const std::string& bedPath, int bedVersion = -1,
+ const std::locale* inLocale = NULL);
+ virtual void scan(std::istream* bedStream, int bedVersion = -1,
+ const std::locale* inLocale = NULL);
- static int getBedVersion(std::istream* bedStream);
- static size_t getNumColumns(const std::string& bedLine);
+ static int getBedVersion(std::istream* bedStream,
+ const std::locale* inLocale = NULL);
+ static size_t getNumColumns(const std::string& bedLine,
+ const std::locale* inLocale = NULL);
protected:
virtual void visitBegin();
virtual void visitLine();
virtual void visitEOF();
- static void skipWhiteSpaces(std::istream* bedStream);
+ static void skipWhiteSpaces(std::istream* bedStream,
+ const std::locale* inLocale = NULL);
protected:
View
5 liftover/inc/halLiftover.h
@@ -11,6 +11,7 @@
#include <string>
#include <fstream>
#include <iostream>
+#include <locale>
#include "halBedScanner.h"
namespace hal {
@@ -31,7 +32,8 @@ class Liftover : public BedScanner
int outBedVersion = -1,
bool addExtraColumns = false,
bool traverseDupes = true,
- bool outPSL = false);
+ bool outPSL = false,
+ const std::locale* inLocale = NULL);
protected:
@@ -60,6 +62,7 @@ class Liftover : public BedScanner
int _inBedVersion;
int _outBedVersion;
bool _outPSL;
+ const std::locale* _inLocale;
BedList _mappedBlocks;
View
49 liftover/inc/halTabFacet.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+// based on:
+//http://stackoverflow.com/questions/9085471/is-there-a-flag-to-make-istream-treat-only-tabs-as-delimiters
+
+#include <locale>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+// This is my facet:
+// It is designed to treat only <tab> as whitespace
+class TabSepFacet: public std::ctype<char>
+{
+public:
+ typedef std::ctype<char> base;
+ typedef base::char_type char_type;
+
+ TabSepFacet(std::locale const& l) : base(table)
+ {
+ // Get the ctype facet of the current locale
+ std::ctype<char> const& defaultCType =
+ std::use_facet<std::ctype<char> >(l);
+
+ // Copy the default flags for each character from the current facet
+ static char data[256];
+ for(int loop = 0; loop < 256; ++loop)
+ {
+ data[loop] = loop;
+ }
+ defaultCType.is(data, data+256, table);
+
+ // Remove the space character
+ if (table[' '] & base::space)
+ {
+ table[' '] ^= base::space;
+ }
+
+ // Add the tab character
+ table['\t'] |= base::space;
+ }
+private:
+ base::mask table[256];
+};
View
6 liftover/inc/halWiggleLiftover.h
@@ -22,7 +22,11 @@ class WiggleLiftover : public WiggleScanner
WiggleLiftover();
virtual ~WiggleLiftover();
-
+
+ void preloadOutput(AlignmentConstPtr alignment,
+ const Genome* tgtGenome,
+ std::istream* inputFile);
+
void convert(AlignmentConstPtr alignment,
const Genome* srcGenome,
std::istream* inputFile,
View
47 liftover/inc/halWiggleLoader.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+#ifndef _HALWIGGLELOADER_H
+#define _HALWIGGLELOADER_H
+
+#include <vector>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include "halWiggleScanner.h"
+#include "halWiggleTiles.h"
+
+namespace hal {
+
+/** Quick hack to load a wiggle into memory, in order to have wiggleLiftover
+ * --append option work better. Ideally would be base class of WiggleLiftover
+ * but don't have time to refactor right now */
+class WiggleLoader : public WiggleScanner
+{
+public:
+
+ WiggleLoader();
+ virtual ~WiggleLoader();
+
+ void load(AlignmentConstPtr alignment,
+ const Genome* genome,
+ std::istream* inputFile,
+ WiggleTiles<double>* vals);
+
+
+protected:
+
+ virtual void visitLine();
+ virtual void visitHeader();
+
+ AlignmentConstPtr _alignment;
+ const Genome* _srcGenome;
+ const Sequence* _srcSequence;
+ WiggleTiles<double>* _vals;
+};
+
+}
+#endif
View
10 phyloP/halPhyloPMP.py
@@ -52,7 +52,9 @@ def getHalPhyloPCmd(options):
opt == 'dupType' or
opt == 'dupMask' or
opt == 'step' or
- opt == 'refBed')):
+ opt == 'refBed' or
+ opt == 'subtree' or
+ opt == 'prec')):
if val is not True:
cmd += ' --%s %s' % (opt, str(val))
else:
@@ -266,6 +268,12 @@ def main(argv=None):
"reference genome to stream from standard "
" input.",
default=None)
+ hppGrp.add_argument("--subtree",
+ help="Subtree root for lineage-specific acceleration/conservation",
+ default=None)
+ hppGrp.add_argument("--prec",
+ help="Number of decimal places in wig output", type=int,
+ default=None)
args = parser.parse_args()
View
20 phyloP/halPhyloPTrain.py
@@ -142,8 +142,8 @@ def computeFit(options):
tree = options.tree
else:
tree = getHalTree(options.hal)
- runShellCommand("phyloFit --tree \"%s\" --subst-mod SSREV --sym-freqs %s --out-root %s" % (
- tree, options.outMafSS,
+ runShellCommand("phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --out-root %s" % (
+ tree, options.substMod, options.outMafSS,
os.path.splitext(options.outMod)[0]))
runShellCommand("rm -f %s" % options.outMafSS)
@@ -164,7 +164,8 @@ def computeModel(options):
extractGeneMAFs(options)
computeAgMAFStats(options)
computeFit(options)
- modFreqs(options)
+ if not options.noModFreqs:
+ modFreqs(options)
runShellCommand("rm -f %s" % options.outMafAllPaths)
def main(argv=None):
@@ -210,6 +211,17 @@ def main(argv=None):
" in the alignment. Note that it is best to enclose"
" this string in quotes",
default=None)
+ parser.add_argument("--substMod", help="Substitution model for phyloFit"
+ ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|"
+ "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S",
+ default = "SSREV")
+ parser.add_argument("--noModFreqs", help="By default, equilibrium "
+ "frequencies for the nucleotides of the trained model"
+ " are corrected with the observed frequencies of "
+ "the reference genome (using the PHAST modFreqs"
+ " tool. This flag disables this step, and keeps the"
+ " trained frequencies", action="store_true",
+ default=False)
args = parser.parse_args()
@@ -231,6 +243,8 @@ def main(argv=None):
raise RuntimeError("Reference genome %s not found." % args.refGenome)
if os.path.splitext(args.outMod)[1] != ".mod":
raise RuntimeError("Output model must have .mod extension")
+ if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split("|"):
+ raise RuntimeError("Invalid substitution model: %s" % args.substMod)
args.outDir = os.path.dirname(args.outMod)
args.outName = os.path.splitext(os.path.basename(args.outMod))[0]
View
22 phyloP/halTreePhyloP.py
@@ -54,8 +54,14 @@ def computeTreePhyloP(args):
# Run halPhyloP on the inserts
wigFile = outFileName(args, genome, "wig", "phyloP", False)
- runShellCommand("halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
- args.hal, genome, args.mod, bedFlags, args.numProc, wigFile))
+ cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
+ args.hal, genome, args.mod, bedFlags, args.numProc, wigFile)
+ if args.subtree is not None:
+ cmd += " --subtree %s" % args.subtree
+ if args.prec is not None:
+ cmd += " --prec %d" % args.prec
+
+ runShellCommand(cmd)
runShellCommand("rm -f %s" % bedInsertsFile)
@@ -110,6 +116,13 @@ def main(argv=None):
parser.add_argument("--bigWig",
help="Run wigToBigWig on each generated wiggle",
action="store_true", default=False)
+ parser.add_argument("--subtree",
+ help="Run clade-specific acceleration/conservation on subtree below this node",
+ default=None)
+ parser.add_argument("--prec",
+ help="Number of decimal places in wig output", type=int,
+ default=None)
+
# need phyloP options here:
args = parser.parse_args()
@@ -126,10 +139,13 @@ def main(argv=None):
args.halGenomes = getHalGenomes(args.hal)
if args.root is None:
args.root = getHalRootName(args.hal)
-
+
if not args.root in args.halGenomes:
raise RuntimeError("Root genome %s not found." % args.root)
+ if args.subtree is not None and args.root not in args.halGenomes:
+ raise RuntimeError("Subtree root %s not found." % args.subtree)
+
# make a little id tag for temporary maf slices
S = string.ascii_uppercase + string.digits
args.tempID = 'halTreePhyloP' + ''.join(random.choice(S) for x in range(5))
View
88 phyloP/impl/halPhyloP.cpp
@@ -46,24 +46,21 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
ostream* outStream,
bool softMaskDups,
const string& dupType,
- const string& phyloPMode)
+ const string& phyloPMode,
+ const string &subtree)
{
clear();
_alignment = alignment;
_softMaskDups = (int)softMaskDups;
_outStream = outStream;
- // set float precision to 3 places to be consistent with non-hal phyloP
- _outStream->setf(ios::fixed, ios::floatfield);
- _outStream->precision(3);
-
if (dupType == "ambiguous")
{
- _maskAllDups = 1;
+ _maskAllDups = 0;
}
else if (dupType == "all")
{
- _maskAllDups = 0;
+ _maskAllDups = 1;
}
else
{
@@ -97,6 +94,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_mod = tm_new_from_file(infile, TRUE);
phast_fclose(infile);
+
// make sure all species in the tree are in the alignment, otherwise print
// warning and prune tree; create targetSet from these species.
// Make a hash of species names to species index (using phast's hash
@@ -131,6 +129,7 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
lst_free(leafNames);
lst_free(pruneNames);
+
//create a dummy alignment with a single column. As we iterate through the
// columns we will fill in the bases and compute the phyloP scores for
// individual columns.
@@ -144,7 +143,28 @@ void PhyloP::init(AlignmentConstPtr alignment, const string& modFilePath,
_msa = msa_new(seqs, names, numspec, 1, NULL);
ss_from_msas(_msa, 1, 0, NULL, NULL, NULL, -1, FALSE);
msa_free_seqs(_msa);
- _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+
+
+ if (subtree != "\"\"") {
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL) {
+ tr_name_ancestors(_mod->tree);
+ _mod->subtree_root = tr_get_node(_mod->tree, subtree.c_str());
+ if (_mod->subtree_root == NULL)
+ throw hal_exception("no node named " + subtree);
+ }
+ _modcpy = tm_create_copy(_mod);
+ _modcpy->subtree_root = NULL;
+ _colfitdata = col_init_fit_data(_modcpy, _msa, ALL, NNEUT, FALSE);
+ _colfitdata2 = col_init_fit_data(_mod, _msa, SUBTREE, _mode, FALSE);
+ _colfitdata2->tupleidx = 0;
+ _insideNodes = lst_new_ptr(_mod->tree->nnodes);
+ _outsideNodes = lst_new_ptr(_mod->tree->nnodes);
+ tr_partition_leaves(_mod->tree, _mod->subtree_root,
+ _insideNodes, _outsideNodes);
+ } else {
+ _colfitdata = col_init_fit_data(_mod, _msa, ALL, _mode, FALSE);
+ }
_colfitdata->tupleidx = 0;
}
@@ -267,7 +287,7 @@ void PhyloP::processSequence(const Sequence* sequence,
else
{
/** Reset the iterator to a non-contiguous position */
- colIt->toSite(pos, last);
+ colIt->toSite(pos, last - 1);
}
}
}
@@ -340,18 +360,43 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
//finally, compute the score!
double alt_lnl, null_lnl, this_scale, delta_lnl, pval;
int sigfigs=4; //same value used in phyloP code
- _mod->scale = 1;
- tm_set_subst_matrices(_mod);
- null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
- _colfitdata->fels_scratch[0]);
- vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
- opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
- _colfitdata,
- &alt_lnl, sigfigs, _colfitdata->lb->data[0],
- _colfitdata->ub->data[0],
- NULL, NULL, NULL);
- alt_lnl *= -1;
- this_scale = _colfitdata->params->data[0];
+
+ if (_mod->subtree_root == NULL) {
+ _mod->scale = 1;
+ tm_set_subst_matrices(_mod);
+ null_lnl = col_compute_log_likelihood(_mod, _msa, 0,
+ _colfitdata->fels_scratch[0]);
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &alt_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0],
+ NULL, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata->params->data[0];
+ } else { //subtree case
+ if (!col_has_data_sub(_mod, _msa, 0, _insideNodes, _outsideNodes)) {
+ alt_lnl = 0;
+ null_lnl = 0;
+ this_scale = 1;
+ } else {
+ vec_set(_colfitdata->params, 0, _colfitdata->init_scale);
+ opt_newton_1d(col_likelihood_wrapper_1d, &_colfitdata->params->data[0],
+ _colfitdata,
+ &null_lnl, sigfigs, _colfitdata->lb->data[0],
+ _colfitdata->ub->data[0], NULL, NULL, NULL);
+ null_lnl *= -1;
+
+ vec_set(_colfitdata2->params, 0,
+ max(0.05, _colfitdata->params->data[0]));
+ vec_set(_colfitdata2->params, 1, _colfitdata2->init_scale_sub);
+ opt_bfgs(col_likelihood_wrapper, _colfitdata2->params,
+ _colfitdata2, &alt_lnl, _colfitdata2->lb,
+ _colfitdata2->ub, NULL, NULL, OPT_HIGH_PREC, NULL, NULL);
+ alt_lnl *= -1;
+ this_scale = _colfitdata2->params->data[1];
+ }
+ }
delta_lnl = alt_lnl - null_lnl;
if (delta_lnl <= -0.01)
@@ -360,6 +405,7 @@ double PhyloP::pval(const ColumnIterator::ColumnMap *cmap)
cerr << "got delta_lnl = " << delta_lnl << endl;
throw hal_exception(string("ERROR col_lrts: delta_lnl < 0 "));
}
+ if (delta_lnl < 0) delta_lnl=0;
if (_mode == NNEUT || _mode == CONACC)
{
pval = chisq_cdf(2*delta_lnl, 1, FALSE);
View
17 phyloP/impl/halPhyloPMain.cpp
@@ -65,6 +65,12 @@ static CLParserPtr initParser()
optionsParser->addOption("refBed", "Bed file with coordinates to "
"annotate in the reference genome (or \"stdin\") "
"to stream from standard input", "\"\"");
+ optionsParser->addOption("subtree", "Partition the tree into the subtree "
+ "beneath the node given (including the branch "
+ "leading up to this node), and test for "
+ "conservation/acceleration in this subtree "
+ "relative to the rest of the tree", "\"\"");
+ optionsParser->addOption("prec", "Number of decimal places in wig output", 3);
optionsParser->setDescription("Make PhyloP wiggle plot for a genome.");
return optionsParser;
@@ -80,10 +86,12 @@ int main(int argc, char** argv)
string refSequenceName;
string dupType;
string dupMask;
+ string subtree;
hal_size_t start;
hal_size_t length;
hal_size_t step;
string refBedPath;
+ hal_size_t prec;
try
{
optionsParser->parseOptions(argc, argv);
@@ -97,8 +105,10 @@ int main(int argc, char** argv)
step = optionsParser->getOption<hal_size_t>("step");
dupType = optionsParser->getOption<string>("dupType");
dupMask = optionsParser->getOption<string>("dupMask");
+ subtree = optionsParser->getOption<string>("subtree");
std::transform(dupMask.begin(), dupMask.end(), dupMask.begin(), ::tolower);
refBedPath = optionsParser->getOption<string>("refBed");
+ prec = optionsParser->getOption<hal_size_t>("prec");
}
catch(exception& e)
{
@@ -162,8 +172,13 @@ int main(int argc, char** argv)
}
}
+ // set the precision of floating point output
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.precision(prec);
+
PhyloP phyloP;
- phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType);
+ phyloP.init(alignment, modPath, &outStream, dupMask == "soft" , dupType,
+ "CONACC", subtree);
ifstream refBedStream;
if (refBedPath != "\"\"")
View
11 phyloP/inc/halPhyloP.h
@@ -37,12 +37,17 @@ class PhyloP
* @param phyloPMode "CONACC", "CON", "ACC", "NNEUT" are choices,
* though I think we are mainly interested in CONACC
* (conservation/acceleration- negative p-values indicate acceleration)
+ * @param subtree If equal to empty string, perform phyloP test on
+ * entire tree. Otherwise, subtree names a branch to perform test on
+ * subtree relative to rest of tree. The subtree includes all children
+ * of the named node as well as the branch leading to the node.
*/
void init(AlignmentConstPtr alignment, const std::string& modFilePath,
std::ostream* outStream,
bool softMaskDups = true,
const std::string& dupType = "ambiguous",
- const std::string& phyloPMode = "CONACC");
+ const std::string& phyloPMode = "CONACC",
+ const std::string& subtree = "\"\"");
void processSequence(const Sequence* sequence,
hal_index_t start,
@@ -60,6 +65,7 @@ class PhyloP
hal::AlignmentConstPtr _alignment;
TreeModel* _mod;
+ TreeModel* _modcpy;
std::set<const Genome*> _targetSet;
std::ostream* _outStream;
@@ -70,6 +76,9 @@ class PhyloP
// 0 default = mask only ambiguous bases in dups; if 1 mask any duplication
hash_table* _seqnameHash;
ColFitData* _colfitdata;
+ ColFitData *_colfitdata2;
+ List *_insideNodes;
+ List *_outsideNodes;
mode_type _mode;
MSA* _msa;
};

0 comments on commit eb06969

Please sign in to comment.