Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'development' of https://github.com/glennhickey/hal into…

… development
  • Loading branch information...
commit f4231fb24733bd089e351a293fe3d59f852a97bf 2 parents 9766826 + 9ce3753
Joel Armstrong joelarmstrong authored
2  Makefile
View
@@ -1,5 +1,5 @@
# order is important, libraries first
-modules = api stats randgen validate mutations fasta alignability lod chain liftover maf extract analysis phyloP assemblyHub
+modules = api stats randgen validate mutations fasta alignability liftover lod maf chain extract analysis phyloP assemblyHub
.PHONY: all %.all clean %.clean doxy %.doxy
8 README.md
View
@@ -13,6 +13,14 @@ Glenn Hickey, Benedict Paten, Dent Earl, Daniel Zerbino, and David
Haussler. HAL: A Hierarchical Format for Storing and Analyzing
Multiple Genome Alignments. Bioinformatics. 2013. [Advance Online Access](http://bioinformatics.oxfordjournals.org/content/early/2013/03/16/bioinformatics.btt128.abstract)
+Code Contributors
+-----
+* Glenn Hickey (UCSC)
+* Joel Armstrong (UCSC)
+* Ngan Nguyen (UCSC)
+* Benedict Paten (UCSC)
+* Melissa Jane Hubisz (Cornell)
+
Installation
-----
2  api/impl/defaultGappedBottomSegmentIterator.cpp
View
@@ -530,7 +530,7 @@ bool DefaultGappedBottomSegmentIterator::hasChild() const
if (_left->equals(_right))
{
- assert(_temp->equals(_left) && _temp2->equals(_right));
+ //assert(_temp->equals(_left) && _temp2->equals(_right));
}
// to do: verify edge cases
// assert(_temp->hasChild(_childIndex) == _temp2->hasChild(_childIndex));
30 chain/Makefile
View
@@ -12,12 +12,12 @@ libTestsHeaders = $(wildcard test/*.h)
libHalTestsAll := $(wildcard ../api/tests/*.cpp)
libHalTests = $(subst ../api/tests/allTests.cpp,,${libHalTestsAll})
-all : ${libPath}/halChain.a ${binPath}/hal2chain test/blockVizTest test/blockVizBed test/blockInterpolateTest ${binPath}/halChainTests
+all : ${libPath}/halChain.a ${binPath}/hal2chain test/blockVizTest test/blockVizBed test/blockVizTime test/blockVizMaf ${binPath}/halChainTests
clean :
- rm -f ${libPath}/halChain.a ${libPath}/*.h ${binPath}/hal2chain test/blockVizTest test/BlockVizBed test/blockInterpolateTest
+ rm -f ${libPath}/halChain.a ${libPath}/*.h ${binPath}/hal2chain test/blockVizTest test/BlockVizBed test/blockVizTime test/blockVizMaf
-${libPath}/halChain.a : ${libSources} ${libHeaders} ${libPath}/halLib.a ${basicLibsDependencies}
+${libPath}/halChain.a : ${libSources} ${libHeaders} ${libPath}/halLib.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${basicLibsDependencies}
cp ${libHeaders} ${libPath}/
rm -f *.o
${cpp} ${cppflags} -I inc -I hdf5_impl -I impl -I ${libPath}/ -c ${libSources}
@@ -26,21 +26,25 @@ ${libPath}/halChain.a : ${libSources} ${libHeaders} ${libPath}/halLib.a ${basicL
rm *.o
mv halChain.a ${libPath}/
-${binPath}/hal2chain : impl/hal2chain.cpp ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibsDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/hal2chain impl/hal2chain.cpp ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibs}
+${binPath}/hal2chain : impl/hal2chain.cpp ${libPath}/halMaf.a ${libPath}/halChain.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibsDependencies}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/hal2chain impl/hal2chain.cpp ${libPath}/halMaf.a ${libPath}/halChain.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
-test/blockVizTest: test/blockVizTest.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halLib.a ${basicLib\
+test/blockVizTest: test/blockVizTest.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLib\
sDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizTest test/blockVizTest.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halLib.a ${basicLibs}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizTest test/blockVizTest.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
-test/blockVizBed: test/blockVizBed.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halLib.a ${basicLib\
+test/blockVizMaf: test/blockVizMaf.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLib\
sDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizBed test/blockVizBed.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halLib.a ${basicLibs}
-
-test/blockInterpolateTest: test/blockInterpolateTest.cpp ${libPath}/halChain.a ${libPath}/halLib.a ${basicLib\
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizMaf test/blockVizMaf.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
+
+test/blockVizBed: test/blockVizBed.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLib\
+sDependencies}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizBed test/blockVizBed.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
+
+test/blockVizTime: test/blockVizTime.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLib\
sDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockInterpolateTest test/blockInterpolateTest.cpp ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibs}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o test/blockVizTime test/blockVizTime.c ${libPath}/halChain.a ${libPath}/halLod.a ${libPath}/halMaf.a ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
${binPath}/halChainTests : ${libTests} ${libTestsHeaders} ${libTestsCommon} ${libTestsHeadersCommon} ${libSources} ${libHeaders} ${libInternalHeaders} ${libPath}/halLib.a ${basicLibsDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I tests -I ../api/tests -o ${binPath}/halChainTests ${libHalTests} ${libTests} ${libPath}/halLib.a ${libPath}/halChain.a ${basicLibs}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I tests -I ../api/tests -o ${binPath}/halChainTests ${libHalTests} ${libTests} ${libPath}/halLib.a ${libPath}/halMaf.a ${libPath}/halChain.a ${libPath}/halLiftover.a ${basicLibs}
99 chain/impl/halBlockInterpolate.cpp
View
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
- *
- * Released under the MIT license, see LICENSE.txt
- */
-
-#include <deque>
-#include <cassert>
-#include <map>
-#include <sstream>
-#include "hal.h"
-#include "halChain.h"
-#include "halBlockViz.h"
-#include "halBlockInterpolate.h"
-
-using namespace std;
-using namespace hal;
-
-
-void hal::blockInterpolateSequence(const Sequence* sequence,
- const Genome* target,
- hal_size_t start, hal_size_t length,
- hal_size_t step)
-{
- if (length == 0)
- {
- length = sequence->getSequenceLength() - start;
- }
-
- set<const Genome*> tgtSet;
- tgtSet.insert(target);
- ColumnIteratorConstPtr colIt = sequence->getColumnIterator(&tgtSet, 0, start,
- start + length-1,
- true);
-
- hal_index_t absStart = start + sequence->getStartPosition();
- hal_index_t absLast = start + length + sequence->getStartPosition() - 1;
-
- hal_size_t count = 0;
- for (hal_size_t i = 0; i < length; i+= step)
- {
- colIt->toSite(absStart + i, absLast);
- /* colIt->toRight();
- colIt->toRight();
- colIt->toRight();
- colIt->toRight();
- colIt->toRight();*/
- ++count;
- }
- cout << sequence->getName() << ": " << count << "\n";
-}
-
-void hal::blockInterpolateGenome(const Genome* genome, const Sequence* sequence,
- const Genome* target,
- hal_size_t start, hal_size_t length,
- hal_size_t step)
-{
- if (sequence != NULL)
- {
- blockInterpolateSequence(sequence, target, start, length, step);
- }
- else
- {
- if (start + length > genome->getSequenceLength())
- {
- stringstream ss;
- ss << "Specified range [" << start << "," << length << "] is"
- << "out of range for genome " << genome->getName()
- << ", which has length " << genome->getSequenceLength();
- throw (hal_exception(ss.str()));
- }
- if (length == 0)
- {
- length = genome->getSequenceLength() - start;
- }
-
- SequenceIteratorConstPtr seqIt = genome->getSequenceIterator();
- SequenceIteratorConstPtr seqEndIt = genome->getSequenceEndIterator();
- hal_size_t runningLength = 0;
- for (; seqIt != seqEndIt; seqIt->toNext())
- {
- const Sequence* sequence = seqIt->getSequence();
- hal_size_t seqLen = sequence->getSequenceLength();
- hal_size_t seqStart = (hal_size_t)sequence->getStartPosition();
-
- if (start + length >= seqStart &&
- start < seqStart + seqLen &&
- runningLength < length)
- {
- hal_size_t readStart = seqStart >= start ? 0 : seqStart - start;
- hal_size_t readLen = std::min(seqLen - start, length - runningLength);
-
- blockInterpolateSequence(sequence, target, readStart, readLen, step);
- runningLength += readLen;
- }
- }
- }
-}
-
277 chain/impl/halBlockViz.cpp
View
@@ -12,12 +12,14 @@
#include <limits>
#include <algorithm>
#include <stdlib.h>
+#include <stdio.h>
#include <string.h>
#include "hal.h"
#include "halChain.h"
#include "halBlockViz.h"
#include "halBlockMapper.h"
#include "halLodManager.h"
+#include "halMafExport.h"
#ifdef ENABLE_UDC
#include <pthread.h>
@@ -48,10 +50,13 @@ static char* copyCString(const string& inString);
static hal_block_results_t* readBlocks(AlignmentConstPtr seqAlignment,
const Sequence* tSequence,
- hal_index_t absStart, hal_index_t absEnd,
+ hal_index_t absStart,
+ hal_index_t absEnd,
+ bool tReversed,
const Genome* qGenome,
bool getSequenceString,
- bool doDupes, double flipStrandPct);
+ bool doDupes, bool doTargetDupes,
+ bool doAdjes);
static void readBlock(AlignmentConstPtr seqAlignment,
hal_block_t* cur,
@@ -65,8 +70,7 @@ static void readTargetRange(hal_target_dupe_list_t* cur,
static void cleanTargetDupesList(vector<hal_target_dupe_list_t*>& dupeList);
-static void flipStrand(hal_block_t* firstBlock);
-
+static void mergeCompatibleDupes(vector<hal_target_dupe_list_t*>& dupeList);
extern "C" int halOpenLOD(char *lodFilePath)
{
@@ -178,7 +182,8 @@ extern "C" void halFreeBlocks(struct hal_block_t* head)
while (head != NULL)
{
hal_block_t* next = head->next;
- free(head->sequence);
+ free(head->qSequence);
+ free(head->tSequence);
free(head->qChrom);
free(head);
head = next;
@@ -209,8 +214,10 @@ struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
char* tChrom,
hal_int_t tStart,
hal_int_t tEnd,
+ hal_int_t tReversed,
int getSequenceString,
- int doDupes)
+ hal_dup_type_t dupMode,
+ int mapBackAdjacencies)
{
HAL_LOCK
hal_block_results_t* results = NULL;
@@ -223,6 +230,16 @@ struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
ss << "Invalid query range [" << tStart << "," << tEnd << ").";
throw hal_exception(ss.str());
}
+ if (tReversed != 0 && mapBackAdjacencies != 0)
+ {
+ throw hal_exception("tReversed can only be set when"
+ "mapBackAdjacencies is 0");
+ }
+ if (tReversed != 0 && dupMode == HAL_QUERY_AND_TARGET_DUPS)
+ {
+ throw hal_exception("tReversed cannot be set in conjunction with"
+ " dupMode=HAL_QUERY_AND_TARGET_DUPS");
+ }
AlignmentConstPtr alignment =
getExistingAlignment(halHandle, hal_size_t(rangeLength), false);
checkGenomes(halHandle, alignment, qSpecies, tSpecies, tChrom);
@@ -260,8 +277,12 @@ struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
true);
}
- results = readBlocks(seqAlignment, tSequence, absStart, absEnd, qGenome,
- getSequenceString != 0, doDupes != 0, 0.5);
+ results = readBlocks(seqAlignment, tSequence, absStart, absEnd,
+ tReversed != 0,
+ qGenome,
+ getSequenceString != 0, dupMode != HAL_NO_DUPS,
+ dupMode == HAL_QUERY_AND_TARGET_DUPS,
+ mapBackAdjacencies != 0);
}
catch(exception& e)
{
@@ -277,6 +298,86 @@ struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
return results;
}
+extern "C" hal_int_t halGetMAF(FILE* outFile,
+ int halHandle,
+ hal_species_t* qSpeciesNames,
+ char* tSpecies,
+ char* tChrom,
+ hal_int_t tStart,
+ hal_int_t tEnd,
+ int doDupes)
+{
+ HAL_LOCK
+ hal_int_t numBytes = 0;
+ try
+ {
+ hal_int_t rangeLength = tEnd - tStart;
+ if (rangeLength < 0)
+ {
+ stringstream ss;
+ ss << "Invalid query range [" << tStart << "," << tEnd << ").";
+ throw hal_exception(ss.str());
+ }
+ AlignmentConstPtr alignment =
+ getExistingAlignment(halHandle, hal_size_t(0), true);
+
+ set<const Genome*> qGenomeSet;
+ for (hal_species_t* qSpecies = qSpeciesNames; qSpecies != NULL;
+ qSpecies = qSpecies->next)
+ {
+ checkGenomes(halHandle, alignment, qSpecies->name, tSpecies, tChrom);
+ const Genome* qGenome = alignment->openGenome(qSpecies->name);
+ qGenomeSet.insert(qGenome);
+ }
+
+ const Genome* tGenome = alignment->openGenome(tSpecies);
+ const Sequence* tSequence = tGenome->getSequence(tChrom);
+
+ hal_index_t myEnd = tEnd > 0 ? tEnd : tSequence->getSequenceLength();
+ hal_index_t absStart = tSequence->getStartPosition() + tStart;
+ hal_index_t absEnd = tSequence->getStartPosition() + myEnd - 1;
+ if (absStart > absEnd)
+ {
+ throw hal_exception("Invalid range");
+ }
+ if (absEnd > tSequence->getEndPosition())
+ {
+ throw hal_exception("Target end position outside of target sequence");
+ }
+
+ stringstream mafBuffer;
+ MafExport mafExport;
+ mafExport.setNoDupes(doDupes == 0);
+ mafExport.setUcscNames(true);
+ mafExport.convertSegmentedSequence(mafBuffer, alignment, tGenome,
+ absStart, 1 + absEnd - absStart,
+ qGenomeSet);
+ // if these buffers are very big, it is my intuition that
+ // chopping them up and, saw, only converting and writing a few kb
+ // at a time would be more efficient... but i think that's the least
+ // of our problems right now.
+ string mafStringBuffer = mafBuffer.str();
+ if (mafStringBuffer.empty() == false)
+ {
+ numBytes = (hal_int_t)fwrite(mafStringBuffer.c_str(),
+ mafStringBuffer.length(),
+ sizeof(char), outFile);
+ }
+ }
+ catch(exception& e)
+ {
+ cerr << "Exception caught: " << e.what() << endl;
+ numBytes = -1;
+ }
+ catch(...)
+ {
+ cerr << "Error in hal maf query";
+ numBytes = -1;
+ }
+ HAL_UNLOCK
+ return numBytes;
+}
+
extern "C" struct hal_species_t *halGetSpecies(int halHandle)
{
HAL_LOCK
@@ -527,14 +628,17 @@ char* copyCString(const string& inString)
hal_block_results_t* readBlocks(AlignmentConstPtr seqAlignment,
const Sequence* tSequence,
hal_index_t absStart, hal_index_t absEnd,
+ bool tReversed,
const Genome* qGenome, bool getSequenceString,
- bool doDupes, double flipStrandPct)
+ bool doDupes, bool doTargetDupes,
+ bool doAdjes)
{
const Genome* tGenome = tSequence->getGenome();
string qGenomeName = qGenome->getName();
hal_block_t* prev = NULL;
BlockMapper blockMapper;
- blockMapper.init(tGenome, qGenome, absStart, absEnd, doDupes, 0, true);
+ blockMapper.init(tGenome, qGenome, absStart, absEnd,
+ tReversed, doDupes, 0, doAdjes);
blockMapper.map();
BlockMapper::MSSet paraSet;
hal_size_t totalLength = 0;
@@ -577,10 +681,10 @@ hal_block_results_t* readBlocks(AlignmentConstPtr seqAlignment,
{
results->targetDupeBlocks = processTargetDupes(blockMapper, paraSet);
}
- if (totalLength > 0 &&
- (double)reversedLength / (double)totalLength > flipStrandPct)
+ if (doTargetDupes == false && results->targetDupeBlocks != NULL)
{
- flipStrand(results->mappedBlocks);
+ halFreeTargetDupeLists(results->targetDupeBlocks);
+ results->targetDupeBlocks = NULL;
}
return results;
}
@@ -604,7 +708,8 @@ void readBlock(AlignmentConstPtr seqAlignment,
cur->next = NULL;
string seqBuffer = qSequence->getName();
- string dnaBuffer;
+ string qDnaBuffer;
+ string tDnaBuffer;
size_t prefix =
seqBuffer.find(genomeName + '.') != 0 ? 0 : genomeName.length() + 1;
cur->qChrom = (char*)malloc(seqBuffer.length() + 1 - prefix);
@@ -634,7 +739,8 @@ void readBlock(AlignmentConstPtr seqAlignment,
assert(firstRefSeg->getLength() == firstQuerySeg->getLength());
cur->size = 1 + tEnd - cur->tStart;
cur->strand = firstQuerySeg->getReversed() ? '-' : '+';
- cur->sequence = NULL;
+ cur->tSequence = NULL;
+ cur->qSequence = NULL;
if (getSequenceString != 0)
{
const Genome* qSeqGenome =
@@ -654,14 +760,36 @@ void readBlock(AlignmentConstPtr seqAlignment,
<< " for DNA sequence extraction";
throw hal_exception(ss.str());
}
+
+ const Genome* tSeqGenome =
+ seqAlignment->openGenome(tSequence->getGenome()->getName());
+ if (tSeqGenome == NULL)
+ {
+ stringstream ss;
+ ss << "Unable to open genome " << tSequence->getGenome()->getName()
+ << " for DNA sequence extraction";
+ throw hal_exception(ss.str());
+ }
+
+ const Sequence* tSeqSequence = tSeqGenome->getSequence(tSequence->getName());
+ if (tSeqSequence == NULL)
+ {
+ stringstream ss;
+ ss << "Unable to open sequence " << tSequence->getName()
+ << " for DNA sequence extraction";
+ throw hal_exception(ss.str());
+ }
- qSeqSequence->getSubString(dnaBuffer, cur->qStart, cur->size);
+ qSeqSequence->getSubString(qDnaBuffer, cur->qStart, cur->size);
+ tSeqSequence->getSubString(tDnaBuffer, cur->tStart, cur->size);
if (cur->strand == '-')
{
- reverseComplement(dnaBuffer);
+ reverseComplement(qDnaBuffer);
}
- cur->sequence = (char*)malloc(dnaBuffer.length() * sizeof(char) + 1);
- strcpy(cur->sequence, dnaBuffer.c_str());
+ cur->qSequence = (char*)malloc(qDnaBuffer.length() * sizeof(char) + 1);
+ cur->tSequence = (char*)malloc(tDnaBuffer.length() * sizeof(char) + 1);
+ strcpy(cur->qSequence, qDnaBuffer.c_str());
+ strcpy(cur->tSequence, tDnaBuffer.c_str());
}
}
@@ -669,6 +797,31 @@ struct CStringLess { bool operator()(const char* s1, const char* s2) const {
return strcmp(s1, s2) < 0;}
};
+struct DupeIdLess { bool operator()(const hal_target_dupe_list_t* d1,
+ const hal_target_dupe_list_t* d2) const {
+ if (d1 != NULL && d2 != NULL)
+ {
+ if (d1->id == d2->id)
+ {
+ // hack to keep ties in reverse sorted order based on tStart
+ // this is so that the prepend in processTargetDupes when the
+ // lists gets merged keeps the tStarts in ascending order
+ return d1->tRange->tStart > d2->tRange->tStart;
+ }
+ return d1->id < d2->id;
+ }
+ return d1 != NULL && d2 == NULL;
+}};
+
+struct DupeStartLess { bool operator()(const hal_target_dupe_list_t* d1,
+ const hal_target_dupe_list_t* d2) const {
+ if (d1 != NULL && d2 != NULL)
+ {
+ return d1->tRange->tStart < d2->tRange->tStart;
+ }
+ return d1 != NULL && d2 == NULL;
+}};
+
hal_target_dupe_list_t* processTargetDupes(BlockMapper& blockMapper,
BlockMapper::MSSet& paraSet)
{
@@ -696,8 +849,6 @@ hal_target_dupe_list_t* processTargetDupes(BlockMapper& blockMapper,
cleanTargetDupesList(tempList);
- hal_target_dupe_list_t* head = NULL;
- hal_target_dupe_list_t* prev = NULL;
map<const char*, hal_int_t, CStringLess> idMap;
pair<map<const char*, hal_int_t, CStringLess>::iterator, bool> idRes;
vector<hal_target_dupe_list_t*>::iterator i;
@@ -721,6 +872,10 @@ hal_target_dupe_list_t* processTargetDupes(BlockMapper& blockMapper,
(*j)->tRange->next = (*i)->tRange;
(*i)->tRange = (*j)->tRange;
(*j)->tRange = NULL;
+ // DupeIdLess sorting comparator should ensure that these
+ // elemens are added in the proper order so that trange list
+ // stays sorted here
+ assert((*i)->tRange->tStart < (*i)->tRange->next->tStart);
halFreeTargetDupeLists(*j);
*j = NULL;
j = k;
@@ -728,17 +883,26 @@ hal_target_dupe_list_t* processTargetDupes(BlockMapper& blockMapper,
idRes = idMap.insert(pair<const char*, hal_int_t>((*i)->qChrom, 0));
(*i)->id = idRes.first->second++;
+ }
+
+ std::sort(tempList.begin(), tempList.end(), DupeStartLess());
+ mergeCompatibleDupes(tempList);
+
+ hal_target_dupe_list_t* head = NULL;
+ hal_target_dupe_list_t* prev = NULL;
+ for (i = tempList.begin(); i != tempList.end() && *i != NULL; ++i)
+ {
if (head == NULL)
{
head = *i;
}
- else
+ else
{
prev->next = *i;
}
prev = *i;
- prev->next = NULL;
}
+
return head;
}
@@ -786,21 +950,6 @@ void readTargetRange(hal_target_dupe_list_t* cur,
strcpy(cur->qChrom, qSeqName.c_str());
}
-struct DupeIdLess { bool operator()(const hal_target_dupe_list_t* d1,
- const hal_target_dupe_list_t* d2) const {
- if (d1 != NULL && d2 != NULL)
- {
- return d1->id < d2->id;
- }
- return d1 != NULL && d2 == NULL;
-}};
-
-struct DupeStartLess { bool operator()(const hal_target_dupe_list_t* d1,
- const hal_target_dupe_list_t* d2) const {
- return d1->tRange->tStart < d2->tRange->tStart;
-}};
-
-
void cleanTargetDupesList(vector<hal_target_dupe_list_t*>& dupeList)
{
// sort based on target start coordinate
@@ -858,14 +1007,50 @@ void cleanTargetDupesList(vector<hal_target_dupe_list_t*>& dupeList)
dupeList.resize(dupeList.size() - deadCount);
}
-static void flipStrand(hal_block_t* firstBlock)
+static bool areRangesCompatible(hal_target_range_t* range1,
+ hal_target_range_t* range2)
{
- // below is stupid and naive and will not work for anything but
- // dense mode (ie where no edges between blocks). put off fixing
- // for now....
-// for (hal_block_t* cur = firstBlock; cur; cur = cur->next)
-// {
-// cur->strand = cur->strand == '-' ? '+' : '-';
-// }
+ for (; range1 != NULL; range1 = range1->next, range2 = range2->next)
+ {
+ if ((range1->next == NULL) != (range2->next == NULL) ||
+ range2->tStart != range1->tStart + range1->size ||
+ (range1->next != NULL &&
+ range1->next->tStart - (range1->tStart + range1->size) <
+ range2->size))
+ {
+ return false;
+ }
+ }
+ return true;
}
+void mergeCompatibleDupes(vector<hal_target_dupe_list_t*>& dupeList)
+{
+ vector<hal_target_dupe_list_t*>::iterator i;
+ vector<hal_target_dupe_list_t*>::iterator j;
+ vector<hal_target_dupe_list_t*>::iterator k;
+
+ for (i = dupeList.begin(); i != dupeList.end() && *i != NULL; i = j)
+ {
+ hal_target_range_t* range1 = (*i)->tRange;
+ j = i;
+ bool compatible = true;
+ for (++j; j != dupeList.end() && *j != NULL && compatible; ++j)
+ {
+ hal_target_range_t* range2 = (*j)->tRange;
+ compatible = areRangesCompatible(range1, range2);
+ if (compatible == true)
+ {
+ range2 = (*j)->tRange;
+ for (range1 = (*i)->tRange; range1 != NULL; range1 = range1->next)
+ {
+ assert(range2 != NULL);
+ range1->size += range2->size;
+ range2 = range2->next;
+ }
+ halFreeTargetDupeLists(*j);
+ *j = NULL;
+ }
+ }
+ }
+}
28 chain/inc/halBlockInterpolate.h
View
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
- *
- * Released under the MIT license, see LICENSE.txt
- */
-
-#ifndef _HALBLOCKINTERPOLATE_H
-#define _HALBLOCKINTERPOLATE_H
-
-#include <iostream>
-#include <string>
-#include <vector>
-#include "hal.h"
-
-namespace hal {
-
-void blockInterpolateGenome(const Genome* genome, const Sequence* sequence,
- const Genome* target,
- hal_size_t start, hal_size_t length,
- hal_size_t step);
-
-void blockInterpolateSequence(const Sequence* sequence, const Genome* target,
- hal_size_t start, hal_size_t length,
- hal_size_t step);
-
-}
-
-#endif
58 chain/inc/halBlockViz.h
View
@@ -57,7 +57,8 @@ struct hal_block_t
hal_int_t qStart;
hal_int_t size;
char strand;
- char *sequence;
+ char *qSequence; // query DNA, if requested
+ char *tSequence; // target DNA, if requested
};
/** Some information about a genome */
@@ -79,6 +80,19 @@ struct hal_chromosome_t
hal_int_t length;
};
+/** Duplication mode toggler.
+ * HAL_NO_DUPS: No duplications computed
+ * HAL_QUERY_DUPS: The same query range can map to multiple places in target
+ * HAL_QUERY_AND_TARGET_DUPS: As above, but target can also map to multiple
+ * places in query. Note that these instances are returned in the special
+ * target dup list structure */
+typedef enum
+{
+ HAL_NO_DUPS = 0,
+ HAL_QUERY_DUPS,
+ HAL_QUERY_AND_TARGET_DUPS,
+} hal_dup_type_t;
+
/** Open a text file created by halLodInterpolate.py for viewing.
* This text file contains a list of paths to progressively coarser
* levels of detail of a source HAL file. For example, it may look like
@@ -138,10 +152,18 @@ void halFreeTargetDupeLists(struct hal_target_dupe_list_t* dupes);
* @param tStart start position in reference
* @param tEnd last + 1 position in reference (if 0, then the size of the
* chromosome is used).
+ * @param tReversed Input region is on the reverse strand (but still
+ * in forward coordinates like BED). can only be used in liftOverMode
+ * otherwise must be set to 0
* @param getSequenceString copy DNA sequence (of query species) into
* output blocks if not 0.
- * @param doDupes create blocks for duplications if not 0. When this
- * option is enabled, the same region can appear in more than one block.
+ * @param dupMode Specifies which types of duplications to compute.
+ * (note that when tReversed != 0, target duplications are not supported,
+ * so when doing liftover use no dupes or query dupes only)
+ * @param mapBackAdjacencies Species if segments adjacent to query segments
+ * in the alignment are mapped back ot the target (producing offscreen
+ * results). Must be set to 0 it tReversed is not 0 (so set to 0 when doing
+ * liftover).
* @return block structure -- must be freed by halFreeBlockResults()
*/
struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
@@ -150,9 +172,35 @@ struct hal_block_results_t *halGetBlocksInTargetRange(int halHandle,
char* tChrom,
hal_int_t tStart,
hal_int_t tEnd,
+ hal_int_t tReversed,
int getSequenceString,
- int doDupes);
-
+ hal_dup_type_t dupMode,
+ int mapBackAdjacencies);
+
+/** Read alignment into an output file in MAF format. Interface very
+ * similar to halGetBlocksInTargetRange except multiple query species
+ * can be specified
+ *
+ * @param halHandle handle for the HAL alignment obtained from halOpen
+ * @param qSpeciesNames the names of the query species (no other information
+ * is read from the hal_species_t structure -- just the names).
+ * @param tSpecies the name of the reference species.
+ * @param tChrom name of the chromosome in reference.
+ * @param tStart start position in reference
+ * @param tEnd last + 1 position in reference (if 0, then the size of the
+ * chromosome is used).
+ * @param doDupes create blocks for duplications if not 0. When this
+ * option is enabled, the same region can appear in more than one block.
+ * @return number of bytes written
+ */
+hal_int_t halGetMAF(FILE* outFile,
+ int halHandle,
+ struct hal_species_t* qSpeciesNames,
+ char* tSpecies,
+ char* tChrom,
+ hal_int_t tStart,
+ hal_int_t tEnd,
+ int doDupes);
/** Create a linked list of the species in the hal file.
* @param halHandle handle for the HAL alignment obtained from halOpen
131 chain/test/blockInterpolateTest.cpp
View
@@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu)
- *
- * Released under the MIT license, see LICENSE.txt
- */
-
-#include <cstdlib>
-#include <iostream>
-#include <fstream>
-#include <algorithm>
-#include "hal.h"
-#include "halChain.h"
-#include "halBlockInterpolate.h"
-
-using namespace std;
-using namespace hal;
-
-static CLParserPtr initParser()
-{
- CLParserPtr optionsParser = hdf5CLParserInstance(true);
- optionsParser->addArgument("halPath", "input hal file");
- optionsParser->addArgument("genome", "reference genome to scan");
- optionsParser->addOption("sequence", "sequence name to scan ("
- "all sequences by default)",
- "\"\"");
- optionsParser->addOption("start",
- "coordinate within reference genome (or sequence"
- " if specified) to start at",
- 0);
- optionsParser->addOption("length",
- "length of the reference genome (or sequence"
- " if specified) to convert. If set to 0,"
- " the entire thing is converted",
- 0);
- optionsParser->addOption("step", "step size", 1);
- optionsParser->addOption("targetGenome", "target genome [default: "
- "parent of reference genome", "\"\"");
-
- return optionsParser;
-}
-
-int main(int argc, char** argv)
-{
- CLParserPtr optionsParser = initParser();
-
- string halPath;
- string genomeName;
- string tgtGenomeName;
- string sequenceName;
- hal_size_t start;
- hal_size_t length;
- hal_size_t step;
- try
- {
- optionsParser->parseOptions(argc, argv);
- halPath = optionsParser->getArgument<string>("halPath");
- genomeName = optionsParser->getArgument<string>("genome");
- sequenceName = optionsParser->getOption<string>("sequence");
- start = optionsParser->getOption<hal_size_t>("start");
- length = optionsParser->getOption<hal_size_t>("length");
- step = optionsParser->getOption<hal_size_t>("step");
- tgtGenomeName = optionsParser->getOption<string>("targetGenome");
- }
- catch(exception& e)
- {
- cerr << e.what() << endl;
- optionsParser->printUsage(cerr);
- exit(1);
- }
-
- try
- {
- AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath,
- optionsParser);
- if (alignment->getNumGenomes() == 0)
- {
- throw hal_exception("input hal alignmenet is empty");
- }
-
- const Genome* genome = alignment->openGenome(genomeName);
- if (genome == NULL)
- {
- throw hal_exception(string("Genome ") + genomeName + " not found");
- }
-
- const Genome* target;
- if (tgtGenomeName != "\"\"")
- {
- target = alignment->openGenome(tgtGenomeName);
- if (target == NULL)
- {
- throw hal_exception(string("Genome ") + tgtGenomeName + " not found");
- }
- }
- else
- {
- target = genome->getParent();
- if (target == NULL)
- {
- throw hal_exception(string("Genome ") + genomeName + " has no parent: "
- "--targetGenome must be specified");
- }
- }
-
- const Sequence* sequence = NULL;
- if (sequenceName != "\"\"")
- {
- sequence = genome->getSequence(sequenceName);
- if (sequence == NULL)
- {
- throw hal_exception(string("Sequence ") + sequenceName + " not found");
- }
- }
-
- blockInterpolateGenome(genome, sequence, target, start, length, step);
-
- }
- catch(hal_exception& e)
- {
- cerr << "hal exception caught: " << e.what() << endl;
- return 1;
- }
- catch(exception& e)
- {
- cerr << "Exception caught: " << e.what() << endl;
- return 1;
- }
-
- return 0;
-}
-
7 chain/test/blockVizBed.c
View
@@ -124,9 +124,12 @@ int main(int argc, char** argv)
args.tSpecies,
args.tChrom,
args.tStart,
- args.tEnd,
+ args.tEnd,
+ 0,
args.doSeq,
- args.doDupes);
+ HAL_QUERY_AND_TARGET_DUPS,
+ 1);
+
if (results == NULL)
{
ret = -1;
170 chain/test/blockVizBenchmark.py
View
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+#Copyright (C) 2013 by Glenn Hickey
+#
+#Released under the MIT license, see LICENSE.txt
+#!/usr/bin/env python
+
+"""Simulate snake track browser queries and time them. Ideally done with hgTracks
+but it's easier to automate this way, as far as I can tell.
+"""
+import argparse
+import os
+import sys
+import copy
+import subprocess
+import time
+import math
+import random
+from datetime import datetime
+from collections import OrderedDict
+
+from hal.stats.halStats import runShellCommand
+from hal.stats.halStats import getHalGenomes
+from hal.stats.halStats import getHalNumSegments
+from hal.stats.halStats import getHalStats
+
+
+# Wrapper for blockVizTime
+def getBlockVizCmd(options, tgtGenome):
+ cmd = "./blockVizTime %s %s %s %s %d %d" % (options.lod, tgtGenome,
+ options.refGenome, options.refSequence,
+ options.refFirst, options.refLast)
+ if options.refLength >= options.maxSnp:
+ cmd += " 1"
+ else:
+ cmd += " 0"
+ if options.doDupes is True:
+ cmd += " 1"
+ else:
+ cmd += " 0"
+ if options.udc is not None:
+ cmd += " %s" % options.udc
+
+ return cmd
+
+def timeCmd(cmd):
+ tstart = datetime.now()
+ runShellCommand(cmd)
+ tend = datetime.now()
+ tdelta = tend - tstart
+ tsecs = tdelta.seconds
+ tsecs += tdelta.microseconds / 1000000.0
+ return tsecs
+
+def simulateLoad(options):
+ cmds = [getBlockVizCmd(options, tgtGenome) for tgtGenome in options.tgtGenomes]
+ elapsedTime = 0.
+ for cmd in cmds:
+ lastExcep = None
+ for trial in xrange(options.retry):
+ if options.udc is not None and options.zapUdc is True:
+ runShellCommand("rm -rf %s" % os.path.join(options.udc, "*"))
+ t = -1
+ try:
+ t = timeCmd(cmd)
+ lastExcep = None
+ break
+ except Exception as e:
+ lastExcep = e
+ time.sleep(2)
+ if lastExcep is None:
+ elapsedTime += t
+ else:
+ raise lastExcep
+ return elapsedTime
+
+def uniformRuns(options):
+ trials = []
+ for i in xrange(options.reps):
+ size = random.uniform(1, options.refLength)
+ start = random.uniform(0, options.refLength - size)
+ trials.append((int(start), int(start + size)))
+ return trials
+
+def runSim(options):
+ trials = uniformRuns(options)
+ elapsedTimes = []
+ for trial in trials:
+ options.refFirst = trial[0]
+ options.refLast = trial[1]
+ elapsedTimes.append((options.refLast - options.refFirst, simulateLoad(options)))
+ return elapsedTimes
+
+def binTimes(options, elapsedTimes):
+ binnedTimes = dict()
+ for (size, time) in elapsedTimes:
+ theBin = int(size / options.binSize) * options.binSize
+ if theBin not in binnedTimes:
+ binnedTimes[theBin] = (1, size, time, time, time)
+ else:
+ curVal = binnedTimes[theBin]
+ binnedTimes[theBin] = (curVal[0] + 1,
+ curVal[1] + size,
+ curVal[2] + time,
+ min(curVal[3], time),
+ max(curVal[4], time))
+ return binnedTimes
+
+def printTable(options, binnedTimes):
+ print "Bin, nElem, totTime, minTime, maxTime, avgTime"
+ for key in sorted(binnedTimes.iterkeys()):
+ theBin = key
+ nElem, totSize, totTime, minTime, maxTime = binnedTimes[theBin]
+ avgTime = float(totTime) / nElem
+ print "%d, %d, %f, %f, %f, %f" % (
+ theBin, nElem, totTime, minTime, maxTime, avgTime)
+
+def main(argv=None):
+ if argv is None:
+ argv = sys.argv
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description="Time some simulated browser queries. ")
+
+ parser.add_argument("lod", help="input lod or hal path")
+ parser.add_argument("refGenome", help="Name of reference genome")
+ parser.add_argument("refSequence", help="Name of reference sequence")
+ parser.add_argument("refLength", help="Length of reference sequence", type=int)
+ parser.add_argument("tgtGenomes", help="Comma-separated list of target genomes")
+
+ parser.add_argument("--reps", help="Number of queries to perform",
+ type=int, default=100)
+
+ parser.add_argument("--udc", help="UDC path", default=None)
+
+ parser.add_argument("--zapUdc", help="If a UDC path is specified with --udc, then"
+ " erase it before each individual query to make sure it doesnt"
+ " get used. this is for comparison only", action="store_true",
+ default=False)
+
+ parser.add_argument("--doDupes", help="Do duplications", action="store_true",
+ default=False)
+
+ parser.add_argument("--maxSnp", help="Max query size to get bases",
+ type=int, default=50000)
+
+ parser.add_argument("--binSize", help="Bin size for output table",
+ type=int, default=1000)
+
+ parser.add_argument("--seed", help="Random seed", type=int, default=None)
+
+ parser.add_argument("--retry", help="Number of retries if a query fails (which"
+ " happens more than you think", type=int, default=10)
+
+ args = parser.parse_args()
+ args.tgtGenomes = args.tgtGenomes.split(",")
+
+ if args.udc is not None and not os.path.exists(args.udc):
+ os.makedirs(args.udc)
+
+ if args.seed is not None:
+ random.seed(args.seed)
+
+ times = runSim(args)
+ binnedTimes = binTimes(args, times)
+ printTable(args, binnedTimes)
+
+if __name__ == "__main__":
+ sys.exit(main())
115 chain/test/blockVizMaf.c
View
@@ -0,0 +1,115 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "halBlockViz.h"
+
+#ifdef ENABLE_UDC
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <pthread.h>
+#include "common.h"
+#include "udc.h"
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+struct bv_args_t
+{
+ char* path;
+ char* qSpecies;
+ char* tSpecies;
+ char* tChrom;
+ int tStart;
+ int tEnd;
+ int doSeq;
+ int doDupes;
+ char* udcPath;
+};
+
+static int parseArgs(int argc, char** argv, bv_args_t* args)
+{
+ if (argc != 7 && argc != 8 && argc != 9)
+ {
+ return -1;
+ }
+ args->path = argv[1];
+ args->qSpecies = argv[2];
+ args->tSpecies = argv[3];
+ args->tChrom = argv[4];
+ if (sscanf(argv[5], "%d", &args->tStart) != 1 ||
+ sscanf(argv[6], "%d", &args->tEnd) != 1)
+ {
+ return -1;
+ }
+ args->doSeq = 0;
+ args->doDupes = 0;
+ if (argc >= 8)
+ {
+ if (sscanf(argv[7], "%d", &args->doDupes) != 1)
+ {
+ return -1;
+ }
+ }
+ args->udcPath = NULL;
+ if (argc >= 9)
+ {
+ args->udcPath = argv[8];
+ }
+ return 0;
+}
+
+static int openWrapper(char* path)
+{
+ if (strcmp(path + strlen(path) - 3, "hal") == 0)
+ {
+ return halOpen(path);
+ }
+ return halOpenLOD(path);
+}
+
+int main(int argc, char** argv)
+{
+ bv_args_t args;
+
+ if (parseArgs(argc, argv, &args) != 0)
+ {
+ fprintf(stderr, "Usage: %s <halLodPath> <qSpecies> <tSpecies> <tChrom> "
+ "<tStart> <tEnd> [doDupes=0] [udcPath=NULL]\n\n", argv[0]);
+ return -1;
+ }
+#ifdef ENABLE_UDC
+ if (args.udcPath != NULL)
+ {
+ udcSetDefaultDir(args.udcPath);
+ }
+#endif
+
+ int handle = openWrapper(args.path);
+ int ret = -1;
+ if (handle >= 0)
+ {
+ // printStats(stdout, handle);
+ hal_species_t qSpecies;
+ qSpecies.name = args.qSpecies;
+ qSpecies.next = NULL;
+
+ long numBytes = halGetMAF(stdout,
+ handle,
+ &qSpecies,
+ args.tSpecies,
+ args.tChrom,
+ args.tStart,
+ args.tEnd,
+ args.doDupes);
+
+ if (numBytes >= 0)
+ {
+ ret = 0;
+ }
+ printf("\nread %ld bytes\n", numBytes);
+ }
+ return ret;
+}
25 chain/test/blockVizTest.c
View
@@ -70,8 +70,8 @@ static int parseArgs(int argc, char** argv, bv_args_t* args)
static void printBlock(FILE* file, struct hal_block_t* b)
{
- fprintf(file, "chr:%s, tSt:%ld, qSt:%ld, size:%ld, strand:%c: %s\n",
- b->qChrom, b->tStart, b->qStart, b->size, b->strand, b->sequence);
+ fprintf(file, "chr:%s, tSt:%ld, qSt:%ld, size:%ld, strand:%c: tgt : %s query: %s\n",
+ b->qChrom, b->tStart, b->qStart, b->size, b->strand, b->tSequence, b->qSequence);
}
static void printDupeList(FILE* file, struct hal_target_dupe_list_t* d)
@@ -122,13 +122,15 @@ static void* getBlocksWrapper(void* voidArgs)
if (handle >= 0)
{
results = halGetBlocksInTargetRange(handle,
- args->qSpecies,
- args->tSpecies,
- args->tChrom,
- args->tStart,
- args->tEnd,
- args->doSeq,
- 0);
+ args->qSpecies,
+ args->tSpecies,
+ args->tChrom,
+ args->tStart,
+ args->tEnd,
+ 0,
+ args->doSeq,
+ HAL_QUERY_AND_TARGET_DUPS,
+ 1);
halFreeBlockResults(results);
}
pthread_exit(NULL);
@@ -165,8 +167,11 @@ int main(int argc, char** argv)
args.tChrom,
args.tStart,
args.tEnd,
+ 0,
args.doSeq,
- args.doDupes);
+ HAL_QUERY_AND_TARGET_DUPS,
+ 1);
+
if (results == NULL)
{
ret = -1;
124 chain/test/blockVizTime.c
View
@@ -0,0 +1,124 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "halBlockViz.h"
+
+#ifdef ENABLE_UDC
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <pthread.h>
+#include "common.h"
+#include "udc.h"
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+struct bv_args_t
+{
+ char* path;
+ char* qSpecies;
+ char* tSpecies;
+ char* tChrom;
+ int tStart;
+ int tEnd;
+ int doSeq;
+ int doDupes;
+ char* udcPath;
+};
+
+static int parseArgs(int argc, char** argv, bv_args_t* args)
+{
+ if (argc != 7 && argc != 8 && argc != 9 && argc != 10)
+ {
+ return -1;
+ }
+ args->path = argv[1];
+ args->qSpecies = argv[2];
+ args->tSpecies = argv[3];
+ args->tChrom = argv[4];
+ if (sscanf(argv[5], "%d", &args->tStart) != 1 ||
+ sscanf(argv[6], "%d", &args->tEnd) != 1)
+ {
+ return -1;
+ }
+ args->doSeq = 0;
+ if (argc >= 8)
+ {
+ if (sscanf(argv[7], "%d", &args->doSeq) != 1)
+ {
+ return -1;
+ }
+ }
+ args->doDupes = 0;
+ if (argc >= 9)
+ {
+ if (sscanf(argv[8], "%d", &args->doDupes) != 1)
+ {
+ return -1;
+ }
+ }
+ args->udcPath = NULL;
+ if (argc >= 10)
+ {
+ args->udcPath = argv[9];
+ }
+ return 0;
+}
+
+
+static int openWrapper(char* path)
+{
+ if (strcmp(path + strlen(path) - 3, "hal") == 0)
+ {
+ return halOpen(path);
+ }
+ return halOpenLOD(path);
+}
+
+int main(int argc, char** argv)
+{
+ bv_args_t args;
+
+ if (parseArgs(argc, argv, &args) != 0)
+ {
+ fprintf(stderr, "Usage: %s <halLodPath> <qSpecies> <tSpecies> <tChrom> "
+ "<tStart> <tEnd> [doSeq=0] [doDupes=0] [udcPath=NULL]\n\n", argv[0]);
+ return -1;
+ }
+#ifdef ENABLE_UDC
+ if (args.udcPath != NULL)
+ {
+ udcSetDefaultDir(args.udcPath);
+ }
+#endif
+
+ int handle = openWrapper(args.path);
+ int ret = 0;
+ if (handle >= 0)
+ {
+ struct hal_block_results_t* results =
+ halGetBlocksInTargetRange(handle,
+ args.qSpecies,
+ args.tSpecies,
+ args.tChrom,
+ args.tStart,
+ args.tEnd,
+ 0,
+ args.doSeq,
+ HAL_QUERY_AND_TARGET_DUPS,
+ 1);
+ if (results == NULL)
+ {
+ ret = -1;
+ }
+ halFreeBlockResults(results);
+ }
+ else
+ {
+ ret = -1;
+ }
+ return ret;
+}
57 chain/test/halChainGetBlocksTest.cpp
View
@@ -62,7 +62,7 @@ void ChainGetBlocksSimpleTest::checkCallBack(AlignmentConstPtr alignment)
// BASIC TEST
BlockMapper mapper;
- mapper.init(parent, child, 0, 9, false, 0, true);
+ mapper.init(parent, child, 0, 9, false, false, 0, true);
mapper.map();
const BlockMapper::MSSet& segMap = mapper.getMap();
CuAssertTrue(_testCase, segMap.size() == 1);
@@ -89,7 +89,7 @@ void ChainGetBlocksInversionTest::checkCallBack(AlignmentConstPtr alignment)
// BASIC TEST
BlockMapper mapper;
- mapper.init(parent, child, 0, 9, false, 0, true);
+ mapper.init(parent, child, 0, 9, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -118,7 +118,7 @@ void ChainGetBlocksOffsetTest::checkCallBack(
// BASIC TEST
BlockMapper mapper;
- mapper.init(parent, child, 1, 4, false, 0, true);
+ mapper.init(parent, child, 1, 4, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -177,7 +177,7 @@ void ChainGetBlocksInversionOffsetTest::checkCallBack(
// BASIC TEST
BlockMapper mapper;
- mapper.init(parent, child, 1, 4, false, 0, true);
+ mapper.init(parent, child, 1, 4, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -236,7 +236,7 @@ void ChainGetBlocksOffsetQRefTest::checkCallBack(
// BASIC TEST
BlockMapper mapper;
- mapper.init(child, parent, 1, 4, false, 0, true);
+ mapper.init(child, parent, 1, 4, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -295,7 +295,7 @@ void ChainGetBlocksInversionOffsetQRefTest::checkCallBack(
// BASIC TEST
BlockMapper mapper;
- mapper.init(child, parent, 1, 4, false, 0, true);
+ mapper.init(child, parent, 1, 4, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -355,7 +355,7 @@ void ChainGetBlocksInversionOffsetQSisTest::checkCallBack(
// BASIC TEST
BlockMapper mapper;
- mapper.init(child1, child2, 1, 4, false, 0, true);
+ mapper.init(child1, child2, 1, 4, false, false, 0, true);
mapper.map();
MSRefSet segMap;
segMap.insert(mapper.getMap().begin(), mapper.getMap().end());
@@ -406,6 +406,34 @@ void ChainGetBlocksInversionOffsetQSisTest::checkCallBack(
}
+void ChainGetBlocksSimpleLiftoverTest::checkCallBack(
+ AlignmentConstPtr alignment)
+{
+ const Genome* parent = alignment->openGenome("parent");
+ const Genome* child = alignment->openGenome("child2");
+
+ // BASIC TEST
+ BlockMapper mapper;
+ mapper.init(parent, child, 0, 9, true, false, 0, false);
+ mapper.map();
+ const BlockMapper::MSSet& segMap = mapper.getMap();
+ CuAssertTrue(_testCase, segMap.size() == 1);
+ MSRefSet::const_iterator mapIt = segMap.begin();
+
+ SlicedSegmentConstPtr refSeg = (*mapIt)->getSource();
+ SlicedSegmentConstPtr queSeg = (*mapIt);
+
+ CuAssertTrue(_testCase, refSeg->getGenome() == parent);
+ CuAssertTrue(_testCase, refSeg->getStartPosition() == 9);
+ CuAssertTrue(_testCase, refSeg->getReversed() == true);
+ CuAssertTrue(_testCase, refSeg->getLength() == 10);
+
+ CuAssertTrue(_testCase, queSeg->getGenome() == child);
+ CuAssertTrue(_testCase, queSeg->getStartPosition() == 9);
+ CuAssertTrue(_testCase, queSeg->getReversed() == true);
+ CuAssertTrue(_testCase, queSeg->getLength() == 10);
+}
+
void halChainGetBlocksSimpleTest(CuTest *testCase)
{
try
@@ -497,6 +525,20 @@ void halChainGetBlocksInversionOffsetQSisTest(CuTest *testCase)
}
}
+void halChainGetBlocksSimpleLiftoverTest(CuTest *testCase)
+{
+ try
+ {
+ ChainGetBlocksSimpleLiftoverTest tester;
+ tester.check(testCase);
+ }
+ catch (...)
+ {
+ CuAssertTrue(testCase, false);
+ }
+}
+
+
CuSuite *halChainGetBlocksTestSuite(void)
{
CuSuite* suite = CuSuiteNew();
@@ -507,6 +549,7 @@ CuSuite *halChainGetBlocksTestSuite(void)
SUITE_ADD_TEST(suite, halChainGetBlocksOffsetQRefTest);
SUITE_ADD_TEST(suite, halChainGetBlocksInversionOffsetQRefTest);
SUITE_ADD_TEST(suite, halChainGetBlocksInversionOffsetQSisTest);
+ SUITE_ADD_TEST(suite, halChainGetBlocksSimpleLiftoverTest);
return suite;
}
6 chain/test/halChainGetBlocksTest.h
View
@@ -50,5 +50,11 @@ struct ChainGetBlocksInversionOffsetQSisTest : public ChainGetBlocksSimpleTest
void checkCallBack(hal::AlignmentConstPtr alignment);
};
+struct ChainGetBlocksSimpleLiftoverTest : public ChainGetBlocksSimpleTest
+{
+ void checkCallBack(hal::AlignmentConstPtr alignment);
+};
+
+
#endif
20 chain/test/timing.sh
View
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+binSize=1000000
+reps=1000
+ref=reference
+tgts=EscherichiaColiWUid162011,EscherichiaColiKo11flUid162099,EscherichiaColiKo11flUid52593,ShigellaSonnei53gUid84383
+refChr=referencerefChr1
+len=9808498
+lodPath=http://hgwdev.cse.ucsc.edu/~nknguyen/ecoli/hub/pangenome-dups/lod.txt
+halPath=http://hgwdev.cse.ucsc.edu/~nknguyen/ecoli/hub/pangenome-dups/out.hal
+udcPath=/hive/users/hickey/udcTemp
+outPath=/hive/users/hickey/timing
+seed=2323
+python=/cluster/home/jcarmstr/Python-2.7/python
+
+${python} ./blockVizBenchmark.py ${lodPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --seed ${seed} > ${outPath}/ecoli.csv
+
+${python} ./blockVizBenchmark.py ${lodPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --zapUdc --seed ${seed} > ${outPath}/ecoli_noudc.csv
+
+${python} ./blockVizBenchmark.py ${halPath} ${ref} ${refChr} ${len} ${tgts} --udc ${udcPath} --reps ${reps} --binSize ${binSize} --seed ${seed} > ${outPath}/ecoli_nolod.csv
6 include.mk
View
@@ -27,9 +27,11 @@ cpp = h5c++ ${h5prefix}
cxx = h5cc ${h5prefix}
ifeq (${SYS},Darwin) #This is to deal with the Mavericks replacing gcc with clang fully and changing libraries
+ifneq ($(wildcard /usr/bin/clang),)
cppflags += -stdlib=libstdc++
cflags += -stdlib=libstdc++
endif
+endif
# add compiler flag and kent paths if udc is enabled
# relies on KENTSRC containing path to top level kent/ dir
@@ -68,11 +70,11 @@ endif
# CLAPACKPATH=/usr/local/software/CLAPACK
ifeq ($(TARGETOS), Darwin)
- phyloPcppflags += -DENABLE_PHYLOP -I${PHAST}/include -framework vecLib -DVECLIB
+ phyloPcppflags += -DENABLE_PHYLOP -I${PHAST}/include -I${PHAST}/src/lib/pcre -framework vecLib -DVECLIB
phyloPlibs += -L${PHAST}/lib -lphast -lc
else
F2CPATH=${CLAPACKPATH}/F2CLIBS
- phyloPcppflags += -DENABLE_PHYLOP -I${PHAST}/include -I${CLAPACKPATH}/INCLUDE -I${F2CPATH}
+ phyloPcppflags += -DENABLE_PHYLOP -I${PHAST}/include -I${PHAST}/src/lib/pcre -I${CLAPACKPATH}/INCLUDE -I${F2CPATH}
phyloPlibs += -L${PHAST}/lib -lphast -L${CLAPACKPATH} -L${F2CPATH} -llapack -ltmg -lblaswr -lf2c
endif
12 liftover/Makefile
View
@@ -24,11 +24,11 @@ ${libPath}/halLiftover.a : ${libSources} ${libHeaders} ${libPath}/halLib.a ${bas
rm *.o
mv halLiftover.a ${libPath}/
-${binPath}/halLiftover : impl/halLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibsDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/halLiftover impl/halLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibs}
+${binPath}/halLiftover : impl/halLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibsDependencies}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/halLiftover impl/halLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
-${binPath}/halWiggleLiftover : impl/halWiggleLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibsDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/halWiggleLiftover impl/halWiggleLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibs}
+${binPath}/halWiggleLiftover : impl/halWiggleLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibsDependencies}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I impl -I tests -o ${binPath}/halWiggleLiftover impl/halWiggleLiftoverMain.cpp ${libPath}/halLiftover.a ${libPath}/halLib.a ${basicLibs}
-${binPath}/halLiftoverTests : ${libTestSources} ${libTestHeaders} ${libTestsCommon} ${libTestsHeadersCommon} ${libSources} ${libHeaders} ${libInternalHeaders} ${libPath}/halChain.a ${libPath}/halLib.a ${basicLibsDependencies}
- ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I tests -I ../api/tests -o ${binPath}/halLiftoverTests ${libTestSources} ${libTestsCommon} ${libPath}/halChain.a ${libPath}/halLib.a ${libPath}/halLiftover.a ${basicLibs}
+${binPath}/halLiftoverTests : ${libTestSources} ${libTestHeaders} ${libTestsCommon} ${libTestsHeadersCommon} ${libSources} ${libHeaders} ${libInternalHeaders} ${libPath}/halLib.a ${basicLibsDependencies}
+ ${cpp} ${cppflags} -I inc -I impl -I ${libPath} -I tests -I ../api/tests -o ${binPath}/halLiftoverTests ${libTestSources} ${libTestsCommon} ${libPath}/halLib.a ${libPath}/halLiftover.a ${basicLibs}
11 chain/impl/halBlockMapper.cpp → liftover/impl/halBlockMapper.cpp
View
@@ -36,12 +36,14 @@ void BlockMapper::erase()
void BlockMapper::init(const Genome* refGenome, const Genome* queryGenome,
hal_index_t absRefFirst, hal_index_t absRefLast,
+ bool targetReversed,
bool doDupes, hal_size_t minLength,
bool mapTargetAdjacencies)
{
erase();
_absRefFirst = absRefFirst;
_absRefLast = absRefLast;
+ _targetReversed = targetReversed;
_doDupes = doDupes;
_minLength = minLength;
_mapAdj = mapTargetAdjacencies;
@@ -89,13 +91,22 @@ void BlockMapper::map()
while (refSeg->getArrayIndex() < lastIndex &&
refSeg->getStartPosition() <= _absRefLast)
{
+ if (_targetReversed == true)
+ {
+ refSeg->toReverseInPlace();
+ }
refSeg->getMappedSegments(_segSet, _queryGenome, &_spanningTree,
_doDupes, _minLength);
+ if (_targetReversed == true)
+ {
+ refSeg->toReverseInPlace();
+ }
refSeg->toRight(_absRefLast);
}
if (_mapAdj)
{
+ assert(_targetReversed == false);
MSSet::const_iterator i;
for (i = _segSet.begin(); i != _segSet.end(); ++i)
{
2  chain/inc/halBlockMapper.h → liftover/inc/halBlockMapper.h
View
@@ -31,6 +31,7 @@ class BlockMapper
void init(const Genome* refGenome, const Genome* queryGenome,
hal_index_t absRefFirst, hal_index_t absRefLast,
+ bool targetReversed,
bool doDupes, hal_size_t minLength,
bool mapTargetAdjacencies);
void map();
@@ -82,6 +83,7 @@ class BlockMapper
hal_index_t _absRefFirst;
hal_index_t _absRefLast;
bool _mapAdj;
+ bool _targetReversed;
static hal_size_t _maxAdjScan;
};
5 liftover/inc/halWiggleTiles.h
View
@@ -85,7 +85,7 @@ inline void WiggleTiles<T>::init(hal_size_t genomeSize, T defaultValue,
_defaultValue = defaultValue;
_tileSize = std::min(tileSize, genomeSize);
_lastTileSize = genomeSize % _tileSize;
- hal_size_t numTiles = genomeSize / tileSize;
+ hal_size_t numTiles = genomeSize / _tileSize;
if (_lastTileSize > 0)
{
++numTiles;
@@ -108,6 +108,9 @@ inline void WiggleTiles<T>::clear()
{
_tiles.clear();
_bits.clear();
+ _tileSize = 0;
+ _genomeSize = 0;
+ _lastTileSize = 0;
}
template <class T>
14 maf/impl/hal2maf.cpp
View
@@ -238,8 +238,18 @@ int main(int argc, char** argv)
}
else
{
- mafExport.convertSegmentedSequence(mafStream, alignment, ref,
- start, length, targetSet);
+ if (start == 0 && length == 0 && ref->getSequenceLength() == 0)
+ {
+ string refSeqName =
+ refSequence != NULL ? refSequence->getName() : refGenome->getName();
+ cerr << "hal2maf: Warning reference sequence " << refSeqName
+ << " has zero length. MAF output will be empty" << endl;
+ }
+ else
+ {
+ mafExport.convertSegmentedSequence(mafStream, alignment, ref,
+ start, length, targetSet);
+ }
}
if (mafPath != "stdout")
{
8 stats/halStats.py
View
@@ -65,6 +65,14 @@ def getHalStats(halPath):
outList.append(tuple(tokens))
return outList
+def getHalTotalStats(halPath):
+ allStats = getHalStats(halPath)
+ totals = (0, 0, 0, 0, 0)
+ for genome in allStats:
+ assert len(genome) == 6
+ totals = tuple(sum(t) for t in zip(totals, genome)[1:])
+ return totals
+
def getHalSequenceStats(halPath, genomeName):
res = runShellCommand("halStats %s --sequenceStats %s" %
(halPath, genomeName)).split("\n")
Please sign in to comment.
Something went wrong with that request. Please try again.