Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

merge development

  • Loading branch information...
commit 4f814f9b3ea4acb74f84472d94ea5abc75bcfdcf 2 parents 1eb1b9f + 7415d13
@glennhickey authored
Showing with 8,289 additions and 1,486 deletions.
  1. +67 −0 CODING_STYLE.txt
  2. +1 −1  Makefile
  3. +69 −9 README.md
  4. BIN  README.pdf
  5. +7 −9 alignability/halAlignability.cpp
  6. +1 −0  api/Makefile
  7. +21 −0 api/hdf5_impl/hdf5Alignment.cpp
  8. +4 −0 api/hdf5_impl/hdf5Alignment.h
  9. +5 −0 api/hdf5_impl/hdf5BottomSegment.cpp
  10. +1 −0  api/hdf5_impl/hdf5BottomSegment.h
  11. +2 −4 api/hdf5_impl/hdf5ExternalArray.cpp
  12. +278 −66 api/hdf5_impl/hdf5Genome.cpp
  13. +9 −2 api/hdf5_impl/hdf5Genome.h
  14. +1 −1  api/hdf5_impl/hdf5MetaData.cpp
  15. +74 −78 api/hdf5_impl/hdf5Sequence.cpp
  16. +12 −20 api/hdf5_impl/hdf5Sequence.h
  17. +4 −3 api/hdf5_impl/hdf5SequenceIterator.cpp
  18. +5 −0 api/hdf5_impl/hdf5TopSegment.cpp
  19. +1 −0  api/hdf5_impl/hdf5TopSegment.h
  20. +23 −11 api/hdf5_tests/hdf5SequenceTypeTest.cpp
  21. +25 −1 api/impl/defaultBottomSegmentIterator.cpp
  22. +3 −0  api/impl/defaultBottomSegmentIterator.h
  23. +32 −10 api/impl/defaultColumnIterator.cpp
  24. +3 −1 api/impl/defaultColumnIterator.h
  25. +25 −1 api/impl/defaultGappedBottomSegmentIterator.cpp
  26. +1 −0  api/impl/defaultGappedBottomSegmentIterator.h
  27. +23 −0 api/impl/defaultGappedTopSegmentIterator.cpp
  28. +1 −0  api/impl/defaultGappedTopSegmentIterator.h
  29. +58 −37 api/impl/defaultMappedSegment.cpp
  30. +2 −2 api/impl/defaultMappedSegment.h
  31. +23 −0 api/impl/defaultSegmentIterator.cpp
  32. +2 −1  api/impl/defaultSegmentIterator.h
  33. +20 −1 api/impl/defaultTopSegmentIterator.cpp
  34. +3 −0  api/impl/defaultTopSegmentIterator.h
  35. +48 −24 api/impl/halCommon.cpp
  36. +8 −1 api/inc/halAlignment.h
  37. +0 −34 api/inc/halBottomSegmentIterator.h
  38. +12 −21 api/inc/halColumnIterator.h
  39. +16 −0 api/inc/halCommon.h
  40. +6 −0 api/inc/halCountedPtr.h
  41. +2 −1  api/inc/halDefs.h
  42. +0 −12 api/inc/halGappedBottomSegmentIterator.h
  43. +0 −14 api/inc/halGappedTopSegmentIterator.h
  44. +5 −0 api/inc/halGenome.h
  45. +10 −0 api/inc/halSegment.h
  46. +1 −1  api/inc/halSegmentedSequence.h
  47. +2 −2 api/inc/halSequence.h
  48. +0 −31 api/inc/halTopSegmentIterator.h
  49. +5 −1 api/tests/halAlignmentTest.cpp
  50. +2 −2 api/tests/halMappedSegmentTest.cpp
  51. +11 −0 assemblyHub/Makefile
  52. 0  {mask → assemblyHub}/__init__.py
  53. +54 −0 assemblyHub/alignabilityTrack.py
  54. +99 −0 assemblyHub/assemblyHubCommon.py
  55. +266 −0 assemblyHub/bedTrack.py
  56. +101 −0 assemblyHub/conservationTrack.py
  57. +56 −0 assemblyHub/gcPercentTrack.py
  58. +347 −0 assemblyHub/hal2assemblyHub.py
  59. +147 −0 assemblyHub/prepareHubFiles.py
  60. +92 −0 assemblyHub/prepareLodFiles.py
  61. +57 −0 assemblyHub/rmskTrack.py
  62. +31 −0 assemblyHub/snakeTrack.py
  63. +154 −0 assemblyHub/wigTrack.py
  64. +18 −16 chain/Makefile
  65. +0 −383 chain/hal2assemblyHub.py
  66. +0 −99 chain/impl/halBlockInterpolate.cpp
  67. +241 −33 chain/impl/halBlockViz.cpp
  68. +0 −28 chain/inc/halBlockInterpolate.h
  69. +53 −5 chain/inc/halBlockViz.h
  70. +0 −131 chain/test/blockInterpolateTest.cpp
  71. +5 −2 chain/test/blockVizBed.c
  72. +170 −0 chain/test/blockVizBenchmark.py
  73. +115 −0 chain/test/blockVizMaf.c
  74. +15 −10 chain/test/blockVizTest.c
  75. +124 −0 chain/test/blockVizTime.c
  76. +50 −7 chain/test/halChainGetBlocksTest.cpp
  77. +6 −0 chain/test/halChainGetBlocksTest.h
  78. +20 −0 chain/test/timing.sh
  79. +19 −8 extract/Makefile
  80. 0  extract/__init__.py
  81. +428 −0 extract/impl/hal4dExtract.cpp
  82. +139 −0 extract/impl/hal4dExtractMain.cpp
  83. +17 −7 extract/{ → impl}/halAlignedExtract.cpp
  84. 0  extract/{ → impl}/halExtract.cpp
  85. 0  {mask → extract}/impl/halMaskExtractMain.cpp
  86. 0  {mask → extract}/impl/halMaskExtractor.cpp
  87. +57 −0 extract/inc/hal4dExtract.h
  88. 0  {mask → extract}/inc/halMaskExtractor.h
  89. +284 −0 extract/tests/hal4dExtractTest.cpp
  90. +32 −0 extract/tests/hal4dExtractTest.h
  91. +41 −1 include.mk
  92. +12 −7 liftover/Makefile
  93. +175 −10 liftover/impl/halBedLine.cpp
  94. +36 −12 liftover/impl/halBedScanner.cpp
  95. +67 −0 liftover/impl/halBlockLiftover.cpp
  96. +12 −1 {chain → liftover}/impl/halBlockMapper.cpp
  97. +281 −161 liftover/impl/halLiftover.cpp
  98. +29 −2 liftover/impl/halLiftoverMain.cpp
  99. +264 −0 liftover/impl/halWiggleLiftover.cpp
  100. +157 −0 liftover/impl/halWiggleLiftoverMain.cpp
  101. +64 −0 liftover/impl/halWiggleLoader.cpp
  102. +241 −0 liftover/impl/halWiggleScanner.cpp
  103. +30 −0 liftover/inc/halBedLine.h
  104. +12 −6 liftover/inc/halBedScanner.h
  105. +3 −0  liftover/inc/halBlockLiftover.h
  106. +2 −0  {chain → liftover}/inc/halBlockMapper.h
  107. +9 −2 liftover/inc/halLiftover.h
  108. +49 −0 liftover/inc/halTabFacet.h
  109. +77 −0 liftover/inc/halWiggleLiftover.h
  110. +47 −0 liftover/inc/halWiggleLoader.h
  111. +62 −0 liftover/inc/halWiggleScanner.h
  112. +199 −0 liftover/inc/halWiggleTiles.h
  113. +1 −0  lod/Makefile
  114. +2 −1  lod/halLodBenchmark.py
  115. +16 −7 lod/halLodInterpolate.py
  116. +34 −3 lod/impl/halLodExtract.cpp
  117. +11 −9 lod/impl/halLodExtractMain.cpp
  118. +18 −4 lod/impl/halLodGraph.cpp
  119. +50 −28 lod/impl/halLodManager.cpp
  120. +5 −2 lod/inc/halLodExtract.h
  121. +5 −1 lod/inc/halLodManager.h
  122. +4 −3 maf/Makefile
  123. +22 −16 maf/hal2mafMP.py
  124. +46 −52 maf/impl/hal2maf.cpp
  125. +119 −0 maf/impl/halMafBed.cpp
  126. +52 −0 maf/inc/halMafBed.h
  127. +0 −23 mask/Makefile
  128. +1 −0  mutations/Makefile
  129. +40 −0 phyloP/Makefile
  130. +18 −0 phyloP/README.txt
  131. +306 −0 phyloP/halPhyloPMP.py
  132. +260 −0 phyloP/halPhyloPTrain.py
  133. +156 −0 phyloP/halTreePhyloP.py
  134. +438 −0 phyloP/impl/halPhyloP.cpp
  135. +116 −0 phyloP/impl/halPhyloPBed.cpp
  136. +273 −0 phyloP/impl/halPhyloPMain.cpp
  137. +87 −0 phyloP/inc/halPhyloP.h
  138. +51 −0 phyloP/inc/halPhyloPBed.h
  139. BIN  phyloP/test/blanchette.hal
  140. +10 −0 phyloP/test/blanchette.mod
  141. +3 −0  phyloP/test/test.sh
  142. +1 −0  stats/Makefile
  143. +18 −2 stats/halStats.py
  144. +126 −2 stats/impl/halStatsMain.cpp
View
67 CODING_STYLE.txt
@@ -0,0 +1,67 @@
+Please adhere to these style convetions for source code you will push (or send pull requests for) into the HAL repository. They should be pretty obvious by looking any of the existing source code...
+
+
+indent: 2 spaces (no tab characters)
+
+curly braces: on their own lines, no indent. ex:
+
+if (x > y)
+{
+ return true;
+}
+
+class names, struct names and other types: first character capitalized
+
+function names and variable names: first character lower case (ex variableNameOne)
+
+member variables: begin with _
+
+multiword names: capitalize subsequent words (ie no underscore) (ex ClassNameOne, variableNameOne)
+
+inline functions: not defined in class definition. ex:
+
+class X
+{
+ void f();
+};
+
+inline X::voidf()
+{
+
+}
+
+maximum line width: 80 characters
+
+never use "using namespace" in a header file
+
+--Glenn
+
+
+.emacs for the above:
+
+(add-to-list 'auto-mode-alist '("\\.h\\'" . c++-mode))
+
+(c-add-style "mycodingstyle"
+ '((c-comment-only-line-offset . 0)
+ (c-hanging-braces-alist . ((substatement-open beforeafter)))
+ (c-offsets-alist . ((topmost-intro . 0)
+ (topmost-intro-cont . 0)
+ (substatement . 3)
+ (substatement-open . 0)
+ (statement-case-open . 3)
+ (statement-cont . 3)
+ (access-label . -3)
+ (inclass . 3)
+ (inline-open . 3)
+ (innamespace . 0)
+ ))))
+
+;; c/c++ mode
+(add-hook 'c-mode-common-hook
+ '(lambda()
+ (c-set-style "mycodingstyle")
+ (setq tab-width 2)
+ (setq c-basic-offset tab-width)
+ (setq tab-width 8
+ ;; this will make sure spaces are used instead of tabs
+ indent-tabs-mode nil)))
View
2  Makefile
@@ -1,5 +1,5 @@
# order is important, libraries first
-modules = api stats randgen validate mutations maf extract fasta alignability lod chain liftover mask analysis
+modules = api stats randgen validate mutations fasta alignability liftover lod maf chain extract analysis phyloP assemblyHub
.PHONY: all %.all clean %.clean doxy %.doxy
View
78 README.md
@@ -1,6 +1,6 @@
-Hierarchical Alignment (HAL) Format API (v1.4)
+Hierarchical Alignment (HAL) Format API (v2.1)
=====
-Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu)
+Copyright (C) 2012 - 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
Released under the MIT license, see LICENSE.txt
HAL is a structure to efficiently store and index multiple genome alignments and ancestral reconstructions. HAL is a graph-based representation which provides several advantages over matrix/block-based formats such as MAF, such as improved scalability and the ability to perform queries with respect to an arbitrary reference or subtree.
@@ -13,6 +13,14 @@ Glenn Hickey, Benedict Paten, Dent Earl, Daniel Zerbino, and David
Haussler. HAL: A Hierarchical Format for Storing and Analyzing
Multiple Genome Alignments. Bioinformatics. 2013. [Advance Online Access](http://bioinformatics.oxfordjournals.org/content/early/2013/03/16/bioinformatics.btt128.abstract)
+Code Contributors
+-----
+* Glenn Hickey (UCSC)
+* Joel Armstrong (UCSC)
+* Ngan Nguyen (UCSC)
+* Benedict Paten (UCSC)
+* Melissa Jane Hubisz (Cornell)
+
Installation
-----
@@ -26,6 +34,10 @@ From the parent directory of where you want HAL installed:
git clone git://github.com/glennhickey/hal.git
+#### Progressive Cactus Package
+
+Note that HAL can also be downloaded and installed (automatically along with all its dependencies) as part of the [Progressive Cactus installation package](https://github.com/glennhickey/progressiveCactus)
+
### Installing Dependencies
#### HDF5 1.8 with C++ API enabled
@@ -73,24 +85,54 @@ to reflect the directory where you installed sonLib
Define ENABLE_UDC before making, and specify the path of the Kent source tree using KENTSRC. When built with this enabled, all HAL files opened read-only will be accessed using UDC which supports both local files and URLs.
- `export ENABLE_UDC=1`
- `export KENTSRC=<path to top level of Kent source tree>`
+ export ENABLE_UDC=1
+ export KENTSRC=<path to top level of Kent source tree>
Those without the UCSC genome browser already installed locally will probably find it simpler to first mount URLs with [HTTPFS](http://httpfs.sourceforge.net/) before opening with HAL.
+#### Optional support of PhyloP evolutionary constraint annotation
+
+PhyloP is part of the [Phast Package](http://compgen.bscb.cornell.edu/phast/), and can be used to test for genomic positions that are under selective pressure. We are working on prototype support for running PhyloP on HAL files. In order to enable this support, Phast must be installed. We recommend downloading the latest source using Subversion.
+
+From the same parent directory where you downloaded HAL:
+
+* First install CLAPACK (Linux only)
+
+ `wget http://www.netlib.org/clapack/clapack.tgz`
+ `tar -xvzf clapack.tgz`
+ `mv CLAPACK-3.2.1 clapack`
+ `cd clapack`
+ `cp make.inc.example make.inc && make f2clib && make blaslib && make lib`
+ ``export CLAPACKPATH=`pwd` ``
+ `cd ..`
+
+* Install Phast (Mac or Linux)
+
+ `svn co http://compgen.bscb.cornell.edu/svnrepo/phast/trunk phast/`
+ `cd phast`
+ ``export PHAST=`pwd` ``
+ `cd src && make`
+ `cd ../..`
+
+* Before building HAL
+
+ `export ENABLE_PHYLOP=1`
+
+Special thanks to Melissa Jane Hubiz and Adam Siepel from Cornell University for their work on extending their tools to work with HAL.
+
### Building HAL
From the hal/ directory:
- `make`
+ make
Before using HAL, add it to your path:
- `export PATH=<path to hal>/bin:${PATH}`
+ export PATH=<path to hal>/bin:${PATH}
The parent directory of hal/ should be in your PYTHONPATH in order to use any of the Python functionality. This includes running `make test`
- `export PYTHONPATH=<parent of hal>:${PYTHONPATH}`
+ export PYTHONPATH=<parent of hal>:${PYTHONPATH}
HAL Tools
-----
@@ -233,7 +275,7 @@ Annotations in [BED](http://genome.ucsc.edu/FAQ/FAQformat.html#format1), ie tab-
SequenceName StartPosition LastPosition+1
-can be lifted over between genomes using `halLiftover`. halLiftover does a base-by-base mapping between any two sequences in the alignment (following paralogy relations as well).
+can be lifted over between genomes using `halLiftover`. halLiftover does a base-by-base mapping between any two sequences in the alignment (following paralogy relations as well). The output is written in BED (default) or PSL format.
halLiftover mammals.hal human human_annotation.bed dog dog_annotation.bed
@@ -241,6 +283,10 @@ will map all annotations in human_annotation.bed, which must refer to sequences
halLiftover attempts to autodetect the BED version of the input. This can be overried with the `--inVedVersion` option. Columns that are not described in the official BED specs can be optionally mapped as-is using the `--keepExtra` option.
+By default, halLiftover uses spaces and/or tabs to separate columns. To use only tabs (ie to allow spaces within names), use the `--tab` option.
+
+Annotations in [Wiggle](http://genome.ucsc.edu/goldenPath/help/wiggle.html) format can likewise be mapped using `halWiggleLiftover`
+
#### Alignability
The number of distinct genomes different bases of a set of target genomes align to can be computed using the `halAlignability` tool. The output is in `.wig` format.
@@ -269,8 +315,22 @@ Two bed files must be specified because the coordinates of inserted (and by conv
Point mutations can optionally be written using the `--snpFile <file>` option. The '--maxGap' and '--maxNFraction' options can specify the gap indel threshold and missing data threshold, respectively, as described above in the *halSummarizeMtuations* section.
+### Constrained Element Prediction
+
+(Under development)
+
+PhyloP is part of the [Phast Package](http://compgen.bscb.cornell.edu/phast/), and can be used to test for genomic positions that are under selective pressure. We are working on prototype support for running PhyloP on HAL files.
+
+* Train a neutral model
+
+ See `halPhyloPTrain.py`
+
+* Detect constrained elements
+
+ See `halPhyloPMP.py`
+
+Special thanks to Melissa Jane Hubiz and Adam Siepel from Cornell University for their work on extending their tools to work with HAL.
-### Importing from other formats
Example of HAL Genome Representation
-----
View
BIN  README.pdf
Binary file not shown
View
16 alignability/halAlignability.cpp
@@ -240,6 +240,10 @@ void printSequence(ostream& outStream, const Sequence* sequence,
hal_size_t start, hal_size_t length, hal_size_t step)
{
hal_size_t seqLen = sequence->getSequenceLength();
+ if (seqLen == 0)
+ {
+ return;
+ }
/** If the length is 0, we do from the start position until the end
* of the sequence */
if (length == 0)
@@ -259,12 +263,6 @@ void printSequence(ostream& outStream, const Sequence* sequence,
const Genome* genome = sequence->getGenome();
string sequenceName = sequence->getName();
string genomeName = genome->getName();
- /** A hack to take the genome name out of the chromosome name. Should
- * be largely unnecessary now that our naming conventions are better */
- if (sequenceName.find(genomeName + '.') == 0)
- {
- sequenceName = sequenceName.substr(genomeName.length() + 1);
- }
/** The ColumnIterator is fundamental structure used in this example to
* traverse the alignment. It essientially generates the multiple alignment
@@ -388,9 +386,9 @@ void printGenome(ostream& outStream,
start < seqStart + seqLen &&
runningLength < length)
{
- hal_size_t readStart = seqStart >= start ? 0 : seqStart - start;
- hal_size_t readLen = std::min(seqLen - start, length - runningLength);
-
+ hal_size_t readStart = seqStart >= start ? 0 : start - seqStart;
+ hal_size_t readLen = min(seqLen - readStart, length);
+ readLen = min(readLen, length - runningLength);
printSequence(outStream, sequence, targetSet, readStart, readLen, step);
runningLength += readLen;
}
View
1  api/Makefile
@@ -19,6 +19,7 @@ doxy :
${libPath}/halLib.a : ${libSources} ${libHeaders} ${libInternalHeaders} ${basicLibsDependencies}
cp ${libHeaders} ${libPath}/
+ rm -f *.o
${cpp} ${cppflags} -I inc -I hdf5_impl -I impl -I ${libPath}/ -c ${libSources}
ar rc halLib.a *.o
ranlib halLib.a
View
21 api/hdf5_impl/hdf5Alignment.cpp
@@ -402,6 +402,27 @@ string HDF5Alignment::getParentName(const string& name) const
return stTree_getLabel(parent);
}
+
+void HDF5Alignment::updateBranchLength(const string& parentName,
+ const string& childName,
+ double length)
+{
+ map<string, stTree*>::iterator findIt = _nodeMap.find(childName);
+ if (findIt == _nodeMap.end())
+ {
+ throw hal_exception(string("node ") + childName + " not found");
+ }
+ stTree* node = findIt->second;
+ stTree* parent = stTree_getParent(node);
+ if (parent == NULL || parentName != stTree_getLabel(parent))
+ {
+ throw hal_exception(string("edge ") + parentName + "--" + childName +
+ " not found");
+ }
+ stTree_setBranchLength(node, length);
+ _dirty = true;
+}
+
double HDF5Alignment::getBranchLength(const string& parentName,
const string& childName) const
{
View
4 api/hdf5_impl/hdf5Alignment.h
@@ -55,6 +55,10 @@ class HDF5Alignment : public Alignment
std::string getParentName(const std::string& name) const;
+ void updateBranchLength(const std::string& parentName,
+ const std::string& childName,
+ double length);
+
double getBranchLength(const std::string& parentName,
const std::string& childName) const;
View
5 api/hdf5_impl/hdf5BottomSegment.cpp
@@ -110,6 +110,11 @@ bool HDF5BottomSegment::isMissingData(double nThreshold) const
return false;
}
+void HDF5BottomSegment::print(std::ostream& os) const
+{
+ os << "HDF5 Bottom Segment";
+}
+
// HDF5 SPECIFIC
H5::CompType HDF5BottomSegment::dataType(hal_size_t numChildren)
{
View
1  api/hdf5_impl/hdf5BottomSegment.h
@@ -62,6 +62,7 @@ class HDF5BottomSegment : public BottomSegment
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ void print(std::ostream& os) const;
// BOTTOM SEGMENT INTERFACE
hal_size_t getNumChildren() const;
View
6 api/hdf5_impl/hdf5ExternalArray.cpp
@@ -128,14 +128,12 @@ void HDF5ExternalArray::load(CommonFG* file, const H5std_string& path,
// create the internal data buffer
_bufSize = _chunkSize > 1 ? _chunkSize : _size;
- _bufStart = 0;
_bufEnd = _bufStart + _bufSize - 1;
+ // set out of range to ensure page happens
+ _bufStart = _bufEnd + 1;
delete [] _buf;
_buf = new char[_bufSize * _dataSize];
- // fill buffer from disk
- page(0);
-
assert(_bufSize > 0 || _size == 0);
}
View
344 api/hdf5_impl/hdf5Genome.cpp
@@ -27,7 +27,8 @@ using namespace H5;
const string HDF5Genome::dnaArrayName = "DNA_ARRAY";
const string HDF5Genome::topArrayName = "TOP_ARRAY";
const string HDF5Genome::bottomArrayName = "BOTTOM_ARRAY";
-const string HDF5Genome::sequenceArrayName = "SEQUENCE_ARRAY";
+const string HDF5Genome::sequenceIdxArrayName = "SEQIDX_ARRAY";
+const string HDF5Genome::sequenceNameArrayName = "SEQNAME_ARRAY";
const string HDF5Genome::metaGroupName = "Meta";
const string HDF5Genome::rupGroupName = "Rup";
const double HDF5Genome::dnaChunkScale = 10.;
@@ -62,6 +63,18 @@ HDF5Genome::HDF5Genome(const string& name,
_metaData = new HDF5MetaData(&_group, metaGroupName);
_rup = new HDF5MetaData(&_group, rupGroupName);
+ _totalSequenceLength = _dnaArray.getSize() * 2;
+ if (_totalSequenceLength > 0 && _rup->get(rupGroupName) == "1")
+ {
+ _totalSequenceLength -= 1;
+ }
+ else if (_totalSequenceLength == 0 && _sequenceIdxArray.getSize() > 0)
+ {
+ HDF5Sequence lastSeq(this, &_sequenceIdxArray, &_sequenceNameArray,
+ _sequenceNameArray.getSize() - 1);
+ _totalSequenceLength = lastSeq.getEndPosition() + 1;
+ }
+
hsize_t chunk;
_dcprops.getChunk(1, &chunk);
}
@@ -116,10 +129,17 @@ void HDF5Genome::setDimensions(
catch (H5::Exception){}
try
{
- DataSet d = _group.openDataSet(sequenceArrayName);
- _group.unlink(sequenceArrayName);
+ DataSet d = _group.openDataSet(sequenceIdxArrayName);
+ _group.unlink(sequenceIdxArrayName);
+ }
+ catch (H5::Exception){}
+ try
+ {
+ DataSet d = _group.openDataSet(sequenceNameArrayName);
+ _group.unlink(sequenceNameArrayName);
}
catch (H5::Exception){}
+
if (_totalSequenceLength > 0 && storeDNAArrays == true)
{
hal_size_t arrayLength = _totalSequenceLength / 2;
@@ -146,12 +166,15 @@ void HDF5Genome::setDimensions(
}
if (totalSeq > 0)
{
- _sequenceArray.create(&_group, sequenceArrayName,
- // pad names a bit to allow renaming
- HDF5Sequence::dataType(maxName + 32),
- totalSeq, NULL, _numChunksInArrayBuffer);
- writeSequences(sequenceDimensions);
-
+ _sequenceIdxArray.create(&_group, sequenceIdxArrayName,
+ HDF5Sequence::idxDataType(),
+ totalSeq + 1, &_dcprops, _numChunksInArrayBuffer);
+
+ _sequenceNameArray.create(&_group, sequenceNameArrayName,
+ HDF5Sequence::nameDataType(maxName + 1),
+ totalSeq, &_dcprops, _numChunksInArrayBuffer);
+
+ writeSequences(sequenceDimensions);
}
// Do the same as above for the segments.
@@ -165,9 +188,12 @@ void HDF5Genome::setDimensions(
void HDF5Genome::updateTopDimensions(
const vector<Sequence::UpdateInfo>& topDimensions)
{
+ loadSequencePosCache();
+ loadSequenceNameCache();
vector<Sequence::UpdateInfo>::const_iterator i;
map<string, HDF5Sequence*>::iterator cacheIt;
map<string, const Sequence::UpdateInfo*> inputMap;
+ map<string, hal_size_t> currentTopD;
// copy input into map, checking everything is already present
for (i = topDimensions.begin(); i != topDimensions.end(); ++i)
{
@@ -181,10 +207,34 @@ void HDF5Genome::updateTopDimensions(
}
inputMap.insert(pair<string, const Sequence::UpdateInfo*>(name, &*i));
}
- // scan through existing sequences, updating as necessary
- // build summary of all new and unchanged dimensions in newDimensions
+ // keep a record of the number of segments in each existing
+ // segment (these can get muddled as we add the new ones in the next
+ // loop to be sure by getting them in one shot)
+ // Note to self: iterating the map in this way skips zero-length
+ // sequences (which are in the separate vector). This is fine
+ // here since we will never update them, but seems like it could be
+ // dangerous if something were to change
map<hal_size_t, HDF5Sequence*>::iterator posCacheIt;
map<string, const Sequence::UpdateInfo*>::iterator inputIt;
+ for (posCacheIt = _sequencePosCache.begin();
+ posCacheIt != _sequencePosCache.end(); ++posCacheIt)
+ {
+ HDF5Sequence* sequence = posCacheIt->second;
+ inputIt = inputMap.find(sequence->getName());
+ if (inputIt == inputMap.end())
+ {
+ currentTopD.insert(pair<string, hal_size_t>(
+ sequence->getName(),
+ sequence->getNumTopSegments()));
+ }
+ }
+ // scan through existing sequences, updating as necessary
+ // build summary of all new and unchanged dimensions in newDimensions
+ // Note to self: iterating the map in this way skips zero-length
+ // sequences (which are in the separate vector). This is fine
+ // here since we will never update them, but seems like it could be
+ // dangerous if something were to change
+ map<string, hal_size_t>::iterator currentIt;
vector<Sequence::UpdateInfo> newDimensions;
Sequence::UpdateInfo newInfo;
hal_size_t topArrayIndex = 0;
@@ -197,15 +247,17 @@ void HDF5Genome::updateTopDimensions(
if (inputIt != inputMap.end())
{
const Sequence::UpdateInfo* updateInfo = inputIt->second;
- sequence->setNumTopSegments(updateInfo->_numSegments);
newDimensions.push_back(*updateInfo);
}
else
{
+ currentIt = currentTopD.find(sequence->getName());
+ assert(currentIt != currentTopD.end());
newInfo._name = posCacheIt->first;
- newInfo._numSegments = sequence->getNumTopSegments();
+ newInfo._numSegments = currentIt->second;
newDimensions.push_back(newInfo);
}
+ sequence->setNumTopSegments(newDimensions.back()._numSegments);
topArrayIndex += newDimensions.back()._numSegments;
}
setGenomeTopDimensions(newDimensions);
@@ -214,9 +266,12 @@ void HDF5Genome::updateTopDimensions(
void HDF5Genome::updateBottomDimensions(
const vector<Sequence::UpdateInfo>& bottomDimensions)
{
+ loadSequencePosCache();
+ loadSequenceNameCache();
vector<Sequence::UpdateInfo>::const_iterator i;
map<string, HDF5Sequence*>::iterator cacheIt;
map<string, const Sequence::UpdateInfo*> inputMap;
+ map<string, hal_size_t> currentBottomD;
// copy input into map, checking everything is already present
for (i = bottomDimensions.begin(); i != bottomDimensions.end(); ++i)
{
@@ -230,10 +285,34 @@ void HDF5Genome::updateBottomDimensions(
}
inputMap.insert(pair<string, const Sequence::UpdateInfo*>(name, &*i));
}
- // scan through existing sequences, updating as necessary
- // build summary of all new and unchanged dimensions in newDimensions
+ // keep a record of the number of segments in each existing
+ // segment (these can get muddled as we add the new ones in the next
+ // loop to be sure by getting them in one shot)
+ // Note to self: iterating the map in this way skips zero-length
+ // sequences (which are in the separate vector). This is fine
+ // here since we will never update them, but seems like it could be
+ // dangerous if something were to change
map<hal_size_t, HDF5Sequence*>::iterator posCacheIt;
map<string, const Sequence::UpdateInfo*>::iterator inputIt;
+ for (posCacheIt = _sequencePosCache.begin();
+ posCacheIt != _sequencePosCache.end(); ++posCacheIt)
+ {
+ HDF5Sequence* sequence = posCacheIt->second;
+ inputIt = inputMap.find(sequence->getName());
+ if (inputIt == inputMap.end())
+ {
+ currentBottomD.insert(pair<string, hal_size_t>(
+ sequence->getName(),
+ sequence->getNumBottomSegments()));
+ }
+ }
+ // scan through existing sequences, updating as necessary
+ // build summary of all new and unchanged dimensions in newDimensions
+ // Note to self: iterating the map in this way skips zero-length
+ // sequences (which are in the separate vector). This is fine
+ // here since we will never update them, but seems like it could be
+ // dangerous if something were to change
+ map<string, hal_size_t>::iterator currentIt;
vector<Sequence::UpdateInfo> newDimensions;
Sequence::UpdateInfo newInfo;
hal_size_t bottomArrayIndex = 0;
@@ -246,15 +325,17 @@ void HDF5Genome::updateBottomDimensions(
if (inputIt != inputMap.end())
{
const Sequence::UpdateInfo* updateInfo = inputIt->second;
- sequence->setNumBottomSegments(updateInfo->_numSegments);
newDimensions.push_back(*updateInfo);
}
else
{
+ currentIt = currentBottomD.find(sequence->getName());
+ assert(currentIt != currentBottomD.end());
newInfo._name = posCacheIt->first;
- newInfo._numSegments = sequence->getNumBottomSegments();
+ newInfo._numSegments = currentIt->second;
newDimensions.push_back(newInfo);
}
+ sequence->setNumBottomSegments(newDimensions.back()._numSegments);
bottomArrayIndex += newDimensions.back()._numSegments;
}
setGenomeBottomDimensions(newDimensions);
@@ -320,11 +401,13 @@ void HDF5Genome::setGenomeBottomDimensions(
hal_size_t HDF5Genome::getNumSequences() const
{
- return _sequenceArray.getSize();
+ assert(_sequenceIdxArray.getSize() == _sequenceNameArray.getSize() + 1);
+ return _sequenceNameArray.getSize();
}
Sequence* HDF5Genome::getSequence(const string& name)
{
+ loadSequenceNameCache();
Sequence* sequence = NULL;
map<string, HDF5Sequence*>::iterator mapIt = _sequenceNameCache.find(name);
if (mapIt != _sequenceNameCache.end())
@@ -336,6 +419,7 @@ Sequence* HDF5Genome::getSequence(const string& name)
const Sequence* HDF5Genome::getSequence(const string& name) const
{
+ loadSequenceNameCache();
const Sequence* sequence = NULL;
map<string, HDF5Sequence*>::const_iterator mapIt =
_sequenceNameCache.find(name);
@@ -348,12 +432,15 @@ const Sequence* HDF5Genome::getSequence(const string& name) const
Sequence* HDF5Genome::getSequenceBySite(hal_size_t position)
{
+ loadSequencePosCache();
map<hal_size_t, HDF5Sequence*>::iterator i;
i = _sequencePosCache.upper_bound(position);
if (i != _sequencePosCache.end())
{
if (position >= (hal_size_t)i->second->getStartPosition())
{
+ assert(position < i->second->getStartPosition() +
+ i->second->getSequenceLength());
return i->second;
}
}
@@ -362,12 +449,15 @@ Sequence* HDF5Genome::getSequenceBySite(hal_size_t position)
const Sequence* HDF5Genome::getSequenceBySite(hal_size_t position) const
{
+ loadSequencePosCache();
map<hal_size_t, HDF5Sequence*>::const_iterator i;
i = _sequencePosCache.upper_bound(position);
if (i != _sequencePosCache.end())
{
if (position >= (hal_size_t)i->second->getStartPosition())
{
+ assert(position < i->second->getStartPosition() +
+ i->second->getSequenceLength());
return i->second;
}
}
@@ -377,7 +467,7 @@ const Sequence* HDF5Genome::getSequenceBySite(hal_size_t position) const
SequenceIteratorPtr HDF5Genome::getSequenceIterator(
hal_index_t position)
{
- assert(position <= (hal_index_t)_sequenceArray.getSize());
+ assert(position <= (hal_index_t)_sequenceNameArray.getSize());
HDF5SequenceIterator* newIt = new HDF5SequenceIterator(this, position);
return SequenceIteratorPtr(newIt);
}
@@ -385,7 +475,7 @@ SequenceIteratorPtr HDF5Genome::getSequenceIterator(
SequenceIteratorConstPtr HDF5Genome::getSequenceIterator(
hal_index_t position) const
{
- assert(position <= (hal_index_t)_sequenceArray.getSize());
+ assert(position <= (hal_index_t)_sequenceNameArray.getSize());
// genome effectively gets re-consted when returned in the
// const iterator. just save doubling up code.
HDF5SequenceIterator* newIt = new HDF5SequenceIterator(
@@ -437,15 +527,15 @@ const Genome* HDF5Genome::getParent() const
Genome* HDF5Genome::getChild(hal_size_t childIdx)
{
assert(childIdx < _numChildrenInBottomArray);
- if (_childCache.size() <= childIdx ||
- _childCache[childIdx] == NULL)
+ if (_childCache.size() <= childIdx)
+ {
+ _childCache.assign(_numChildrenInBottomArray, NULL);
+ }
+ if (_childCache[childIdx] == NULL)
{
vector<string> childNames = _alignment->getChildNames(_name);
- _childCache.resize(childNames.size());
- for (size_t i = 0; i < _childCache.size(); ++i)
- {
- _childCache[i] = _alignment->openGenome(childNames.at(i));
- }
+ assert(childNames.size() > childIdx);
+ _childCache[childIdx] = _alignment->openGenome(childNames.at(childIdx));
}
return _childCache[childIdx];
}
@@ -453,15 +543,15 @@ Genome* HDF5Genome::getChild(hal_size_t childIdx)
const Genome* HDF5Genome::getChild(hal_size_t childIdx) const
{
assert(childIdx < _numChildrenInBottomArray);
- if (_childCache.size() <= childIdx ||
- _childCache[childIdx] == NULL)
+ if (_childCache.size() <= childIdx)
+ {
+ _childCache.assign(_numChildrenInBottomArray, NULL);
+ }
+ if (_childCache[childIdx] == NULL)
{
vector<string> childNames = _alignment->getChildNames(_name);
- _childCache.resize(childNames.size());
- for (size_t i = 0; i < _childCache.size(); ++i)
- {
- _childCache[i] = _alignment->openGenome(childNames.at(i));
- }
+ assert(childNames.size() > childIdx);
+ _childCache[childIdx] = _alignment->openGenome(childNames.at(childIdx));
}
return _childCache[childIdx];
}
@@ -473,9 +563,11 @@ hal_size_t HDF5Genome::getNumChildren() const
hal_index_t HDF5Genome::getChildIndex(const Genome* child) const
{
- for (hal_size_t i = 0; i < _numChildrenInBottomArray; ++i)
+ string childName = child->getName();
+ vector<string> childNames = _alignment->getChildNames(_name);
+ for (hal_size_t i = 0; i < childNames.size(); ++i)
{
- if (getChild(i) == child)
+ if (childNames[i] == childName)
{
return i;
}
@@ -488,6 +580,11 @@ bool HDF5Genome::containsDNAArray() const
return _dnaArray.getSize() > 0;
}
+const Alignment* HDF5Genome::getAlignment() const
+{
+ return _alignment;
+}
+
// SEGMENTED SEQUENCE INTERFACE
const string& HDF5Genome::getName() const
@@ -692,7 +789,8 @@ void HDF5Genome::write()
_bottomArray.write();
_metaData->write();
_rup->write();
- _sequenceArray.write();
+ _sequenceIdxArray.write();
+ _sequenceNameArray.write();
}
void HDF5Genome::read()
@@ -704,6 +802,7 @@ void HDF5Genome::read()
_dnaArray.load(&_group, dnaArrayName, _numChunksInArrayBuffer);
}
catch (H5::Exception){}
+
try
{
_group.openDataSet(topArrayName);
@@ -718,55 +817,160 @@ void HDF5Genome::read()
HDF5BottomSegment::numChildrenFromDataType(_bottomArray.getDataType());
}
catch (H5::Exception){}
+
deleteSequenceCache();
try
{
- _group.openDataSet(sequenceArrayName);
- _sequenceArray.load(&_group, sequenceArrayName, _numChunksInArrayBuffer);
- readSequences();
+ _group.openDataSet(sequenceIdxArrayName);
+ _sequenceIdxArray.load(&_group, sequenceIdxArrayName,
+ _numChunksInArrayBuffer);
+ }
+ catch (H5::Exception){}
+ try
+ {
+ _group.openDataSet(sequenceNameArrayName);
+ _sequenceNameArray.load(&_group, sequenceNameArrayName,
+ _numChunksInArrayBuffer);
}
catch (H5::Exception){}
+
+ readSequences();
}
void HDF5Genome::readSequences()
{
deleteSequenceCache();
- _totalSequenceLength = 0;
- hal_size_t numSequences = _sequenceArray.getSize();
- for (hal_size_t i = 0; i < numSequences; ++i)
+}
+
+void HDF5Genome::deleteSequenceCache()
+{
+ if (_sequencePosCache.size() > 0 || _zeroLenPosCache.size() > 0)
+ {
+ map<hal_size_t, HDF5Sequence*>::iterator i;
+ for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
+ {
+ delete i->second;
+ }
+ vector<HDF5Sequence*>::iterator z;
+ for (z = _zeroLenPosCache.begin(); z != _zeroLenPosCache.end(); ++z)
+ {
+ delete *z;
+ }
+ }
+ else if (_sequenceNameCache.size() > 0)
+ {
+ map<string, HDF5Sequence*>::iterator i;
+ for (i = _sequenceNameCache.begin(); i != _sequenceNameCache.end(); ++i)
+ {
+ delete i->second;
+ }
+ }
+ _sequencePosCache.clear();
+ _zeroLenPosCache.clear();
+ _sequenceNameCache.clear(); // I share my pointers with above.
+}
+
+void HDF5Genome::loadSequencePosCache() const
+{
+ if (_sequencePosCache.size() > 0 || _zeroLenPosCache.size() > 0)
+ {
+ return;
+ }
+ hal_size_t totalReadLen = 0;
+ hal_size_t numSequences = _sequenceNameArray.getSize();
+
+ if (_sequenceNameCache.size() > 0)
+ {
+ assert(_sequenceNameCache.size() == numSequences);
+ map<std::string, HDF5Sequence*>::iterator i;
+ for (i = _sequenceNameCache.begin(); i != _sequenceNameCache.end(); ++i)
+ {
+ if (i->second->getSequenceLength() > 0)
+ {
+ _sequencePosCache.insert(pair<hal_size_t, HDF5Sequence*>(
+ i->second->getStartPosition() +
+ i->second->getSequenceLength(), i->second));
+ totalReadLen += i->second->getSequenceLength();
+ }
+ else
+ {
+ _zeroLenPosCache.push_back(i->second);
+ }
+ }
+ }
+ else
{
- HDF5Sequence* seq = new HDF5Sequence(this, &_sequenceArray, i);
- _sequencePosCache.insert(
- pair<hal_size_t, HDF5Sequence*>(seq->getStartPosition() +
- seq->getSequenceLength(), seq));
- _sequenceNameCache.insert(
- pair<string, HDF5Sequence*>(seq->getName(), seq));
- _totalSequenceLength += seq->getSequenceLength();
+ for (hal_size_t i = 0; i < numSequences; ++i)
+ {
+ HDF5Sequence* seq =
+ new HDF5Sequence(const_cast<HDF5Genome*>(this),
+ const_cast<HDF5ExternalArray*>(&_sequenceIdxArray),
+ const_cast<HDF5ExternalArray*>(&_sequenceNameArray),
+ i);
+ if (seq->getSequenceLength() > 0)
+ {
+ _sequencePosCache.insert(
+ pair<hal_size_t, HDF5Sequence*>(seq->getStartPosition() +
+ seq->getSequenceLength(), seq));
+ totalReadLen += seq->getSequenceLength();
+ }
+ else
+ {
+ _zeroLenPosCache.push_back(seq);
+ }
+ }
}
- hal_size_t seqLenFromArray = _dnaArray.getSize() * 2;
- if (seqLenFromArray > 0 && (seqLenFromArray != _totalSequenceLength &&
- seqLenFromArray-1 != _totalSequenceLength))
+ if (_totalSequenceLength > 0 && totalReadLen != _totalSequenceLength)
{
stringstream ss;
ss << "Sequences for genome " << getName() << " have total length "
- << _totalSequenceLength << " but the (non-zero) DNA array contains "
- << seqLenFromArray << " elements. This is an internal error or the "
- << "file is corrupt.";
+ << totalReadLen << " but the (non-zero) DNA array contains "
+ << _totalSequenceLength << " elements. This is an internal error "
+ << "or the file is corrupt.";
throw hal_exception(ss.str());
}
}
-void HDF5Genome::deleteSequenceCache()
+void HDF5Genome::loadSequenceNameCache() const
{
- map<hal_size_t, HDF5Sequence*>::iterator i;
- for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
+ if (_sequenceNameCache.size() > 0)
{
- delete i->second;
+ return;
+ }
+ hal_size_t numSequences = _sequenceNameArray.getSize();
+
+ if (_sequencePosCache.size() > 0 || _zeroLenPosCache.size() > 0)
+ {
+ assert(_sequencePosCache.size() + _zeroLenPosCache.size() == numSequences);
+ map<hal_size_t, HDF5Sequence*>::iterator i;
+ for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
+ {
+ _sequenceNameCache.insert(pair<string, HDF5Sequence*>(
+ i->second->getName(), i->second));
+ }
+ vector<HDF5Sequence*>::iterator z;
+ for (z = _zeroLenPosCache.begin(); z != _zeroLenPosCache.end(); ++z)
+ {
+ _sequenceNameCache.insert(pair<string, HDF5Sequence*>(
+ (*z)->getName(), (*z)));
+ }
+ }
+ else
+ {
+ for (hal_size_t i = 0; i < numSequences; ++i)
+ {
+ HDF5Sequence* seq =
+ new HDF5Sequence(const_cast<HDF5Genome*>(this),
+ const_cast<HDF5ExternalArray*>(&_sequenceIdxArray),
+ const_cast<HDF5ExternalArray*>(&_sequenceNameArray),
+ i);
+
+ _sequenceNameCache.insert(
+ pair<string, HDF5Sequence*>(seq->getName(), seq));
+ }
}
- _sequencePosCache.clear();
- _sequenceNameCache.clear(); // I share my pointers with above.
}
-
+
void HDF5Genome::writeSequences(const vector<Sequence::Info>&
sequenceDimensions)
{
@@ -778,13 +982,21 @@ void HDF5Genome::writeSequences(const vector<Sequence::Info>&
for (i = sequenceDimensions.begin(); i != sequenceDimensions.end(); ++i)
{
// Copy segment into HDF5 array
- HDF5Sequence* seq = new HDF5Sequence(this, &_sequenceArray,
+ HDF5Sequence* seq = new HDF5Sequence(this, &_sequenceIdxArray,
+ &_sequenceNameArray,
i - sequenceDimensions.begin());
// write all the Sequence::Info into the hdf5 sequence record
seq->set(startPosition, *i, topArrayIndex, bottomArrayIndex);
// Keep the object pointer in our caches
- _sequencePosCache.insert(
- pair<hal_size_t, HDF5Sequence*>(startPosition + i->_length, seq));
+ if (seq->getSequenceLength() > 0)
+ {
+ _sequencePosCache.insert(
+ pair<hal_size_t, HDF5Sequence*>(startPosition + i->_length, seq));
+ }
+ else
+ {
+ _zeroLenPosCache.push_back(seq);
+ }
_sequenceNameCache.insert(pair<string, HDF5Sequence*>(i->_name, seq));
startPosition += i->_length;
topArrayIndex += i->_numTopSegments;
View
11 api/hdf5_impl/hdf5Genome.h
@@ -94,6 +94,8 @@ class HDF5Genome : public Genome
bool containsDNAArray() const;
+ const Alignment* getAlignment() const;
+
// SEGMENTED SEQUENCE INTERFACE
hal_size_t getSequenceLength() const;
@@ -169,6 +171,8 @@ class HDF5Genome : public Genome
void writeSequences(const std::vector<hal::Sequence::Info>&
sequenceDimensions);
void deleteSequenceCache();
+ void loadSequencePosCache() const;
+ void loadSequenceNameCache() const;
void setGenomeTopDimensions(
const std::vector<hal::Sequence::UpdateInfo>& sequenceDimensions);
@@ -187,7 +191,8 @@ class HDF5Genome : public Genome
HDF5ExternalArray _dnaArray;
HDF5ExternalArray _topArray;
HDF5ExternalArray _bottomArray;
- HDF5ExternalArray _sequenceArray;
+ HDF5ExternalArray _sequenceIdxArray;
+ HDF5ExternalArray _sequenceNameArray;
H5::Group _group;
H5::DSetCreatPropList _dcprops;
hal_size_t _numChildrenInBottomArray;
@@ -197,12 +202,14 @@ class HDF5Genome : public Genome
mutable Genome* _parentCache;
mutable std::vector<Genome*> _childCache;
mutable std::map<hal_size_t, HDF5Sequence*> _sequencePosCache;
+ mutable std::vector<HDF5Sequence*> _zeroLenPosCache;
mutable std::map<std::string, HDF5Sequence*> _sequenceNameCache;
static const std::string dnaArrayName;
static const std::string topArrayName;
static const std::string bottomArrayName;
- static const std::string sequenceArrayName;
+ static const std::string sequenceIdxArrayName;
+ static const std::string sequenceNameArrayName;
static const std::string metaGroupName;
static const std::string rupGroupName;
View
2  api/hdf5_impl/hdf5MetaData.cpp
@@ -109,7 +109,7 @@ void HDF5MetaData::write()
// we delve into C api for this test
if (H5Aexists(_group.getId(), i->first.c_str()) == true)
{
- _group.removeAttr(_name);
+ _group.removeAttr(i->first.c_str());
}
attr = _group.createAttribute(i->first, vlsType, attSpace);
attr.write(vlsType, i->second);
View
152 api/hdf5_impl/hdf5Sequence.cpp
@@ -22,22 +22,18 @@ using namespace H5;
using namespace hal;
const size_t HDF5Sequence::startOffset = 0;
-const size_t HDF5Sequence::lengthOffset = sizeof(hal_size_t);
-const size_t HDF5Sequence::numTopSegmentsOffset = lengthOffset + sizeof(hal_size_t);
-const size_t HDF5Sequence::numBottomSegmentsOffset = numTopSegmentsOffset + sizeof(hal_size_t);
-const size_t HDF5Sequence::topSegmentArrayIndexOffset = numBottomSegmentsOffset + sizeof(hal_size_t);
+const size_t HDF5Sequence::topSegmentArrayIndexOffset = sizeof(hal_size_t);
const size_t HDF5Sequence::bottomSegmentArrayIndexOffset = topSegmentArrayIndexOffset + sizeof(hal_size_t);
-const size_t HDF5Sequence::nameOffset = bottomSegmentArrayIndexOffset + sizeof(hal_size_t);
+const size_t HDF5Sequence::totalSize = bottomSegmentArrayIndexOffset + sizeof(hal_size_t);
HDF5Sequence::HDF5Sequence(HDF5Genome* genome,
- HDF5ExternalArray* array,
+ HDF5ExternalArray* idxArray,
+ HDF5ExternalArray* nameArray,
hal_index_t index) :
- _array(array),
+ _idxArray(idxArray),
+ _nameArray(nameArray),
_index(index),
- _genome(genome),
- _cacheIndex(NULL_INDEX),
- _startCache(NULL_INDEX),
- _lengthCache(0)
+ _genome(genome)
{
}
@@ -47,31 +43,24 @@ HDF5Sequence::~HDF5Sequence()
}
-// Some tools (ie maf export) access the same sequence
-// info over and over again deep inside a loop. So I hack
-// in a little cache, mostly to avoid copying the same string
-// (name) out of hdf5.
-inline void HDF5Sequence::refreshCache() const
+H5::CompType HDF5Sequence::idxDataType()
{
- if (_index != _cacheIndex)
- {
- _nameCache = _array->get(_index) + nameOffset;
- // genome can be null in unit tests so we hack to not crash.
- if (_genome != NULL)
- {
- _fullNameCache = _genome->getName() + '.' + _nameCache;
- }
- else
- {
- _fullNameCache = _nameCache;
- }
- _startCache = _array->getValue<hal_size_t>(_index, startOffset);
- _lengthCache = _array->getValue<hal_size_t>(_index, lengthOffset);
- _cacheIndex = _index;
- }
+ // the in-memory representations and hdf5 representations
+ // don't necessarily have to be the same, but it simplifies
+ // testing for now.
+ assert(PredType::NATIVE_HSIZE.getSize() == sizeof(hal_size_t));
+ CompType dataType(totalSize);
+ dataType.insertMember("start", startOffset, PredType::NATIVE_HSIZE);
+ dataType.insertMember("topSegmentArrayIndexOffset",
+ topSegmentArrayIndexOffset,
+ PredType::NATIVE_HSIZE);
+ dataType.insertMember("bottomSegmentArrayIndexOffset",
+ bottomSegmentArrayIndexOffset,
+ PredType::NATIVE_HSIZE);
+ return dataType;
}
-H5::CompType HDF5Sequence::dataType(hal_size_t maxNameLength)
+H5::StrType HDF5Sequence::nameDataType(hal_size_t maxNameLength)
{
// the in-memory representations and hdf5 representations
// don't necessarily have to be the same, but it simplifies
@@ -79,32 +68,22 @@ H5::CompType HDF5Sequence::dataType(hal_size_t maxNameLength)
assert(PredType::NATIVE_HSIZE.getSize() == sizeof(hal_size_t));
assert(PredType::NATIVE_CHAR.getSize() == sizeof(char));
StrType strType(PredType::NATIVE_CHAR, (maxNameLength + 1) * sizeof(char));
- CompType dataType(nameOffset + strType.getSize());
- dataType.insertMember("start", startOffset, PredType::NATIVE_HSIZE);
- dataType.insertMember("length", lengthOffset, PredType::NATIVE_HSIZE);
- dataType.insertMember("numSequences", numTopSegmentsOffset,
- PredType::NATIVE_HSIZE);
- dataType.insertMember("numBottomSegments", numBottomSegmentsOffset,
- PredType::NATIVE_HSIZE);
- dataType.insertMember("topSegmentArrayIndexOffset", topSegmentArrayIndexOffset,
- PredType::NATIVE_HSIZE);
- dataType.insertMember("bottomSegmentArrayIndexOffset", bottomSegmentArrayIndexOffset,
- PredType::NATIVE_HSIZE);
- dataType.insertMember("name", nameOffset, strType);
- return dataType;
+// CompType dataType(nameOffset + strType.getSize());
+// dataType.insertMember("name", nameOffset, strType);
+ return strType;
+// return dataType;
}
// SEQUENCE INTERFACE
-const string& HDF5Sequence::getName() const
+string HDF5Sequence::getName() const
{
- refreshCache();
- return _nameCache;
+ return _nameArray->get(_index);
}
-const string& HDF5Sequence::getFullName() const
+string HDF5Sequence::getFullName() const
{
- refreshCache();
- return _fullNameCache;
+ assert(_genome != NULL);
+ return _genome->getName() + '.' + getName();
}
const Genome* HDF5Sequence::getGenome() const
@@ -119,14 +98,12 @@ Genome* HDF5Sequence::getGenome()
hal_index_t HDF5Sequence::getStartPosition() const
{
- refreshCache();
- return _startCache;
+ return _idxArray->getValue<hal_size_t>(_index, startOffset);
}
hal_index_t HDF5Sequence::getEndPosition() const
{
- refreshCache();
- return _startCache + (hal_index_t)(_lengthCache - 1);
+ return _idxArray->getValue<hal_size_t>(_index + 1 , startOffset) - 1;
}
hal_index_t HDF5Sequence::getArrayIndex() const
@@ -137,31 +114,43 @@ hal_index_t HDF5Sequence::getArrayIndex() const
hal_index_t HDF5Sequence::getTopSegmentArrayIndex() const
{
return (hal_index_t)
- _array->getValue<hal_size_t>(_index, topSegmentArrayIndexOffset);
+ _idxArray->getValue<hal_size_t>(_index, topSegmentArrayIndexOffset);
}
hal_index_t HDF5Sequence::getBottomSegmentArrayIndex() const
{
return (hal_index_t)
- _array->getValue<hal_size_t>(_index, bottomSegmentArrayIndexOffset);
+ _idxArray->getValue<hal_size_t>(_index, bottomSegmentArrayIndexOffset);
}
// SEGMENTED SEQUENCE INTERFACE
hal_size_t HDF5Sequence::getSequenceLength() const
{
- refreshCache();
- return _lengthCache;
+ hal_size_t len = _idxArray->getValue<hal_size_t>(_index, startOffset);
+ hal_size_t nlen = _idxArray->getValue<hal_size_t>(_index + 1, startOffset);
+ assert(nlen >= len);
+ return (nlen - len);
}
hal_size_t HDF5Sequence::getNumTopSegments() const
{
- return _array->getValue<hal_size_t>(_index, numTopSegmentsOffset);
+ hal_index_t idx =
+ _idxArray->getValue<hal_size_t>(_index, topSegmentArrayIndexOffset);
+ hal_index_t nextIdx =
+ _idxArray->getValue<hal_size_t>(_index + 1, topSegmentArrayIndexOffset);
+ assert(nextIdx >= idx);
+ return nextIdx - idx;
}
hal_size_t HDF5Sequence::getNumBottomSegments() const
{
- return _array->getValue<hal_size_t>(_index, numBottomSegmentsOffset);
+ hal_index_t idx =
+ _idxArray->getValue<hal_size_t>(_index, bottomSegmentArrayIndexOffset);
+ hal_index_t nextIdx =
+ _idxArray->getValue<hal_size_t>(_index + 1, bottomSegmentArrayIndexOffset);
+ assert(nextIdx >= idx);
+ return nextIdx - idx;
}
TopSegmentIteratorPtr HDF5Sequence::getTopSegmentIterator(
@@ -329,37 +318,44 @@ void HDF5Sequence::set(hal_size_t startPosition,
hal_size_t topSegmentStartIndex,
hal_size_t bottomSegmentStartIndex)
{
- _array->setValue(_index, startOffset, startPosition);
- _array->setValue(_index, lengthOffset, sequenceInfo._length);
- _array->setValue(_index, numTopSegmentsOffset, sequenceInfo._numTopSegments);
- _array->setValue(_index, numBottomSegmentsOffset,
- sequenceInfo._numBottomSegments);
- _array->setValue(_index, topSegmentArrayIndexOffset, topSegmentStartIndex);
- _array->setValue(_index, bottomSegmentArrayIndexOffset,
- bottomSegmentStartIndex);
- char* arrayBuffer = _array->getUpdate(_index) + nameOffset;
+ _idxArray->setValue(_index, startOffset, startPosition);
+ _idxArray->setValue(_index + 1, startOffset,
+ startPosition + sequenceInfo._length);
+ _idxArray->setValue(_index, topSegmentArrayIndexOffset, topSegmentStartIndex);
+ _idxArray->setValue(_index, bottomSegmentArrayIndexOffset,
+ bottomSegmentStartIndex);
+ _idxArray->setValue(_index + 1, topSegmentArrayIndexOffset,
+ topSegmentStartIndex + sequenceInfo._numTopSegments);
+ _idxArray->setValue(_index + 1, bottomSegmentArrayIndexOffset,
+ bottomSegmentStartIndex + sequenceInfo._numBottomSegments);
+ char* arrayBuffer = _nameArray->getUpdate(_index);
strcpy(arrayBuffer, sequenceInfo._name.c_str());
- // keep cache for frequently used queries.
- _cacheIndex = NULL_INDEX;
- refreshCache();
+
+ assert(getStartPosition() == (hal_index_t)startPosition);
+ assert(getNumTopSegments() == sequenceInfo._numTopSegments);
+ assert(getNumBottomSegments() == sequenceInfo._numBottomSegments);
+ assert(getSequenceLength() == sequenceInfo._length);
}
+// These functions look dangerous. Dont think they're used.
void HDF5Sequence::setNumTopSegments(hal_size_t numTopSegments)
{
- _array->setValue(_index, numTopSegmentsOffset, numTopSegments);
+ _idxArray->setValue(_index + 1, topSegmentArrayIndexOffset,
+ getTopSegmentArrayIndex() + numTopSegments);
}
void HDF5Sequence::setNumBottomSegments(hal_size_t numBottomSegments)
{
- _array->setValue(_index, numBottomSegmentsOffset, numBottomSegments);
+ _idxArray->setValue(_index + 1, bottomSegmentArrayIndexOffset,
+ getBottomSegmentArrayIndex() + numBottomSegments);
}
void HDF5Sequence::setTopSegmentArrayIndex(hal_size_t topIndex)
{
- _array->setValue(_index, topSegmentArrayIndexOffset, topIndex);
+ _idxArray->setValue(_index, topSegmentArrayIndexOffset, topIndex);
}
void HDF5Sequence::setBottomSegmentArrayIndex(hal_size_t bottomIndex)
{
- _array->setValue(_index, bottomSegmentArrayIndexOffset, bottomIndex);
+ _idxArray->setValue(_index, bottomSegmentArrayIndexOffset, bottomIndex);
}
View
32 api/hdf5_impl/hdf5Sequence.h
@@ -22,21 +22,18 @@ class HDF5Sequence : public Sequence
public:
- /** Constructor
- * @param genome Smart pointer to genome to which segment belongs
- * @param array HDF5 array containg segment
- * @param index Index of segment in the array */
HDF5Sequence(HDF5Genome* genome,
- HDF5ExternalArray* array,
+ HDF5ExternalArray* idxArray,
+ HDF5ExternalArray* nameArray,
hal_index_t index);
/** Destructor */
~HDF5Sequence();
// SEQUENCE INTERFACE
- const std::string& getName() const;
+ std::string getName() const;
- const std::string& getFullName() const;
+ std::string getFullName() const;
const Genome* getGenome() const;
@@ -116,7 +113,8 @@ class HDF5Sequence : public Sequence
// LOCAL NON-INTERFACE METHODS
- static H5::CompType dataType(hal_size_t maxNameLength);
+ static H5::CompType idxDataType();
+ static H5::StrType nameDataType(hal_size_t maxNameLength);
void set(hal_size_t startPosition, const Sequence::Info& sequenceInfo,
hal_size_t topSegmentStartIndex,
@@ -132,25 +130,19 @@ class HDF5Sequence : public Sequence
protected:
- void refreshCache() const;
+ void refreshNameCache() const;
+ void refreshFullNameCache() const;
+ void refreshIdxCache() const;
static const size_t startOffset;
- static const size_t lengthOffset;
- static const size_t numTopSegmentsOffset;
- static const size_t numBottomSegmentsOffset;
static const size_t topSegmentArrayIndexOffset;
static const size_t bottomSegmentArrayIndexOffset;
- static const size_t nameOffset;
+ static const size_t totalSize;
- mutable HDF5ExternalArray* _array;
+ mutable HDF5ExternalArray* _idxArray;
+ mutable HDF5ExternalArray* _nameArray;
mutable hal_index_t _index;
mutable HDF5Genome* _genome;
-
- mutable hal_index_t _cacheIndex;
- mutable std::string _nameCache;
- mutable std::string _fullNameCache;
- mutable hal_index_t _startCache;
- mutable hal_size_t _lengthCache;
};
}
View
7 api/hdf5_impl/hdf5SequenceIterator.cpp
@@ -15,7 +15,8 @@ using namespace hal;
HDF5SequenceIterator::HDF5SequenceIterator(HDF5Genome* genome,
hal_index_t index) :
- _sequence(genome, &genome->_sequenceArray, index)
+_sequence(genome, &genome->_sequenceIdxArray,
+ &genome->_sequenceNameArray, index)
{
}
@@ -52,7 +53,7 @@ void HDF5SequenceIterator::toPrev() const
Sequence* HDF5SequenceIterator::getSequence()
{
assert(_sequence._index >= 0 && _sequence._index <
- (hal_index_t)_sequence._genome->_sequenceArray.getSize());
+ (hal_index_t)_sequence._genome->_sequenceNameArray.getSize());
// don't return local sequence pointer. give cached pointer from
// genome instead (so it will not expire when iterator moves!)
return _sequence._genome->getSequence(_sequence.getName());
@@ -61,7 +62,7 @@ Sequence* HDF5SequenceIterator::getSequence()
const Sequence* HDF5SequenceIterator::getSequence() const
{
assert(_sequence._index >= 0 && _sequence._index <
- (hal_index_t)_sequence._genome->_sequenceArray.getSize());
+ (hal_index_t)_sequence._genome->_sequenceNameArray.getSize());
// don't return local sequence pointer. give cached pointer from
// genome instead (so it will not expire when iterator moves!)
return _sequence._genome->getSequence(_sequence.getName());
View
5 api/hdf5_impl/hdf5TopSegment.cpp
@@ -119,6 +119,11 @@ bool HDF5TopSegment::isCanonicalParalog() const
return isCanon;
}
+void HDF5TopSegment::print(std::ostream& os) const
+{
+ os << "HDF5 Top Segment";
+}
+
// HDF5 SPECIFIC
H5::CompType HDF5TopSegment::dataType()
{
View
1  api/hdf5_impl/hdf5TopSegment.h
@@ -61,6 +61,7 @@ class HDF5TopSegment : public TopSegment
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ void print(std::ostream& os) const;
// TOP SEGMENT INTERFACE
hal_index_t getParentIndex() const;
View
34 api/hdf5_tests/hdf5SequenceTypeTest.cpp
@@ -66,47 +66,59 @@ void hdf5SequenceTypeTest(CuTest *testCase)
setup();
try
{
- CompType datatype = HDF5Sequence::dataType(length);
+ CompType idxDatatype = HDF5Sequence::idxDataType();
+ StrType nameDatatype = HDF5Sequence::nameDataType(length);
H5File file(H5std_string(fileName), H5F_ACC_TRUNC);
- HDF5ExternalArray myArray;
+ HDF5ExternalArray myIdxArray;
+ HDF5ExternalArray myNameArray;
DSetCreatPropList cparms;
if (chunkSize > 0)
{
cparms.setChunk(1, &chunkSize);
}
- myArray.create(&file, datasetName, datatype, N, &cparms);
+ myIdxArray.create(&file, datasetName, idxDatatype, N + 1, &cparms);
+ myNameArray.create(&file, datasetName + "_n", nameDatatype, N,
+ &cparms);
hal_size_t totalTopSegments = 0;
hal_size_t totalBottomSegments = 0;
+ hal_index_t startPosition = 0;
for (hsize_t i = 0; i < N; ++i)
{
- HDF5Sequence sequence(NULL, &myArray, i);
+ HDF5Sequence sequence(NULL, &myIdxArray, &myNameArray, i);
Sequence::Info seqInfo(genName(i, length), i * 2, i * 3, i * 4);
- sequence.set(i, seqInfo, totalTopSegments, totalBottomSegments);
+ sequence.set(startPosition, seqInfo, totalTopSegments,
+ totalBottomSegments);
+ startPosition += i * 2;
totalTopSegments += seqInfo._numTopSegments;
totalBottomSegments += seqInfo._numBottomSegments;
}
- myArray.write();
+ myIdxArray.write();
+ myNameArray.write();
file.flush(H5F_SCOPE_LOCAL);
file.close();
H5File rfile(H5std_string(fileName), H5F_ACC_RDONLY);
- HDF5ExternalArray readArray;
- readArray.load(&rfile, datasetName);
-
+ HDF5ExternalArray readIdxArray;
+ HDF5ExternalArray readNameArray;
+ readIdxArray.load(&rfile, datasetName);
+ readNameArray.load(&rfile, datasetName + "_n");
+
+ startPosition = 0;
for (hsize_t i = 0; i < N; ++i)
{
- HDF5Sequence sequence(NULL, &readArray, i);
+ HDF5Sequence sequence(NULL, &readIdxArray, &readNameArray, i);
CuAssertTrue(testCase,
sequence.getName() == genName(i, length));
CuAssertTrue(testCase,
- sequence.getStartPosition() == (hal_index_t)i);
+ sequence.getStartPosition() == startPosition);
CuAssertTrue(testCase,
sequence.getSequenceLength() == i * 2);
CuAssertTrue(testCase,
sequence.getNumTopSegments() == i * 3);
CuAssertTrue(testCase,
sequence.getNumBottomSegments() == i * 4);
+ startPosition += i * 2;
}
}
catch(Exception& exception)
View
26 api/impl/defaultBottomSegmentIterator.cpp
@@ -45,7 +45,31 @@ hal_size_t DefaultBottomSegmentIterator::getNumSegmentsInGenome() const
{
return getGenome()->getNumBottomSegments();
}
-
+
+//////////////////////////////////////////////////////////////////////////////
+// SEGMENT INTERFACE OVERRIDE
+//////////////////////////////////////////////////////////////////////////////
+void DefaultBottomSegmentIterator::print(ostream& os) const
+{
+ os << "BotSegIt: ";
+ DefaultSegmentIterator::print(os);
+
+ hal_index_t ai = getArrayIndex();
+ bool offRight =
+ isTop() ? ai >= (hal_index_t)getGenome()->getNumTopSegments() :
+ ai >= (hal_index_t)getGenome()->getNumBottomSegments();
+
+ if (ai != NULL_INDEX && !offRight)
+ {
+ os << " numChilds=" << getNumChildren();
+ for (hal_size_t i = 0; i < getNumChildren(); ++i)
+ {
+ os << " cI[" << i << "]=" << getChildIndex(i);
+ os << " cR[" << i << "]=" << getChildReversed(i);
+ }
+ }
+}
+
//////////////////////////////////////////////////////////////////////////////
// BOTTOM SEGMENT INTERFACE
//////////////////////////////////////////////////////////////////////////////
View
3  api/impl/defaultBottomSegmentIterator.h
@@ -23,6 +23,9 @@ class DefaultBottomSegmentIterator : public DefaultSegmentIterator,
bool inverted = false);
virtual ~DefaultBottomSegmentIterator();
+ // SEGMENT INTERFACE OVERRIDE
+ virtual void print(std::ostream& os) const;
+
// BOTTOM SEGMENT INTERFACE
virtual hal_size_t getNumChildren() const;
virtual hal_index_t getChildIndex(hal_size_t i) const;
View
42 api/impl/defaultColumnIterator.cpp
@@ -66,12 +66,6 @@ DefaultColumnIterator::DefaultColumnIterator(const Genome* reference,
{
_targets = *targets;
_targets.insert(reference);
- const Genome* root = reference;
- while (root->getParent() != NULL)
- {
- // stupid!
- root = root->getParent();
- }
getGenomesInSpanningTree(_targets, _scope);
}
@@ -178,9 +172,10 @@ void DefaultColumnIterator::toRight() const
}
void DefaultColumnIterator::toSite(hal_index_t columnIndex,
- hal_index_t lastColumnIndex) const
+ hal_index_t lastColumnIndex,
+ bool clearCache) const
{
- const Genome* reference = _top->getGenome();
+ const Genome* reference = getReferenceGenome();
assert (columnIndex >= 0 && lastColumnIndex >= columnIndex &&
lastColumnIndex < (hal_index_t)reference->getSequenceLength());
@@ -189,10 +184,21 @@ void DefaultColumnIterator::toSite(hal_index_t columnIndex,
_ref =sequence;
_stack.clear();
_indelStack.clear();
+ if (clearCache == true)
+ {
+ for (VisitCache::iterator i = _visitCache.begin();
+ i != _visitCache.end(); ++i)
+ {
+ delete i->second;
+ }
+ _visitCache.clear();
+ }
defragment();
// note columnIndex in genome (not sequence) coordinates
_stack.push(sequence, columnIndex, lastColumnIndex);
toRight();
+ assert(getReferenceSequencePosition() + sequence->getStartPosition() ==
+ columnIndex);
}
bool DefaultColumnIterator::lastColumn() const
@@ -256,6 +262,21 @@ bool DefaultColumnIterator::isCanonicalOnRef() const
_leftmostRefPos <= _stack[0]->_lastIndex;
}
+void DefaultColumnIterator::print(ostream& os) const
+{
+ const ColumnIterator::ColumnMap* cmap = getColumnMap();
+ for (ColumnIterator::ColumnMap::const_iterator i = cmap->begin();
+ i != cmap->end(); ++i)
+ {
+ os << i->first->getName() << ": ";
+ for (size_t j = 0; j < i->second->size(); ++j)
+ {
+ os << i->second->at(j)->getArrayIndex() << ", ";
+ }
+ os << "\n";
+ }
+}
+
// Starting from the reference sequence which is determined
// from the stack, we start recursing over the entire column.
// if init is specified, all the initial iterators are created
@@ -612,15 +633,16 @@ void DefaultColumnIterator::updateChild(LinkedBottomIterator* bottomIt,
void DefaultColumnIterator::updateNextTopDup(LinkedTopIterator* topIt) const
{
assert (topIt->_it.get() != NULL);
+ const Genome* genome = topIt->_it->getTopSegment()->getGenome();
if (_break || _noDupes == true ||
- topIt->_it->getTopSegment()->getNextParalogyIndex() == NULL_INDEX)
+ topIt->_it->getTopSegment()->getNextParalogyIndex() == NULL_INDEX ||
+ genome->getParent() == NULL || parentInScope(genome) == false)
{
return;
}
hal_index_t firstIndex = topIt->_it->getTopSegment()->getArrayIndex();
LinkedTopIterator* currentTopIt = topIt;
- const Genome* genome = topIt->_it->getTopSegment()->getGenome();
do
{
View
4 api/impl/defaultColumnIterator.h
@@ -35,7 +35,8 @@ class DefaultColumnIterator : public ColumnIterator
// COLUMN ITERATOR INTERFACE
virtual void toRight() const;
- virtual void toSite(hal_index_t columnIndex, hal_index_t lastIndex) const;
+ virtual void toSite(hal_index_t columnIndex, hal_index_t lastIndex,
+ bool clearCache) const;
virtual bool lastColumn() const;
virtual const hal::Genome* getReferenceGenome() const;
virtual const hal::Sequence* getReferenceSequence() const;
@@ -44,6 +45,7 @@ class DefaultColumnIterator : public ColumnIterator
virtual hal_index_t getArrayIndex() const;
virtual void defragment() const;
virtual bool isCanonicalOnRef() const;
+ virtual void print(std::ostream& os) const;
protected:
View
26 api/impl/defaultGappedBottomSegmentIterator.cpp
@@ -194,6 +194,30 @@ hal_size_t DefaultGappedBottomSegmentIterator::getMappedSegments(
"DefaultGappedTopSegmentIterator");
}
+void DefaultGappedBottomSegmentIterator::print(std::ostream& os) const
+{
+ os << "Gapped Bottom Segment: (thresh=" << getGapThreshold()
+ << " ci=" << getChildIndex() << ")\n";
+ os << "Left: ";
+ if (_left.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_left;
+ }
+ os << "\nRight: ";
+ if (_right.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_right;
+ }
+}
+
//////////////////////////////////////////////////////////////////////////////
// SEGMENT ITERATOR INTERFACE
//////////////////////////////////////////////////////////////////////////////
@@ -506,7 +530,7 @@ bool DefaultGappedBottomSegmentIterator::hasChild() const
if (_left->equals(_right))
{
- assert(_temp->equals(_left) && _temp2->equals(_right));
+ //assert(_temp->equals(_left) && _temp2->equals(_right));
}
// to do: verify edge cases
// assert(_temp->hasChild(_childIndex) == _temp2->hasChild(_childIndex));
View
1  api/impl/defaultGappedBottomSegmentIterator.h
@@ -55,6 +55,7 @@ class DefaultGappedBottomSegmentIterator :
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ virtual void print(std::ostream& os) const;
// SEGMENT ITERATOR IrNTERFACE
virtual void toLeft(hal_index_t leftCutoff = NULL_INDEX) const;
View
23 api/impl/defaultGappedTopSegmentIterator.cpp
@@ -185,6 +185,29 @@ hal_size_t DefaultGappedTopSegmentIterator::getMappedSegments(
"DefaultGappedTopSegmentIterator");
}
+void DefaultGappedTopSegmentIterator::print(std::ostream& os) const
+{
+ os << "Gapped Top Segment: (thresh=" << getGapThreshold() << ")\n";
+ os << "Left: ";
+ if (_left.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_left;
+ }
+ os << "\nRight: ";
+ if (_right.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_right;
+ }
+}
+
//////////////////////////////////////////////////////////////////////////////
// SEGMENT ITERATOR INTERFACE
//////////////////////////////////////////////////////////////////////////////
View
1  api/impl/defaultGappedTopSegmentIterator.h
@@ -53,6 +53,7 @@ class DefaultGappedTopSegmentIterator : public virtual GappedTopSegmentIterator
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ virtual void print(std::ostream& os) const;
// SEGMENT ITERATOR INTERFACE
virtual void toLeft(hal_index_t leftCutoff = NULL_INDEX) const;
View
95 api/impl/defaultMappedSegment.cpp
@@ -170,9 +170,9 @@ bool DefaultMappedSegment::canMergeRightWith(
else if (this->getReversed() == true && ref->getReversed() == true)
{
qdelta = next->getEndPosition() - this->getStartPosition();
- rdelta = nextRef->getStartPosition() - ref->getEndPosition();
+ rdelta = nextRef->getEndPosition() - ref->getStartPosition();
cut = this->getStartPosition();
- sourceCut = ref->getEndPosition();
+ sourceCut = ref->getStartPosition();
}
else if (this->getReversed() == false && ref->getReversed() == true)
{
@@ -209,20 +209,6 @@ bool DefaultMappedSegment::canMergeRightWith(
return ret;
}
-void DefaultMappedSegment::print(ostream& os) const
-{
- os << "src: " << _source->getGenome()->getName() << "."
- << _source->getSequence()->getName() << " ai="
- << _source->getArrayIndex() << "[" << _source->getStartPosition()
- << "," << _source->getEndPosition() << "] rev="
- << _source->getReversed() << "\n"
- << "tgt: " << _target->getGenome()->getName() << "."
- << _target->getSequence()->getName() << " ai="
- << _target->getArrayIndex() << "[" << _target->getStartPosition()
- << "," << _target->getEndPosition() << "] rev="
- << _target->getReversed() << "\n";
-}
-
//////////////////////////////////////////////////////////////////////////////
// INTERNAL FUNCTIONS
//////////////////////////////////////////////////////////////////////////////
@@ -423,7 +409,15 @@ hal_size_t DefaultMappedSegment::map(const DefaultSegmentIterator* source,
list<DefaultMappedSegmentConstPtr> input;
input.push_back(newMappedSeg);
list<DefaultMappedSegmentConstPtr> output;
- mapRecursive(NULL, input, output, tgtGenome, genomesOnPath, doDupes,
+
+ set<string> namesOnPath;
+ assert(genomesOnPath != NULL);
+ for (set<const Genome*>::const_iterator i = genomesOnPath->begin();
+ i != genomesOnPath->end(); ++i)
+ {
+ namesOnPath.insert((*i)->getName());
+ }
+ mapRecursive(NULL, input, output, tgtGenome, namesOnPath, doDupes,
minLength);
list<DefaultMappedSegmentConstPtr>::iterator outIt = output.begin();
@@ -440,7 +434,7 @@ hal_size_t DefaultMappedSegment::mapRecursive(
list<DefaultMappedSegmentConstPtr>& input,
list<DefaultMappedSegmentConstPtr>& results,
const Genome* tgtGenome,
- const set<const Genome*>* genomesOnPath,
+ const set<string>& namesOnPath,
bool doDupes,
hal_size_t minLength)
{
@@ -457,33 +451,37 @@ hal_size_t DefaultMappedSegment::mapRecursive(
srcGenome = (*inputPtr->begin())->getSource()->getGenome();
genome = (*inputPtr->begin())->getGenome();
- const Genome* parentGenome = genome->getParent();
- if (parentGenome != NULL &&
- parentGenome != prevGenome && (
- parentGenome == tgtGenome ||
- genomesOnPath->find(parentGenome) != genomesOnPath->end()))
+ const Alignment* alignment = genome->getAlignment();
+ string parentName = alignment->getParentName(genome->getName());
+ if (parentName == tgtGenome->getName() ||
+ namesOnPath.find(parentName) != namesOnPath.end())
{
- nextGenome = parentGenome;
+ const Genome* parentGenome = genome->getParent();
+ if (parentGenome != NULL &&
+ parentGenome != prevGenome)
+ {
+ nextGenome = parentGenome;
+ }
}
+ vector<string> childNames = alignment->getChildNames(genome->getName());
for (hal_size_t child = 0;
- nextGenome == NULL && child < genome->getNumChildren(); ++child)
+ nextGenome == NULL && child < childNames.size(); ++child)
{
- // note that this code is potentially unfriendly to the
- // inmemory option where entire child genomes get loaded
- // when they may not be needed. but since the column iterator
- // does the same thing, we don't worry for now
- const Genome* childGenome = genome->getChild(child);
- if (childGenome != prevGenome && (
- childGenome == tgtGenome ||
- genomesOnPath->find(childGenome) != genomesOnPath->end()))
+ if (childNames[child] == tgtGenome->getName() ||
+ namesOnPath.find(childNames[child]) != namesOnPath.end())
{
- nextGenome = childGenome;
- nextChildIndex = child;
+ const Genome* childGenome = genome->getChild(child);
+ if (childGenome != prevGenome)
+ {
+ nextGenome = childGenome;
+ nextChildIndex = child;
+ }
}
}
if (doDupes == true && genome->getParent() != NULL &&
- (!nextGenome || nextGenome != genome->getParent()))
+ (!nextGenome || nextGenome != genome->getParent())
+ && namesOnPath.find(parentName) != namesOnPath.end())
{
outputPtr->clear();
list<DefaultMappedSegmentConstPtr>::iterator i = inputPtr->begin();
@@ -521,7 +519,7 @@ hal_size_t DefaultMappedSegment::mapRecursive(
swap(inputPtr, outputPtr);
assert(genome != NULL);
- mapRecursive(genome, *inputPtr, *outputPtr, tgtGenome, genomesOnPath,
+ mapRecursive(genome, *inputPtr, *outputPtr, tgtGenome, namesOnPath,
doDupes, minLength);
}
else
@@ -1153,6 +1151,29 @@ hal_size_t DefaultMappedSegment::getMappedSegments(
doDupes, minLength);
}
+void DefaultMappedSegment::print(ostream& os) const
+{
+ os << "Mapped Segment:\n";
+ os << "Source: ";
+ if (_source.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_source;
+ }
+ os << "\nTarget: ";
+ if (_target.get() == NULL)
+ {
+ os << "NULL";
+ }
+ else
+ {
+ os << *_target;
+ }
+}
+
//////////////////////////////////////////////////////////////////////////////
// SLICED SEGMENT INTERFACE
//////////////////////////////////////////////////////////////////////////////
View
4 api/impl/defaultMappedSegment.h
@@ -54,6 +54,7 @@ class DefaultMappedSegment : virtual public MappedSegment
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ virtual void print(std::ostream& os) const;
// SLICED SEGMENT INTERFACE
virtual void toReverse() const;
@@ -76,7 +77,6 @@ class DefaultMappedSegment : virtual public MappedSegment
const MappedSegmentConstPtr& next,
const std::set<hal_index_t>* cutSet,
const std::set<hal_index_t>* sourceCutSet) const;
- virtual void print(std::ostream& os) const;
// INTERNAL METHODS
@@ -136,7 +136,7 @@ class DefaultMappedSegment : virtual public MappedSegment
std::list<DefaultMappedSegmentConstPtr>& input,
std::list<DefaultMappedSegmentConstPtr>& results,
const Genome* tgtGenome,
- const std::set<const Genome*>* genomesOnPath,
+ const std::set<std::string>& namesOnPath,
bool doDupes,
hal_size_t minLength);
static
View
23 api/impl/defaultSegmentIterator.cpp
@@ -197,6 +197,29 @@ hal_size_t DefaultSegmentIterator::getMappedSegments(
return numResults;
}
+void DefaultSegmentIterator::print(ostream& os) const
+{
+ hal_index_t ai = getArrayIndex();
+
+ os << "gen=" << getGenome()->getName()
+ << " seq=" << getSequence()->getName()
+ << " idx=" << ai;
+
+ bool offRight =
+ isTop() ? ai >= (hal_index_t)getGenome()->getNumTopSegments() :
+ ai >= (hal_index_t)getGenome()->getNumBottomSegments();
+
+ if (ai != NULL_INDEX && !offRight)
+ {
+ os
+ << " start=" << getStartPosition()
+ << " end=" << getEndPosition()
+ << " len=" << getLength()
+ << " off=[" << getStartOffset() << "," << getEndOffset() << "]"
+ << " rev=" << getReversed();
+ }
+}
+
//////////////////////////////////////////////////////////////////////////////
// SLICED SEGMENT INTERFACE
//////////////////////////////////////////////////////////////////////////////
View
3  api/impl/defaultSegmentIterator.h
@@ -20,7 +20,7 @@ class DefaultSegmentIterator : virtual public SegmentIterator
bool inverted = false);
virtual ~DefaultSegmentIterator();
- // SEGMENT INTERFACE
+ // SEGMENT INTERFACE
virtual void setArrayIndex(Genome* genome,
hal_index_t arrayIndex);
virtual void setArrayIndex(const Genome* genome,
@@ -48,6 +48,7 @@ class DefaultSegmentIterator : virtual public SegmentIterator
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ virtual void print(std::ostream& os) const;
// SLICED SEGMENT INTERFACE
virtual void toReverse() const;
View
21 api/impl/defaultTopSegmentIterator.cpp
@@ -44,7 +44,26 @@ hal_size_t DefaultTopSegmentIterator::getNumSegmentsInGenome() const