diff --git a/Source/Bio.Core/Extensions/BAMParserExtensions.cs b/Source/Bio.Core/Extensions/BAMParserExtensions.cs
index 864c30c..adc8f70 100644
--- a/Source/Bio.Core/Extensions/BAMParserExtensions.cs
+++ b/Source/Bio.Core/Extensions/BAMParserExtensions.cs
@@ -28,6 +28,7 @@ namespace Bio.IO.BAM
throw new ArgumentNullException("filename");
}
+ parser.SetFilePath(filename);
using (var fs = File.OpenRead(filename))
{
foreach (var item in parser.Parse(fs))
@@ -61,7 +62,7 @@ namespace Bio.IO.BAM
{
throw new ArgumentNullException("refSeqName");
}
-
+ parser.SetFilePath(fileName);
using (var bamStream = File.OpenRead(fileName))
{
string bamIndexFileName = GetBAMIndexFileName(fileName);
@@ -131,7 +132,7 @@ namespace Bio.IO.BAM
{
throw new ArgumentNullException("refSeqName");
}
-
+ parser.SetFilePath(fileName);
using (FileStream bamStream = File.OpenRead(fileName))
{
string bamIndexFileName = GetBAMIndexFileName(fileName);
@@ -165,6 +166,7 @@ namespace Bio.IO.BAM
throw new ArgumentNullException("fileName");
}
+ parser.SetFilePath(fileName);
using (FileStream bamStream = File.OpenRead(fileName))
{
string bamIndexFileName = GetBAMIndexFileName(fileName);
@@ -200,7 +202,7 @@ namespace Bio.IO.BAM
{
throw new ArgumentNullException("fileName");
}
-
+ parser.SetFilePath(fileName);
using (FileStream bamStream = File.OpenRead(fileName))
{
string bamIndexFileName = GetBAMIndexFileName(fileName);
@@ -243,7 +245,7 @@ namespace Bio.IO.BAM
int start = range.Start >= int.MaxValue ? int.MaxValue : (int)range.Start;
int end = range.End >= int.MaxValue ? int.MaxValue : (int)range.End;
-
+ parser.SetFilePath(fileName);
using (FileStream bamStream = File.OpenRead(fileName))
{
string bamIndexFileName = GetBAMIndexFileName(fileName);
diff --git a/Source/Bio.Core/Extensions/ParserExtensions.cs b/Source/Bio.Core/Extensions/ParserExtensions.cs
index d31d238..dbce8c5 100644
--- a/Source/Bio.Core/Extensions/ParserExtensions.cs
+++ b/Source/Bio.Core/Extensions/ParserExtensions.cs
@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.IO;
-
using Bio.IO;
namespace Bio
@@ -19,6 +18,7 @@ namespace Bio
/// Disposable object to close the stream.
public static IDisposable Open(this IParser parser, string filename)
{
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filename);
return ParserFormatterExtensions>.Open(parser, filename);
}
@@ -61,6 +61,7 @@ namespace Bio
/// Set of parsed data elements.
public static IEnumerable Parse(this IParser parser, string fileName)
{
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(fileName);
using (FileStream fs = File.OpenRead(fileName))
{
foreach (var item in parser.Parse(fs))
@@ -77,6 +78,7 @@ namespace Bio
/// Set of parsed data elements.
public static T ParseOne(this IParser parser, string fileName)
{
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(fileName);
using (FileStream fs = File.OpenRead(fileName))
{
return parser.ParseOne(fs);
diff --git a/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs b/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs
index 228b269..e0a8fe0 100644
--- a/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs
+++ b/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs
@@ -55,6 +55,23 @@ namespace Bio
return new DisposableParser(parser);
}
+ public static string GetFilePath(T parser)
+ {
+ if (parser == null)
+ {
+ throw new ArgumentNullException("parser");
+ }
+ lock (fileData)
+ {
+ Tuple data;
+ if (fileData.TryGetValue(parser, out data))
+ {
+ return data.Item1;
+ }
+ }
+ return null;
+ }
+
///
/// Parses the sequences from the open file.
///
diff --git a/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs b/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs
index 346273a..eb05e7a 100644
--- a/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs
+++ b/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs
@@ -17,11 +17,11 @@ namespace Bio
/// Opens a sequence file using the parser.
///
/// Parser
- /// File to open
+ /// File to open
/// Disposable object to close the stream.
- public static IDisposable Open(this ISequenceAlignmentParser parser, string filename)
+ public static IDisposable Open(this ISequenceAlignmentParser parser, string filePath)
{
- return ParserFormatterExtensions.Open(parser, filename);
+ return ParserFormatterExtensions.Open(parser, filePath);
}
///
@@ -53,11 +53,12 @@ namespace Bio
/// Parses the sequences from the open file.
///
/// Sequence Parser
- /// File to parse
+ /// File to parse
/// Set of parsed sequences.
- public static ISequenceAlignment ParseOne(this ISequenceAlignmentParser parser, string filename)
+ public static ISequenceAlignment ParseOne(this ISequenceAlignmentParser parser, string filePath)
{
- using (var fs = File.OpenRead(filename))
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath);
+ using (var fs = File.OpenRead(filePath))
return parser.ParseOne(fs);
}
@@ -65,12 +66,13 @@ namespace Bio
/// Parses the sequences from the open file.
///
/// Sequence Parser
- /// File to parse
+ /// File to parse
/// Set of parsed sequences.
- public static T ParseOne(this ISequenceAlignmentParser parser, string filename)
+ public static T ParseOne(this ISequenceAlignmentParser parser, string filePath)
where T : ISequenceAlignment
{
- using (var fs = File.OpenRead(filename))
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath);
+ using (var fs = File.OpenRead(filePath))
return (T) parser.ParseOne(fs);
}
@@ -78,11 +80,12 @@ namespace Bio
/// Parses the sequences from the given filename.
///
/// Sequence Parser
- /// Filename to open/close
+ /// Filename to open/close
/// Set of parsed sequences.
- public static IEnumerable Parse(this ISequenceAlignmentParser parser, string fileName)
+ public static IEnumerable Parse(this ISequenceAlignmentParser parser, string filePath)
{
- using (FileStream fs = File.OpenRead(fileName))
+ if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath);
+ using (FileStream fs = File.OpenRead(filePath))
{
foreach (var item in parser.Parse(fs))
yield return item;
diff --git a/Source/Bio.Core/IO/BAM/BAMParser.cs b/Source/Bio.Core/IO/BAM/BAMParser.cs
index ae98edd..d34191f 100644
--- a/Source/Bio.Core/IO/BAM/BAMParser.cs
+++ b/Source/Bio.Core/IO/BAM/BAMParser.cs
@@ -28,7 +28,7 @@ namespace Bio.IO.BAM
private const string BAMAlphabet = "=ACMGRSVTWYHKDBN";
private static readonly byte[] BAMAlphabetAsBytes = BAMAlphabet.ToByteArray();
-
+
///
/// Holds the BAM file stream.
///
@@ -70,6 +70,18 @@ namespace Bio.IO.BAM
///
private bool createBamIndex = false;
+ ///
+ /// Holds the header parsed from the bam file.
+ ///
+ SAMAlignmentHeader header = null;
+
+ ///
+ /// Cached copy of the parsed index
+ ///
+ private BAMIndex cachedBamIndex = null;
+
+ public string cacheBamPath = null;
+
///
/// The default constructor which chooses the default encoding based on the alphabet.
///
@@ -80,6 +92,19 @@ namespace Bio.IO.BAM
refSeqLengths = new List();
}
+ public void ClearCache()
+ {
+ header = null;
+ cachedBamIndex = null;
+ cacheBamPath = null;
+ }
+
+ public void SetFilePath(string filePath)
+ {
+ if (filePath != cacheBamPath) ClearCache();
+ cacheBamPath = filePath;
+ }
+
///
/// Gets the name of the sequence alignment parser being
/// implemented. This is intended to give the
@@ -137,8 +162,8 @@ namespace Bio.IO.BAM
/// Returns true if the specified byte array indicates that the BAM file is compressed else returns false.
private static bool IsCompressedBAMFile(byte[] array)
{
- return array[0] == 31
- && array[1] == 139
+ return array[0] == 31
+ && array[1] == 139
&& array[2] == 8;
}
@@ -149,9 +174,9 @@ namespace Bio.IO.BAM
/// Returns true if the specified byte array indicates a valid uncompressed BAM file else returns false.
private static bool IsUnCompressedBAMFile(byte[] array)
{
- return array[0] == 66
- && array[1] == 65
- && array[2] == 77
+ return array[0] == 66
+ && array[1] == 65
+ && array[2] == 77
&& array[3] == 1;
}
@@ -253,12 +278,12 @@ namespace Bio.IO.BAM
return i + 1 - startIndex;
}
-
+
///
/// Gets equivalent sequence char for the specified encoded value.
///
/// Encoded value.
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static byte GetSeqCharAsByte(int encodedValue)
{
// Note that encoded value must be between 0 and 15, but we don't
@@ -313,7 +338,7 @@ namespace Bio.IO.BAM
//get all bins that overlap
IList binnumbers = Reg2Bins((uint)start, (uint)end);
//now only get those that match
- List chunks = refIndex.Bins.Where(B => binnumbers.Contains(B.BinNumber)).SelectMany(x=>x.Chunks).ToList();
+ List chunks = refIndex.Bins.Where(B => binnumbers.Contains(B.BinNumber)).SelectMany(x => x.Chunks).ToList();
//now use linear indexing to filter any chunks that end before the first start
if (refIndex.LinearIndex.Count > 0)
{
@@ -329,7 +354,7 @@ namespace Bio.IO.BAM
}
chunks = chunks.Where(x => x.ChunkEnd >= minStart).ToList();
}
- return SortAndMergeChunks(chunks);
+ return SortAndMergeChunks(chunks);
}
///
@@ -412,7 +437,7 @@ namespace Bio.IO.BAM
{
throw new ArgumentNullException("stream");
}
-
+
return this.GetAlignmentMapIterator(stream);
}
@@ -427,7 +452,7 @@ namespace Bio.IO.BAM
///
///
internal SequenceAlignmentMap GetAlignmentMap(Stream reader, BAMIndexStorage bamIndexStorage = null,
- string refSeqName = null, int? refSeqIndex = null, int start = 0, int end = int.MaxValue)
+ string refSeqName = null, int? refSeqIndex = null, int? start = null, int? end = null)
{
if (reader == null || reader.Length == 0)
{
@@ -436,8 +461,13 @@ namespace Bio.IO.BAM
readStream = reader;
ValidateReader();
-
- SAMAlignmentHeader header = this.GetHeader();
+ bool parsedHeader = false;
+ if (header == null)
+ {
+ this.header = this.GetHeader();
+ parsedHeader = true;
+ }
+
SequenceAlignmentMap seqMap = null;
if (refSeqIndex.HasValue && refSeqName == null)
{
@@ -473,6 +503,8 @@ namespace Bio.IO.BAM
}
else
{
+ // Special case: If there's no index, then we must re-parse the header to get to the right place
+ if (!parsedHeader) header = this.GetHeader();
GetAlignmentWithoutIndex(header, ref seqMap);
}
return seqMap;
@@ -497,7 +529,12 @@ namespace Bio.IO.BAM
}
readStream = reader;
ValidateReader();
- SAMAlignmentHeader header = this.GetHeader();
+ bool parsedHeader = false;
+ if (header == null)
+ {
+ this.header = this.GetHeader();
+ parsedHeader = true;
+ }
if (refSeq.HasValue && refSeqName == null)
{
@@ -537,6 +574,7 @@ namespace Bio.IO.BAM
}
else
{
+ if (!parsedHeader) header = this.GetHeader(); // Force reading the header, to get to correct file position
foreach (SAMAlignedSequence seq in GetAlignmentWithoutIndexYield())
{
yield return seq;
@@ -575,13 +613,17 @@ namespace Bio.IO.BAM
readStream = null;
}
- private void GetAlignmentWithIndex(BAMIndexStorage bamIndexStorage, int refSeqIndex, int start, int end,
+ private void GetAlignmentWithIndex(BAMIndexStorage bamIndexStorage, int refSeqIndex, int? start, int? end,
SAMAlignmentHeader header, ref SequenceAlignmentMap seqMap)
{
IList chunks;
seqMap = new SequenceAlignmentMap(header);
- BAMIndex bamIndexInfo = bamIndexStorage.Read();
+ if (cachedBamIndex == null)
+ {
+ cachedBamIndex = bamIndexStorage.Read();
+ }
+ BAMIndex bamIndexInfo = cachedBamIndex;
if (refSeqIndex != -1 && bamIndexInfo.RefIndexes.Count <= refSeqIndex)
{
@@ -590,13 +632,13 @@ namespace Bio.IO.BAM
BAMReferenceIndexes refIndex = bamIndexInfo.RefIndexes[refSeqIndex];
- if (start == 0 && end == int.MaxValue)
+ if (start == null && end == null)
{
chunks = GetChunks(refIndex);
}
else
{
- chunks = GetChunks(refIndex, start, end);
+ chunks = GetChunks(refIndex, (int)start, (int)end);
}
IList alignedSeqs = GetAlignedSequences(chunks, start, end);
@@ -611,12 +653,12 @@ namespace Bio.IO.BAM
private void GetAlignmentWithoutIndex(SAMAlignmentHeader header, ref SequenceAlignmentMap seqMap)
{
Chunk lastChunk = null;
- FileOffset lastOffSet = new FileOffset(0,0);
+ FileOffset lastOffSet = new FileOffset(0, 0);
BAMReferenceIndexes refIndices = null;
int lastBin = int.MaxValue;
int lastRefSeqIndex = 0;
int lastRefPos = Int32.MinValue;
-
+
if (createBamIndex)
{
bamIndex = new BAMIndex();
@@ -626,7 +668,7 @@ namespace Bio.IO.BAM
}
refIndices = bamIndex.RefIndexes[0];
}
-
+
if (!createBamIndex && seqMap == null)
{
seqMap = new SequenceAlignmentMap(header);
@@ -636,9 +678,9 @@ namespace Bio.IO.BAM
{
if (createBamIndex)
{
- lastOffSet=new FileOffset((ulong)currentCompressedBlockStartPos,(ushort)deCompressedStream.Position);
+ lastOffSet = new FileOffset((ulong)currentCompressedBlockStartPos, (ushort)deCompressedStream.Position);
}
- SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue);
+ SAMAlignedSequence alignedSeq = GetAlignedSequence(null, null);
// BAM indexing
if (createBamIndex)
@@ -659,9 +701,9 @@ namespace Bio.IO.BAM
}
if (lastRefPos > alignedSeq.Pos)
{
- throw new InvalidDataException("The BAM file is not sorted. " + alignedSeq.QName + " appears after a later sequence");
+ throw new InvalidDataException("The BAM file is not sorted. " + alignedSeq.QName + " appears after a later sequence");
}
-
+
lastRefPos = alignedSeq.Pos;
//update Bins when we switch over
if (lastBin != alignedSeq.Bin)
@@ -688,7 +730,7 @@ namespace Bio.IO.BAM
lastBin = alignedSeq.Bin;
}
//UPDATE LINEAR INDEX AND PROCESS READ FOR META-DATA
- refIndices.AddReadToIndexInformation(alignedSeq,lastOffSet);
+ refIndices.AddReadToIndexInformation(alignedSeq, lastOffSet);
}
if (!createBamIndex && alignedSeq != null)
@@ -700,14 +742,15 @@ namespace Bio.IO.BAM
// BAM indexing
if (createBamIndex)
{
- ulong compressedOff=(ulong)readStream.Position;
- ushort uncompressedEnd=0;
+ ulong compressedOff = (ulong)readStream.Position;
+ ushort uncompressedEnd = 0;
//TODO: Shouldn't this always be true? Or go to max value?
- if (deCompressedStream != null) {
+ if (deCompressedStream != null)
+ {
uncompressedEnd = (ushort)deCompressedStream.Position;
}
- FileOffset veryLast=new FileOffset(compressedOff,uncompressedEnd);
- lastChunk.ChunkEnd=veryLast;
+ FileOffset veryLast = new FileOffset(compressedOff, uncompressedEnd);
+ lastChunk.ChunkEnd = veryLast;
foreach (var ri in bamIndex.RefIndexes)
{
ri.Freeze();
@@ -725,7 +768,7 @@ namespace Bio.IO.BAM
while (!IsEOF())
{
- SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue);
+ SAMAlignedSequence alignedSeq = GetAlignedSequence(null, null);
yield return alignedSeq;
}
}
@@ -770,8 +813,8 @@ namespace Bio.IO.BAM
ValidateReader();
return GetHeader();
- }
-
+ }
+
///
/// Returns an aligned sequence by parses the BAM file.
///
@@ -782,7 +825,7 @@ namespace Bio.IO.BAM
///
public SAMAlignedSequence GetAlignedSequence(bool isReadOnly)
{
- return GetAlignedSequence(0, int.MaxValue);
+ return GetAlignedSequence(null, null);
}
///
@@ -943,11 +986,11 @@ namespace Bio.IO.BAM
}
}
-
+
///
/// Returns an aligned sequence by parses the BAM file.
///
- private SAMAlignedSequence GetAlignedSequence(int start, int end)
+ private SAMAlignedSequence GetAlignedSequence(int? start, int? end)
{
byte[] array = new byte[4];
@@ -969,7 +1012,7 @@ namespace Bio.IO.BAM
// if there is no overlap no need to parse further.
// BAMPos > closedEnd
// => (alignedSeq.Pos - 1) > end -1
- if (alignedSeq.Pos > end)
+ if (end != null && alignedSeq.Pos > end)
{
return null;
}
@@ -1066,7 +1109,7 @@ namespace Bio.IO.BAM
// if there is no overlap no need to parse further.
// ZeroBasedRefEnd < start
// => (alignedSeq.RefEndPos -1) < start
- if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName!="*")
+ if (start != null && alignedSeq.RefEndPos - 1 < start && alignedSeq.RName != "*")
{
return null;
}
@@ -1107,7 +1150,7 @@ namespace Bio.IO.BAM
{
sequence = new byte[] { SAMParser.AsteriskAsByte };
}
-
+
byte[] qualValues;
if (alignmentBlock[startIndex] != 0xFF && readLen != 0)
@@ -1134,20 +1177,20 @@ namespace Bio.IO.BAM
qualValues = new byte[] { SAMParser.AsteriskAsByte };
}
//Values have already been validated when first parsed at this point so no need to again
- SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, sequence, qualValues,false);
+ SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, sequence, qualValues, false);
startIndex += readLen;
if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0)
{
- for (index = startIndex; index < alignmentBlock.Length; )
+ for (index = startIndex; index < alignmentBlock.Length;)
{
SAMOptionalField optionalField = new SAMOptionalField
- {
- Tag = Encoding.UTF8.GetString(alignmentBlock, index, 2)
- };
+ {
+ Tag = Encoding.UTF8.GetString(alignmentBlock, index, 2)
+ };
index += 2;
char vType = (char)alignmentBlock[index++];
-
+
// SAM format supports [AifZH] for value type.
// In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned
// short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However,
@@ -1170,7 +1213,7 @@ namespace Bio.IO.BAM
return alignedSeq;
}
-
+
///
@@ -1204,7 +1247,7 @@ namespace Bio.IO.BAM
if (bytesToRead < count)
{
GetNextBlock();
- ReadUnCompressedData(array, offset+bytesToRead, count - bytesToRead);
+ ReadUnCompressedData(array, offset + bytesToRead, count - bytesToRead);
}
}
@@ -1352,7 +1395,7 @@ namespace Bio.IO.BAM
}
// Gets aligned sequence from the specified chunks of the BAM file which overlaps with the specified start and end co-ordinates.
- internal IList GetAlignedSequences(IList chunks, int start, int end)
+ internal IList GetAlignedSequences(IList chunks, int? start, int? end)
{
List alignedSeqs = new List();
foreach (Chunk chunk in chunks)
diff --git a/Source/Bio.Core/IO/IParser.cs b/Source/Bio.Core/IO/IParser.cs
index 3bf01d7..240659c 100644
--- a/Source/Bio.Core/IO/IParser.cs
+++ b/Source/Bio.Core/IO/IParser.cs
@@ -29,6 +29,7 @@ namespace Bio.IO
/// will return a string containing all extensions with a ',' delimited.
///
string SupportedFileTypes { get; }
+
}
///