diff --git a/Source/Bio.Core/Extensions/BAMParserExtensions.cs b/Source/Bio.Core/Extensions/BAMParserExtensions.cs index 864c30c..adc8f70 100644 --- a/Source/Bio.Core/Extensions/BAMParserExtensions.cs +++ b/Source/Bio.Core/Extensions/BAMParserExtensions.cs @@ -28,6 +28,7 @@ namespace Bio.IO.BAM throw new ArgumentNullException("filename"); } + parser.SetFilePath(filename); using (var fs = File.OpenRead(filename)) { foreach (var item in parser.Parse(fs)) @@ -61,7 +62,7 @@ namespace Bio.IO.BAM { throw new ArgumentNullException("refSeqName"); } - + parser.SetFilePath(fileName); using (var bamStream = File.OpenRead(fileName)) { string bamIndexFileName = GetBAMIndexFileName(fileName); @@ -131,7 +132,7 @@ namespace Bio.IO.BAM { throw new ArgumentNullException("refSeqName"); } - + parser.SetFilePath(fileName); using (FileStream bamStream = File.OpenRead(fileName)) { string bamIndexFileName = GetBAMIndexFileName(fileName); @@ -165,6 +166,7 @@ namespace Bio.IO.BAM throw new ArgumentNullException("fileName"); } + parser.SetFilePath(fileName); using (FileStream bamStream = File.OpenRead(fileName)) { string bamIndexFileName = GetBAMIndexFileName(fileName); @@ -200,7 +202,7 @@ namespace Bio.IO.BAM { throw new ArgumentNullException("fileName"); } - + parser.SetFilePath(fileName); using (FileStream bamStream = File.OpenRead(fileName)) { string bamIndexFileName = GetBAMIndexFileName(fileName); @@ -243,7 +245,7 @@ namespace Bio.IO.BAM int start = range.Start >= int.MaxValue ? int.MaxValue : (int)range.Start; int end = range.End >= int.MaxValue ? int.MaxValue : (int)range.End; - + parser.SetFilePath(fileName); using (FileStream bamStream = File.OpenRead(fileName)) { string bamIndexFileName = GetBAMIndexFileName(fileName); diff --git a/Source/Bio.Core/Extensions/ParserExtensions.cs b/Source/Bio.Core/Extensions/ParserExtensions.cs index d31d238..dbce8c5 100644 --- a/Source/Bio.Core/Extensions/ParserExtensions.cs +++ b/Source/Bio.Core/Extensions/ParserExtensions.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.IO; - using Bio.IO; namespace Bio @@ -19,6 +18,7 @@ namespace Bio /// Disposable object to close the stream. public static IDisposable Open(this IParser parser, string filename) { + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filename); return ParserFormatterExtensions>.Open(parser, filename); } @@ -61,6 +61,7 @@ namespace Bio /// Set of parsed data elements. public static IEnumerable Parse(this IParser parser, string fileName) { + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(fileName); using (FileStream fs = File.OpenRead(fileName)) { foreach (var item in parser.Parse(fs)) @@ -77,6 +78,7 @@ namespace Bio /// Set of parsed data elements. public static T ParseOne(this IParser parser, string fileName) { + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(fileName); using (FileStream fs = File.OpenRead(fileName)) { return parser.ParseOne(fs); diff --git a/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs b/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs index 228b269..e0a8fe0 100644 --- a/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs +++ b/Source/Bio.Core/Extensions/ParserFormatterExtensions.cs @@ -55,6 +55,23 @@ namespace Bio return new DisposableParser(parser); } + public static string GetFilePath(T parser) + { + if (parser == null) + { + throw new ArgumentNullException("parser"); + } + lock (fileData) + { + Tuple data; + if (fileData.TryGetValue(parser, out data)) + { + return data.Item1; + } + } + return null; + } + /// /// Parses the sequences from the open file. /// diff --git a/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs b/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs index 346273a..eb05e7a 100644 --- a/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs +++ b/Source/Bio.Core/Extensions/SequenceAlignmentParserExtensions.cs @@ -17,11 +17,11 @@ namespace Bio /// Opens a sequence file using the parser. /// /// Parser - /// File to open + /// File to open /// Disposable object to close the stream. - public static IDisposable Open(this ISequenceAlignmentParser parser, string filename) + public static IDisposable Open(this ISequenceAlignmentParser parser, string filePath) { - return ParserFormatterExtensions.Open(parser, filename); + return ParserFormatterExtensions.Open(parser, filePath); } /// @@ -53,11 +53,12 @@ namespace Bio /// Parses the sequences from the open file. /// /// Sequence Parser - /// File to parse + /// File to parse /// Set of parsed sequences. - public static ISequenceAlignment ParseOne(this ISequenceAlignmentParser parser, string filename) + public static ISequenceAlignment ParseOne(this ISequenceAlignmentParser parser, string filePath) { - using (var fs = File.OpenRead(filename)) + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath); + using (var fs = File.OpenRead(filePath)) return parser.ParseOne(fs); } @@ -65,12 +66,13 @@ namespace Bio /// Parses the sequences from the open file. /// /// Sequence Parser - /// File to parse + /// File to parse /// Set of parsed sequences. - public static T ParseOne(this ISequenceAlignmentParser parser, string filename) + public static T ParseOne(this ISequenceAlignmentParser parser, string filePath) where T : ISequenceAlignment { - using (var fs = File.OpenRead(filename)) + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath); + using (var fs = File.OpenRead(filePath)) return (T) parser.ParseOne(fs); } @@ -78,11 +80,12 @@ namespace Bio /// Parses the sequences from the given filename. /// /// Sequence Parser - /// Filename to open/close + /// Filename to open/close /// Set of parsed sequences. - public static IEnumerable Parse(this ISequenceAlignmentParser parser, string fileName) + public static IEnumerable Parse(this ISequenceAlignmentParser parser, string filePath) { - using (FileStream fs = File.OpenRead(fileName)) + if (parser is Bio.IO.BAM.BAMParser) ((Bio.IO.BAM.BAMParser)parser).SetFilePath(filePath); + using (FileStream fs = File.OpenRead(filePath)) { foreach (var item in parser.Parse(fs)) yield return item; diff --git a/Source/Bio.Core/IO/BAM/BAMParser.cs b/Source/Bio.Core/IO/BAM/BAMParser.cs index ae98edd..d34191f 100644 --- a/Source/Bio.Core/IO/BAM/BAMParser.cs +++ b/Source/Bio.Core/IO/BAM/BAMParser.cs @@ -28,7 +28,7 @@ namespace Bio.IO.BAM private const string BAMAlphabet = "=ACMGRSVTWYHKDBN"; private static readonly byte[] BAMAlphabetAsBytes = BAMAlphabet.ToByteArray(); - + /// /// Holds the BAM file stream. /// @@ -70,6 +70,18 @@ namespace Bio.IO.BAM /// private bool createBamIndex = false; + /// + /// Holds the header parsed from the bam file. + /// + SAMAlignmentHeader header = null; + + /// + /// Cached copy of the parsed index + /// + private BAMIndex cachedBamIndex = null; + + public string cacheBamPath = null; + /// /// The default constructor which chooses the default encoding based on the alphabet. /// @@ -80,6 +92,19 @@ namespace Bio.IO.BAM refSeqLengths = new List(); } + public void ClearCache() + { + header = null; + cachedBamIndex = null; + cacheBamPath = null; + } + + public void SetFilePath(string filePath) + { + if (filePath != cacheBamPath) ClearCache(); + cacheBamPath = filePath; + } + /// /// Gets the name of the sequence alignment parser being /// implemented. This is intended to give the @@ -137,8 +162,8 @@ namespace Bio.IO.BAM /// Returns true if the specified byte array indicates that the BAM file is compressed else returns false. private static bool IsCompressedBAMFile(byte[] array) { - return array[0] == 31 - && array[1] == 139 + return array[0] == 31 + && array[1] == 139 && array[2] == 8; } @@ -149,9 +174,9 @@ namespace Bio.IO.BAM /// Returns true if the specified byte array indicates a valid uncompressed BAM file else returns false. private static bool IsUnCompressedBAMFile(byte[] array) { - return array[0] == 66 - && array[1] == 65 - && array[2] == 77 + return array[0] == 66 + && array[1] == 65 + && array[2] == 77 && array[3] == 1; } @@ -253,12 +278,12 @@ namespace Bio.IO.BAM return i + 1 - startIndex; } - + /// /// Gets equivalent sequence char for the specified encoded value. /// /// Encoded value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static byte GetSeqCharAsByte(int encodedValue) { // Note that encoded value must be between 0 and 15, but we don't @@ -313,7 +338,7 @@ namespace Bio.IO.BAM //get all bins that overlap IList binnumbers = Reg2Bins((uint)start, (uint)end); //now only get those that match - List chunks = refIndex.Bins.Where(B => binnumbers.Contains(B.BinNumber)).SelectMany(x=>x.Chunks).ToList(); + List chunks = refIndex.Bins.Where(B => binnumbers.Contains(B.BinNumber)).SelectMany(x => x.Chunks).ToList(); //now use linear indexing to filter any chunks that end before the first start if (refIndex.LinearIndex.Count > 0) { @@ -329,7 +354,7 @@ namespace Bio.IO.BAM } chunks = chunks.Where(x => x.ChunkEnd >= minStart).ToList(); } - return SortAndMergeChunks(chunks); + return SortAndMergeChunks(chunks); } /// @@ -412,7 +437,7 @@ namespace Bio.IO.BAM { throw new ArgumentNullException("stream"); } - + return this.GetAlignmentMapIterator(stream); } @@ -427,7 +452,7 @@ namespace Bio.IO.BAM /// /// internal SequenceAlignmentMap GetAlignmentMap(Stream reader, BAMIndexStorage bamIndexStorage = null, - string refSeqName = null, int? refSeqIndex = null, int start = 0, int end = int.MaxValue) + string refSeqName = null, int? refSeqIndex = null, int? start = null, int? end = null) { if (reader == null || reader.Length == 0) { @@ -436,8 +461,13 @@ namespace Bio.IO.BAM readStream = reader; ValidateReader(); - - SAMAlignmentHeader header = this.GetHeader(); + bool parsedHeader = false; + if (header == null) + { + this.header = this.GetHeader(); + parsedHeader = true; + } + SequenceAlignmentMap seqMap = null; if (refSeqIndex.HasValue && refSeqName == null) { @@ -473,6 +503,8 @@ namespace Bio.IO.BAM } else { + // Special case: If there's no index, then we must re-parse the header to get to the right place + if (!parsedHeader) header = this.GetHeader(); GetAlignmentWithoutIndex(header, ref seqMap); } return seqMap; @@ -497,7 +529,12 @@ namespace Bio.IO.BAM } readStream = reader; ValidateReader(); - SAMAlignmentHeader header = this.GetHeader(); + bool parsedHeader = false; + if (header == null) + { + this.header = this.GetHeader(); + parsedHeader = true; + } if (refSeq.HasValue && refSeqName == null) { @@ -537,6 +574,7 @@ namespace Bio.IO.BAM } else { + if (!parsedHeader) header = this.GetHeader(); // Force reading the header, to get to correct file position foreach (SAMAlignedSequence seq in GetAlignmentWithoutIndexYield()) { yield return seq; @@ -575,13 +613,17 @@ namespace Bio.IO.BAM readStream = null; } - private void GetAlignmentWithIndex(BAMIndexStorage bamIndexStorage, int refSeqIndex, int start, int end, + private void GetAlignmentWithIndex(BAMIndexStorage bamIndexStorage, int refSeqIndex, int? start, int? end, SAMAlignmentHeader header, ref SequenceAlignmentMap seqMap) { IList chunks; seqMap = new SequenceAlignmentMap(header); - BAMIndex bamIndexInfo = bamIndexStorage.Read(); + if (cachedBamIndex == null) + { + cachedBamIndex = bamIndexStorage.Read(); + } + BAMIndex bamIndexInfo = cachedBamIndex; if (refSeqIndex != -1 && bamIndexInfo.RefIndexes.Count <= refSeqIndex) { @@ -590,13 +632,13 @@ namespace Bio.IO.BAM BAMReferenceIndexes refIndex = bamIndexInfo.RefIndexes[refSeqIndex]; - if (start == 0 && end == int.MaxValue) + if (start == null && end == null) { chunks = GetChunks(refIndex); } else { - chunks = GetChunks(refIndex, start, end); + chunks = GetChunks(refIndex, (int)start, (int)end); } IList alignedSeqs = GetAlignedSequences(chunks, start, end); @@ -611,12 +653,12 @@ namespace Bio.IO.BAM private void GetAlignmentWithoutIndex(SAMAlignmentHeader header, ref SequenceAlignmentMap seqMap) { Chunk lastChunk = null; - FileOffset lastOffSet = new FileOffset(0,0); + FileOffset lastOffSet = new FileOffset(0, 0); BAMReferenceIndexes refIndices = null; int lastBin = int.MaxValue; int lastRefSeqIndex = 0; int lastRefPos = Int32.MinValue; - + if (createBamIndex) { bamIndex = new BAMIndex(); @@ -626,7 +668,7 @@ namespace Bio.IO.BAM } refIndices = bamIndex.RefIndexes[0]; } - + if (!createBamIndex && seqMap == null) { seqMap = new SequenceAlignmentMap(header); @@ -636,9 +678,9 @@ namespace Bio.IO.BAM { if (createBamIndex) { - lastOffSet=new FileOffset((ulong)currentCompressedBlockStartPos,(ushort)deCompressedStream.Position); + lastOffSet = new FileOffset((ulong)currentCompressedBlockStartPos, (ushort)deCompressedStream.Position); } - SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); + SAMAlignedSequence alignedSeq = GetAlignedSequence(null, null); // BAM indexing if (createBamIndex) @@ -659,9 +701,9 @@ namespace Bio.IO.BAM } if (lastRefPos > alignedSeq.Pos) { - throw new InvalidDataException("The BAM file is not sorted. " + alignedSeq.QName + " appears after a later sequence"); + throw new InvalidDataException("The BAM file is not sorted. " + alignedSeq.QName + " appears after a later sequence"); } - + lastRefPos = alignedSeq.Pos; //update Bins when we switch over if (lastBin != alignedSeq.Bin) @@ -688,7 +730,7 @@ namespace Bio.IO.BAM lastBin = alignedSeq.Bin; } //UPDATE LINEAR INDEX AND PROCESS READ FOR META-DATA - refIndices.AddReadToIndexInformation(alignedSeq,lastOffSet); + refIndices.AddReadToIndexInformation(alignedSeq, lastOffSet); } if (!createBamIndex && alignedSeq != null) @@ -700,14 +742,15 @@ namespace Bio.IO.BAM // BAM indexing if (createBamIndex) { - ulong compressedOff=(ulong)readStream.Position; - ushort uncompressedEnd=0; + ulong compressedOff = (ulong)readStream.Position; + ushort uncompressedEnd = 0; //TODO: Shouldn't this always be true? Or go to max value? - if (deCompressedStream != null) { + if (deCompressedStream != null) + { uncompressedEnd = (ushort)deCompressedStream.Position; } - FileOffset veryLast=new FileOffset(compressedOff,uncompressedEnd); - lastChunk.ChunkEnd=veryLast; + FileOffset veryLast = new FileOffset(compressedOff, uncompressedEnd); + lastChunk.ChunkEnd = veryLast; foreach (var ri in bamIndex.RefIndexes) { ri.Freeze(); @@ -725,7 +768,7 @@ namespace Bio.IO.BAM while (!IsEOF()) { - SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); + SAMAlignedSequence alignedSeq = GetAlignedSequence(null, null); yield return alignedSeq; } } @@ -770,8 +813,8 @@ namespace Bio.IO.BAM ValidateReader(); return GetHeader(); - } - + } + /// /// Returns an aligned sequence by parses the BAM file. /// @@ -782,7 +825,7 @@ namespace Bio.IO.BAM /// public SAMAlignedSequence GetAlignedSequence(bool isReadOnly) { - return GetAlignedSequence(0, int.MaxValue); + return GetAlignedSequence(null, null); } /// @@ -943,11 +986,11 @@ namespace Bio.IO.BAM } } - + /// /// Returns an aligned sequence by parses the BAM file. /// - private SAMAlignedSequence GetAlignedSequence(int start, int end) + private SAMAlignedSequence GetAlignedSequence(int? start, int? end) { byte[] array = new byte[4]; @@ -969,7 +1012,7 @@ namespace Bio.IO.BAM // if there is no overlap no need to parse further. // BAMPos > closedEnd // => (alignedSeq.Pos - 1) > end -1 - if (alignedSeq.Pos > end) + if (end != null && alignedSeq.Pos > end) { return null; } @@ -1066,7 +1109,7 @@ namespace Bio.IO.BAM // if there is no overlap no need to parse further. // ZeroBasedRefEnd < start // => (alignedSeq.RefEndPos -1) < start - if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName!="*") + if (start != null && alignedSeq.RefEndPos - 1 < start && alignedSeq.RName != "*") { return null; } @@ -1107,7 +1150,7 @@ namespace Bio.IO.BAM { sequence = new byte[] { SAMParser.AsteriskAsByte }; } - + byte[] qualValues; if (alignmentBlock[startIndex] != 0xFF && readLen != 0) @@ -1134,20 +1177,20 @@ namespace Bio.IO.BAM qualValues = new byte[] { SAMParser.AsteriskAsByte }; } //Values have already been validated when first parsed at this point so no need to again - SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, sequence, qualValues,false); + SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, sequence, qualValues, false); startIndex += readLen; if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0) { - for (index = startIndex; index < alignmentBlock.Length; ) + for (index = startIndex; index < alignmentBlock.Length;) { SAMOptionalField optionalField = new SAMOptionalField - { - Tag = Encoding.UTF8.GetString(alignmentBlock, index, 2) - }; + { + Tag = Encoding.UTF8.GetString(alignmentBlock, index, 2) + }; index += 2; char vType = (char)alignmentBlock[index++]; - + // SAM format supports [AifZH] for value type. // In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned // short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However, @@ -1170,7 +1213,7 @@ namespace Bio.IO.BAM return alignedSeq; } - + /// @@ -1204,7 +1247,7 @@ namespace Bio.IO.BAM if (bytesToRead < count) { GetNextBlock(); - ReadUnCompressedData(array, offset+bytesToRead, count - bytesToRead); + ReadUnCompressedData(array, offset + bytesToRead, count - bytesToRead); } } @@ -1352,7 +1395,7 @@ namespace Bio.IO.BAM } // Gets aligned sequence from the specified chunks of the BAM file which overlaps with the specified start and end co-ordinates. - internal IList GetAlignedSequences(IList chunks, int start, int end) + internal IList GetAlignedSequences(IList chunks, int? start, int? end) { List alignedSeqs = new List(); foreach (Chunk chunk in chunks) diff --git a/Source/Bio.Core/IO/IParser.cs b/Source/Bio.Core/IO/IParser.cs index 3bf01d7..240659c 100644 --- a/Source/Bio.Core/IO/IParser.cs +++ b/Source/Bio.Core/IO/IParser.cs @@ -29,6 +29,7 @@ namespace Bio.IO /// will return a string containing all extensions with a ',' delimited. /// string SupportedFileTypes { get; } + } ///