From 0916d3d6f797b3473f9cc11c83cb0fe1b17c8499 Mon Sep 17 00:00:00 2001 From: alumi Date: Fri, 28 Jul 2017 17:48:29 +0900 Subject: [PATCH] Refactor BAM I/O. --- src/cljam/algo/bam_indexer.clj | 4 +- src/cljam/algo/convert.clj | 41 ++-- src/cljam/algo/depth.clj | 17 +- src/cljam/algo/pileup/mpileup.clj | 2 +- src/cljam/io/bam/core.clj | 4 +- src/cljam/io/bam/decoder.clj | 297 +++++++++++++++++++--------- src/cljam/io/bam/encoder.clj | 199 ++++++------------- src/cljam/io/bam/reader.clj | 313 ++++++++---------------------- src/cljam/io/bam/writer.clj | 187 ++---------------- src/cljam/io/bam_index/writer.clj | 69 +++---- src/cljam/io/protocols.clj | 9 +- src/cljam/io/sam.clj | 9 +- src/cljam/io/sam/reader.clj | 72 ++++--- src/cljam/io/sam/util.clj | 14 +- src/cljam/io/util/cigar.clj | 43 ++-- src/cljam/io/util/lsb.clj | 143 ++++++++------ src/cljam/tools/cli.clj | 8 +- test/cljam/algo/t_bam_indexer.clj | 8 +- test/cljam/io/t_sam.clj | 72 +++---- test/cljam/t_common.clj | 62 +++--- 20 files changed, 660 insertions(+), 913 deletions(-) diff --git a/src/cljam/algo/bam_indexer.clj b/src/cljam/algo/bam_indexer.clj index 88f876ef..ae8d0155 100644 --- a/src/cljam/algo/bam_indexer.clj +++ b/src/cljam/algo/bam_indexer.clj @@ -9,6 +9,4 @@ [in-bam out-bai & {:keys [n-threads] :or {n-threads 0}}] (with-open [r (sam/bam-reader in-bam)] (binding [*n-threads* n-threads] - (bai-core/create-index out-bai - (sam/read-blocks r {} {:mode :pointer}) - (sam/read-refs r))))) + (bai-core/create-index out-bai (sam/read-blocks r) (sam/read-refs r))))) diff --git a/src/cljam/algo/convert.clj b/src/cljam/algo/convert.clj index 50ead474..913fa12e 100644 --- a/src/cljam/algo/convert.clj +++ b/src/cljam/algo/convert.clj @@ -5,45 +5,42 @@ [cljam.io.sam.util :as sam-util] [cljam.io.util :as io-util] [com.climate.claypoole :as cp]) - (:import [cljam.io.bam.writer BAMWriter])) + (:import [java.nio ByteBuffer] + [cljam.io.bam.writer BAMWriter])) (def ^:private default-num-block 100000) -(def ^:private default-num-write-block 10000) -(defn- sam-write-alignments [rdr wtr hdr num-block num-write-block] +(defn- sam-write-alignments [rdr wtr hdr num-block] (doseq [alns (partition-all num-block (sam/read-alignments rdr {}))] (sam/write-alignments wtr alns hdr))) -(defn- bam-write-alignments [rdr wtr hdr num-block num-write-block] - (let [refs (sam-util/make-refs hdr) - w (.writer ^BAMWriter wtr)] +(defn- bam-write-alignments [rdr wtr hdr num-block] + (let [refs (sam-util/make-refs hdr)] (cp/with-shutdown! [pool (cp/threadpool (cp/ncpus))] - (doseq [alns (partition-all num-block (sam/read-alignments rdr {}))] - (let [blocks (doall (cp/pmap pool - (fn [lalns] - (doall (map #(encoder/encode % refs) - lalns))) - (partition-all num-write-block alns)))] - (doseq [block blocks] - (doseq [e block] - (encoder/write-encoded-alignment w e)))))))) + (doseq [blocks (cp/pmap pool + (fn [chunk] + (mapv #(let [bb (ByteBuffer/allocate (encoder/get-block-size %))] + (encoder/encode-alignment bb % refs) + {:data (.array bb)}) + chunk)) + (partition-all num-block (sam/read-alignments rdr {})))] + (sam/write-blocks wtr blocks))))) (defn- _convert! - [rdr wtr num-block num-write-block write-alignments-fn] + [rdr wtr num-block write-alignments-fn] (let [hdr (sam/read-header rdr)] (sam/write-header wtr hdr) (sam/write-refs wtr hdr) - (write-alignments-fn rdr wtr hdr num-block num-write-block))) + (write-alignments-fn rdr wtr hdr num-block))) (defn convert "Converts file format from input file to output file by the file extension." - [in out & {:keys [num-block num-write-block] - :or {num-block default-num-block - num-write-block default-num-write-block}}] + [in out & {:keys [num-block] + :or {num-block default-num-block}}] (with-open [rdr (sam/reader in) wtr (sam/writer out)] (cond - (io-util/sam-writer? wtr) (_convert! rdr wtr num-block num-write-block sam-write-alignments) - (io-util/bam-writer? wtr) (_convert! rdr wtr num-block num-write-block bam-write-alignments) + (io-util/sam-writer? wtr) (_convert! rdr wtr num-block sam-write-alignments) + (io-util/bam-writer? wtr) (_convert! rdr wtr num-block bam-write-alignments) :else (throw (ex-info (str "Unsupported output file format " out) {})))) nil) diff --git a/src/cljam/algo/depth.clj b/src/cljam/algo/depth.clj index 97db7fc5..a8b619e6 100644 --- a/src/cljam/algo/depth.clj +++ b/src/cljam/algo/depth.clj @@ -4,7 +4,8 @@ [cljam.common :as common] [cljam.util :as util] [cljam.io.sam :as sam] - [cljam.io.sam.util :as sam-util])) + [cljam.io.sam.util :as sam-util]) + (:import [cljam.io.protocols SAMRegionBlock])) (def ^:const default-step 1000000) @@ -17,7 +18,7 @@ (let [pile (long-array (inc (- end beg)))] (doseq [aln alns] (let [left (max (:pos aln) beg) - right (min (sam-util/get-end aln) end) + right (min (:end aln) end) left-index (- left beg)] (dotimes [i (inc (- right left))] (aset-long pile (+ i left-index) (inc (aget pile (+ i left-index))))))) @@ -28,7 +29,7 @@ [rdr rname start end step] (let [n-threads (common/get-exec-n-threads) read-fn (fn [r start end] - (sam/read-alignments r {:chr rname :start start :end end :depth :shallow})) + (sam/read-blocks r {:chr rname :start start :end end} {:mode :region})) count-fn (fn [xs] (if (= n-threads 1) (map (fn [[start end]] @@ -62,9 +63,9 @@ (let [beg (int beg) end (int end) offset (int offset)] - (doseq [aln alns] - (let [left (Math/max ^int (:pos aln) (unchecked-int beg)) - right (unchecked-inc-int (or (:end aln) (sam-util/get-end aln))) + (doseq [^SAMRegionBlock aln alns] + (let [left (Math/max ^int (.pos aln) beg) + right (unchecked-inc-int (.end aln)) left-index (unchecked-add-int (unchecked-subtract-int left beg) offset) right-index (unchecked-add-int (unchecked-subtract-int right beg) offset)] (aset-int pile left-index (unchecked-inc-int (aget pile left-index))) @@ -103,12 +104,12 @@ (let [pile (int-array (inc (- end start))) f (if unchecked? unchecked-aset-depth-in-region! aset-depth-in-region!)] (if (= n-threads 1) - (f (sam/read-alignments rdr region {:depth :shallow}) start end 0 pile) + (f (sam/read-blocks rdr region {:mode :region}) start end 0 pile) (cp/pdoseq n-threads [[s e] (util/divide-region start end step)] (with-open [r (sam/clone-bam-reader rdr)] - (-> (sam/read-alignments r {:chr chr, :start s, :end e} {:depth :shallow}) + (-> (sam/read-blocks r {:chr chr, :start s, :end e} {:mode :region}) (f s e (- s start) pile))))) pile)) diff --git a/src/cljam/algo/pileup/mpileup.clj b/src/cljam/algo/pileup/mpileup.clj index 8ad0da95..6a3189de 100644 --- a/src/cljam/algo/pileup/mpileup.clj +++ b/src/cljam/algo/pileup/mpileup.clj @@ -137,7 +137,7 @@ refseq (if ref-reader (cseq/read-sequence ref-reader {:chr chr :start s :end e}) (repeat \N))] - (->> (sam/read-alignments bam-reader {:chr chr :start s :end e :depth :deep}) + (->> (sam/read-alignments bam-reader {:chr chr :start s :end e}) (sequence (comp (filter basic-mpileup-pred) diff --git a/src/cljam/io/bam/core.clj b/src/cljam/io/bam/core.clj index d71e38b1..39f05159 100644 --- a/src/cljam/io/bam/core.clj +++ b/src/cljam/io/bam/core.clj @@ -2,7 +2,7 @@ "The core of BAM features." (:require [clojure.java.io :as cio] [clojure.string :as cstr] - [cljam.io.bam [common :refer [bam-magic]] + [cljam.io.bam [common :as common] [reader :as reader] [writer :as writer]] [cljam.io.bam-index :as bai] @@ -33,7 +33,7 @@ [f] (let [rdr (BGZFInputStream. (cio/file f)) data-rdr (DataInputStream. rdr)] - (when-not (Arrays/equals ^bytes (lsb/read-bytes data-rdr 4) (.getBytes ^String bam-magic)) + (when-not (Arrays/equals ^bytes (lsb/read-bytes data-rdr 4) (.getBytes ^String common/bam-magic)) (throw (IOException. "Invalid BAM file"))) (let [{:keys [header refs]} (reader/load-headers data-rdr) index-delay (delay (bam-index f))] diff --git a/src/cljam/io/bam/decoder.clj b/src/cljam/io/bam/decoder.clj index 9e7742c2..602d7d37 100644 --- a/src/cljam/io/bam/decoder.clj +++ b/src/cljam/io/bam/decoder.clj @@ -1,14 +1,15 @@ (ns cljam.io.bam.decoder "Decoder of BAM alignment blocks." - (:require [clojure.string :refer [join]] - [proton.core :refer [hex->bytes]] - [cljam.util :refer [ubyte]] + (:require [clojure.string :as cstr] + [proton.core :as proton] + [cljam.util :as util] [cljam.io.sam.util :as sam-util] - [cljam.io.bam.common :refer [fixed-block-size]] - [cljam.io.util.lsb :as lsb]) - (:import java.util.Arrays + [cljam.io.bam.common :as common] + [cljam.io.util.lsb :as lsb] + [cljam.io.util.cigar :as cigar]) + (:import [java.util Arrays] [java.nio ByteBuffer ByteOrder CharBuffer] - [cljam.io.protocols SAMAlignment])) + [cljam.io.protocols SAMAlignment SAMRegionBlock SAMCoordinateBlock SAMQuerynameBlock])) (definline validate-tag-type [t] @@ -31,7 +32,7 @@ ~(long \c) (int (.get ~bb)) ~(long \C) (bit-and (int (.get ~bb)) 0xff) ~(long \f) (.getFloat ~bb) - ~(long \H) (hex->bytes (lsb/read-null-terminated-string ~bb)) + ~(long \H) (proton/hex->bytes (lsb/read-null-terminated-string ~bb)) (throw (Exception. "Unrecognized tag type")))) (defn- parse-tag-array [^ByteBuffer bb] @@ -48,7 +49,7 @@ \f (.getFloat bb) (throw (Exception. (str "Unrecognized tag array type: " typ))))) (cons typ) - (join \,)))) + (cstr/join \,)))) (defn- parse-option [^ByteBuffer bb] (lazy-seq @@ -74,101 +75,205 @@ (defn options-size [^long block-size ^long l-read-name ^long n-cigar-op ^long l-seq] (- block-size - fixed-block-size + common/fixed-block-size l-read-name (* n-cigar-op 4) - (int (/ (inc l-seq) 2)) + (quot (inc l-seq) 2) l-seq)) (defn decode-qual [^bytes b] - (if (Arrays/equals b (byte-array (alength b) (ubyte 0xff))) + (if (Arrays/equals b (byte-array (alength b) (util/ubyte 0xff))) "*" (sam-util/phred-bytes->fastq b))) (defn decode-seq [seq-bytes length] (sam-util/compressed-bases->str length seq-bytes 0)) -(defn decode-next-ref-id [refs n rname] - (cond - (= n -1) "*" - (= (sam-util/ref-name refs n) rname) "=" - :else (sam-util/ref-name refs n))) - -(defn decode-cigar-and-ref-length - [cigar-bytes] - (let [buf (ByteBuffer/wrap cigar-bytes) - sb (StringBuilder.)] - (.order buf ByteOrder/LITTLE_ENDIAN) - (loop [ref-length 0] - (if (.hasRemaining buf) - (let [b (.getInt buf) - op (bit-and b 0xF) - n (bit-shift-right b 4)] - (doto sb - (.append n) - (.append (case op 0 \M 1 \I 2 \D 3 \N 4 \S 5 \H 6 \P 7 \= 8 \X))) - (recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0)))) - [(.toString sb) ref-length])))) - -(defn deep-decode-alignment-block - [block refs] - (let [buffer (ByteBuffer/wrap (:data block))] - (let [ref-id (lsb/read-int buffer) - rname (or (sam-util/ref-name refs ref-id) "*") - pos (inc (lsb/read-int buffer)) - l-read-name (int (lsb/read-ubyte buffer)) - mapq (lsb/read-ubyte buffer) - _ (lsb/read-ushort buffer) ; bin - n-cigar-op (lsb/read-ushort buffer) - flag (lsb/read-ushort buffer) - l-seq (lsb/read-int buffer) - next-ref-id (lsb/read-int buffer) - rnext (if (= ref-id next-ref-id) "=" (or (sam-util/ref-name refs next-ref-id) "*")) - pnext (inc (lsb/read-int buffer)) - tlen (lsb/read-int buffer) - qname (lsb/read-string buffer (dec l-read-name)) - _ (lsb/skip buffer 1) - [cigar len] (decode-cigar-and-ref-length (lsb/read-bytes buffer (* n-cigar-op 4))) - seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) - qual (decode-qual (lsb/read-bytes buffer l-seq)) - rest (lsb/read-bytes buffer (options-size (:size block) l-read-name n-cigar-op l-seq)) - options (decode-options rest)] - (SAMAlignment. qname (int flag) rname (int pos) (if (zero? len) 0 (int (dec (+ pos len)))) (int mapq) - cigar rnext (int pnext) (int tlen) seq qual options)))) - -(defn light-decode-alignment-block - [block refs] - (let [buffer (ByteBuffer/wrap (:data block))] - (let [ref-id (lsb/read-int buffer) - rname (or (sam-util/ref-name refs ref-id) "*") - pos (inc (lsb/read-int buffer)) - l-read-name (int (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) - n-cigar-op (lsb/read-ushort buffer) - _ (lsb/skip buffer (+ 18 l-read-name)) - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))] - {:rname rname, :pos pos, :meta {:cigar-bytes cigar-bytes}}))) - -(defn pointer-decode-alignment-block - [block _] - (let [buffer (ByteBuffer/wrap (:data block))] - (let [ref-id (lsb/read-int buffer) - pos (inc (lsb/read-int buffer)) - l-read-name (int (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) - n-cigar-op (lsb/read-ushort buffer) - flag (lsb/read-ushort buffer) - _ (lsb/skip buffer (+ 16 l-read-name)) - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))] - {:flag flag, :ref-id ref-id, :pos pos, - :meta {:chunk {:beg (:pointer-beg block), - :end (:pointer-end block)} - :cigar-bytes cigar-bytes}}))) - -(defn decode-alignment-block - [block refs depth] - (let [f (case depth - :deep deep-decode-alignment-block - :shallow light-decode-alignment-block - :pointer pointer-decode-alignment-block)] - (f block refs))) +(defn decode-next-ref-id [refs ^long ref-id ^long next-ref-id] + (if (= next-ref-id -1) + "*" + (if (= ref-id next-ref-id) + "=" + (sam-util/ref-name refs next-ref-id)))) + +(defrecord BAMRawBlock [data ^long pointer-beg ^long pointer-end]) + +(defn decode-alignment + "Decodes BAM block and creates SAMAlignment instance which is compatible with SAM. + When called with start and end, this function may return nil if any base of the block + is not included in the range." + ([refs block] + (let [buffer (ByteBuffer/wrap (:data block)) + ref-id ^int (lsb/read-int buffer) + rname (or (sam-util/ref-name refs ref-id) "*") + pos (inc ^int (lsb/read-int buffer)) + l-read-name (int ^long (lsb/read-ubyte buffer)) + mapq ^long (lsb/read-ubyte buffer) + _ (lsb/skip buffer 2) ; bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + l-seq ^int (lsb/read-int buffer) + next-ref-id ^int (lsb/read-int buffer) + rnext (decode-next-ref-id refs ref-id next-ref-id) + pnext (inc ^int (lsb/read-int buffer)) + tlen ^int (lsb/read-int buffer) + qname (lsb/read-string buffer (dec l-read-name)) + _ (lsb/skip buffer 1) + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + [cigar ^long len] (cigar/decode-cigar-and-ref-length cigar-bytes) + ref-end (int ^long (if (zero? len) pos (dec (+ pos len)))) + seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) + qual (decode-qual (lsb/read-bytes buffer l-seq)) + rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) + options (decode-options rest)] + (SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq) + cigar rnext (int pnext) (int tlen) seq qual options))) + ([refs block ^long start ^long end] + (let [buffer (ByteBuffer/wrap (:data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer))] + (when (<= pos end) + (let [l-read-name (int ^long (lsb/read-ubyte buffer)) + mapq ^long (lsb/read-ubyte buffer) + _ (lsb/skip buffer 2) ; bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + l-seq ^int (lsb/read-int buffer) + next-ref-id ^int (lsb/read-int buffer) + rnext (decode-next-ref-id refs ref-id next-ref-id) + pnext (inc ^int (lsb/read-int buffer)) + tlen ^int (lsb/read-int buffer) + qname (lsb/read-string buffer (dec l-read-name)) + _ (lsb/skip buffer 1) + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + [cigar ^long len] (cigar/decode-cigar-and-ref-length cigar-bytes) + ref-end (int ^long (if (zero? len) pos (dec (+ pos len))))] + (when (<= start ref-end) + (let [seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) + qual (decode-qual (lsb/read-bytes buffer l-seq)) + rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) + rname (or (sam-util/ref-name refs ref-id) "*") + options (decode-options rest)] + (SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq) + cigar rnext (int pnext) (int tlen) seq qual options)))))))) + +(defn decode-region-block + "Decodes BAM block and returns a SAMRegionBlock instance containing covering range of the alignment." + ([^BAMRawBlock block] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer)) + l-read-name (int ^long (lsb/read-ubyte buffer)) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (SAMRegionBlock. (.data block) ref-id pos ref-end))) + ([^BAMRawBlock block ^long start ^long end] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer))] + (when (<= pos end) + (let [l-read-name (int ^long (lsb/read-ubyte buffer)) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (when (<= start ref-end) + (SAMRegionBlock. (.data block) ref-id pos ref-end))))))) + +(defn decode-coordinate-block + "Decodes BAM block and returns a SAMCoordinateBlock instance containing ref-id, pos and flag." + ([^BAMRawBlock block] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer)) + _ (lsb/skip buffer 6) ;; l_read_name, MAPQ, bin, n_cigar_op + flag ^long (lsb/read-ushort buffer)] ;; l_seq, rnext, pnext, tlen, qname + (SAMCoordinateBlock. (.data block) (int ref-id) (int pos) (int flag)))) + ([^BAMRawBlock block ^long start ^long end] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer))] + (when (<= pos end) + (let [l-read-name (int ^long (lsb/read-ubyte buffer)) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (when (<= start ref-end) + (SAMCoordinateBlock. (.data block) (int ref-id) (int pos) (int flag)))))))) + +(defn decode-queryname-block + "Decodes BAM block and returns a SAMQuerynameBlock instance containing qname and flag." + ([^BAMRawBlock block] + (let [buffer (ByteBuffer/wrap (.data block)) + _ (lsb/skip buffer 8) ;; ref-id, pos + l-read-name ^long (lsb/read-ubyte buffer) + _ (lsb/skip buffer 5) ;; MAPQ, bin, n_cigar_op + flag ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen + qname (lsb/read-string buffer l-read-name)] + (SAMQuerynameBlock. (.data block) qname (int flag)))) + ([^BAMRawBlock block ^long start ^long end] + (let [buffer (ByteBuffer/wrap (.data block)) + _ (lsb/skip buffer 4) ;; ref-id + pos (inc ^int (lsb/read-int buffer))] + (when (<= pos end) + (let [l-read-name ^long (lsb/read-ubyte buffer) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen + qname (lsb/read-string buffer l-read-name) + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (when (<= start ref-end) + (SAMQuerynameBlock. (.data block) qname (int flag)))))))) + +(defrecord BAMPointerBlock [data ^int ref-id ^int pos ^int end ^int flag ^long pointer-beg ^long pointer-end]) +(defn decode-pointer-block + "Decodes BAM block and returns a BAMPointerBlock instance containing region, flag and block pointers." + ([^BAMRawBlock block] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer)) + l-read-name (int ^long (lsb/read-ubyte buffer)) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (BAMPointerBlock. (.data block) ref-id pos ref-end (int flag) (.pointer-beg block) (.pointer-end block)))) + ([^BAMRawBlock block ^long start ^long end] + (let [buffer (ByteBuffer/wrap (.data block)) + ref-id ^int (lsb/read-int buffer) + pos (inc ^int (lsb/read-int buffer))] + (when (<= pos end) + (let [l-read-name (int ^long (lsb/read-ubyte buffer)) + _ (lsb/skip buffer 3) ;; MAPQ, bin + n-cigar-op ^long (lsb/read-ushort buffer) + flag ^long (lsb/read-ushort buffer) + _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + ref-length ^long (cigar/count-ref-bytes cigar-bytes) + ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))] + (when (<= start ref-end) + (BAMPointerBlock. (.data block) ref-id pos ref-end (int flag) (.pointer-beg block) (.pointer-end block)))))))) + +(defn raw-block + "Checks the range of BAM block and returns the given block if any base is included." + ([b] b) + ([b ^long s ^long e] + (when (decode-region-block b s e) + b))) diff --git a/src/cljam/io/bam/encoder.clj b/src/cljam/io/bam/encoder.clj index 0a26c332..2484a7fd 100644 --- a/src/cljam/io/bam/encoder.clj +++ b/src/cljam/io/bam/encoder.clj @@ -1,81 +1,32 @@ (ns cljam.io.bam.encoder "Encoder of BAM alignment blocks." - (:require [clojure.string :refer [split]] - [cljam.util :refer [ubyte]] - [cljam.io.sam.util :refer [reg->bin normalize-bases fastq->phred - str->compressed-bases make-refs ref-id - stringify-header]] - [cljam.io.bam.common :refer [bam-magic fixed-block-size]] + (:require [clojure.string :as cstr] + [cljam.util :as util] + [cljam.io.sam.util :as sam-util] [cljam.io.util.cigar :as cgr] - [cljam.io.util.lsb :as lsb]) - (:import [java.util Arrays])) + [cljam.io.util.lsb :as lsb] + [cljam.io.bam.common :as common])) (def ^:private fixed-tag-size 3) (def ^:private fixed-binary-array-tag-size 5) -(defn- get-pos [aln] - (dec (:pos aln))) - -(defn- get-end [aln] - (dec - (+ (:pos aln) - (cgr/count-ref (:cigar aln))))) - -(defn- get-l-seq [sam-alignment] - (count (:seq sam-alignment))) - (defn- get-next-ref-id [sa refs] (condp = (:rnext sa) "*" -1 - "=" (if-let [id (ref-id refs (:rname sa))] id -1) - (if-let [id (ref-id refs (:rnext sa))] id -1))) - -(defn- get-ref-id [aln refs] - (if-let [id (ref-id refs (:rname aln))] id -1)) - -(defn- get-next-pos [sam-alignment] - (dec (:pnext sam-alignment))) - -(defn- get-tlen [sam-alignment] - (:tlen sam-alignment)) - -(defn- get-read-name [sam-alignment] - (:qname sam-alignment)) - -(defn- encode-cigar-op [op] - (case op - \M (byte 0) - \I (byte 1) - \D (byte 2) - \N (byte 3) - \S (byte 4) - \H (byte 5) - \P (byte 6) - \= (byte 7) - \X (byte 8))) - -(defn- encode-cigar [cigar] - (map #(bit-or (bit-shift-left (first %) 4) - (encode-cigar-op (second %))) - (cgr/parse cigar))) - -(defn- get-cigar [aln] - (encode-cigar (:cigar aln))) - -(defn- get-seq [sam-alignment] - (str->compressed-bases (:seq sam-alignment))) + "=" (if-let [id (sam-util/ref-id refs (:rname sa))] id -1) + (if-let [id (sam-util/ref-id refs (:rnext sa))] id -1))) (defn- get-options-size [sam-alignment] (->> (map (fn [op] - (let [[tag value] (first (seq op))] + (let [[_ value] (first (seq op))] (+ fixed-tag-size (case (first (:type value)) \A 1 \i 4 \f 4 \Z (inc (count (:value value))) - \B (let [[array-type & array] (split (:value value) #",")] + \B (let [[array-type & array] (cstr/split (:value value) #",")] (+ fixed-binary-array-tag-size (* (count array) (case (first array-type) @@ -90,116 +41,78 @@ (:options sam-alignment)) (reduce +))) -(defn- get-block-size [aln] - (let [read-length (count (:seq aln)) - cigar-length (cgr/count-op (:cigar aln))] - (+ fixed-block-size (count (:qname aln)) 1 - (* cigar-length 4) - (int (/ (inc read-length) 2)) - read-length - (get-options-size aln)))) - (defn- encode-qual [sam-alignment] (if (= (:qual sam-alignment) "*") - (byte-array (count (:seq sam-alignment)) (ubyte 0xff)) - (fastq->phred (:qual sam-alignment)))) + (byte-array (.length ^String (:seq sam-alignment)) (util/ubyte 0xff)) + (sam-util/fastq->phred (:qual sam-alignment)))) -(defn- encode-tag-value [val-type value] +(defn- encode-tag-value [writer val-type value] (case val-type - \A (let [bb (lsb/gen-byte-buffer)] - (.putChar bb (char value)) - (Arrays/copyOfRange (.array bb) 0 1)) - \i (let [bb (lsb/gen-byte-buffer)] - (.putInt bb (int value)) - (Arrays/copyOfRange (.array bb) 0 4)) - \f (let [bb (lsb/gen-byte-buffer)] - (.putFloat bb (float value)) - (Arrays/copyOfRange (.array bb) 0 4)) - \Z (let [^String text value - text-size (count text) - buf (byte-array (inc text-size) (.getBytes text))] - (aset-byte buf text-size 0) - buf) + \A (lsb/write-char writer (char value)) + \i (lsb/write-int writer (int value)) + \f (lsb/write-float writer (float value)) + \Z (do (lsb/write-string writer value) + (lsb/write-char writer (char 0))) ;; \H nil - \B (let [[array-type & array] (split value #",")] + \B (let [[array-type & array] (cstr/split value #",")] (case (first array-type) \c nil \C nil \s nil - \S (let [total-len (+ 1 4 (* 2 (count array))) - bb (lsb/gen-byte-buffer total-len)] - (.put bb (byte-array 1 (byte \S)) 0 1) - (.putInt bb (count array)) + \S (do + (lsb/write-bytes writer (byte-array 1 (byte \S))) + (lsb/write-int writer (count array)) (doseq [v array] - (.putShort bb (Short/parseShort v))) - (Arrays/copyOfRange (.array bb) 0 total-len)) + (lsb/write-short writer (Short/parseShort v)))) \i nil \I nil \f nil)))) -(defn encode - [aln refs] - [(get-block-size aln) - (get-ref-id aln refs) - (get-pos aln) - (short (inc (count (:qname aln)))) - (short (:mapq aln)) - (reg->bin (:pos aln) (get-end aln)) - (cgr/count-op (:cigar aln)) - (:flag aln) - (get-l-seq aln) - (get-next-ref-id aln refs) - (get-next-pos aln) - (get-tlen aln) - (get-read-name aln) - (byte-array 1 (byte 0)) - (get-cigar aln) - (get-seq aln) - (encode-qual aln) - (doall (map (fn [op] - (let [[tag value] (first (seq op))] - [(short (bit-or (bit-shift-left (byte (second (name tag))) 8) - (byte (first (name tag))))) - (.getBytes ^String (:type value)) - (encode-tag-value (first (:type value)) (:value value))])) - (:options aln)))]) +(defn get-block-size [aln] + (let [read-length (.length ^String (:seq aln)) + cigar-length ^long (cgr/count-op (:cigar aln))] + (+ ^long common/fixed-block-size + (.length ^String (:qname aln)) + 1 ;; null + (* cigar-length 4) + (quot (inc read-length) 2) + read-length + ^long (get-options-size aln)))) -(defn write-encoded-alignment - [w aln] - ;; block_size - (lsb/write-int w (nth aln 0)) +(defn encode-alignment [wrtr aln refs] ;; refID - (lsb/write-int w (nth aln 1)) + (lsb/write-int wrtr (or (sam-util/ref-id refs (:rname aln)) -1)) ;; pos - (lsb/write-int w (nth aln 2)) + (lsb/write-int wrtr (dec (:pos aln))) ;; bin_mq_nl - (lsb/write-ubyte w (nth aln 3)) - (lsb/write-ubyte w (nth aln 4)) - (lsb/write-ushort w (nth aln 5)) + (lsb/write-ubyte wrtr (short (inc (.length ^String (:qname aln))))) + (lsb/write-ubyte wrtr (short (:mapq aln))) + (lsb/write-ushort wrtr (sam-util/reg->bin (:pos aln) (sam-util/get-end aln))) ;; flag_nc - (lsb/write-ushort w (nth aln 6)) - (lsb/write-ushort w (nth aln 7)) + (lsb/write-ushort wrtr (cgr/count-op (:cigar aln))) + (lsb/write-ushort wrtr (:flag aln)) ;; l_seq - (lsb/write-int w (nth aln 8)) + (lsb/write-int wrtr (.length ^String (:seq aln))) ;; next_refID - (lsb/write-int w (nth aln 9)) + (lsb/write-int wrtr (get-next-ref-id aln refs)) ;; next_pos - (lsb/write-int w (nth aln 10)) + (lsb/write-int wrtr (dec (:pnext aln))) ;; tlen - (lsb/write-int w (nth aln 11)) + (lsb/write-int wrtr (:tlen aln)) ;; read_name - (lsb/write-string w (nth aln 12)) - (lsb/write-bytes w (nth aln 13)) + (lsb/write-string wrtr (:qname aln)) + (lsb/write-bytes wrtr (byte-array 1 (byte 0))) ;; cigar - (doseq [cigar (nth aln 14)] - (lsb/write-int w cigar)) + (doseq [cigar (cgr/encode-cigar (:cigar aln))] + (lsb/write-int wrtr cigar)) ;; seq - (lsb/write-bytes w (nth aln 15)) + (lsb/write-bytes wrtr (sam-util/str->compressed-bases (:seq aln))) ;; qual - (lsb/write-bytes w (nth aln 16)) + (lsb/write-bytes wrtr (encode-qual aln)) ;; options - (doseq [v (nth aln 17)] - (lsb/write-short w (nth v 0)) - (lsb/write-bytes w (nth v 1)) - (when-not (nil? (nth v 2)) - (lsb/write-bytes w (nth v 2))))) + (doseq [op (:options aln)] + (let [[tag value] (first (seq op))] + (lsb/write-short wrtr (short (bit-or (bit-shift-left (byte (second (name tag))) 8) + (byte (first (name tag)))))) + (lsb/write-bytes wrtr (.getBytes ^String (:type value))) + (encode-tag-value wrtr (first (:type value)) (:value value))))) diff --git a/src/cljam/io/bam/reader.clj b/src/cljam/io/bam/reader.clj index 28486232..3be7d9ec 100644 --- a/src/cljam/io/bam/reader.clj +++ b/src/cljam/io/bam/reader.clj @@ -1,19 +1,17 @@ (ns cljam.io.bam.reader + "Reader of BAM file format." (:require [cljam.io.protocols :as protocols] - [cljam.io.sam.util :refer [ref-id ref-name parse-header get-end]] - [cljam.io.bam-index :refer [get-spans]] + [cljam.io.sam.util :as sam-util] [cljam.io.bam-index.core :as bai] - [cljam.io.bam.common :refer [fixed-block-size]] [cljam.io.bam.decoder :as decoder] [cljam.io.util.lsb :as lsb]) - (:import [java.io Closeable EOFException] - java.nio.ByteBuffer - bgzf4j.BGZFInputStream)) + (:import [java.io Closeable EOFException IOException FileNotFoundException] + [cljam.io.bam.decoder BAMRawBlock] + [java.nio ByteBuffer] + [bgzf4j BGZFInputStream])) -(declare read-alignments-sequentially* - read-alignments* - read-blocks-sequentially* - read-blocks*) +(declare read-blocks-sequentially* + read-blocks-randomly*) ;; BAMReader ;; --------- @@ -27,249 +25,108 @@ (.f this)) (read [this] (protocols/read this {})) - (read [this option] - (protocols/read-alignments this {} option)) + (read [this region] + (protocols/read-alignments this region)) protocols/IAlignmentReader (read-header [this] (.header this)) (read-refs [this] (.refs this)) (read-alignments [this] - (protocols/read-alignments this {} {})) - (read-alignments [this region] - (protocols/read-alignments this region {})) - (read-alignments [this - {:keys [chr start end] - :or {chr nil - start 1 - end Long/MAX_VALUE}} - {:keys [depth] - :or {depth :deep}}] - (if (nil? chr) - (read-alignments-sequentially* this depth) - (read-alignments* this chr start end depth))) + (protocols/read-alignments this {})) + (read-alignments [this {:keys [chr start end] + :or {start 1 + end Long/MAX_VALUE}}] + (let [decoder (partial decoder/decode-alignment (.refs this))] + (if (nil? chr) + (read-blocks-sequentially* this decoder) + (read-blocks-randomly* this chr start end decoder)))) (read-blocks [this] (protocols/read-blocks this {} {})) (read-blocks [this region] (protocols/read-blocks this region {})) (read-blocks [this {:keys [chr start end] - :or {chr nil - start 1 + :or {start 1 end Long/MAX_VALUE}} {:keys [mode] :or {mode :normal}}] - (if (nil? chr) - (read-blocks-sequentially* this mode) - (read-blocks* this chr start end))) + (let [decoder (if (fn? mode) + mode + (case mode + :normal decoder/raw-block + :region decoder/decode-region-block + :coordinate decoder/decode-coordinate-block + :queryname decoder/decode-queryname-block + :pointer decoder/decode-pointer-block))] + (if (nil? chr) + (read-blocks-sequentially* this decoder) + (read-blocks-randomly* this chr start end decoder)))) protocols/IRegionReader (read-in-region [this region] - (protocols/read-in-region this region {})) - (read-in-region [this region option] - (protocols/read-alignments this region option))) + (protocols/read-in-region this region)) + (read-in-region [this region _] + (protocols/read-alignments this region))) -;; Reading a single block -;; -------------------- - -(defn- read-alignment-block - "Reads an alignment block, returning a map including the block size and the - data as byte array." - [^BAMReader bam-reader refs] - (let [rdr (.data-reader bam-reader) - block-size (lsb/read-int rdr)] - (when (< block-size fixed-block-size) - (throw (Exception. (str "Invalid block size:" block-size)))) - {:size block-size - :data (lsb/read-bytes rdr block-size)})) - -(defn- read-coordinate-alignment-block - "Reads an alignment block, returning a map including the block size, the data - as byte array, rname, flag and pos." - [^BAMReader bam-reader refs] - (let [rdr (.data-reader bam-reader) - block-size (lsb/read-int rdr)] - (when (< block-size fixed-block-size) - (throw (Exception. (str "Invalid block size: " block-size)))) - (let [data (lsb/read-bytes rdr block-size) - bb (ByteBuffer/wrap data) - ref-id (lsb/read-int bb) - rname (or (ref-name refs ref-id) "*") - pos (inc (lsb/read-int bb)) - _ (lsb/skip bb 6) ;; l_qname, MAPQ, bin, n_cigar_op - flag (lsb/read-ushort bb) - ;; l_seq, rnext, pnext, tlen, qname - ] - {:size block-size - :data data - :rname rname - :pos pos - :flag flag - :ref-id ref-id}))) - -(defn- read-queryname-alignment-block - "Reads an alignment block, returning a map including the block size, the data - as byte array, qname and flag." - [^BAMReader bam-reader refs] - (let [rdr (.data-reader bam-reader) - block-size (lsb/read-int rdr)] - (when (< block-size fixed-block-size) - (throw (Exception. (str "Invalid block size: " block-size)))) - (let [data (lsb/read-bytes rdr block-size) - bb (ByteBuffer/wrap data) - _ (lsb/skip bb 8) ;; ref-id, pos - l-read-name (lsb/read-ubyte bb) - _ (lsb/skip bb 5) ;; MAPQ, bin, n_cigar_op - flag (lsb/read-ushort bb) - _ (lsb/skip bb 16) ;; l_seq, rnext, pnext, tlen - qname (lsb/read-string bb l-read-name)] - {:size block-size - :data data - :qname qname - :flag flag}))) - -(defn- read-pointer-alignment-block - "Reads an alignment block, returning a map including the data as byte array - and the file pointers. This function is for indexing a BAM file." - [^BAMReader bam-reader _] - (let [rdr (.data-reader bam-reader) - pointer-beg (.getFilePointer ^BGZFInputStream (.reader bam-reader)) - block-size (lsb/read-int rdr)] - (when (< block-size fixed-block-size) - (throw (Exception. (str "Invalid block size: " block-size)))) - {:data (lsb/read-bytes rdr block-size) - :pointer-beg pointer-beg - :pointer-end (.getFilePointer ^BGZFInputStream (.reader bam-reader))})) - -;; Reading a single alignment -;; -------------------------- - -(defn- read-alignment - "Reads a single alignment and decodes it entirely, returning a map including - full information. See also `cljam.bam.decoder/deep-decode-alignment-block`." - [bam-reader refs] - (-> (read-alignment-block bam-reader refs) - (decoder/deep-decode-alignment-block refs))) - -(defn- light-read-alignment - "Reads a single alignment and decodes it partialy, returning a map including - partial information. See also - `cljam.bam.decoder/light-decode-alignment-block`." - [bam-reader refs] - (-> (read-alignment-block bam-reader refs) - (decoder/light-decode-alignment-block refs))) - -(defn- pointer-read-alignment - "Reads a single alignment and decodes it partialy, returning a map including - partial information and file pointers. See also - `cljam.bam.decoder/pointer-decode-alignment-block`." - [bam-reader refs] - (-> (read-pointer-alignment-block bam-reader refs) - (decoder/pointer-decode-alignment-block refs))) - -;; +(defn- ^"[B" read-a-block! + "Reads a single alignment block from a reader." + [rdr] + (let [block-size (lsb/read-int rdr)] + (lsb/read-bytes rdr block-size))) (defn- read-to-finish - [^BAMReader rdr - ^Long start - ^Long finish - ^clojure.lang.IFn read-fn] - (let [r ^BGZFInputStream (.reader rdr)] - (when (< start finish) - (.seek r start) - (when-not (zero? (.available r)) - (let [result (read-fn rdr (.refs rdr)) - curr (.getFilePointer r)] - (cons result - (lazy-seq (read-to-finish rdr curr finish read-fn)))))))) - -(defn- read-alignments-first-only - "It should be equivalent to [(first (filter window @candidates))]" - [^BAMReader rdr spans window read-fn] - (loop [left-spans spans] - (when-let [span (first left-spans)] - (let [[^Long begin ^Long finish] span] - (or - (loop [left (read-to-finish rdr begin finish read-fn)] - (when-let [one (first left)] - (if (window one) - [one] - (recur (rest left))))) - (recur (rest left-spans))))))) - -(defn- read-alignments* - [^BAMReader rdr - ^String chr ^Long start ^Long end - deep-or-shallow] - (when (nil? @(.index-delay rdr)) - (throw (Exception. "BAM index not found"))) - (let [bai @(.index-delay rdr) - spans (if (= chr "*") - (bai/get-unplaced-spans bai) - (get-spans bai (ref-id (.refs rdr) chr) start end)) - window (fn [a] - (let [left ^long (:pos a) - right ^long (or (:end a) (get-end a))] - (and (= chr (:rname a)) - (if (= chr "*") - true - (and - (<= start right) - (>= end left)))))) - read-fn (case deep-or-shallow - :first-only light-read-alignment - :shallow light-read-alignment - :deep read-alignment - :pointer pointer-read-alignment) - candidates (flatten (keep (fn [[^long begin ^long finish]] - (read-to-finish rdr begin finish read-fn)) spans))] - (if (= deep-or-shallow :first-only) - (read-alignments-first-only rdr spans window read-fn) - (filter window candidates)))) - -(defn- read-alignments-sequentially* - [^BAMReader rdr deep-or-shallow] - (let [read-aln-fn (case deep-or-shallow - :shallow light-read-alignment - :deep read-alignment - :pointer pointer-read-alignment) - read-fn (fn read-fn* - [^BAMReader r ^clojure.lang.PersistentVector refs] - (if-let [a (try (read-aln-fn r refs) - (catch EOFException e nil))] - (cons a (lazy-seq (read-fn* r refs)))))] - (read-fn rdr (.refs rdr)))) + "Reads alignment blocks until reaches to the finish pointer or EOF." + ([^BAMReader rdr] + (let [r ^BGZFInputStream (.reader rdr) + dr (.data-reader rdr) + start (.getFilePointer r)] + (when-not (zero? (.available r)) + (let [data (read-a-block! dr) + curr (.getFilePointer r)] + (cons (BAMRawBlock. data start curr) + (lazy-seq (read-to-finish rdr))))))) + ([^BAMReader rdr + ^long start + ^long finish] + (let [r ^BGZFInputStream (.reader rdr) + dr (.data-reader rdr)] + (when (< start finish) + (.seek r start) + (when-not (zero? (.available r)) + (let [data (read-a-block! dr) + curr (.getFilePointer r)] + (cons (BAMRawBlock. data start curr) + (lazy-seq (read-to-finish rdr curr finish))))))))) (defn- read-blocks-sequentially* - [^BAMReader rdr mode] - (let [read-block-fn (case mode - :normal read-alignment-block - :coordinate read-coordinate-alignment-block - :queryname read-queryname-alignment-block - :pointer read-pointer-alignment-block) - read-fn (fn read-fn* [^BAMReader r ^clojure.lang.PersistentVector refs] - (if-let [b (try (read-block-fn r refs) - (catch EOFException e nil))] - (cons b (lazy-seq (read-fn* r refs)))))] - (read-fn rdr (.refs rdr)))) - -(defn- read-blocks* - [^BAMReader rdr - ^String chr ^Long start ^Long end] - (when (nil? @(.index-delay rdr)) - (throw (Exception. "BAM index not found"))) - (let [bai @(.index-delay rdr) - spans (get-spans bai (ref-id (.refs rdr) chr) start end) - window (fn [^clojure.lang.PersistentHashMap a] - (let [^Long left (:pos a)] - (and (= chr (:rname a)) - (<= start left end)))) - candidates (flatten (map (fn [[^Long begin ^Long finish]] - (read-to-finish rdr begin finish read-coordinate-alignment-block)) spans))] - (filter window candidates))) + "Reads blocks sequentially from current position. + Returns an eduction of decoded blocks." + [^BAMReader rdr decoder] + (eduction + (keep decoder) + (read-to-finish rdr))) + +(defn- read-blocks-randomly* + "Reads blocks crossing the given range using BAM index. + Returns an eduction of decoded blocks." + [^BAMReader rdr chr start end decoder] + (let [bai @(.index-delay rdr)] + (if (= chr "*") + (do (.seek ^BGZFInputStream (.reader rdr) (ffirst (bai/get-unplaced-spans bai))) + (read-blocks-sequentially* rdr decoder)) + (let [refs (.refs rdr)] + (->> (bai/get-spans bai (sam-util/ref-id refs chr) start end) + (eduction + (comp + (mapcat + (fn [[begin finish]] + (read-to-finish rdr begin finish))) + (keep #(decoder % start end))))))))) (defn load-headers + "Reads header section of BAM file and returns it as a map." [rdr] - (let [header (parse-header (lsb/read-string rdr (lsb/read-int rdr))) + (let [header (sam-util/parse-header (lsb/read-string rdr (lsb/read-int rdr))) n-ref (lsb/read-int rdr) refs (loop [i n-ref, ret []] (if (zero? i) diff --git a/src/cljam/io/bam/writer.clj b/src/cljam/io/bam/writer.clj index 4889f061..a9c28abb 100644 --- a/src/cljam/io/bam/writer.clj +++ b/src/cljam/io/bam/writer.clj @@ -1,15 +1,11 @@ (ns cljam.io.bam.writer - (:require [clojure.string :refer [split]] - [cljam.util :refer [ubyte]] - [cljam.io.protocols :as protocols] - [cljam.io.sam.util :refer [reg->bin normalize-bases fastq->phred - str->compressed-bases make-refs ref-id - stringify-header]] - [cljam.io.bam.common :refer [bam-magic fixed-block-size]] - [cljam.io.util.cigar :as cgr] - [cljam.io.util.lsb :as lsb]) - (:import [java.io DataOutputStream Closeable IOException EOFException] - [bgzf4j BGZFOutputStream])) + "Writer of BAM file format." + (:require [cljam.io.protocols :as protocols] + [cljam.io.sam.util :as sam-util] + [cljam.io.util.lsb :as lsb] + [cljam.io.bam.common :as common] + [cljam.io.bam.encoder :as encoder]) + (:import [java.io Closeable])) (declare write-header* write-refs* write-alignments* write-blocks*) @@ -38,132 +34,16 @@ ;; write ;; -(def ^:private fixed-tag-size 3) -(def ^:private fixed-binary-array-tag-size 5) - -(defn- get-pos [aln] - (dec (:pos aln))) - -(defn- get-end [aln] - (dec - (+ (:pos aln) - (cgr/count-ref (:cigar aln))))) - -(defn- get-l-seq [sam-alignment] - (count (:seq sam-alignment))) - -(defn- get-next-ref-id [sa refs] - (condp = (:rnext sa) - "*" -1 - "=" (if-let [id (ref-id refs (:rname sa))] id -1) - (if-let [id (ref-id refs (:rnext sa))] id -1))) - -(defn- get-ref-id [aln refs] - (if-let [id (ref-id refs (:rname aln))] id -1)) - -(defn- get-next-pos [sam-alignment] - (dec (:pnext sam-alignment))) - -(defn- get-tlen [sam-alignment] - (:tlen sam-alignment)) - -(defn- get-read-name [sam-alignment] - (:qname sam-alignment)) - -(defn- encode-cigar-op [op] - (case op - \M (byte 0) - \I (byte 1) - \D (byte 2) - \N (byte 3) - \S (byte 4) - \H (byte 5) - \P (byte 6) - \= (byte 7) - \X (byte 8))) - -(defn- encode-cigar [cigar] - (map #(bit-or (bit-shift-left (first %) 4) - (encode-cigar-op (second %))) - (cgr/parse cigar))) - -(defn- get-cigar [aln] - (encode-cigar (:cigar aln))) - -(defn- get-seq [sam-alignment] - (str->compressed-bases (:seq sam-alignment))) - -(defn- get-options-size [sam-alignment] - (->> (map - (fn [op] - (let [[tag value] (first (seq op))] - (+ fixed-tag-size - (case (first (:type value)) - \A 1 - \i 4 - \f 4 - \Z (inc (count (:value value))) - \B (let [[array-type & array] (split (:value value) #",")] - (+ fixed-binary-array-tag-size - (* (count array) - (case (first array-type) - \c 1 - \C 1 - \s 2 - \S 2 - \i 4 - \I 4 - \f 4 - 0)))))))) - (:options sam-alignment)) - (reduce +))) - -(defn- get-block-size [aln] - (let [read-length (count (:seq aln)) - cigar-length (cgr/count-op (:cigar aln))] - (+ fixed-block-size (count (:qname aln)) 1 - (* cigar-length 4) - (int (/ (inc read-length) 2)) - read-length - (get-options-size aln)))) - -(defn- encode-qual [sam-alignment] - (if (= (:qual sam-alignment) "*") - (byte-array (count (:seq sam-alignment)) (ubyte 0xff)) - (fastq->phred (:qual sam-alignment)))) - -(defn- write-tag-value [writer val-type value] - (case val-type - \A (lsb/write-char writer (char value)) - \i (lsb/write-int writer (int value)) - \f (lsb/write-float writer (float value)) - \Z (do (lsb/write-string writer value) - (lsb/write-char writer (char 0))) - ;; \H nil - \B (let [[array-type & array] (split value #",")] - (case (first array-type) - \c nil - \C nil - \s nil - \S (do - (lsb/write-bytes writer (byte-array 1 (byte \S))) - (lsb/write-int writer (count array)) - (doseq [v array] - (lsb/write-short writer (Short/parseShort v)))) - \i nil - \I nil - \f nil)))) - (defn write-header* [^BAMWriter wtr header] (let [w (.writer wtr) - header-string (str (stringify-header header) \newline)] - (lsb/write-bytes w (.getBytes ^String bam-magic)) ; magic + header-string (str (sam-util/stringify-header header) \newline)] + (lsb/write-bytes w (.getBytes ^String common/bam-magic)) ; magic (lsb/write-int w (count header-string)) (lsb/write-string w header-string))) (defn write-refs* [^BAMWriter wtr header] (let [w (.writer wtr) - refs (make-refs header)] + refs (sam-util/make-refs header)] (lsb/write-int w (count refs)) (doseq [ref refs] (lsb/write-int w (inc (count (:name ref)))) @@ -171,54 +51,15 @@ (lsb/write-bytes w (byte-array 1 (byte 0))) (lsb/write-int w (:len ref))))) -(defn- write-alignment [wrtr aln refs] - ;; block_size - (lsb/write-int wrtr (get-block-size aln)) - ;; refID - (lsb/write-int wrtr (get-ref-id aln refs)) - ;; pos - (lsb/write-int wrtr (get-pos aln)) - ;; bin_mq_nl - (lsb/write-ubyte wrtr (short (inc (count (:qname aln))))) - (lsb/write-ubyte wrtr (short (:mapq aln))) - (lsb/write-ushort wrtr (reg->bin (:pos aln) (get-end aln))) - ;; flag_nc - (lsb/write-ushort wrtr (cgr/count-op (:cigar aln))) - (lsb/write-ushort wrtr (:flag aln)) - ;; l_seq - (lsb/write-int wrtr (get-l-seq aln)) - ;; next_refID - (lsb/write-int wrtr (get-next-ref-id aln refs)) - ;; next_pos - (lsb/write-int wrtr (get-next-pos aln)) - ;; tlen - (lsb/write-int wrtr (get-tlen aln)) - ;; read_name - (lsb/write-string wrtr (get-read-name aln)) - (lsb/write-bytes wrtr (byte-array 1 (byte 0))) - ;; cigar - (doseq [cigar (get-cigar aln)] - (lsb/write-int wrtr cigar)) - ;; seq - (lsb/write-bytes wrtr (get-seq aln)) - ;; qual - (lsb/write-bytes wrtr (encode-qual aln)) - ;; options - (doseq [op (:options aln)] - (let [[tag value] (first (seq op))] - (lsb/write-short wrtr (short (bit-or (bit-shift-left (byte (second (name tag))) 8) - (byte (first (name tag)))))) - (lsb/write-bytes wrtr (.getBytes ^String (:type value))) - (write-tag-value wrtr (first (:type value)) (:value value))))) - (defn write-alignments* [^BAMWriter wtr alns header] (let [w (.writer wtr) - refs (make-refs header)] + refs (sam-util/make-refs header)] (doseq [a alns] - (write-alignment w a refs)))) + (lsb/write-int w (encoder/get-block-size a)) + (encoder/encode-alignment w a refs)))) (defn write-blocks* [^BAMWriter wtr blocks] (let [w (.writer wtr)] (doseq [b blocks] - (lsb/write-int w (:size b)) + (lsb/write-int w (alength ^bytes (:data b))) (lsb/write-bytes w (:data b))))) diff --git a/src/cljam/io/bam_index/writer.clj b/src/cljam/io/bam_index/writer.clj index 2ac7e5e0..46689770 100644 --- a/src/cljam/io/bam_index/writer.clj +++ b/src/cljam/io/bam_index/writer.clj @@ -9,6 +9,7 @@ [cljam.io.bam.decoder :as bam-decoder]) (:import [java.io DataOutputStream Closeable] [java.nio ByteBuffer ByteOrder] + [cljam.io.bam.decoder BAMPointerBlock] [cljam.io.bam_index.chunk Chunk])) (declare make-index-from-blocks) @@ -54,13 +55,14 @@ ;; ### Updating (defn- update-meta-data - [^MetaData meta-data aln] + [^MetaData meta-data ^BAMPointerBlock aln] (let [first-offset (.first-offset meta-data) last-offset (.last-offset meta-data) aligned-alns (.aligned-alns meta-data) unaligned-alns (.unaligned-alns meta-data) - {:keys [beg end]} (:chunk (:meta aln)) - aligned? (zero? (bit-and (:flag aln) 4))] + beg (.pointer-beg aln) + end (.pointer-end aln) + aligned? (zero? (bit-and (.flag aln) 4))] (MetaData. ;; first-offset (if (or (< (bgzf/compare beg first-offset) 1) @@ -75,23 +77,23 @@ (if-not aligned? (inc unaligned-alns) unaligned-alns)))) (defn- update-bin-index - [bin-index aln] - (let [bin (sam-util/compute-bin aln) - meta-chunk (:chunk (:meta aln)) - achunk (Chunk. (:beg meta-chunk) (:end meta-chunk))] + [bin-index ^BAMPointerBlock aln] + (let [bin (sam-util/reg->bin (.pos aln) (inc (.end aln))) + beg (.pointer-beg aln) + end (.pointer-end aln)] (assoc bin-index bin (if-let [chunks (get bin-index bin)] - (if (bgzf/same-or-adjacent-blocks? (.end ^Chunk (peek chunks)) (.beg achunk)) - (let [l (assoc (peek chunks) :end (.end achunk))] - (assoc chunks (dec (count chunks)) l)) - (conj chunks achunk)) - (vector achunk))))) + (let [last-chunk ^Chunk (peek chunks)] + (if (bgzf/same-or-adjacent-blocks? (.end last-chunk) beg) + (conj (pop chunks) (Chunk. (.beg last-chunk) end)) + (conj chunks (Chunk. beg end)))) + [(Chunk. beg end)])))) (defn- update-linear-index - [linear-index aln] - (let [beg (-> aln :meta :chunk :beg) - aln-beg (:pos aln) - aln-end (sam-util/get-end aln) + [linear-index ^BAMPointerBlock aln] + (let [beg (.pointer-beg aln) + aln-beg (.pos aln) + aln-end (.end aln) win-beg (if (zero? aln-end) (pos->lidx-offset (dec aln-beg)) (pos->lidx-offset aln-beg)) @@ -118,17 +120,17 @@ Returned index is still intermidate. It must be passed to finalize function in the final stage." [alns] - (loop [[aln & rest] alns - rid (:ref-id aln) + (loop [[^BAMPointerBlock aln & rest] alns + rid (.ref-id aln) idx-status (init-index-status) no-coordinate-alns 0 indices {}] (if aln - (let [rid' (:ref-id aln) + (let [rid' (.ref-id aln) new-ref? (not= rid' rid) idx-status' (update-index-status (if new-ref? (init-index-status) idx-status) aln) - no-coordinate-alns' (if (zero? (:pos aln)) + no-coordinate-alns' (if (zero? (.pos aln)) (inc no-coordinate-alns) no-coordinate-alns) indices' (if new-ref? @@ -225,25 +227,24 @@ (loop [i 0 index index] (if (< i nrefs) - (let [] - (if (get index i) - (recur (inc i) (-> index - (update-in [i :bin-index] finalize-bin-index) - (update-in [i :linear-index] finalize-linear-index))) - (recur (inc i) index))) + (if (get index i) + (recur (inc i) (-> index + (update-in [i :bin-index] finalize-bin-index) + (update-in [i :linear-index] finalize-linear-index))) + (recur (inc i) index)) index))) ;; Writing index ;; ----------- (defn- write-bin - [w bin chunks] + [w ^long bin chunks] (lsb/write-int w bin) ;; chunks (lsb/write-int w (count chunks)) - (doseq [{:keys [beg end]} chunks] - (lsb/write-long w beg) - (lsb/write-long w end))) + (doseq [^Chunk chunk chunks] + (lsb/write-long w (.beg chunk)) + (lsb/write-long w (.end chunk)))) (defn- write-meta-data [w meta-data] @@ -255,7 +256,7 @@ (lsb/write-long w (:unaligned-alns meta-data))) (defn- write-index*! - [wtr nrefs alns] + [wtr ^long nrefs alns] ;; magic (lsb/write-bytes wtr (.getBytes ^String bai-magic)) ;; n_ref @@ -297,14 +298,14 @@ make-index-fn (fn [blocks] (if (= n-threads 1) (->> blocks - (map #(bam-decoder/decode-alignment-block % nil :pointer)) + (eduction (map bam-decoder/decode-pointer-block)) make-index*) (cp/with-shutdown! [pool (cp/threadpool (dec n-threads))] (->> blocks - (partition-all *alignments-partition-size*) + (eduction (partition-all *alignments-partition-size*)) (cp/upmap pool (fn [sub-blocks] (->> sub-blocks - (map #(bam-decoder/decode-alignment-block % nil :pointer)) + (eduction (map bam-decoder/decode-pointer-block)) make-index*))) (reduce merge-index)))))] (->> blocks diff --git a/src/cljam/io/protocols.clj b/src/cljam/io/protocols.clj index a514e703..5193e089 100644 --- a/src/cljam/io/protocols.clj +++ b/src/cljam/io/protocols.clj @@ -4,6 +4,9 @@ (defrecord SAMAlignment [qname ^int flag rname ^int pos ^int end ^int mapq cigar rnext ^int pnext ^int tlen seq qual options]) +(defrecord SAMRegionBlock [data ^int ref-id ^int pos ^int end]) +(defrecord SAMCoordinateBlock [data ^int ref-id ^int pos ^int flag]) +(defrecord SAMQuerynameBlock [data qname ^int flag]) (defprotocol IReader (reader-path [this] @@ -24,10 +27,10 @@ "Returns header of the SAM/BAM file.") (read-refs [this] "Returns references of the SAM/BAM file.") - (read-alignments [this] [this region] [this region option] - "Reads alignments of the SAM/BAM file, returning the alignments as a lazy sequence.") + (read-alignments [this] [this region] + "Reads alignments of the SAM/BAM file, returning the alignments as an eduction.") (read-blocks [this] [this region] [this region option] - "Reads alignment blocks of the SAM/BAM file, returning the blocks as a lazy sequence.")) + "Reads alignment blocks of the SAM/BAM file, returning the blocks as an eduction.")) (defprotocol IAlignmentWriter (write-header [this header] diff --git a/src/cljam/io/sam.clj b/src/cljam/io/sam.clj index fc45fc8c..17168ac7 100644 --- a/src/cljam/io/sam.clj +++ b/src/cljam/io/sam.clj @@ -58,15 +58,12 @@ (protocols/read-refs rdr)) (defn read-alignments - "Reads alignments of the SAM/BAM file, returning the alignments as a lazy - sequence." + "Reads alignments of the SAM/BAM file, returning the alignments as an eduction." ([rdr] (protocols/read-alignments rdr)) - ([rdr region] (protocols/read-alignments rdr region)) - ([rdr region option] (protocols/read-alignments rdr region option))) + ([rdr region] (protocols/read-alignments rdr region))) (defn read-blocks - "Reads alignment blocks of the SAM/BAM file, returning the blocks as a lazy - sequence." + "Reads alignment blocks of the SAM/BAM file, returning the blocks as an eduction." ([rdr] (protocols/read-blocks rdr)) ([rdr region] (protocols/read-blocks rdr region)) ([rdr region option] (protocols/read-blocks rdr region option))) diff --git a/src/cljam/io/sam/reader.clj b/src/cljam/io/sam/reader.clj index 03de1d67..fd76ea4d 100644 --- a/src/cljam/io/sam/reader.clj +++ b/src/cljam/io/sam/reader.clj @@ -3,7 +3,8 @@ [clojure.tools.logging :as logging] [cljam.io.sam.util :as sam-util] [cljam.io.protocols :as protocols]) - (:import [java.io BufferedReader Closeable])) + (:import [java.io BufferedReader Closeable] + [cljam.io.protocols SAMCoordinateBlock SAMQuerynameBlock])) (declare read-alignments* read-blocks* read-alignments-in-region*) @@ -18,8 +19,8 @@ (.f this)) (read [this] (protocols/read this {})) - (read [this option] - (read-alignments* this)) + (read [this region] + (protocols/read-alignments this region)) protocols/IRegionReader (read-in-region [this region] (protocols/read-in-region this region {})) @@ -31,12 +32,10 @@ (read-refs [this] (vec (sam-util/make-refs (.header this)))) (read-alignments [this] - (protocols/read-alignments this {} {})) - (read-alignments [this region] - (protocols/read-alignments this region {})) - (read-alignments [this {:keys [chr start end] :as region} option] + (protocols/read-alignments this {})) + (read-alignments [this {:keys [chr start end] :as region}] (if (or chr start end) - (read-alignments-in-region* this region option) + (read-alignments-in-region* this region) (read-alignments* this))) (read-blocks [this] (protocols/read-blocks this {})) @@ -47,22 +46,24 @@ (defn- read-alignments* [^SAMReader sam-reader] - (when-let [line (.readLine ^BufferedReader (.reader sam-reader))] - (if-not (= (first line) \@) - (cons (sam-util/parse-alignment line) (lazy-seq (read-alignments* sam-reader))) - (lazy-seq (read-alignments* sam-reader))))) + (eduction + (comp + (drop-while (fn [[f]] (= f \@))) + (map sam-util/parse-alignment)) + (line-seq (.reader sam-reader)))) (defn- read-alignments-in-region* - [^SAMReader sam-reader {:keys [chr start end]} option] + [^SAMReader sam-reader {:keys [chr start end]}] (logging/warn "May cause degradation of performance.") - (filter - (fn [a] (and (if chr (= (:rname a) chr) true) - (if start (<= start (sam-util/get-end a)) true) - (if end (<= (:pos a) end) true))) + (eduction + (filter + (fn [a] (and (if chr (= (:rname a) chr) true) + (if start (<= start (sam-util/get-end a)) true) + (if end (<= (:pos a) end) true)))) (read-alignments* sam-reader))) (defn- parse-coordinate - [^String line] + [rname->ref-id ^String line] (let [t0 (.indexOf line (int \tab) 0) t1 (.indexOf line (int \tab) (unchecked-inc t0)) t2 (.indexOf line (int \tab) (unchecked-inc t1)) @@ -70,10 +71,7 @@ flag (Integer/parseInt (.substring line (unchecked-inc t0) t1)) rname (.substring line (unchecked-inc t1) t2) pos (Integer/parseInt (.substring line (unchecked-inc t2) t3))] - {:data line - :rname rname - :pos pos - :flag flag})) + (SAMCoordinateBlock. line (rname->ref-id rname 0) pos flag))) (defn- parse-qname [^String line] @@ -81,24 +79,24 @@ t1 (.indexOf line (int \tab) (unchecked-inc t0)) qname (.substring line 0 t0) flag (Integer/parseInt (.substring line (unchecked-inc t0) t1))] - {:data line - :qname qname - :flag flag})) - -(defn- read-blocks** - [parse-fn ^BufferedReader rdr] - (when-let [line (.readLine rdr)] - (if-not (= (first line) \@) - (cons (parse-fn line) (lazy-seq (read-blocks** parse-fn rdr))) - (lazy-seq (read-blocks** parse-fn rdr))))) + (SAMQuerynameBlock. line qname flag))) (defn- read-blocks* [^SAMReader sam-reader {:keys [mode] :or {mode :normal}}] - (let [parse-fn (case mode - :normal (fn [line] {:data line}) - :coordinate parse-coordinate - :queryname parse-qname)] - (read-blocks** parse-fn (.reader sam-reader)))) + (let [parse-fn (if (fn? mode) + mode + (case mode + :normal (fn [line] {:data line}) + :coordinate (->> (.header sam-reader) + :SQ + (into {"*" -1} (map-indexed (fn [i {:keys [SN]}] [SN i]))) + (partial parse-coordinate)) + :queryname parse-qname))] + (eduction + (comp + (drop-while (fn [[f]] (= f \@))) + (map parse-fn)) + (line-seq (.reader sam-reader))))) (defn- read-header* [^BufferedReader rdr] (->> (line-seq rdr) diff --git a/src/cljam/io/sam/util.clj b/src/cljam/io/sam/util.clj index f8849250..f311c954 100644 --- a/src/cljam/io/sam/util.clj +++ b/src/cljam/io/sam/util.clj @@ -3,8 +3,7 @@ (:require [clojure.string :as cstr] [proton.core :refer [as-long as-double hex->bytes]] cljam.io.protocols - [cljam.io.util.cigar :refer [count-ref]] - [cljam.util :refer [ubyte]]) + [cljam.io.util.cigar :as cigar]) (:import [java.nio CharBuffer ByteBuffer] [java.nio.charset StandardCharsets] [cljam.io.protocols SAMAlignment])) @@ -109,7 +108,7 @@ [line] (let [[qname flag rname pos-str mapq cigar rnext pnext tlen seq qual & options] (cstr/split line #"\t") pos (Integer/parseInt pos-str) - ref-length (count-ref cigar) + ref-length (cigar/count-ref cigar) end (if (zero? ref-length) 0 (int (dec (+ pos ref-length))))] (SAMAlignment. qname (Integer/parseInt flag) rname pos end (Integer/parseInt mapq) cigar rnext (Integer/parseInt pnext) (Integer/parseInt tlen) (parse-seq-text seq) @@ -183,11 +182,12 @@ :else 0))) (defn get-end + "Returns the end position in reference for the given alignment." [aln] - (dec - (+ (:pos aln) - (count-ref (or (:cigar-bytes (:meta aln)) - (:cigar aln)))))) + (let [ref-length (cigar/count-ref (:cigar aln))] + (if (zero? ref-length) + (:pos aln) + (dec (+ (:pos aln) ref-length))))) (defn compute-bin "Returns indexing bin based on alignment start and end." diff --git a/src/cljam/io/util/cigar.clj b/src/cljam/io/util/cigar.clj index e599d362..b3ef5a35 100644 --- a/src/cljam/io/util/cigar.clj +++ b/src/cljam/io/util/cigar.clj @@ -62,26 +62,43 @@ (def ^:private count-ref-str (memoize count-ref-str*)) -(def ^:private bases-op-bytes - #{(byte 0) - (byte 2) - (byte 3) - (byte 7) - (byte 8)}) - -(defn- count-ref-bytes +(defn count-ref-bytes + "Count covering length in reference from encoded CIGAR byte-array." [cigar-bytes] (let [buf (ByteBuffer/wrap cigar-bytes)] (.order buf ByteOrder/LITTLE_ENDIAN) - (loop [len 0] + (loop [ref-length 0] (if (.hasRemaining buf) (let [b (.getInt buf) op (bit-and b 0xF) n (bit-shift-right b 4)] - (if (bases-op-bytes op) - (recur (+ len n)) - (recur len))) - len)))) + (recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0)))) + ref-length)))) + +(defn decode-cigar-and-ref-length + "Decode CIGAR string and length of alignment in reference. + Returns a vector of [cigar, ref-length]." + [cigar-bytes] + (let [buf (ByteBuffer/wrap cigar-bytes) + sb (StringBuilder.)] + (.order buf ByteOrder/LITTLE_ENDIAN) + (loop [ref-length 0] + (if (.hasRemaining buf) + (let [b (.getInt buf) + op (bit-and b 0xF) + n (bit-shift-right b 4)] + (doto sb + (.append n) + (.append (case op 0 \M 1 \I 2 \D 3 \N 4 \S 5 \H 6 \P 7 \= 8 \X))) + (recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0)))) + [(.toString sb) ref-length])))) + +(defn encode-cigar + "Encodes CIGAR string into a sequence of longs." + [cigar] + (mapv #(bit-or (bit-shift-left (first %) 4) + (case (second %) \M 0 \I 1 \D 2 \N 3 \S 4 \H 5 \P 6 \= 7 \X 8)) + (parse cigar))) (defmulti count-ref "Returns length of reference bases." diff --git a/src/cljam/io/util/lsb.clj b/src/cljam/io/util/lsb.clj index d4ddf7f2..a81234a4 100644 --- a/src/cljam/io/util/lsb.clj +++ b/src/cljam/io/util/lsb.clj @@ -184,69 +184,84 @@ ;; Writing ;; ------- -(defn write-char - [^DataOutputStream w b] - (let [bb (gen-byte-buffer)] - (.putChar bb b) - (.write w (.array bb) 0 1) - nil)) +(defprotocol LSBWritable + "Provides feature of writing little-endian values." + (write-ubyte [this n] "Writes 1 byte.") + (write-char [this n] "Writes a 1-byte ascii character.") + (write-short [this n] "Writes a 2-byte short value.") + (write-ushort [this n] "Writes a 2-byte unsigned short value.") + (write-int [this n] "Writes a 4-byte integer value.") + (write-uint [this n] "Writes a 4-byte unsigned integer value.") + (write-long [this n] "Writes a 8-byte long value.") + (write-float [this n] "Writes a 4-byte float value.") + (write-bytes [this b] "Writes a byte-array.") + (write-string [this s] "Writes a string as a sequence of ascii characters.")) -(defn write-bytes - [^DataOutputStream w ^bytes b] - (.write w b 0 (alength b)) - nil) - -(defn write-ubyte - [^DataOutputStream w b] - (let [bb (gen-byte-buffer)] - (.putShort bb b) - (.write w (.array bb) 0 1) - nil)) - -(defn write-short - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putShort bb n) - (.write w (.array bb) 0 2) - nil)) - -(defn write-ushort - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putInt bb n) - (.write w (.array bb) 0 2) - nil)) - -(defn write-int - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putInt bb n) - (.write w (.array bb) 0 4) - nil)) - -(defn write-uint - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putInt bb (unchecked-int n)) - (.write w (.array bb) 0 4) - nil)) - -(defn write-long - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putLong bb n) - (.write w (.array bb) 0 8) - nil)) - -(defn write-float - [^DataOutputStream w n] - (let [bb (gen-byte-buffer)] - (.putFloat bb n) - (.write w (.array bb) 0 4) - nil)) +(extend-type ByteBuffer + LSBWritable + (write-ubyte [w b] + (.put w (unchecked-byte b))) + (write-char [w b] + (.put w (unchecked-byte (long b)))) + (write-short [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putShort w n)) + (write-ushort [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putShort w (unchecked-short n))) + (write-int [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putInt w n)) + (write-uint [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putInt w (unchecked-int n))) + (write-long [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putLong w n)) + (write-float [w n] + (.order w ByteOrder/LITTLE_ENDIAN) + (.putFloat w n)) + (write-bytes [w ^bytes b] + (.put w b)) + (write-string [w s] + (.put w (string->bytes s)))) -(defn write-string - [^DataOutputStream w s] - (let [data-bytes (string->bytes s)] - (.write w data-bytes 0 (alength data-bytes)) - nil)) +(extend-type DataOutputStream + LSBWritable + (write-ubyte [w b] + (let [bb (gen-byte-buffer)] + (.putShort bb b) + (.write w (.array bb) 0 1))) + (write-char [w b] + (let [bb (gen-byte-buffer)] + (.putChar bb b) + (.write w (.array bb) 0 1))) + (write-short [w n] + (let [bb (gen-byte-buffer)] + (.putShort bb n) + (.write w (.array bb) 0 2))) + (write-ushort [w n] + (let [bb (gen-byte-buffer)] + (.putInt bb n) + (.write w (.array bb) 0 2))) + (write-int [w n] + (let [bb (gen-byte-buffer)] + (.putInt bb n) + (.write w (.array bb) 0 4))) + (write-uint [w n] + (let [bb (gen-byte-buffer)] + (.putInt bb (unchecked-int n)) + (.write w (.array bb) 0 4))) + (write-long [w n] + (let [bb (gen-byte-buffer)] + (.putLong bb n) + (.write w (.array bb) 0 8))) + (write-float [w n] + (let [bb (gen-byte-buffer)] + (.putFloat bb n) + (.write w (.array bb) 0 4))) + (write-bytes [w ^bytes b] + (.write w b 0 (alength b))) + (write-string [w s] + (let [data-bytes (string->bytes s)] + (.write w data-bytes 0 (alength data-bytes))))) diff --git a/src/cljam/tools/cli.clj b/src/cljam/tools/cli.clj index 0debe88e..5a492462 100644 --- a/src/cljam/tools/cli.clj +++ b/src/cljam/tools/cli.clj @@ -5,7 +5,7 @@ [clj-sub-command.core :refer [sub-command candidate-message]] [clojure.tools.cli :refer [parse-opts]] [cljam.io.sam :as sam] - [cljam.io.sam.util :refer [stringify-header stringify-alignment]] + [cljam.io.sam.util :as sam-util] [cljam.io.sequence :as cseq] [cljam.algo.bam-indexer :as bai] [cljam.algo.normal :as normal] @@ -66,9 +66,9 @@ "sam" (sam/sam-reader f) "bam" (sam/bam-reader f))] (when (:header options) - (println (stringify-header (sam/read-header r)))) - (doseq [aln (sam/read-alignments r {})] - (println (stringify-alignment aln)))))) + (println (sam-util/stringify-header (sam/read-header r)))) + (doseq [aln (sam/read-alignments r)] + (println (sam-util/stringify-alignment aln)))))) nil) ;; ### convert command diff --git a/test/cljam/algo/t_bam_indexer.clj b/test/cljam/algo/t_bam_indexer.clj index 1be564ce..f6d71115 100644 --- a/test/cljam/algo/t_bam_indexer.clj +++ b/test/cljam/algo/t_bam_indexer.clj @@ -22,7 +22,7 @@ (is (.isFile (cio/file (str temp-file-sorted ".bai")))) (is (same-file? (str temp-file-sorted ".bai") test-bai-file)) (is (= (with-open [r (sam/bam-reader temp-file-sorted)] - (doall (sam/read-alignments r {:chr "ref" :start 0 :end 1000}))) + (doall (seq (sam/read-alignments r {:chr "ref" :start 0 :end 1000})))) (filter #(= "ref" (:rname %)) (:alignments test-sam-sorted-by-pos)))))) @@ -46,7 +46,7 @@ (is (not-throw? (bai/create-index sorted-f (str sorted-f ".bai")))) (is (.isFile (cio/file (str sorted-f ".bai")))) (is (= (with-open [r (sam/bam-reader sorted-f)] - (doall (sam/read-alignments r {:chr "ref" :start 0 :end 1000}))) + (doall (seq (sam/read-alignments r {:chr "ref" :start 0 :end 1000})))) (filter #(= "ref" (:rname %)) (:alignments test-sam-incomplete-alignments-sorted-by-pos)))) ;; TODO: need more strictly check to .bai files @@ -73,7 +73,7 @@ (is (same-file? temp-file-sorted-bai temp-file-sorted-bai-2)) (with-open [r (sam/bam-reader temp-file-sorted)] ;; Random read with different number of spans. - (are [?param ?counts] (= (count (sam/read-alignments r ?param)) ?counts) + (are [?param ?counts] (= (count (seq (sam/read-alignments r ?param))) ?counts) {:chr "chr1" :start 23000000 :end 23001000 :depth :deep} 46 ;; 1 span {:chr "chr1" :start 24900000 :end 24902000 :depth :deep} 3 ;; 2 spans {:chr "chr1" :start 24000000 :end 24001000 :depth :deep} 6 ;; 3 spans @@ -89,7 +89,7 @@ (is (not-throw? (bai/create-index temp-file-sorted (str temp-file-sorted ".bai")))) (with-open [r (sam/bam-reader temp-file-sorted)] - (is (= (count (sam/read-alignments r {:chr "*"})) 4348)) + (is (= (count (seq (sam/read-alignments r {:chr "*"}))) 4348)) (is (every? (fn [a] (and (= (:mapq a) 0) (= (:pos a) 0) (= (:tlen a) 0) diff --git a/test/cljam/io/t_sam.clj b/test/cljam/io/t_sam.clj index 0ad7dd50..18656042 100644 --- a/test/cljam/io/t_sam.clj +++ b/test/cljam/io/t_sam.clj @@ -97,9 +97,10 @@ (is (= (sam/read-refs rdr) test-sam-refs)) (is (pointer= (sam/read-alignments rdr {:depth :pointer}) (:alignments test-sam)))) - (with-open [rdr (sam/bam-reader temp-bam-file)] - (is (= (sam/read-refs rdr) test-sam-refs)) - (is (= (data->clj (sam/read-blocks rdr)) test-sam-data))) + (doseq [mode [:normal :region :coordinate :queryname :pointer]] + (with-open [rdr (sam/bam-reader temp-bam-file)] + (is (= (sam/read-refs rdr) test-sam-refs)) + (is (= (data->clj (sam/read-blocks rdr {} {:mode mode})) test-sam-data)))) (with-open [rdr (sam/bam-reader temp-bam-file)] (is (= (sam/read-refs rdr) test-sam-refs)) (is (thrown? Exception (data->clj (sam/read-blocks rdr {:chr "ref2"}))))))) @@ -107,37 +108,40 @@ (deftest bam-reader-with-index-test (with-before-after {:before (prepare-cache!) :after (clean-cache!)} - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (sam/read-alignments rdr {:chr "ref2"}) - (drop 6 (:alignments test-sam-sorted-by-pos))))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (sam/read-alignments rdr {:chr "ref2" :start 21}) - (drop 7 (:alignments test-sam-sorted-by-pos))))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (sam/read-alignments rdr {:chr "ref2" :end 9}) - (take 3 (drop 6 (:alignments test-sam-sorted-by-pos)))))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (sam/read-alignments rdr {:chr "ref2" :start 10 :end 12}) - (take 5 (drop 6 (:alignments test-sam-sorted-by-pos)))))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (data->clj (sam/read-blocks rdr)) - test-sorted-bam-data))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) - (data->clj (sam/read-blocks rdr {:chr "ref2"}))) - (drop 6 test-sorted-bam-data)))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) - (data->clj (sam/read-blocks rdr {:chr "ref2" :start 2}))) - (drop 7 test-sorted-bam-data)))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) - (data->clj (sam/read-blocks rdr {:chr "ref2" :end 2}))) - (take 2 (drop 6 test-sorted-bam-data))))) - (with-open [rdr (sam/bam-reader test-sorted-bam-file)] - (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) - (data->clj (sam/read-blocks rdr {:chr "ref2" :start 4 :end 12}))) - (take 3 (drop 8 test-sorted-bam-data))))))) + (testing "read-alignments" + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (sam/read-alignments rdr {:chr "ref2"}) + (drop 6 (:alignments test-sam-sorted-by-pos))))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (sam/read-alignments rdr {:chr "ref2" :start 21}) + (drop 7 (:alignments test-sam-sorted-by-pos))))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (sam/read-alignments rdr {:chr "ref2" :end 9}) + (take 3 (drop 6 (:alignments test-sam-sorted-by-pos)))))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (sam/read-alignments rdr {:chr "ref2" :start 10 :end 12}) + (take 5 (drop 6 (:alignments test-sam-sorted-by-pos))))))) + (testing "read-blocks" + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (data->clj (sam/read-blocks rdr)) + test-sorted-bam-data))) + (doseq [mode [:normal :region :coordinate :queryname :pointer]] + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) + (data->clj (sam/read-blocks rdr {:chr "ref2"} {:mode mode}))) + (drop 6 test-sorted-bam-data)))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) + (data->clj (sam/read-blocks rdr {:chr "ref2" :start 2} {:mode mode}))) + (drop 6 test-sorted-bam-data)))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) + (data->clj (sam/read-blocks rdr {:chr "ref2" :end 2} {:mode mode}))) + (take 2 (drop 6 test-sorted-bam-data))))) + (with-open [rdr (sam/bam-reader test-sorted-bam-file)] + (is (= (map #(dissoc % :pos :qname :rname :flag :ref-id) + (data->clj (sam/read-blocks rdr {:chr "ref2" :start 4 :end 12} {:mode mode}))) + (take 5 (drop 6 test-sorted-bam-data))))))))) (deftest bam-reader-invalid-test (with-before-after {:before (prepare-cache!) diff --git a/test/cljam/t_common.clj b/test/cljam/t_common.clj index 50535bae..245ba805 100644 --- a/test/cljam/t_common.clj +++ b/test/cljam/t_common.clj @@ -78,12 +78,12 @@ (defn slurp-sam-for-test [f] (with-open [r (sam/sam-reader f)] {:header (sam/read-header r) - :alignments (doall (sam/read-alignments r {}))})) + :alignments (doall (seq (sam/read-alignments r {})))})) (defn slurp-bam-for-test [f] (with-open [r (sam/bam-reader f)] {:header (sam/read-header r) - :alignments (doall (sam/read-alignments r {}))})) + :alignments (doall (seq (sam/read-alignments r {})))})) ;; spit (for test) (defn spit-sam-for-test [f sam] @@ -324,35 +324,35 @@ {:len 57772954 :name "chrY"}]) (defn data->clj [data] - (map #(update % :data vec) data)) + (map (fn [m] {:data (vec (:data m))}) data)) (def test-sam-data - [{:data [0 0 0 0 28 0 0 0 5 30 73 18 2 0 16 0 5 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 101 0 0 0 80 0 0 0 -127 68 32 -1 -1 -1 -1 -1], :size 53} - {:data [0 0 0 0 6 0 0 0 5 30 73 18 5 0 -93 0 19 0 0 0 0 0 0 0 36 0 0 0 39 0 0 0 114 48 48 49 0 -128 0 0 0 65 0 0 0 64 0 0 0 18 0 0 0 48 0 0 0 -120 20 24 17 20 20 65 -127 40 64 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 88 88 66 83 4 0 0 0 17 49 2 0 20 0 112 0], :size 102} - {:data [0 0 0 0 8 0 0 0 5 30 73 18 9 0 0 0 17 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 50 0 20 0 0 0 33 0 0 0 96 0 0 0 22 0 0 0 17 0 0 0 22 0 0 0 17 0 0 0 64 0 0 0 33 0 0 0 17 17 65 -127 20 68 24 17 16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 99} - {:data [0 0 0 0 8 0 0 0 5 30 73 18 2 0 0 0 6 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 85 0 0 0 96 0 0 0 20 40 17 -1 -1 -1 -1 -1 -1], :size 54} - {:data [1 0 0 0 5 0 0 0 3 30 73 18 3 0 0 0 26 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 51 0 -112 0 0 0 65 0 0 0 -48 0 0 0 -120 24 17 17 33 17 -127 24 -127 20 -126 -127 33 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 86} - {:data [0 0 0 0 15 0 0 0 5 30 73 18 4 0 0 0 12 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 52 0 96 0 0 0 -29 0 0 0 17 0 0 0 80 0 0 0 24 20 40 40 33 66 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 71} - {:data [0 0 0 0 36 0 0 0 5 30 73 18 1 0 83 0 9 0 0 0 0 0 0 0 6 0 0 0 -39 -1 -1 -1 114 48 48 49 0 -112 0 0 0 33 66 66 33 -128 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 55} - {:data [1 0 0 0 0 0 0 0 3 30 73 18 1 0 0 0 20 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 49 0 64 1 0 0 20 72 -120 -127 -127 17 18 17 24 17 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 69} - {:data [1 0 0 0 1 0 0 0 3 30 73 18 1 0 0 0 21 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 50 0 80 1 0 0 68 -120 -120 24 17 17 33 17 -127 24 -128 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 71} - {:data [1 0 0 0 9 0 0 0 3 30 73 18 1 0 0 0 25 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 52 0 -112 1 0 0 33 17 -127 24 -127 20 -126 -127 33 65 66 17 32 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 77} - {:data [1 0 0 0 13 0 0 0 3 30 73 18 1 0 0 0 23 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 54 0 112 1 0 0 -127 24 -127 20 -126 -127 33 65 66 17 40 16 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 74} - {:data [1 0 0 0 11 0 0 0 3 30 73 18 1 0 0 0 24 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 53 0 -128 1 0 0 17 -127 24 -127 20 -126 -127 33 65 66 17 40 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 75}]) + [{:data [0 0 0 0 28 0 0 0 5 30 73 18 2 0 16 0 5 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 101 0 0 0 80 0 0 0 -127 68 32 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 6 0 0 0 5 30 73 18 5 0 -93 0 19 0 0 0 0 0 0 0 36 0 0 0 39 0 0 0 114 48 48 49 0 -128 0 0 0 65 0 0 0 64 0 0 0 18 0 0 0 48 0 0 0 -120 20 24 17 20 20 65 -127 40 64 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 88 88 66 83 4 0 0 0 17 49 2 0 20 0 112 0]} + {:data [0 0 0 0 8 0 0 0 5 30 73 18 9 0 0 0 17 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 50 0 20 0 0 0 33 0 0 0 96 0 0 0 22 0 0 0 17 0 0 0 22 0 0 0 17 0 0 0 64 0 0 0 33 0 0 0 17 17 65 -127 20 68 24 17 16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 8 0 0 0 5 30 73 18 2 0 0 0 6 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 85 0 0 0 96 0 0 0 20 40 17 -1 -1 -1 -1 -1 -1]} + {:data [1 0 0 0 5 0 0 0 3 30 73 18 3 0 0 0 26 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 51 0 -112 0 0 0 65 0 0 0 -48 0 0 0 -120 24 17 17 33 17 -127 24 -127 20 -126 -127 33 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [0 0 0 0 15 0 0 0 5 30 73 18 4 0 0 0 12 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 52 0 96 0 0 0 -29 0 0 0 17 0 0 0 80 0 0 0 24 20 40 40 33 66 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 36 0 0 0 5 30 73 18 1 0 83 0 9 0 0 0 0 0 0 0 6 0 0 0 -39 -1 -1 -1 114 48 48 49 0 -112 0 0 0 33 66 66 33 -128 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [1 0 0 0 0 0 0 0 3 30 73 18 1 0 0 0 20 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 49 0 64 1 0 0 20 72 -120 -127 -127 17 18 17 24 17 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 1 0 0 0 3 30 73 18 1 0 0 0 21 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 50 0 80 1 0 0 68 -120 -120 24 17 17 33 17 -127 24 -128 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 9 0 0 0 3 30 73 18 1 0 0 0 25 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 52 0 -112 1 0 0 33 17 -127 24 -127 20 -126 -127 33 65 66 17 32 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 13 0 0 0 3 30 73 18 1 0 0 0 23 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 54 0 112 1 0 0 -127 24 -127 20 -126 -127 33 65 66 17 40 16 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 11 0 0 0 3 30 73 18 1 0 0 0 24 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 53 0 -128 1 0 0 17 -127 24 -127 20 -126 -127 33 65 66 17 40 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]}]) (def test-sorted-bam-data - [{:data [0 0 0 0 6 0 0 0 5 30 73 18 5 0 -93 0 19 0 0 0 0 0 0 0 36 0 0 0 39 0 0 0 114 48 48 49 0 -128 0 0 0 65 0 0 0 64 0 0 0 18 0 0 0 48 0 0 0 -120 20 24 17 20 20 65 -127 40 64 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 88 88 66 83 4 0 0 0 17 49 2 0 20 0 112 0], :size 102} - {:data [0 0 0 0 8 0 0 0 5 30 73 18 9 0 0 0 17 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 50 0 20 0 0 0 33 0 0 0 96 0 0 0 22 0 0 0 17 0 0 0 22 0 0 0 17 0 0 0 64 0 0 0 33 0 0 0 17 17 65 -127 20 68 24 17 16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 99} - {:data [0 0 0 0 8 0 0 0 5 30 73 18 2 0 0 0 6 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 85 0 0 0 96 0 0 0 20 40 17 -1 -1 -1 -1 -1 -1], :size 54} - {:data [0 0 0 0 15 0 0 0 5 30 73 18 4 0 0 0 12 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 52 0 96 0 0 0 -29 0 0 0 17 0 0 0 80 0 0 0 24 20 40 40 33 66 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 71} - {:data [0 0 0 0 28 0 0 0 5 30 73 18 2 0 16 0 5 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 101 0 0 0 80 0 0 0 -127 68 32 -1 -1 -1 -1 -1], :size 53} - {:data [0 0 0 0 36 0 0 0 5 30 73 18 1 0 83 0 9 0 0 0 0 0 0 0 6 0 0 0 -39 -1 -1 -1 114 48 48 49 0 -112 0 0 0 33 66 66 33 -128 -1 -1 -1 -1 -1 -1 -1 -1 -1], :size 55} - {:data [1 0 0 0 0 0 0 0 3 30 73 18 1 0 0 0 20 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 49 0 64 1 0 0 20 72 -120 -127 -127 17 18 17 24 17 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 69} - {:data [1 0 0 0 1 0 0 0 3 30 73 18 1 0 0 0 21 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 50 0 80 1 0 0 68 -120 -120 24 17 17 33 17 -127 24 -128 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 71} - {:data [1 0 0 0 5 0 0 0 3 30 73 18 3 0 0 0 26 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 51 0 -112 0 0 0 65 0 0 0 -48 0 0 0 -120 24 17 17 33 17 -127 24 -127 20 -126 -127 33 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 86} - {:data [1 0 0 0 9 0 0 0 3 30 73 18 1 0 0 0 25 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 52 0 -112 1 0 0 33 17 -127 24 -127 20 -126 -127 33 65 66 17 32 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 77} - {:data [1 0 0 0 11 0 0 0 3 30 73 18 1 0 0 0 24 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 53 0 -128 1 0 0 17 -127 24 -127 20 -126 -127 33 65 66 17 40 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 75} - {:data [1 0 0 0 13 0 0 0 3 30 73 18 1 0 0 0 23 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 54 0 112 1 0 0 -127 24 -127 20 -126 -127 33 65 66 17 40 16 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30], :size 74}]) + [{:data [0 0 0 0 6 0 0 0 5 30 73 18 5 0 -93 0 19 0 0 0 0 0 0 0 36 0 0 0 39 0 0 0 114 48 48 49 0 -128 0 0 0 65 0 0 0 64 0 0 0 18 0 0 0 48 0 0 0 -120 20 24 17 20 20 65 -127 40 64 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 88 88 66 83 4 0 0 0 17 49 2 0 20 0 112 0]} + {:data [0 0 0 0 8 0 0 0 5 30 73 18 9 0 0 0 17 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 50 0 20 0 0 0 33 0 0 0 96 0 0 0 22 0 0 0 17 0 0 0 22 0 0 0 17 0 0 0 64 0 0 0 33 0 0 0 17 17 65 -127 20 68 24 17 16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 8 0 0 0 5 30 73 18 2 0 0 0 6 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 85 0 0 0 96 0 0 0 20 40 17 -1 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 15 0 0 0 5 30 73 18 4 0 0 0 12 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 52 0 96 0 0 0 -29 0 0 0 17 0 0 0 80 0 0 0 24 20 40 40 33 66 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 28 0 0 0 5 30 73 18 2 0 16 0 5 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 114 48 48 51 0 101 0 0 0 80 0 0 0 -127 68 32 -1 -1 -1 -1 -1]} + {:data [0 0 0 0 36 0 0 0 5 30 73 18 1 0 83 0 9 0 0 0 0 0 0 0 6 0 0 0 -39 -1 -1 -1 114 48 48 49 0 -112 0 0 0 33 66 66 33 -128 -1 -1 -1 -1 -1 -1 -1 -1 -1]} + {:data [1 0 0 0 0 0 0 0 3 30 73 18 1 0 0 0 20 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 49 0 64 1 0 0 20 72 -120 -127 -127 17 18 17 24 17 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 1 0 0 0 3 30 73 18 1 0 0 0 21 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 50 0 80 1 0 0 68 -120 -120 24 17 17 33 17 -127 24 -128 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 5 0 0 0 3 30 73 18 3 0 0 0 26 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 51 0 -112 0 0 0 65 0 0 0 -48 0 0 0 -120 24 17 17 33 17 -127 24 -127 20 -126 -127 33 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 9 0 0 0 3 30 73 18 1 0 0 0 25 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 52 0 -112 1 0 0 33 17 -127 24 -127 20 -126 -127 33 65 66 17 32 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 11 0 0 0 3 30 73 18 1 0 0 0 24 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 53 0 -128 1 0 0 17 -127 24 -127 20 -126 -127 33 65 66 17 40 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]} + {:data [1 0 0 0 13 0 0 0 3 30 73 18 1 0 0 0 23 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 120 54 0 112 1 0 0 -127 24 -127 20 -126 -127 33 65 66 17 40 16 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30]}]) (def test-sorted-bam-levels [{:type "i", :value 0} @@ -452,7 +452,7 @@ (and (zero? rf) (zero? ps) (pos? fl))) (reduced nil) x))) - (sam/read-alignments r)))))) + (seq (sam/read-alignments r))))))) (defn qname-sorted? [f] (with-open [r (sam/reader f)] @@ -465,7 +465,7 @@ (and (zero? qn) (pos? fl))) (reduced nil) x))) - (sam/read-alignments r))))) + (seq (sam/read-alignments r)))))) ;; Utilities ;; --------- @@ -482,8 +482,8 @@ r2 (sam/reader f2)] (and (= (sam/read-header r1) (sam/read-header r2)) - (= (sam/read-alignments r1 {}) - (sam/read-alignments r2 {}))))) + (= (seq (sam/read-alignments r1)) + (seq (sam/read-alignments r2)))))) ;;;; FASTA