Skip to content

Commit

Permalink
Merge pull request #22 from chrovis/feature/optimize-decoding-bam
Browse files Browse the repository at this point in the history
Improve performance of reading BAM files.
  • Loading branch information
totakke committed Jan 25, 2017
2 parents 1b4d22d + f93a84a commit dd3f076
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 17 deletions.
14 changes: 7 additions & 7 deletions project.clj
Expand Up @@ -5,19 +5,19 @@
:url "http://www.apache.org/licenses/LICENSE-2.0.html"}
:dependencies [[org.clojure/tools.logging "0.3.1"]
[org.clojure/tools.cli "0.3.5"]
[org.apache.commons/commons-compress "1.12"]
[org.apache.commons/commons-compress "1.13"]
[me.raynes/fs "1.4.6"]
[clj-sub-command "0.2.3"]
[digest "1.4.4"]
[clj-sub-command "0.3.0"]
[digest "1.4.5"]
[bgzf4j "0.1.0"]
[com.climate/claypoole "1.1.3"]
[com.climate/claypoole "1.1.4"]
[camel-snake-kebab "0.4.0"]]
:plugins [[lein-midje "3.2"]]
:plugins [[lein-midje "3.2.1"]]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.8.0"]
[midje "1.8.3" :exclusions [slingshot]]
[cavia "0.2.3"]]
[cavia "0.3.0"]]
:plugins [[lein-bin "0.3.5"]
[lein-codox "0.9.5"]
[lein-codox "0.10.2"]
[lein-marginalia "0.9.0"]]
:main cljam.main
:aot [cljam.main]
Expand Down
7 changes: 3 additions & 4 deletions src/cljam/bam/decoder.clj
Expand Up @@ -2,8 +2,7 @@
"Decoder of BAM alignment blocks."
(:require [clojure.string :refer [join]]
[cljam.util :refer [ubyte hex-string->bytes]]
[cljam.util.sam-util :refer [phred->fastq compressed-bases->chars
ref-name]]
[cljam.util.sam-util :refer [phred->fastq compressed-bases->str ref-name]]
[cljam.lsb :as lsb]
[cljam.bam.common :refer [fixed-block-size]])
(:import java.util.Arrays
Expand Down Expand Up @@ -80,7 +79,7 @@
(phred->fastq b)))

(defn decode-seq [seq-bytes length]
(join (compressed-bases->chars length seq-bytes 0)))
(compressed-bases->str length seq-bytes 0))

(defn decode-next-ref-id [refs n rname]
(cond
Expand Down Expand Up @@ -133,7 +132,7 @@
qname (lsb/read-string buffer (dec l-read-name))
_ (lsb/skip buffer 1)
cigar (decode-cigar (lsb/read-bytes buffer (* n-cigar-op 4)))
seq (decode-seq (lsb/read-bytes buffer (/ (inc l-seq) 2)) l-seq)
seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (:size block)
l-read-name
Expand Down
35 changes: 32 additions & 3 deletions src/cljam/util/sam_util.clj
Expand Up @@ -2,7 +2,9 @@
"Utilities related to SAM/BAM format."
(:require [clojure.string :as cstr]
[cljam.cigar :refer [count-ref]]
[cljam.util :refer [ubyte str->int str->float]]))
[cljam.util :refer [ubyte str->int str->float]])
(:import [java.nio CharBuffer ByteBuffer]
[java.nio.charset StandardCharsets]))

;;; parse

Expand Down Expand Up @@ -318,6 +320,26 @@
(aset-byte result i (byte (bit-or h l)))
(recur (inc i)))))))

(def ^:const ^:private seq-byte-table
(let [nibble-table "=ACMGRSVTWYHKDBN"]
(->> (for [i nibble-table j nibble-table] [i j])
(apply concat)
cstr/join)))

(defn compressed-bases->str
"Decode a sequence from byte array to String."
[^long length ^bytes compressed-bases ^long compressed-offset]
(let [cb (CharBuffer/allocate (inc length))
bb (ByteBuffer/wrap compressed-bases)]
(.position bb compressed-offset)
(dotimes [_ (quot (inc length) 2)]
(let [i (-> (.get bb) (bit-and 0xff) (* 2))]
(.put cb (.charAt seq-byte-table i))
(.put cb (.charAt seq-byte-table (inc i)))))
(.limit cb length)
(.flip cb)
(.toString cb)))

(defn compressed-bases->chars [length compressed-bases compressed-offset]
(let [bases (apply concat
(for [i (range 1 length) :when (odd? i)]
Expand Down Expand Up @@ -353,9 +375,16 @@
(defmulti phred->fastq class)

(defmethod phred->fastq (class (byte-array nil))
[b]
[^bytes b]
(when-not (nil? b)
(cstr/join (map #(phred->fastq (int (bit-and % 0xff))) b))))
(let [bb (ByteBuffer/wrap b)
cb (CharBuffer/allocate (alength b))]
(loop []
(when (.hasRemaining bb)
(.put cb (char (+ 33 (.get bb))))
(recur)))
(.flip cb)
(.toString cb))))

(def ^:const max-phred-score 93)

Expand Down
4 changes: 2 additions & 2 deletions test/cljam/t_common.clj
@@ -1,6 +1,6 @@
(ns cljam.t-common
(:use [clojure.java.io :only [file]])
(:require [pandect.core :refer [md5-file]]
(:require [digest]
[cljam.sam :as sam]
[cljam.bam :as bam]
[cljam.io :as io]
Expand Down Expand Up @@ -284,7 +284,7 @@
(defn same-file?
"Returns true if the two files' MD5 hash are same, false if not."
[f1 f2]
(= (md5-file f1) (md5-file f2)))
(= (digest/sha1 (file f1)) (digest/sha1 (file f2))))

;;;; FASTA

Expand Down
26 changes: 25 additions & 1 deletion test/cljam/util/t_sam_util.clj
Expand Up @@ -3,7 +3,8 @@
(:require [midje.sweet :refer :all]
[cljam.t-common :refer :all]
[cljam.util :as util]
[cljam.util.sam-util :as sam-util]))
[cljam.util.sam-util :as sam-util]
[clojure.string :as cstr]))

(tabular
(fact "about char->compressed-base-high"
Expand Down Expand Up @@ -99,6 +100,29 @@
(util/ubyte 0xd0) \D
(util/ubyte 0xe0) \B)

(def nibble-table "=ACMGRSVTWYHKDBN")

(tabular
(fact
"about compressed-bases->str"
(let [ba (byte-array (mapv util/ubyte ?bases))]
(cstr/join (sam-util/compressed-bases->chars ?length ba ?offset)) => ?expected
(sam-util/compressed-bases->str ?length ba ?offset) => ?expected))
?length ?bases ?offset ?expected
1 [0x00] 0 "="
2 [0x00] 0 "=="
1 [0x10] 0 "A"
2 [0x12] 0 "AC"
4 [0x12 0x8F] 0 "ACTN"
1 [0x12 0x8F] 1 "T"
2 [0x12 0x8F] 1 "TN"
16 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 0 nibble-table
14 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 1 (subs nibble-table 2)
2 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 7 "BN"
512 (range 256) 0 (->> (for [i nibble-table j nibble-table] [i j])
(apply concat)
cstr/join))

;; Reference functions
;; -------------------

Expand Down

0 comments on commit dd3f076

Please sign in to comment.