From 9703683a85d0d8e0ff88a5eff5ab001dc889ed3d Mon Sep 17 00:00:00 2001 From: Jun Imura Date: Wed, 25 Jan 2017 13:52:24 +0900 Subject: [PATCH 1/2] Use static table to decode BAM seq. --- src/cljam/bam/decoder.clj | 7 +++---- src/cljam/util/sam_util.clj | 35 +++++++++++++++++++++++++++++++--- test/cljam/util/t_sam_util.clj | 26 ++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/cljam/bam/decoder.clj b/src/cljam/bam/decoder.clj index f0f2f013..eb9f0ebf 100644 --- a/src/cljam/bam/decoder.clj +++ b/src/cljam/bam/decoder.clj @@ -2,8 +2,7 @@ "Decoder of BAM alignment blocks." (:require [clojure.string :refer [join]] [cljam.util :refer [ubyte hex-string->bytes]] - [cljam.util.sam-util :refer [phred->fastq compressed-bases->chars - ref-name]] + [cljam.util.sam-util :refer [phred->fastq compressed-bases->str ref-name]] [cljam.lsb :as lsb] [cljam.bam.common :refer [fixed-block-size]]) (:import java.util.Arrays @@ -80,7 +79,7 @@ (phred->fastq b))) (defn decode-seq [seq-bytes length] - (join (compressed-bases->chars length seq-bytes 0))) + (compressed-bases->str length seq-bytes 0)) (defn decode-next-ref-id [refs n rname] (cond @@ -133,7 +132,7 @@ qname (lsb/read-string buffer (dec l-read-name)) _ (lsb/skip buffer 1) cigar (decode-cigar (lsb/read-bytes buffer (* n-cigar-op 4))) - seq (decode-seq (lsb/read-bytes buffer (/ (inc l-seq) 2)) l-seq) + seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) qual (decode-qual (lsb/read-bytes buffer l-seq)) rest (lsb/read-bytes buffer (options-size (:size block) l-read-name diff --git a/src/cljam/util/sam_util.clj b/src/cljam/util/sam_util.clj index 8bee3b8c..ceed7191 100644 --- a/src/cljam/util/sam_util.clj +++ b/src/cljam/util/sam_util.clj @@ -2,7 +2,9 @@ "Utilities related to SAM/BAM format." (:require [clojure.string :as cstr] [cljam.cigar :refer [count-ref]] - [cljam.util :refer [ubyte str->int str->float]])) + [cljam.util :refer [ubyte str->int str->float]]) + (:import [java.nio CharBuffer ByteBuffer] + [java.nio.charset StandardCharsets])) ;;; parse @@ -318,6 +320,26 @@ (aset-byte result i (byte (bit-or h l))) (recur (inc i))))))) +(def ^:const ^:private seq-byte-table + (let [nibble-table "=ACMGRSVTWYHKDBN"] + (->> (for [i nibble-table j nibble-table] [i j]) + (apply concat) + cstr/join))) + +(defn compressed-bases->str + "Decode a sequence from byte array to String." + [^long length ^bytes compressed-bases ^long compressed-offset] + (let [cb (CharBuffer/allocate (inc length)) + bb (ByteBuffer/wrap compressed-bases)] + (.position bb compressed-offset) + (dotimes [_ (quot (inc length) 2)] + (let [i (-> (.get bb) (bit-and 0xff) (* 2))] + (.put cb (.charAt seq-byte-table i)) + (.put cb (.charAt seq-byte-table (inc i))))) + (.limit cb length) + (.flip cb) + (.toString cb))) + (defn compressed-bases->chars [length compressed-bases compressed-offset] (let [bases (apply concat (for [i (range 1 length) :when (odd? i)] @@ -353,9 +375,16 @@ (defmulti phred->fastq class) (defmethod phred->fastq (class (byte-array nil)) - [b] + [^bytes b] (when-not (nil? b) - (cstr/join (map #(phred->fastq (int (bit-and % 0xff))) b)))) + (let [bb (ByteBuffer/wrap b) + cb (CharBuffer/allocate (alength b))] + (loop [] + (when (.hasRemaining bb) + (.put cb (char (+ 33 (.get bb)))) + (recur))) + (.flip cb) + (.toString cb)))) (def ^:const max-phred-score 93) diff --git a/test/cljam/util/t_sam_util.clj b/test/cljam/util/t_sam_util.clj index 9978b03c..1420a645 100644 --- a/test/cljam/util/t_sam_util.clj +++ b/test/cljam/util/t_sam_util.clj @@ -3,7 +3,8 @@ (:require [midje.sweet :refer :all] [cljam.t-common :refer :all] [cljam.util :as util] - [cljam.util.sam-util :as sam-util])) + [cljam.util.sam-util :as sam-util] + [clojure.string :as cstr])) (tabular (fact "about char->compressed-base-high" @@ -99,6 +100,29 @@ (util/ubyte 0xd0) \D (util/ubyte 0xe0) \B) +(def nibble-table "=ACMGRSVTWYHKDBN") + +(tabular + (fact + "about compressed-bases->str" + (let [ba (byte-array (mapv util/ubyte ?bases))] + (cstr/join (sam-util/compressed-bases->chars ?length ba ?offset)) => ?expected + (sam-util/compressed-bases->str ?length ba ?offset) => ?expected)) + ?length ?bases ?offset ?expected + 1 [0x00] 0 "=" + 2 [0x00] 0 "==" + 1 [0x10] 0 "A" + 2 [0x12] 0 "AC" + 4 [0x12 0x8F] 0 "ACTN" + 1 [0x12 0x8F] 1 "T" + 2 [0x12 0x8F] 1 "TN" + 16 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 0 nibble-table + 14 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 1 (subs nibble-table 2) + 2 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 7 "BN" + 512 (range 256) 0 (->> (for [i nibble-table j nibble-table] [i j]) + (apply concat) + cstr/join)) + ;; Reference functions ;; ------------------- From f93a84a45749b7f0729698a505d1660cb8c96df8 Mon Sep 17 00:00:00 2001 From: Jun Imura Date: Wed, 25 Jan 2017 14:12:42 +0900 Subject: [PATCH 2/2] Bump dependencies version up. --- project.clj | 14 +++++++------- test/cljam/t_common.clj | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/project.clj b/project.clj index 5f21488f..86c13138 100644 --- a/project.clj +++ b/project.clj @@ -5,19 +5,19 @@ :url "http://www.apache.org/licenses/LICENSE-2.0.html"} :dependencies [[org.clojure/tools.logging "0.3.1"] [org.clojure/tools.cli "0.3.5"] - [org.apache.commons/commons-compress "1.12"] + [org.apache.commons/commons-compress "1.13"] [me.raynes/fs "1.4.6"] - [clj-sub-command "0.2.3"] - [digest "1.4.4"] + [clj-sub-command "0.3.0"] + [digest "1.4.5"] [bgzf4j "0.1.0"] - [com.climate/claypoole "1.1.3"] + [com.climate/claypoole "1.1.4"] [camel-snake-kebab "0.4.0"]] - :plugins [[lein-midje "3.2"]] + :plugins [[lein-midje "3.2.1"]] :profiles {:dev {:dependencies [[org.clojure/clojure "1.8.0"] [midje "1.8.3" :exclusions [slingshot]] - [cavia "0.2.3"]] + [cavia "0.3.0"]] :plugins [[lein-bin "0.3.5"] - [lein-codox "0.9.5"] + [lein-codox "0.10.2"] [lein-marginalia "0.9.0"]] :main cljam.main :aot [cljam.main] diff --git a/test/cljam/t_common.clj b/test/cljam/t_common.clj index db33104b..41e20d8a 100644 --- a/test/cljam/t_common.clj +++ b/test/cljam/t_common.clj @@ -1,6 +1,6 @@ (ns cljam.t-common (:use [clojure.java.io :only [file]]) - (:require [pandect.core :refer [md5-file]] + (:require [digest] [cljam.sam :as sam] [cljam.bam :as bam] [cljam.io :as io] @@ -284,7 +284,7 @@ (defn same-file? "Returns true if the two files' MD5 hash are same, false if not." [f1 f2] - (= (md5-file f1) (md5-file f2))) + (= (digest/sha1 (file f1)) (digest/sha1 (file f2)))) ;;;; FASTA