Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor BAI I/O #96

Merged
merged 3 commits into from
Aug 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
[bgzf4j "0.1.0"]
[com.climate/claypoole "1.1.4"]
[camel-snake-kebab "0.4.0"]
[proton "0.1.1"]]
[proton "0.1.2"]]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.8.0"]
[cavia "0.4.1"]]
:plugins [[lein-binplus "0.6.2"]
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/algo/bam_indexer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
(defn create-index
"Creates a BAM index file from the BAM file."
[in-bam out-bai & {:keys [n-threads] :or {n-threads 0}}]
(with-open [r (sam/bam-reader in-bam :ignore-index true)]
(with-open [r (sam/bam-reader in-bam)]
(binding [*n-threads* n-threads]
(bai-core/create-index out-bai
(sam/read-blocks r {} {:mode :pointer})
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/algo/level.clj
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
cache))]
;; merge
(doseq [cache xs]
(with-open [r (sam/reader cache :ignore-index true)]
(with-open [r (sam/reader cache)]
(sam/write-blocks wtr (sam/read-blocks r)))
(.delete (cio/file cache)))))))

Expand Down
2 changes: 1 addition & 1 deletion src/cljam/algo/sorter.clj
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
(defn- merge**
"Merges multiple SAM/BAM files into single SAM/BAM file."
[wtr hdr files key-fn read-fn write-fn]
(let [rdrs (map #(sam/reader % :ignore-index true) files)
(let [rdrs (map sam/reader files)
alns (map read-fn rdrs)]
(sam/write-header wtr hdr)
(sam/write-refs wtr hdr)
Expand Down
28 changes: 17 additions & 11 deletions src/cljam/io/bam/core.clj
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
(ns cljam.io.bam.core
"The core of BAM features."
(:require [clojure.java.io :as cio]
[clojure.string :as cstr]
[cljam.io.bam [common :refer [bam-magic]]
[reader :as reader]
[writer :as writer]]
[cljam.io.bam-index :as bai]
[cljam.io.util.lsb :as lsb])
(:import java.util.Arrays
[java.io DataInputStream DataOutputStream IOException]
[java.io DataInputStream DataOutputStream IOException FileNotFoundException]
[bgzf4j BGZFInputStream BGZFOutputStream]
cljam.io.bam.reader.BAMReader
cljam.io.bam.writer.BAMWriter))

;; Reading
;; -------

(defn- bam-index [f & {:keys [ignore]
:or {ignore false}}]
(if-not ignore
(let [bai-f (str f ".bai")]
(if (.isFile (cio/file bai-f))
(bai/bam-index bai-f)
(throw (IOException. "Could not find BAM Index file"))))))
(defn- bam-index
"Load an index file (BAI) for the given BAM file."
[bam-path]
(if-let [bai-path (->> ["$1.bai" ".bai" "$1.BAI" ".BAI"]
(eduction
(comp
(map #(cstr/replace bam-path #"(?i)(\.bam)$" %))
(filter #(.isFile (cio/file %)))))
first)]
(bai/bam-index bai-path)
(throw (FileNotFoundException. (str "Could not find BAM Index file for " bam-path)))))

(defn ^BAMReader reader [f {:keys [ignore-index]
:or {ignore-index false}}]
(defn ^BAMReader reader
"Creates a `cljam.io.bam.BAMReader` instance for the given path."
[f]
(let [rdr (BGZFInputStream. (cio/file f))
data-rdr (DataInputStream. rdr)]
(when-not (Arrays/equals ^bytes (lsb/read-bytes data-rdr 4) (.getBytes ^String bam-magic))
(throw (IOException. "Invalid BAM file")))
(let [{:keys [header refs]} (reader/load-headers data-rdr)
index-delay (delay (bam-index f :ignore ignore-index))]
index-delay (delay (bam-index f))]
(BAMReader. (.getAbsolutePath (cio/file f))
header refs rdr data-rdr index-delay (.getFilePointer rdr)))))

Expand Down
6 changes: 3 additions & 3 deletions src/cljam/io/bam/decoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,17 @@
{:rname rname, :pos pos, :meta {:cigar-bytes cigar-bytes}})))

(defn pointer-decode-alignment-block
[block refs]
[block _]
(let [buffer (ByteBuffer/wrap (:data block))]
(let [rname (or (sam-util/ref-name refs (lsb/read-int buffer)) "*")
(let [ref-id (lsb/read-int buffer)
pos (inc (lsb/read-int buffer))
l-read-name (int (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3)
n-cigar-op (lsb/read-ushort buffer)
flag (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 16 l-read-name))
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))]
{:flag flag, :rname rname, :pos pos,
{:flag flag, :ref-id ref-id, :pos pos,
:meta {:chunk {:beg (:pointer-beg block),
:end (:pointer-end block)}
:cigar-bytes cigar-bytes}})))
Expand Down
54 changes: 28 additions & 26 deletions src/cljam/io/bam_index/chunk.clj
Original file line number Diff line number Diff line change
Expand Up @@ -6,54 +6,56 @@
;;; It consists of a start position and a end position.
;;; A chunk is a map like `{:beg 10, :end 20}`.

(defrecord Chunk [^long beg ^long end])

(defn compare
"Returns a negative if chunk1 is earlier than chunk2, a positive if it is
later, 0 if it is equal."
[chunk1 chunk2]
(let [ret (Long/signum (- (:beg chunk1) (:beg chunk2)))]
[^Chunk chunk1 ^Chunk chunk2]
(let [ret (Long/signum (- (.beg chunk1) (.beg chunk2)))]
(if (zero? ret)
(Long/signum (- (:end chunk1) (:end chunk2)))
(Long/signum (- (.end chunk1) (.end chunk2)))
ret)))

(defn overlap?
"Returns true if the two chunks overlap."
[chunk1 chunk2]
[^Chunk chunk1 ^Chunk chunk2]
(let [comparison (compare chunk1 chunk2)]
(or (zero? comparison)
(let [left (if (neg? comparison) chunk1 chunk2)
right (if (pos? comparison) chunk1 chunk2)
left-fp (bgzf/get-block-address (:end left))
right-fp (bgzf/get-block-address (:beg right))]
left-fp (bgzf/get-block-address (.end left))
right-fp (bgzf/get-block-address (.beg right))]
(or (> left-fp right-fp)
(and (= left-fp right-fp)
(let [left-offset (bgzf/get-block-offset (:end left))
right-offset (bgzf/get-block-offset (:beg right))]
(let [left-offset (bgzf/get-block-offset (.end left))
right-offset (bgzf/get-block-offset (.beg right))]
(> left-offset right-offset))))))))

(defn adjacent?
"Returns true if the two chunks are adjacent."
[chunk1 chunk2]
(or (and (= (bgzf/get-block-address (:end chunk1))
(bgzf/get-block-address (:beg chunk2)))
(= (bgzf/get-block-offset (:end chunk1))
(bgzf/get-block-offset (:beg chunk2))))
(and (= (bgzf/get-block-address (:beg chunk1))
(bgzf/get-block-address (:end chunk2)))
(= (bgzf/get-block-offset (:beg chunk1))
(bgzf/get-block-offset (:end chunk2))))))
[^Chunk chunk1 ^Chunk chunk2]
(or (and (= (bgzf/get-block-address (.end chunk1))
(bgzf/get-block-address (.beg chunk2)))
(= (bgzf/get-block-offset (.end chunk1))
(bgzf/get-block-offset (.beg chunk2))))
(and (= (bgzf/get-block-address (.beg chunk1))
(bgzf/get-block-address (.end chunk2)))
(= (bgzf/get-block-offset (.beg chunk1))
(bgzf/get-block-offset (.end chunk2))))))

(defn optimize-chunks
[chunks min-offset]
[chunks ^long min-offset]
(let [chunks (sort compare chunks)]
(loop [[f & r] chunks
last-chunk nil
ret []]
(loop [[^Chunk f & r] chunks
^Chunk last-chunk nil
ret (transient [])]
(if f
(cond
(<= (:end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj ret f))
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj ret f))
(> (:end f) (:end last-chunk)) (let [l (assoc last-chunk :end (:end f))] (recur r l (conj (pop ret) l)))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
ret))))
(persistent! ret)))))
13 changes: 7 additions & 6 deletions src/cljam/io/bam_index/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,22 @@
(def ^:private reg->bins (memoize reg->bins*))

(defn get-spans
[^BAMIndex bai ref-idx beg end]
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
bidx (get (.bidx bai) ref-idx)
lidx (get (.lidx bai) ref-idx)
chunks (flatten (filter (complement nil?) (map #(get bidx %) bins)))
min-offset (or (first lidx) 0)]
chunks (into [] (comp (map bidx) cat) bins)
lin-beg (writer/pos->lidx-offset beg)
min-offset (get lidx lin-beg 0)]
(->> (chunk/optimize-chunks chunks min-offset)
(map vals))))

(defn get-unplaced-spans
[^BAMIndex bai]
(if-let [begin (some->>
(for [[ref-id bins] (.bidx bai)
[bin chunks] bins
{:keys [beg end]} chunks]
(for [[_ bins] (.bidx bai)
[_ chunks] bins
{:keys [end]} chunks]
end)
seq
(reduce max))]
Expand Down
12 changes: 6 additions & 6 deletions src/cljam/io/bam_index/reader.clj
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
(ns cljam.io.bam-index.reader
(:require [clojure.java.io :as cio]
[cljam.io.util.lsb :as lsb]
[cljam.io.bam-index.common :refer [bai-magic]])
[cljam.io.bam-index.common :refer [bai-magic]]
[cljam.io.bam-index.chunk :as chunk])
(:import java.util.Arrays
[java.io DataInputStream FileInputStream Closeable IOException]))
[java.io DataInputStream FileInputStream Closeable IOException]
[cljam.io.bam_index.chunk Chunk]))

(deftype BAIReader [f reader]
Closeable
Expand Down Expand Up @@ -48,8 +50,7 @@
(let [n (lsb/read-int rdr)]
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks {:beg (lsb/read-long rdr)
:end (lsb/read-long rdr)}))
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))

(defn- read-bin-index**!
Expand Down Expand Up @@ -100,8 +101,7 @@
(let [rdr (.reader r)
n-ref (lsb/read-int rdr)
refs (range n-ref)
all-idx (map (fn [i] [(read-bin-index**! rdr) (read-linear-index**! rdr)])
refs)
all-idx (map (fn [_] [(read-bin-index**! rdr) (read-linear-index**! rdr)]) refs)
bidx-seq (map first all-idx)
bidx (zipmap
refs
Expand Down
Loading