/
normal.clj
36 lines (31 loc) · 1.26 KB
/
normal.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
(ns cljam.algo.normal
"Functions to normalize the SAM/BAM format."
(:require [cljam.io.sam :as sam]
[cljam.io.util :as io-util]
[cljam.util.chromosome :refer [normalize-chromosome-key]]))
(def ^:private chunk-size 1500000)
(defn- normalize-header
[hdr]
(update hdr :SQ (fn [xs]
(mapv #(update % :SN normalize-chromosome-key) xs))))
;; TODO: copy all rest of stream for performance. (do not read, parse and write)
(defn- transfer-blocks
[rdr wtr]
(doseq [blks (partition-all chunk-size (sam/read-blocks rdr))]
(sam/write-blocks wtr blks)))
(defn- transfer-alignments
[rdr wtr hdr]
(doseq [alns (->> (sam/read-alignments rdr)
(map #(update % :rname normalize-chromosome-key))
(partition-all chunk-size))]
(sam/write-alignments wtr alns hdr)))
(defn normalize
"Normalizes references of the SAM/BAM format. Be noted that performance may be
degraded if either or both of rdr and wtr is one about the SAM format."
[rdr wtr]
(let [hdr (normalize-header (sam/read-header rdr))]
(sam/write-header wtr hdr)
(sam/write-refs wtr hdr)
(if (and (io-util/bam-reader? rdr) (io-util/bam-writer? wtr))
(transfer-blocks rdr wtr)
(transfer-alignments rdr wtr hdr))))