/
decoder.clj
283 lines (267 loc) · 13.4 KB
/
decoder.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
(ns cljam.io.bam.decoder
"Decoder of BAM alignment blocks."
(:require [clojure.string :as cstr]
[proton.core :as proton]
[cljam.util :as util]
[cljam.io.sam.util.quality :as qual]
[cljam.io.sam.util.sequence :as sam-seq]
[cljam.io.sam.util.refs :as refs]
[cljam.io.sam.util.cigar :as cigar]
[cljam.io.bam.common :as common]
[cljam.io.util.lsb :as lsb])
(:import [java.util Arrays]
[java.nio Buffer ByteBuffer ByteOrder CharBuffer]
[cljam.io.protocols SAMAlignment SAMRegionBlock SAMCoordinateBlock SAMQuerynameBlock]))
(definline validate-tag-type
[t]
`(case (long ~t)
~(long \I) \i
~(long \s) \i
~(long \S) \i
~(long \c) \i
~(long \C) \i
(char ~t)))
(definline parse-tag-single [tag-type ^ByteBuffer bb]
`(case (long ~tag-type)
~(long \Z) (lsb/read-null-terminated-string ~bb)
~(long \A) (char (.get ~bb))
~(long \I) (bit-and (.getInt ~bb) 0xffffffff)
~(long \i) (.getInt ~bb)
~(long \s) (int (.getShort ~bb))
~(long \S) (bit-and (.getShort ~bb) 0xffff)
~(long \c) (int (.get ~bb))
~(long \C) (bit-and (int (.get ~bb)) 0xff)
~(long \f) (.getFloat ~bb)
~(long \H) (proton/hex->bytes (lsb/read-null-terminated-string ~bb))
(throw (Exception. "Unrecognized tag type"))))
(defn- parse-tag-array [^ByteBuffer bb]
(let [typ (char (.get bb))
len (.getInt bb)]
(->> (for [_ (range len)]
(case typ
\c (int (.get bb))
\C (bit-and (int (.get bb)) 0xff)
\s (int (.getShort bb))
\S (bit-and (.getShort bb) 0xffff)
\i (.getInt bb)
\I (bit-and (.getInt bb) 0xffffffff)
\f (.getFloat bb)
(throw (Exception. (str "Unrecognized tag array type: " typ)))))
(cons typ)
(cstr/join \,))))
(defn- parse-option [^ByteBuffer bb]
(lazy-seq
(when (.hasRemaining bb)
(cons
(let [cb (as-> (CharBuffer/allocate 2) cb
(.put cb (unchecked-char (.get bb)))
(.put cb (unchecked-char (.get bb)))
(.flip ^Buffer cb))
typ (.get bb)]
{(keyword (.toString cb))
{:type (str (validate-tag-type typ))
:value (if (= typ 66) ;; (byte \B)
(parse-tag-array bb)
(parse-tag-single typ bb))}})
(parse-option bb)))))
(defn decode-options [rest]
(let [bb (ByteBuffer/wrap rest)]
(.order bb ByteOrder/LITTLE_ENDIAN)
(parse-option bb)))
(defn options-size
[^long block-size ^long l-read-name ^long n-cigar-op ^long l-seq]
(- block-size
common/fixed-block-size
l-read-name
(* n-cigar-op 4)
(quot (inc l-seq) 2)
l-seq))
(defn decode-qual [^bytes b]
(if (Arrays/equals b (byte-array (alength b) (util/ubyte 0xff)))
"*"
(qual/phred-bytes->fastq b)))
(defn decode-seq [seq-bytes length]
(if (zero? length)
"*"
(sam-seq/compressed-bases->str length seq-bytes 0)))
(defn decode-next-ref-id [refs ^long ref-id ^long next-ref-id]
(if (= next-ref-id -1)
"*"
(if (= ref-id next-ref-id)
"="
(refs/ref-name refs next-ref-id))))
(defrecord BAMRawBlock [data ^long pointer-beg ^long pointer-end])
(defn decode-alignment
"Decodes BAM block and creates SAMAlignment instance which is compatible with SAM.
When called with start and end, this function may return nil if any base of the block
is not included in the range."
([refs block]
(let [buffer (ByteBuffer/wrap (:data block))
ref-id ^int (lsb/read-int buffer)
rname (or (refs/ref-name refs ref-id) "*")
pos (inc ^int (lsb/read-int buffer))
l-read-name (int ^long (lsb/read-ubyte buffer))
mapq ^long (lsb/read-ubyte buffer)
_ (lsb/skip buffer 2) ; bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
l-seq ^int (lsb/read-int buffer)
next-ref-id ^int (lsb/read-int buffer)
rnext (decode-next-ref-id refs ref-id next-ref-id)
pnext (inc ^int (lsb/read-int buffer))
tlen ^int (lsb/read-int buffer)
qname (lsb/read-string buffer (dec l-read-name))
_ (lsb/skip buffer 1)
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
[cigar ^long len] (cigar/decode-cigar-and-ref-length cigar-bytes)
ref-end (int ^long (if (zero? len) pos (dec (+ pos len))))
seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
options (decode-options rest)]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options)))
([refs block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (:data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))]
(when (<= pos end)
(let [l-read-name (int ^long (lsb/read-ubyte buffer))
mapq ^long (lsb/read-ubyte buffer)
_ (lsb/skip buffer 2) ; bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
l-seq ^int (lsb/read-int buffer)
next-ref-id ^int (lsb/read-int buffer)
rnext (decode-next-ref-id refs ref-id next-ref-id)
pnext (inc ^int (lsb/read-int buffer))
tlen ^int (lsb/read-int buffer)
qname (lsb/read-string buffer (dec l-read-name))
_ (lsb/skip buffer 1)
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
[cigar ^long len] (cigar/decode-cigar-and-ref-length cigar-bytes)
ref-end (int ^long (if (zero? len) pos (dec (+ pos len))))]
(when (<= start ref-end)
(let [seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
rname (or (refs/ref-name refs ref-id) "*")
options (decode-options rest)]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options))))))))
(defn decode-region-block
"Decodes BAM block and returns a SAMRegionBlock instance containing covering range of the alignment."
([^BAMRawBlock block]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))
l-read-name (int ^long (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(SAMRegionBlock. (.data block) ref-id pos ref-end)))
([^BAMRawBlock block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))]
(when (<= pos end)
(let [l-read-name (int ^long (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(when (<= start ref-end)
(SAMRegionBlock. (.data block) ref-id pos ref-end)))))))
(defn decode-coordinate-block
"Decodes BAM block and returns a SAMCoordinateBlock instance containing ref-id, pos and flag."
([^BAMRawBlock block]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))
_ (lsb/skip buffer 6) ;; l_read_name, MAPQ, bin, n_cigar_op
flag ^long (lsb/read-ushort buffer)] ;; l_seq, rnext, pnext, tlen, qname
(SAMCoordinateBlock. (.data block) (int ref-id) (int pos) (int flag))))
([^BAMRawBlock block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))]
(when (<= pos end)
(let [l-read-name (int ^long (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(when (<= start ref-end)
(SAMCoordinateBlock. (.data block) (int ref-id) (int pos) (int flag))))))))
(defn decode-queryname-block
"Decodes BAM block and returns a SAMQuerynameBlock instance containing qname and flag."
([^BAMRawBlock block]
(let [buffer (ByteBuffer/wrap (.data block))
_ (lsb/skip buffer 8) ;; ref-id, pos
l-read-name ^long (lsb/read-ubyte buffer)
_ (lsb/skip buffer 5) ;; MAPQ, bin, n_cigar_op
flag ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen
qname (lsb/read-string buffer l-read-name)]
(SAMQuerynameBlock. (.data block) qname (int flag))))
([^BAMRawBlock block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (.data block))
_ (lsb/skip buffer 4) ;; ref-id
pos (inc ^int (lsb/read-int buffer))]
(when (<= pos end)
(let [l-read-name ^long (lsb/read-ubyte buffer)
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen
qname (lsb/read-string buffer l-read-name)
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(when (<= start ref-end)
(SAMQuerynameBlock. (.data block) qname (int flag))))))))
(defrecord BAMPointerBlock [data ^int ref-id ^int pos ^int end ^int flag ^long pointer-beg ^long pointer-end])
(defn decode-pointer-block
"Decodes BAM block and returns a BAMPointerBlock instance containing region, flag and block pointers."
([^BAMRawBlock block]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))
l-read-name (int ^long (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(BAMPointerBlock. (.data block) ref-id pos ref-end (int flag) (.pointer-beg block) (.pointer-end block))))
([^BAMRawBlock block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (.data block))
ref-id ^int (lsb/read-int buffer)
pos (inc ^int (lsb/read-int buffer))]
(when (<= pos end)
(let [l-read-name (int ^long (lsb/read-ubyte buffer))
_ (lsb/skip buffer 3) ;; MAPQ, bin
n-cigar-op ^long (lsb/read-ushort buffer)
flag ^long (lsb/read-ushort buffer)
_ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname
cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4))
ref-length ^long (cigar/count-ref-bytes cigar-bytes)
ref-end (int ^long (if (zero? ref-length) pos (dec (+ pos ref-length))))]
(when (<= start ref-end)
(BAMPointerBlock. (.data block) ref-id pos ref-end (int flag) (.pointer-beg block) (.pointer-end block))))))))
(defn raw-block
"Checks the range of BAM block and returns the given block if any base is included."
([b] b)
([b ^long s ^long e]
(when (decode-region-block b s e)
b)))