-
Notifications
You must be signed in to change notification settings - Fork 12
/
cigar.clj
113 lines (98 loc) · 3.36 KB
/
cigar.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
(ns cljam.io.sam.util.cigar
"Parser of CIGAR strings."
(:import [java.nio ByteBuffer ByteOrder]))
(defn parse
"Parses CIGAR string, returning a sequence of lengths and operations."
[^String s]
(for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)]
[(Integer/parseInt n) (first op)]))
(defn simplify
"Merge contiguous same operations of parsed CIGAR."
[cigs]
(loop [[[^long l op :as x] & xs] cigs result (transient [])]
(if (and l op)
(let [[^long nl nop] (first xs)]
(if (= op nop)
(recur (cons [(+ l nl) op] (next xs)) result)
(recur xs (conj! result x))))
(persistent! result))))
(defn- concat! [v coll]
(reduce conj! v coll))
(defn- update-last! [coll f]
(let [c (dec (count coll))]
(if (neg? c)
coll
(let [[op x] (get coll c)]
(if (= :m op)
(assoc! coll c (f x))
coll)))))
(defn to-index*
"Convert CIGAR string to sequence of indices."
[^String s]
(let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))]
(loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])]
(if (and l op)
(condp get op
#{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s)))))
#{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x (range r (+ l r))])) (repeat l [:m \*])))
#{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>])))
#{\S} (recur xs r (+ s l) idx)
#{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x (range s (+ l s))]))))
(persistent! idx)))))
(def to-index (memoize to-index*))
(defn count-op
"Returns length of CIGAR operations."
[^String s]
(count (parse s)))
(defn- count-ref-str*
[^String s]
(->> (parse s)
(filter (comp #{\M \D \N \= \X} peek))
(map first)
(reduce +)))
(def ^:private count-ref-str
(memoize count-ref-str*))
(defn count-ref-bytes
"Count covering length in reference from encoded CIGAR byte-array."
[cigar-bytes]
(let [buf (ByteBuffer/wrap cigar-bytes)]
(.order buf ByteOrder/LITTLE_ENDIAN)
(loop [ref-length 0]
(if (.hasRemaining buf)
(let [b (.getInt buf)
op (bit-and b 0xF)
n (bit-shift-right b 4)]
(recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
ref-length))))
(defn decode-cigar-and-ref-length
"Decode CIGAR string and length of alignment in reference.
Returns a vector of [cigar, ref-length]."
[cigar-bytes]
(let [buf (ByteBuffer/wrap cigar-bytes)
sb (StringBuilder.)]
(.order buf ByteOrder/LITTLE_ENDIAN)
(loop [ref-length 0]
(if (.hasRemaining buf)
(let [b (.getInt buf)
op (bit-and b 0xF)
n (bit-shift-right b 4)]
(doto sb
(.append n)
(.append (case op 0 \M 1 \I 2 \D 3 \N 4 \S 5 \H 6 \P 7 \= 8 \X)))
(recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
[(.toString sb) ref-length]))))
(defn encode-cigar
"Encodes CIGAR string into a sequence of longs."
[cigar]
(mapv #(bit-or (bit-shift-left (first %) 4)
(case (second %) \M 0 \I 1 \D 2 \N 3 \S 4 \H 5 \P 6 \= 7 \X 8))
(parse cigar)))
(defmulti count-ref
"Returns length of reference bases."
class)
(defmethod count-ref String
[s]
(count-ref-str s))
(defmethod count-ref (Class/forName "[B")
[b]
(count-ref-bytes b))