-
Notifications
You must be signed in to change notification settings - Fork 2
/
common.clj
211 lines (196 loc) · 7.81 KB
/
common.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
(ns varity.vcf-to-hgvs.common
(:require [clojure.string :as string]
[cljam.io.sequence :as cseq]
[varity.ref-gene :as rg]))
(defn- left-common-len
"Returns the number of common characters on the left side of s1 and s2."
[s1 s2]
(->> (map #(if (= %1 %2) 1 0) s1 s2)
(take-while #(= % 1))
(reduce +)))
(defn diff-bases
"Compares VCF-style base(s) s1 and s2, returning a vector of
[bases-only-in-s1 bases-only-in-s2 left-common-len right-common-len]."
[s1 s2]
(let [l (left-common-len s1 s2)
ls1 (subs s1 l)
ls2 (subs s2 l)
r (left-common-len (reverse ls1) (reverse ls2))]
[(subs ls1 0 (- (count ls1) r))
(subs ls2 0 (- (count ls2) r))
l
r]))
(def max-repeat-unit-size 100)
;; AGT => ["AGT"]
;; AGTAGT => ["AGT" "AGTAGT"]
(defn- repeat-units
[s]
(case (count s)
0 []
1 [s]
(let [limit (min (/ (count s) 2) max-repeat-unit-size)
xf (comp (map #(partition-all % s))
(filter #(apply = %))
(map first)
(map #(apply str %)))]
(conj (into [] xf (range 1 (inc limit))) s))))
;; seq* pos bases
;; MVSTSTHQ... 3 ST => 2
(defn forward-shift
[seq* pos bases]
(if (= (subs seq* (dec pos) (+ (dec pos) (count bases))) bases)
(if (string/blank? bases)
0
(let [step (count (first (repeat-units bases)))]
(->> seq*
(drop (dec pos))
(partition (count bases) step)
(take-while #(= (apply str %) bases))
count
dec
(* step))))
(throw (ex-info "The bases is not found on the position."
{:type ::invalid-bases
:sequence seq*
:position pos
:bases bases}))))
;; seq* pos bases
;; MVSTSTHQ... 5 ST => 2
(defn backward-shift
[seq* pos bases]
(if (= (subs seq* (dec pos) (+ (dec pos) (count bases))) bases)
(if (string/blank? bases)
0
(let [rbases (string/reverse bases)
n (count bases)
step (count (first (repeat-units rbases)))
tweak (if (= step 1) #(+ (dec n) %) #(* step %))]
(->> seq*
(take (dec pos))
reverse
(partition (count rbases) step)
(take-while #(= (apply str %) rbases))
count
tweak)))
(throw (ex-info "The bases is not found on the position."
{:type ::invalid-bases
:sequence seq*
:position pos
:bases bases}))))
;; ...CAGTC... 8 AGT :ins => ["AGT" 1 2]
;; ...CAGTC... 8 AGTAGT :ins => ["AGT" 1 3]
;; ...CAGTAGTC... 11 AGTAGT :ins => ["AGT" 2 4]
;; ...CAGTAGTC... 8 AGT :del => ["AGT" 2 1]
(defn repeat-info
[seq* pos alt type]
{:pre [(#{:ins :del} type)]}
(->> (repeat-units alt)
(map (fn [unit]
(let [end (case type
:ins (dec pos)
:del (dec (+ pos (count alt))))]
(when (<= end (count seq*))
(let [ref-repeat (->> (subs seq* 0 end)
(reverse)
(partition (count unit))
(take-while #(= % (reverse unit)))
(count))]
[unit
ref-repeat
(+ ref-repeat (cond-> (/ (count alt) (count unit))
(= type :del) -))])))))
(remove (fn [[_ r a]] (or (zero? r) (zero? a))))
(first)))
;; ("AGT" "AGTCTG" "AGTCTGAAA" ...)
(defn read-sequence-stepwise
[seq-rdr {:keys [chr start end]} step]
(->> (range)
(map (fn [i]
(let [start* (+ start (* i step))
end* (min (dec (+ start* step)) end)]
(if (<= start* end)
(cseq/read-sequence seq-rdr {:chr chr, :start start, :end end*})))))
(take-while some?)))
;; ("AGT" "CTGAGT" "AAACTGAGT" ...)
(defn read-sequence-stepwise-backward
[seq-rdr {:keys [chr start end]} step]
(->> (range)
(map (fn [i]
(let [end* (- end (* i step))
start* (max (inc (- end* step)) start)]
(if (>= end* start)
(cseq/read-sequence seq-rdr {:chr chr, :start start*, :end end})))))
(take-while some?)))
;; + ...CAGTAGTAGTC... 7 T TAGT => 13 T TAGT
;; + ...CAGTAGTAGTC... 7 TAGT T => 10 TAGT T
;; - ...CAGTAGTAGTC... 7 T TAGT => 4 C CAGT
;; - ...CAGTAGTAGTC... 7 TAGT T => 4 CAGT C
(defn- normalize-variant*
[{:keys [pos ref alt] :as var} seq* strand]
(let [[ref-only alt-only offset _] (diff-bases ref alt)
type* (cond
(and (zero? (count ref-only)) (pos? (count alt-only))) :ins
(and (pos? (count ref-only)) (zero? (count alt-only))) :del
:else :other)]
(if (#{:ins :del} type*)
(let [bases (if (string/blank? ref-only) alt-only ref-only)
seq' (if (= type* :ins)
(str (subs seq* 0 (+ pos offset -1)) bases (subs seq* (+ pos offset -1)))
seq*)
move (case strand
:forward (forward-shift seq' (+ pos offset) bases)
:reverse (- (backward-shift seq' (+ pos offset) bases)))
comm (subs seq* (+ pos move -1) (+ pos move offset -1))]
{:pos (+ pos move)
:ref (str comm ref-only)
:alt (str comm alt-only)})
var)))
(def normalization-read-sequence-step 1000)
(defn- normalize-variant-forward
[{:keys [chr pos ref alt]} seq-rdr rg]
(->> (read-sequence-stepwise seq-rdr
{:chr chr
:start pos
:end (+ (:tx-end rg) rg/max-tx-margin)}
normalization-read-sequence-step)
(keep (fn [seq*]
(when (>= (count seq*) (count ref))
(let [nvar (normalize-variant* {:pos 1, :ref ref, :alt alt} seq* :forward)]
(if (<= (dec (:pos nvar)) (- (count seq*) (max (count ref) (count alt))))
(-> nvar
(assoc :chr chr)
(update :pos + pos -1)))))))
(first)))
(defn- normalize-variant-backward
[{:keys [chr pos ref alt]} seq-rdr rg]
(->> (read-sequence-stepwise-backward seq-rdr
{:chr chr
:start (- (:tx-start rg) rg/max-tx-margin)
:end (+ pos (count ref) -1)}
normalization-read-sequence-step)
(keep (fn [seq*]
(let [offset (- (count seq*) (count ref))]
(when (>= offset 0)
(let [nvar (normalize-variant* {:pos (inc offset), :ref ref, :alt alt} seq* :reverse)]
(if (> (:pos nvar) (max (count ref) (count alt)))
(-> nvar
(assoc :chr chr)
(update :pos + (- pos offset) -1))))))))
(first)))
(defn normalize-variant
"Normalizes a VCF-style variant based on surrounding sequence. For example,
{:pos 7, :ref T, :alt TAGT} on ...CAGTAGTAGTC... is equivalent to
{:pos 13, :ref T, :alt TAGT}. The latter is normalized.
e.g. ...CAGTAGTAGTC... 7 T TAGT => 13 T TAGT"
[variant seq-rdr rg]
(case (:strand rg)
:forward (normalize-variant-forward variant seq-rdr rg)
:reverse (normalize-variant-backward variant seq-rdr rg)))
(defn alt-sequence
"Returns sequence a variant applied."
[ref-seq seq-start pos ref alt]
(let [pos* (inc (- pos seq-start))]
(str (subs ref-seq 0 (max 0 (dec pos*)))
alt
(subs ref-seq (min (count ref-seq)
(+ (dec pos*) (count ref)))))))