-
Notifications
You must be signed in to change notification settings - Fork 2
/
hgvs_to_vcf.clj
105 lines (92 loc) · 4.66 KB
/
hgvs_to_vcf.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
(ns varity.hgvs-to-vcf
"Functions to convert HGVS into VCF-style variants."
(:require [clojure.string :as string]
[cljam.io.sequence :as cseq]
[cljam.io.util :as io-util]
[varity.hgvs-to-vcf.coding-dna :as coding-dna]
[varity.hgvs-to-vcf.protein :as prot]
[varity.ref-gene :as rg]))
(defn- dispatch
[ref-seq ref-gene]
(cond
(string? ref-seq) :ref-seq-path
(io-util/sequence-reader? ref-seq)
(cond
(string? ref-gene) :ref-gene-path
(instance? varity.ref_gene.RefGeneIndex ref-gene) :ref-gene-index
(map? ref-gene) :ref-gene-entity)))
(defn- coding-dna-hgvs->vcf-variants
[hgvs seq-rdr rgs]
(distinct (keep #(coding-dna/->vcf-variant hgvs seq-rdr %) rgs)))
(defn- protein-hgvs->vcf-variants
[hgvs seq-rdr rgs]
(distinct (apply concat (keep #(prot/->vcf-variants hgvs seq-rdr %) rgs))))
(defmulti hgvs->vcf-variants
"Converts coding DNA/protein hgvs into possible VCF-style variants. Transcript of
hgvs, such as NM_005228, is used for ref-genes search. Alternatively, gene,
such as EGFR, can be used if transcript does not exist. ref-seq must be a path
to reference or an instance which implements
cljam.io.protocols/ISequenceReader. ref-gene must be a path to
refGene.txt(.gz), ref-gene index, or a ref-gene entity. A returned sequence
consists of {:chr :pos :ref :alt}."
{:arglists '([hgvs ref-seq ref-gene] [hgvs gene ref-seq ref-gene])}
(fn [& args]
(apply dispatch (take-last 2 args))))
(defmethod hgvs->vcf-variants :ref-seq-path
([hgvs ref-seq ref-gene] (hgvs->vcf-variants hgvs nil ref-seq ref-gene))
([hgvs gene ref-seq ref-gene]
(with-open [seq-rdr (cseq/reader ref-seq)]
(doall (hgvs->vcf-variants hgvs gene seq-rdr ref-gene)))))
(defmethod hgvs->vcf-variants :ref-gene-path
([hgvs seq-rdr ref-gene] (hgvs->vcf-variants hgvs nil seq-rdr ref-gene))
([hgvs gene seq-rdr ref-gene]
(let [rgidx (rg/index (rg/load-ref-genes ref-gene))]
(hgvs->vcf-variants hgvs gene seq-rdr rgidx))))
(defmethod hgvs->vcf-variants :ref-gene-index
([hgvs seq-rdr rgidx] (hgvs->vcf-variants hgvs nil seq-rdr rgidx))
([hgvs gene seq-rdr rgidx]
(let [convert (case (:kind hgvs)
:coding-dna coding-dna-hgvs->vcf-variants
:protein protein-hgvs->vcf-variants)
rgs (if-let [[rs] (re-find #"^(NM|NR)_\d+" (str (:transcript hgvs)))]
(rg/ref-genes rs rgidx)
(if-not (string/blank? gene)
(rg/ref-genes gene rgidx)
(throw (ex-info "Transcript (NM_, NR_) or gene must be supplied."
{:type ::ref-gene-clue-not-found}))))]
(convert hgvs seq-rdr rgs))))
(defmethod hgvs->vcf-variants :ref-gene-entity
([hgvs seq-rdr rg] (hgvs->vcf-variants hgvs nil seq-rdr rg))
([hgvs _ seq-rdr rg]
(let [convert (case (:kind hgvs)
:coding-dna coding-dna/->vcf-variant
:protein prot/->vcf-variants)]
(convert hgvs seq-rdr rg))))
(defmulti protein-hgvs->vcf-variants-with-coding-dna-hgvs
"Converts protein HGVS into possible VCF-style variants and coding DNA HGVS.
Transcript of hgvs, such as NM_005228, is used for ref-genes search.
Alternatively, gene, such as EGFR, can be used if transcript does not exist.
ref-seq must be a path to reference or an instance which implements
cljam.io.protocols/ISequenceReader. ref-gene must be a path to
refGene.txt(.gz), ref-gene index, or a ref-gene entity. A returned sequence
consists of {:vcf :coding-dna}."
{:arglists '([hgvs ref-seq ref-gene] [hgvs gene ref-seq ref-gene])}
(fn [& args]
(apply dispatch (take-last 2 args))))
(defmethod protein-hgvs->vcf-variants-with-coding-dna-hgvs :ref-seq-path
([hgvs ref-seq ref-gene] (protein-hgvs->vcf-variants-with-coding-dna-hgvs nil ref-seq ref-gene))
([hgvs gene ref-seq ref-gene]
(with-open [seq-rdr (cseq/reader ref-seq)]
(doall (protein-hgvs->vcf-variants-with-coding-dna-hgvs hgvs gene seq-rdr ref-gene)))))
(defmethod protein-hgvs->vcf-variants-with-coding-dna-hgvs :ref-gene-path
([hgvs seq-rdr ref-gene] (protein-hgvs->vcf-variants-with-coding-dna-hgvs nil seq-rdr ref-gene))
([hgvs gene seq-rdr ref-gene]
(let [rgidx (rg/index (rg/load-ref-genes ref-gene))]
(protein-hgvs->vcf-variants-with-coding-dna-hgvs hgvs gene seq-rdr rgidx))))
(defmethod protein-hgvs->vcf-variants-with-coding-dna-hgvs :ref-gene-index
([hgvs seq-rdr rgidx] (protein-hgvs->vcf-variants-with-coding-dna-hgvs nil seq-rdr rgidx))
([hgvs gene seq-rdr rgidx]
{:pre [(= (:kind hgvs) :protein)]}
(->> (rg/ref-genes gene rgidx)
(keep #(prot/->vcf-variants-with-coding-dna-hgvs hgvs seq-rdr %))
(apply concat))))