Skip to content

Commit

Permalink
Merge pull request #33 from chrovis/feature/find-aliases
Browse files Browse the repository at this point in the history
Add function for finding alternative HGVS expressions
  • Loading branch information
federkasten committed Feb 16, 2020
2 parents 6a8a1f4 + 2fdac05 commit 8f35b27
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 1 deletion.
73 changes: 73 additions & 0 deletions src/varity/hgvs.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
(ns varity.hgvs
(:require [cljam.io.sequence :as cseq]
[cljam.io.util :as io-util]
[clojure.string :as string]
[varity.hgvs-to-vcf :as h2v]
[varity.ref-gene :as rg]
[varity.vcf-to-hgvs :as v2h]))

(defn- trim-version
[transcript]
(when transcript
(string/replace transcript #"([^.pt]+)[.pt]\d+" "$1")))

(def ^:private option-patterns
[{:prefer-deletion? false}
{:prefer-deletion? true}
{:prefer-insertion? false}
{:prefer-insertion? true}])

(defn- dispatch
[ref-seq ref-gene]
(cond
(string? ref-seq) :ref-seq-path

(io-util/sequence-reader? ref-seq)
(cond
(string? ref-gene) :ref-gene-path
(instance? varity.ref_gene.RefGeneIndex ref-gene) :ref-gene-index
(map? ref-gene) :ref-gene-entity)))

(defmulti find-aliases
"Returns alternative expressions of `hgvs`, including the given expression.
There can be multiple HGVS expressions for the same variant. For example,
\"NM_123.4:c.4_6[1]\" and \"NM_123.4:c.7_9del\" can be equivalent in some
cases. When you give either of the two HGVS, `find-aliases` returns both HGVS.
`hgvs` must be a coding DNA HGVS with a transcript. `ref-seq` must be a path
to reference or an instance which implements
`cljam.io.protocols/ISequenceReader`. `ref-gene` must be a path to
refGene.txt(.gz), a ref-gene index, or a ref-gene entity."
{:arglists '([hgvs ref-seq ref-gene])}
(fn [& args]
(apply dispatch (take-last 2 args))))

(defmethod find-aliases :ref-seq-path
[hgvs ref-seq ref-gene]
(with-open [seq-rdr (cseq/reader ref-seq)]
(doall (find-aliases hgvs seq-rdr ref-gene))))

(defmethod find-aliases :ref-gene-path
[hgvs seq-rdr ref-gene]
(let [rgidx (rg/index (rg/load-ref-genes ref-gene))]
(find-aliases hgvs seq-rdr rgidx)))

(defmethod find-aliases :ref-gene-index
[hgvs seq-rdr rgidx]
{:pre [(= (:kind hgvs) :coding-dna)]}
(letfn [(vcf-variant->coding-dna-hgvs [variant]
(mapcat #(v2h/vcf-variant->coding-dna-hgvs variant seq-rdr rgidx %)
option-patterns))]
(->> (h2v/hgvs->vcf-variants hgvs seq-rdr rgidx)
(mapcat vcf-variant->coding-dna-hgvs)
(filter #(= (:transcript %) (trim-version (:transcript hgvs))))
distinct)))

(defmethod find-aliases :ref-gene-entity
[hgvs seq-rdr rg]
{:pre [(= (:kind hgvs) :coding-dna)]}
(when-let [variant (h2v/hgvs->vcf-variants hgvs seq-rdr rg)]
(->> option-patterns
(map #(v2h/vcf-variant->coding-dna-hgvs variant seq-rdr rg %))
distinct)))
2 changes: 1 addition & 1 deletion src/varity/vcf_to_hgvs.clj
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
seq-rdr rg)]
(when (:verbose? options)
(print-debug-info nv seq-rdr rg))
(coding-dna/->hgvs (assoc nv :rg rg) seq-rdr rg))
(coding-dna/->hgvs (assoc nv :rg rg) seq-rdr rg options))
(throw (ex-info "ref is not found on the position."
{:type ::invalid-ref
:variant {:chr chr, :pos pos, :ref ref, :alt alt}})))))
Expand Down
59 changes: 59 additions & 0 deletions test/varity/hgvs_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
(ns varity.hgvs-test
(:require [clojure.test :refer [are]]
[clj-hgvs.core :as hgvs]
[cljam.io.sequence :as cseq]
[varity.hgvs :as vhgvs]
[varity.t-common :refer [defslowtest cavia-testing test-ref-gene-file
test-ref-seq-file]]
[varity.ref-gene :as rg]))

(defslowtest find-aliases-test
(cavia-testing "find HGVS aliases"
(with-open [rdr (cseq/reader test-ref-seq-file)]
(let [rgs (rg/load-ref-genes test-ref-gene-file)
rgidx (rg/index rgs)
find-rg (fn [transcript]
(first (filter #(= (:name %) transcript) rgs)))]
(are [s xs] (= (->> (vhgvs/find-aliases (hgvs/parse s) rdr rgidx)
(map hgvs/format)
set)
xs)
;; cf. rs11571587 (+)
"NM_000059:c.162CAA[1]" #{"NM_000059:c.162CAA[1]"
"NM_000059:c.165_167del"}
"NM_000059.3:c.165_167del" #{"NM_000059:c.162CAA[1]"
"NM_000059:c.165_167del"}

;; cf. rs727502907 (-)
"NM_004333.6:c.-95GCCTCC[3]" #{"NM_004333:c.-95GCCTCC[3]"
"NM_004333:c.-77_-72del"}
"NM_004333:c.-77_-72del" #{"NM_004333:c.-95GCCTCC[3]"
"NM_004333:c.-77_-72del"}

;; cf. rs2307882 (-)
"NM_144639:c.1510-122AG[3]" #{"NM_144639:c.1510-122AG[3]"
"NM_144639:c.1510-121_1510-120insAGAG"}
"NM_144639:c.1510-121_1510-120insAGAG" #{"NM_144639:c.1510-122AG[3]"
"NM_144639:c.1510-121_1510-120insAGAG"})

(are [s rg xs] (= (->> (vhgvs/find-aliases (hgvs/parse s) rdr rg)
(map hgvs/format)
set)
xs)
;; cf. rs11571587 (+)
"NM_000059:c.162CAA[1]" (find-rg "NM_000059") #{"NM_000059:c.162CAA[1]"
"NM_000059:c.165_167del"}
"NM_000059.3:c.165_167del" (find-rg "NM_000059") #{"NM_000059:c.162CAA[1]"
"NM_000059:c.165_167del"}

;; cf. rs727502907 (-)
"NM_004333.6:c.-95GCCTCC[3]" (find-rg "NM_004333") #{"NM_004333:c.-95GCCTCC[3]"
"NM_004333:c.-77_-72del"}
"NM_004333:c.-77_-72del" (find-rg "NM_004333") #{"NM_004333:c.-95GCCTCC[3]"
"NM_004333:c.-77_-72del"}

;; cf. rs2307882 (-)
"NM_144639:c.1510-122AG[3]" (find-rg "NM_144639") #{"NM_144639:c.1510-122AG[3]"
"NM_144639:c.1510-121_1510-120insAGAG"}
"NM_144639:c.1510-121_1510-120insAGAG" (find-rg "NM_144639") #{"NM_144639:c.1510-122AG[3]"
"NM_144639:c.1510-121_1510-120insAGAG"})))))

0 comments on commit 8f35b27

Please sign in to comment.