-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33 from chrovis/feature/find-aliases
Add function for finding alternative HGVS expressions
- Loading branch information
Showing
3 changed files
with
133 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
(ns varity.hgvs | ||
(:require [cljam.io.sequence :as cseq] | ||
[cljam.io.util :as io-util] | ||
[clojure.string :as string] | ||
[varity.hgvs-to-vcf :as h2v] | ||
[varity.ref-gene :as rg] | ||
[varity.vcf-to-hgvs :as v2h])) | ||
|
||
(defn- trim-version | ||
[transcript] | ||
(when transcript | ||
(string/replace transcript #"([^.pt]+)[.pt]\d+" "$1"))) | ||
|
||
(def ^:private option-patterns | ||
[{:prefer-deletion? false} | ||
{:prefer-deletion? true} | ||
{:prefer-insertion? false} | ||
{:prefer-insertion? true}]) | ||
|
||
(defn- dispatch | ||
[ref-seq ref-gene] | ||
(cond | ||
(string? ref-seq) :ref-seq-path | ||
|
||
(io-util/sequence-reader? ref-seq) | ||
(cond | ||
(string? ref-gene) :ref-gene-path | ||
(instance? varity.ref_gene.RefGeneIndex ref-gene) :ref-gene-index | ||
(map? ref-gene) :ref-gene-entity))) | ||
|
||
(defmulti find-aliases | ||
"Returns alternative expressions of `hgvs`, including the given expression. | ||
There can be multiple HGVS expressions for the same variant. For example, | ||
\"NM_123.4:c.4_6[1]\" and \"NM_123.4:c.7_9del\" can be equivalent in some | ||
cases. When you give either of the two HGVS, `find-aliases` returns both HGVS. | ||
`hgvs` must be a coding DNA HGVS with a transcript. `ref-seq` must be a path | ||
to reference or an instance which implements | ||
`cljam.io.protocols/ISequenceReader`. `ref-gene` must be a path to | ||
refGene.txt(.gz), a ref-gene index, or a ref-gene entity." | ||
{:arglists '([hgvs ref-seq ref-gene])} | ||
(fn [& args] | ||
(apply dispatch (take-last 2 args)))) | ||
|
||
(defmethod find-aliases :ref-seq-path | ||
[hgvs ref-seq ref-gene] | ||
(with-open [seq-rdr (cseq/reader ref-seq)] | ||
(doall (find-aliases hgvs seq-rdr ref-gene)))) | ||
|
||
(defmethod find-aliases :ref-gene-path | ||
[hgvs seq-rdr ref-gene] | ||
(let [rgidx (rg/index (rg/load-ref-genes ref-gene))] | ||
(find-aliases hgvs seq-rdr rgidx))) | ||
|
||
(defmethod find-aliases :ref-gene-index | ||
[hgvs seq-rdr rgidx] | ||
{:pre [(= (:kind hgvs) :coding-dna)]} | ||
(letfn [(vcf-variant->coding-dna-hgvs [variant] | ||
(mapcat #(v2h/vcf-variant->coding-dna-hgvs variant seq-rdr rgidx %) | ||
option-patterns))] | ||
(->> (h2v/hgvs->vcf-variants hgvs seq-rdr rgidx) | ||
(mapcat vcf-variant->coding-dna-hgvs) | ||
(filter #(= (:transcript %) (trim-version (:transcript hgvs)))) | ||
distinct))) | ||
|
||
(defmethod find-aliases :ref-gene-entity | ||
[hgvs seq-rdr rg] | ||
{:pre [(= (:kind hgvs) :coding-dna)]} | ||
(when-let [variant (h2v/hgvs->vcf-variants hgvs seq-rdr rg)] | ||
(->> option-patterns | ||
(map #(v2h/vcf-variant->coding-dna-hgvs variant seq-rdr rg %)) | ||
distinct))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
(ns varity.hgvs-test | ||
(:require [clojure.test :refer [are]] | ||
[clj-hgvs.core :as hgvs] | ||
[cljam.io.sequence :as cseq] | ||
[varity.hgvs :as vhgvs] | ||
[varity.t-common :refer [defslowtest cavia-testing test-ref-gene-file | ||
test-ref-seq-file]] | ||
[varity.ref-gene :as rg])) | ||
|
||
(defslowtest find-aliases-test | ||
(cavia-testing "find HGVS aliases" | ||
(with-open [rdr (cseq/reader test-ref-seq-file)] | ||
(let [rgs (rg/load-ref-genes test-ref-gene-file) | ||
rgidx (rg/index rgs) | ||
find-rg (fn [transcript] | ||
(first (filter #(= (:name %) transcript) rgs)))] | ||
(are [s xs] (= (->> (vhgvs/find-aliases (hgvs/parse s) rdr rgidx) | ||
(map hgvs/format) | ||
set) | ||
xs) | ||
;; cf. rs11571587 (+) | ||
"NM_000059:c.162CAA[1]" #{"NM_000059:c.162CAA[1]" | ||
"NM_000059:c.165_167del"} | ||
"NM_000059.3:c.165_167del" #{"NM_000059:c.162CAA[1]" | ||
"NM_000059:c.165_167del"} | ||
|
||
;; cf. rs727502907 (-) | ||
"NM_004333.6:c.-95GCCTCC[3]" #{"NM_004333:c.-95GCCTCC[3]" | ||
"NM_004333:c.-77_-72del"} | ||
"NM_004333:c.-77_-72del" #{"NM_004333:c.-95GCCTCC[3]" | ||
"NM_004333:c.-77_-72del"} | ||
|
||
;; cf. rs2307882 (-) | ||
"NM_144639:c.1510-122AG[3]" #{"NM_144639:c.1510-122AG[3]" | ||
"NM_144639:c.1510-121_1510-120insAGAG"} | ||
"NM_144639:c.1510-121_1510-120insAGAG" #{"NM_144639:c.1510-122AG[3]" | ||
"NM_144639:c.1510-121_1510-120insAGAG"}) | ||
|
||
(are [s rg xs] (= (->> (vhgvs/find-aliases (hgvs/parse s) rdr rg) | ||
(map hgvs/format) | ||
set) | ||
xs) | ||
;; cf. rs11571587 (+) | ||
"NM_000059:c.162CAA[1]" (find-rg "NM_000059") #{"NM_000059:c.162CAA[1]" | ||
"NM_000059:c.165_167del"} | ||
"NM_000059.3:c.165_167del" (find-rg "NM_000059") #{"NM_000059:c.162CAA[1]" | ||
"NM_000059:c.165_167del"} | ||
|
||
;; cf. rs727502907 (-) | ||
"NM_004333.6:c.-95GCCTCC[3]" (find-rg "NM_004333") #{"NM_004333:c.-95GCCTCC[3]" | ||
"NM_004333:c.-77_-72del"} | ||
"NM_004333:c.-77_-72del" (find-rg "NM_004333") #{"NM_004333:c.-95GCCTCC[3]" | ||
"NM_004333:c.-77_-72del"} | ||
|
||
;; cf. rs2307882 (-) | ||
"NM_144639:c.1510-122AG[3]" (find-rg "NM_144639") #{"NM_144639:c.1510-122AG[3]" | ||
"NM_144639:c.1510-121_1510-120insAGAG"} | ||
"NM_144639:c.1510-121_1510-120insAGAG" (find-rg "NM_144639") #{"NM_144639:c.1510-122AG[3]" | ||
"NM_144639:c.1510-121_1510-120insAGAG"}))))) |