Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix I/O of variants with empty info/individual fields #167

Merged
merged 4 commits into from Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions project.clj
Expand Up @@ -12,7 +12,7 @@
[com.climate/claypoole "1.1.4"]
[camel-snake-kebab "0.4.0"]
[proton "0.1.8"]]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.9.0"]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]
[cavia "0.5.1"]
[criterium "0.4.5"]
[net.totakke/libra "0.1.1"]
Expand All @@ -31,8 +31,8 @@
:1.7 {:dependencies [[org.clojure/clojure "1.7.0"]]}
:1.8 {:dependencies [[org.clojure/clojure "1.8.0"]]}
:1.9 {:dependencies [[org.clojure/clojure "1.9.0"]]}
:1.10 {:dependencies [[org.clojure/clojure "1.10.1-RC1"]]}
:uberjar {:dependencies [[org.clojure/clojure "1.9.0"]
:1.10 {:dependencies [[org.clojure/clojure "1.10.1"]]}
:uberjar {:dependencies [[org.clojure/clojure "1.10.1"]
[org.apache.logging.log4j/log4j-api "2.11.2"]
[org.apache.logging.log4j/log4j-core "2.11.2"]]
:resource-paths ["bin-resources"]
Expand Down
8 changes: 4 additions & 4 deletions src/cljam/io/bcf/reader.clj
Expand Up @@ -187,16 +187,16 @@
indiv (map
(fn [i] (into
{}
(keep
(map
(fn [[k vs]]
(let [tag (formats k)
v (nth vs i)]
(when-not (or (nil? v) (= [nil] v))
[(:kw tag)
[(:kw tag)
(when-not (or (nil? v) (= [nil] v))
(cond
(= (:kw tag) :GT) (vcf-util/ints->genotype v)
(and (= (:number tag) 1) (sequential? v)) (first v)
:else v)])))) gts))
:else v))]))) gts))
(range (:n-sample variant)))
v (-> (dissoc variant :genotype)
(dissoc :ref-length)
Expand Down
17 changes: 11 additions & 6 deletions src/cljam/io/vcf/util.clj
Expand Up @@ -39,7 +39,7 @@
(into
{}
(map (fn [ss]
(let [[k vs] (cstr/split ss #"\=")]
(let [[k vs] (cstr/split ss #"\=" 2)]
[(keyword k) ((parser-map k) vs)])))))))))

(defn info-stringifier
Expand All @@ -50,15 +50,16 @@
(let [id-ordered (mapv :id info-meta)
info-type (into {} (map (juxt :id :type)) info-meta)]
(fn [info]
(when (some? info)
(when info
(->> id-ordered
(keep
(fn [k]
(when-let [v (info (keyword k))]
(if (or (= v :exists) (= (info-type k) "Flag"))
k
(str k "=" (if (sequential? v) (cstr/join "," v) v))))))
(cstr/join ";"))))))
(cstr/join ";")
not-empty)))))

(defn parse-filter
"Parses FILTER field and returns a sequence of keywords."
Expand Down Expand Up @@ -184,7 +185,10 @@
(fn [^String format-line sample-line]
(when-not (dot-or-nil? format-line)
(let [ks (cstr/split format-line #":")
vs (cstr/split sample-line #":")]
vs (concat
(when (not-empty sample-line)
(cstr/split sample-line #":"))
(repeat nil))]
(into
{}
(map (fn [[k ^String v]]
Expand All @@ -195,7 +199,7 @@
"Converts sample map into string. formats must be a seqeunce of keys in sample-map."
[formats sample-map]
(->> formats
(map (fn [k] [k (sample-map k)]))
(map (fn [k] [k (get sample-map k)]))
reverse
(drop-while (fn [[_ v]] (or (nil? v) (= [nil] v))))
(map (fn [[_ v]]
Expand All @@ -204,7 +208,8 @@
(nil? v) "."
:else v)))
reverse
(cstr/join ":")))
(cstr/join ":")
not-empty))

(defn variant-parser
"Returns a parser function to parse :filter, :info, :FORMAT and sample columns of VCF.
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/io/vcf/writer.clj
Expand Up @@ -32,7 +32,7 @@

(defn- nil->dot
[s]
(if (nil? s) "." s))
(or s "."))
r6eve marked this conversation as resolved.
Show resolved Hide resolved

(defn- write-line
[^BufferedWriter bwtr ^String s]
Expand Down
40 changes: 20 additions & 20 deletions test/cljam/io/vcf/util_test.clj
Expand Up @@ -14,7 +14,7 @@
{:id "HOMLEN", :number ".", :type "Integer"}
{:id "CC", :number 1, :type "Character"}])]
(are [?info-str ?expected]
(= (parse-info ?info-str) ?expected)
(= (parse-info ?info-str) ?expected)
"." nil
"NS=3" {:NS 3}
"DP=0" {:DP 0}
Expand All @@ -35,7 +35,7 @@

(deftest about-parse-filter
(are [?filter-str ?expected]
(= (vcf-util/parse-filter ?filter-str) ?expected)
(= (vcf-util/parse-filter ?filter-str) ?expected)
"." nil
"PASS" [:PASS]
"q10" [:q10]
Expand All @@ -50,7 +50,7 @@
{:id "AF", :number 1, :type "Float"}
{:id "CC", :number 1, :type "Character"}])]
(are [?format-str ?sample-str ?expected]
(= (parse-sample ?format-str ?sample-str) ?expected)
(= (parse-sample ?format-str ?sample-str) ?expected)
"." "." nil
"GT" "0/0" {:GT "0/0"}
"GQ" "48" {:GQ 48}
Expand All @@ -62,11 +62,11 @@
"HQ" ".,." {:HQ [nil nil]}
"HQ" "." {:HQ nil}
"GT:GQ:DP:HQ" "2|1:2:0:18,2" {:GT "2|1", :GQ 2, :DP 0, :HQ [18 2]}
"GT:GQ:DP:HQ" "2/2:35:4" {:GT "2/2", :GQ 35, :DP 4})))
"GT:GQ:DP:HQ" "2/2:35:4" {:GT "2/2", :GQ 35, :DP 4, :HQ nil})))

(deftest about-parse-genotype
(are [?gt-str ?expected]
(= (vcf-util/parse-genotype ?gt-str) ?expected)
(= (vcf-util/parse-genotype ?gt-str) ?expected)
"." nil
"0" [[0 true]]
"1" [[1 true]]
Expand All @@ -83,7 +83,7 @@

(deftest about-stringify-genotype
(are [?gt ?expected]
(= (vcf-util/stringify-genotype ?gt) ?expected)
(= (vcf-util/stringify-genotype ?gt) ?expected)
nil nil
[[0 true]] "0"
[[1 true]] "1"
Expand All @@ -102,7 +102,7 @@

(deftest genotype-seq
(are [?ploidy ?n-alt-alleles ?expected]
(= ?expected (vcf-util/genotype-seq ?ploidy ?n-alt-alleles))
(= ?expected (vcf-util/genotype-seq ?ploidy ?n-alt-alleles))
1 1 [[0] [1]]
1 2 [[0] [1] [2]]
2 1 [[0 0] [0 1] [1 1]]
Expand All @@ -114,7 +114,7 @@

(deftest genotype-index
(are [?genotype ?expected]
(= ?expected (vcf-util/genotype-index ?genotype))
(= ?expected (vcf-util/genotype-index ?genotype))
[0] 0
[1] 1
[0 0] 0
Expand All @@ -130,8 +130,8 @@

(deftest about-genotypes
(are [?ploidy ?n-alt-alleles]
(let [x (vcf-util/genotype-seq ?ploidy ?n-alt-alleles)]
(= (range (count x)) (map vcf-util/genotype-index x)))
(let [x (vcf-util/genotype-seq ?ploidy ?n-alt-alleles)]
(= (range (count x)) (map vcf-util/genotype-index x)))
1 0
1 1
1 2
Expand All @@ -154,7 +154,7 @@

(deftest biallelic-genotype
(are [?genotype ?target-allele ?expected]
(= ?expected (vcf-util/biallelic-genotype ?genotype ?target-allele))
(= ?expected (vcf-util/biallelic-genotype ?genotype ?target-allele))
"0" 1 "0"
"1" 1 "1"
"2" 1 "0"
Expand All @@ -178,8 +178,8 @@

(deftest biallelic-coll
(are [?ploidy ?n-alt-alleles ?target-allele ?coll ?expected]
(= ?expected
(vcf-util/biallelic-coll ?ploidy ?n-alt-alleles ?target-allele ?coll))
(= ?expected
(vcf-util/biallelic-coll ?ploidy ?n-alt-alleles ?target-allele ?coll))
2 1 1 [10 20 30] [10 20 30]
2 2 1 [10 20 30 40 50 60] [10 20 30]
2 2 2 [10 20 30 40 50 60] [10 40 60]
Expand All @@ -188,7 +188,7 @@
(deftest about-parse-variant-v4_3
(let [parse-variant (vcf-util/variant-parser test-vcf-v4_3-meta-info test-vcf-v4_3-header)]
(are [?variant ?expected]
(= (parse-variant ?variant) ?expected)
(= (parse-variant ?variant) ?expected)
(nth test-vcf-v4_3-variants 0) (nth test-vcf-v4_3-variants-deep 0)
(nth test-vcf-v4_3-variants 1) (nth test-vcf-v4_3-variants-deep 1)
(nth test-vcf-v4_3-variants 2) (nth test-vcf-v4_3-variants-deep 2)
Expand All @@ -198,7 +198,7 @@
(deftest about-parse-variant-v4_0
(let [parse-variant (vcf-util/variant-parser test-vcf-v4_0-meta-info test-vcf-v4_0-header)]
(are [?variant ?expected]
(= (parse-variant ?variant) ?expected)
(= (parse-variant ?variant) ?expected)
(nth test-vcf-v4_0-variants 0) (nth test-vcf-v4_0-variants-deep 0)
(nth test-vcf-v4_0-variants 1) (nth test-vcf-v4_0-variants-deep 1)
(nth test-vcf-v4_0-variants 2) (nth test-vcf-v4_0-variants-deep 2)
Expand All @@ -215,7 +215,7 @@
(deftest about-stringify-variant-vals-v4_3
(let [stringify-variant-vals (vcf-util/variant-vals-stringifier test-vcf-v4_3-meta-info test-vcf-v4_3-header)]
(are [?expected ?variant]
(= (stringify-variant-vals ?variant) ?expected)
(= (stringify-variant-vals ?variant) ?expected)
(nth test-vcf-v4_3-variants 0) (nth test-vcf-v4_3-variants-deep 0)
(nth test-vcf-v4_3-variants 1) (nth test-vcf-v4_3-variants-deep 1)
(nth test-vcf-v4_3-variants 2) (nth test-vcf-v4_3-variants-deep 2)
Expand All @@ -225,7 +225,7 @@
(deftest about-stringify-variant-vals-v4_0
(let [stringify-variant-vals (vcf-util/variant-vals-stringifier test-vcf-v4_0-meta-info test-vcf-v4_0-header)]
(are [?expected ?variant]
(= (stringify-variant-vals ?variant) ?expected)
(= (stringify-variant-vals ?variant) ?expected)
(nth test-vcf-v4_0-variants 0) (nth test-vcf-v4_0-variants-deep 0)
(nth test-vcf-v4_0-variants 1) (nth test-vcf-v4_0-variants-deep 1)
(nth test-vcf-v4_0-variants 2) (nth test-vcf-v4_0-variants-deep 2)
Expand All @@ -244,7 +244,7 @@

(deftest parse-breakend
(are [?alt ?expected]
(= ?expected (vcf-util/parse-breakend ?alt))
(= ?expected (vcf-util/parse-breakend ?alt))

"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
Expand Down Expand Up @@ -300,7 +300,7 @@

(deftest stringify-breakend
(are [?expected ?bnd]
(= ?expected (vcf-util/stringify-breakend ?bnd))
(= ?expected (vcf-util/stringify-breakend ?bnd))
"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
"]13:123456]AGTNNNNNCAT" {:chr "13", :pos 123456, :strand :forward,
Expand Down Expand Up @@ -339,7 +339,7 @@

(deftest inspect-allele
(are [?ref ?alt ?expected]
(= ?expected (vcf-util/inspect-allele ?ref ?alt))
(= ?expected (vcf-util/inspect-allele ?ref ?alt))

"A" "" {:type :no-call} ;; malformed
"A" "." {:type :no-call}
Expand Down
35 changes: 34 additions & 1 deletion test/cljam/io/vcf/writer_test.clj
@@ -1,7 +1,9 @@
(ns cljam.io.vcf.writer-test
(:require [clojure.test :refer :all]
[cljam.test-common :refer :all]
[cljam.io.vcf.writer :as vcf-writer]))
[cljam.io.vcf.writer :as vcf-writer])
(:import [java.io StringWriter BufferedWriter]
[cljam.io.vcf.writer VCFWriter]))

(deftest stringify-meta-info-pedigree
(is (= (#'vcf-writer/stringify-structured-line :pedigree
Expand All @@ -28,3 +30,34 @@
:description "test"
:note "extra note"})
"ID=Blood,Genomes=Germline,Mixture=1.,Description=\"test\",Note=\"extra note\"")))

(deftest empty-info-format
(let [meta-info {:info [{:id "XA", :type "String", :number 1}],
:format [{:id "XB", :type "String", :number 1}]}
header ["CHROM" "POS" "ID" "REF" "ALT" "QUAL" "FILTER" "INFO"
"FORMAT" "SAMPLE01" "SAMPLE02"]]
(are [?variant ?str]
(= ?str
(with-open [sw (StringWriter.)
bw (BufferedWriter. sw)
w (VCFWriter. nil bw meta-info header)]
(vcf-writer/write-variants w [?variant])
(.flush bw)
(str sw)))
{:chr "1", :pos 1, :ref "N"}
"1\t1\t.\tN\t.\t.\t.\t.\t.\t.\t.\n"

{:chr "2", :pos 2, :ref "N", :info {}}
"2\t2\t.\tN\t.\t.\t.\t.\t.\t.\t.\n"

{:chr "3", :pos 3, :ref "N", :info {}, :FORMAT []}
"3\t3\t.\tN\t.\t.\t.\t.\t.\t.\t.\n"

{:chr "4", :pos 4, :ref "N", :info {}, :FORMAT [], :SAMPLE01 {}}
"4\t4\t.\tN\t.\t.\t.\t.\t.\t.\t.\n"

{:chr "5", :pos 5, :ref "N", :info {:XX "5"}}
"5\t5\t.\tN\t.\t.\t.\t.\t.\t.\t.\n"

{:chr "6", :pos 6, :ref "N", :info {:XA "6"}, :FORMAT [:XB], :SAMPLE01 {}}
"6\t6\t.\tN\t.\t.\t.\tXA=6\tXB\t.\t.\n")))
46 changes: 36 additions & 10 deletions test/cljam/test_common.clj
Expand Up @@ -690,16 +690,42 @@
:FORMAT "GT:GQ:DP", :NA00001 "0/1:35:4", :NA00002 "0/2:17:2", :NA00003 "1/1:40:3"}))

(def test-vcf-v4_3-variants-deep
`({:chr "20", :pos 14370, :id "rs6054257", :ref "G", :alt ["A"], :qual 29.0, :filter [:PASS], :info {:NS 3, :DP 14, :AF [0.5], :DB :exists, :H2 :exists},
:FORMAT [:GT :GQ :DP :HQ], :NA00001 {:GT "0|0", :GQ 48, :DP 1, :HQ [51 51]}, :NA00002 {:GT "1|0", :GQ 48, :DP 8, :HQ [51 51]}, :NA00003 {:GT "1/1", :GQ 43, :DP 5, :HQ [nil nil]}}
{:chr "20", :pos 17330, :id nil, :ref "T", :alt ["A"], :qual 3.0, :filter [:q10], :info {:NS 3, :DP 11, :AF [~(float 0.017)]},
:FORMAT [:GT :GQ :DP :HQ], :NA00001 {:GT "0|0", :GQ 49, :DP 3, :HQ [58 50]}, :NA00002 {:GT "0|1", :GQ 3, :DP 5, :HQ [65 3]}, :NA00003 {:GT "0/0", :GQ 41, :DP 3}}
{:chr "20", :pos 1110696, :id "rs6040355", :ref "A", :alt ["G" "T"], :qual 67.0, :filter [:PASS], :info {:NS 2, :DP 10, :AF [~(float 0.333) ~(float 0.667)], :AA "T", :DB :exists},
:FORMAT [:GT :GQ :DP :HQ], :NA00001 {:GT "1|2", :GQ 21, :DP 6, :HQ [23 27]}, :NA00002 {:GT "2|1", :GQ 2, :DP 0, :HQ [18 2]}, :NA00003 {:GT "2/2", :GQ 35, :DP 4}}
{:chr "20", :pos 1230237, :id nil, :ref "T", :alt nil, :qual 47.0, :filter [:PASS], :info {:NS 3, :DP 13, :AA "T"},
:FORMAT [:GT :GQ :DP :HQ], :NA00001 {:GT "0|0", :GQ 54, :DP 7, :HQ [56 60]}, :NA00002 {:GT "0|0", :GQ 48, :DP 4, :HQ [51 51]}, :NA00003 {:GT "0/0", :GQ 61, :DP 2}}
{:chr "20", :pos 1234567, :id "microsat1", :ref "GTC", :alt ["G" "GTCT"], :qual 50.0, :filter [:PASS], :info {:NS 3, :DP 9, :AA "G"},
:FORMAT [:GT :GQ :DP], :NA00001 {:GT "0/1", :GQ 35, :DP 4}, :NA00002 {:GT "0/2", :GQ 17, :DP 2}, :NA00003 {:GT "1/1", :GQ 40, :DP 3}}))
`({:chr "20", :pos 14370, :id "rs6054257",
:ref "G", :alt ["A"], :qual 29.0, :filter [:PASS],
:info {:NS 3, :DP 14, :AF [0.5], :DB :exists, :H2 :exists},
:FORMAT [:GT :GQ :DP :HQ],
:NA00001 {:GT "0|0", :GQ 48, :DP 1, :HQ [51 51]},
:NA00002 {:GT "1|0", :GQ 48, :DP 8, :HQ [51 51]},
:NA00003 {:GT "1/1", :GQ 43, :DP 5, :HQ [nil nil]}}
{:chr "20", :pos 17330, :id nil,
:ref "T", :alt ["A"], :qual 3.0, :filter [:q10],
:info {:NS 3, :DP 11, :AF [~(float 0.017)]},
:FORMAT [:GT :GQ :DP :HQ],
:NA00001 {:GT "0|0", :GQ 49, :DP 3, :HQ [58 50]},
:NA00002 {:GT "0|1", :GQ 3, :DP 5, :HQ [65 3]},
:NA00003 {:GT "0/0", :GQ 41, :DP 3, :HQ nil}}
{:chr "20", :pos 1110696, :id "rs6040355",
:ref "A", :alt ["G" "T"], :qual 67.0, :filter [:PASS],
:info {:NS 2, :DP 10, :AF [~(float 0.333) ~(float 0.667)],
:AA "T", :DB :exists},
:FORMAT [:GT :GQ :DP :HQ],
:NA00001 {:GT "1|2", :GQ 21, :DP 6, :HQ [23 27]},
:NA00002 {:GT "2|1", :GQ 2, :DP 0, :HQ [18 2]},
:NA00003 {:GT "2/2", :GQ 35, :DP 4, :HQ nil}}
{:chr "20", :pos 1230237, :id nil,
:ref "T", :alt nil, :qual 47.0, :filter [:PASS],
:info {:NS 3, :DP 13, :AA "T"},
:FORMAT [:GT :GQ :DP :HQ],
:NA00001 {:GT "0|0", :GQ 54, :DP 7, :HQ [56 60]},
:NA00002 {:GT "0|0", :GQ 48, :DP 4, :HQ [51 51]},
:NA00003 {:GT "0/0", :GQ 61, :DP 2, :HQ nil}}
{:chr "20", :pos 1234567, :id "microsat1",
:ref "GTC", :alt ["G" "GTCT"], :qual 50.0, :filter [:PASS],
:info {:NS 3, :DP 9, :AA "G"},
:FORMAT [:GT :GQ :DP],
:NA00001 {:GT "0/1", :GQ 35, :DP 4},
:NA00002 {:GT "0/2", :GQ 17, :DP 2},
:NA00003 {:GT "1/1", :GQ 40, :DP 3}}))

(def test-vcf-no-samples-variants-deep
[{:chr "1", :pos 10, :id nil, :ref "A", :alt ["T"], :qual nil, :filter [:PASS], :info {:DP 10}}])
Expand Down