Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read multiple contigs in VCF meta info #25

Merged
merged 3 commits into from Jan 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 13 additions & 6 deletions src/cljam/vcf/reader.clj
Expand Up @@ -37,18 +37,25 @@
{(->kebab-case-keyword k) (dot->nil (or v2 v1))}))
(apply merge)))

(defn- parse-meta-info-contig
[m]
(update m :length str->long))

(defn- parse-meta-info-info
[s]
(update (parse-structured-line s) :number str->long))
[m]
(update m :number (fn [s]
(if (#{"A" "R" "G"} s)
s
(str->long s)))))

(defn- parse-meta-info-line
[line]
(let [[_ k* v] (re-find #"^##([\w:/\.\?\-]*)=(.*)$" line)
k (->kebab-case-keyword k*)]
[k (if-let [[_ s] (re-find #"^<(.+)>$" v)]
(if (#{:info :format} k)
(parse-meta-info-info s)
(parse-structured-line s))
(cond-> (parse-structured-line s)
(#{:info :format} k) parse-meta-info-info
(= k :contig) parse-meta-info-contig)
v)]))

(defn load-meta-info
Expand All @@ -57,7 +64,7 @@
(if (meta-line? line)
(let [[k v] (parse-meta-info-line line)]
(recur (.readLine rdr)
(if (#{:info :filter :format :alt :sample :pedigree} k)
(if (#{:contig :info :filter :format :alt :sample :pedigree} k)
(if (get meta-info k)
(update meta-info k conj v)
(assoc meta-info k [v]))
Expand Down
45 changes: 35 additions & 10 deletions src/cljam/vcf/writer.clj
Expand Up @@ -33,6 +33,23 @@
;; Writing meta-information
;; ------------------------

(defn- stringify-key
[k]
(if (#{:info :filter :format :alt :sample :pedigree} k)
(cstr/upper-case (name k))
(->camelCaseString k)))

(defn- stringify-meta-info-contig
[m]
(->> (cond-> [(str "ID=" (:id m))
(str "length=" (:length m))
(str "assembly=" (:assembly m))
(str "md5=" (:md-5 m))]
(:species m) (conj (str "species=\"" (:species m) "\""))
(:taxonomy m) (conj (str "taxonomy=" (:taxonomy m))))
(interpose ",")
(apply str)))

(defn- stringify-meta-info-info
[m]
(->> (cond-> [(str "ID=" (:id m))
Expand Down Expand Up @@ -87,6 +104,7 @@
(defn- stringify-structured-line
[k m]
(let [f (case k
:contig stringify-meta-info-contig
:info stringify-meta-info-info
:filter stringify-meta-info-filter
:format stringify-meta-info-format
Expand All @@ -95,18 +113,25 @@
:pedigree stringify-meta-info-pedigree)]
(f m)))

(defn- write-meta-info1
[^VCFWriter wtr k v]
(if-not (nil? v)
(if (vector? v)
(doseq [x v]
(write-line (.writer wtr) (str meta-info-prefix
(stringify-key k)
"=<" (stringify-structured-line k x) ">")))
(write-line (.writer wtr) (str meta-info-prefix
(stringify-key k)
"=" v)))))

(defn write-meta-info
[^VCFWriter wtr meta-info]
(write-line (.writer wtr) (str meta-info-prefix "fileformat="
(:fileformat meta-info default-fileformat)))
(doseq [[k v] (dissoc meta-info :fileformat :info :filter :format :alt :sample
:pedigree)]
(write-line (.writer wtr) (str meta-info-prefix (->camelCaseString k) "="
v)))
(doseq [k [:info :filter :format :alt :sample :pedigree]
v (get meta-info k)]
(write-line (.writer wtr) (str meta-info-prefix (cstr/upper-case (name k))
"=<" (stringify-structured-line k v) ">"))))
(write-meta-info1 wtr :fileformat (:fileformat meta-info default-fileformat))
(doseq [k [:file-date :source :reference :contig :phasing]]
(write-meta-info1 wtr k (get meta-info k)))
(doseq [k [:info :filter :format :alt :sample :pedigree]]
(write-meta-info1 wtr k (get meta-info k))))

;; Writing header
;; --------------
Expand Down
File renamed without changes.
24 changes: 24 additions & 0 deletions test-resources/vcf/test-v4_3.vcf
@@ -0,0 +1,24 @@
##fileformat=VCFv4.3
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
46 changes: 42 additions & 4 deletions test/cljam/t_common.clj
Expand Up @@ -97,7 +97,8 @@

;; ### VCF files

(def test-vcf-file "test-resources/test.vcf")
(def test-vcf-v4_0-file "test-resources/vcf/test-v4_0.vcf")
(def test-vcf-v4_3-file "test-resources/vcf/test-v4_3.vcf")

(def test-sam
{:header {:SQ [{:SN "ref", :LN 45} {:SN "ref2", :LN 40}]}
Expand Down Expand Up @@ -321,7 +322,7 @@

;; ### VCF

(def test-vcf-meta-info
(def test-vcf-v4_0-meta-info
{:fileformat "VCFv4.0"
:file-date "20090805"
:source "myImputationProgramV3.1"
Expand All @@ -344,11 +345,11 @@
:alt [{:id "DEL:ME:ALU", :description "Deletion of ALU element"}
{:id "CNV", :description "Copy number variable region"}]})

(def test-vcf-header
(def test-vcf-v4_0-header
["CHROM" "POS" "ID" "REF" "ALT" "QUAL" "FILTER" "INFO" "FORMAT" "NA00001"
"NA00002" "NA00003"])

(def test-vcf-variants
(def test-vcf-v4_0-variants
'({:chrom "19", :pos 111, :id nil, :ref "A", :alt ["C"], :qual 9.6, :filter nil, :info nil,
:FORMAT "GT:HQ", :NA00001 "0|0:10,10", :NA00002 "0|0:10,10", :NA00003 "0/1:3,3"}
{:chrom "19", :pos 112, :id nil, :ref "A", :alt ["G"], :qual 10.0, :filter nil, :info nil,
Expand All @@ -373,3 +374,40 @@
:FORMAT "GT:DP:GQ", :NA00001 ".:3:10", :NA00002 "./.:.:.", :NA00003 "0|2:3:."}
{:chrom "X", :pos 12, :id nil, :ref "T", :alt ["A"], :qual 13.0, :filter nil, :info nil,
:FORMAT "GT", :NA00001 "0", :NA00002 "1/0", :NA00003 "1/1"}))

(def test-vcf-v4_3-meta-info
{:fileformat "VCFv4.3"
:file-date "20090805"
:source "myImputationProgramV3.1"
:reference "file:///seq/references/1000GenomesPilot-NCBI36.fasta"
:contig [{:id "20", :length 62435964, :assembly "B36",
:md-5 "f126cdf8a6e0c7f379d618ff66beb2da", :species "Homo sapiens", :taxonomy "x"}]
:phasing "partial"
:info [{:id "NS", :number 1, :type "Integer", :description "Number of Samples With Data"}
{:id "DP", :number 1, :type "Integer", :description "Total Depth"}
{:id "AF", :number "A", :type "Float", :description "Allele Frequency"}
{:id "AA", :number 1, :type "String", :description "Ancestral Allele"}
{:id "DB", :number 0, :type "Flag", :description "dbSNP membership, build 129"}
{:id "H2", :number 0, :type "Flag", :description "HapMap2 membership"}]
:filter [{:id "q10", :description "Quality below 10"}
{:id "s50", :description "Less than 50% of samples have data"}]
:format [{:id "GT", :number 1, :type "String", :description "Genotype"}
{:id "GQ", :number 1, :type "Integer", :description "Genotype Quality"}
{:id "DP", :number 1, :type "Integer", :description "Read Depth"}
{:id "HQ", :number 2, :type "Integer", :description "Haplotype Quality"}]})

(def test-vcf-v4_3-header
["CHROM" "POS" "ID" "REF" "ALT" "QUAL" "FILTER" "INFO" "FORMAT" "NA00001"
"NA00002" "NA00003"])

(def test-vcf-v4_3-variants
'({:chrom "20", :pos 14370, :id "rs6054257", :ref "G", :alt ["A"], :qual 29.0, :filter "PASS", :info "NS=3;DP=14;AF=0.5;DB;H2",
:FORMAT "GT:GQ:DP:HQ", :NA00001 "0|0:48:1:51,51", :NA00002 "1|0:48:8:51,51", :NA00003 "1/1:43:5:.,."}
{:chrom "20", :pos 17330, :id nil, :ref "T", :alt ["A"], :qual 3.0, :filter "q10", :info "NS=3;DP=11;AF=0.017",
:FORMAT "GT:GQ:DP:HQ", :NA00001 "0|0:49:3:58,50", :NA00002 "0|1:3:5:65,3", :NA00003 "0/0:41:3"}
{:chrom "20", :pos 1110696, :id "rs6040355", :ref "A", :alt ["G" "T"], :qual 67.0, :filter "PASS", :info "NS=2;DP=10;AF=0.333,0.667;AA=T;DB",
:FORMAT "GT:GQ:DP:HQ",:NA00001 "1|2:21:6:23,27", :NA00002 "2|1:2:0:18,2", :NA00003 "2/2:35:4"}
{:chrom "20", :pos 1230237, :id nil, :ref "T", :alt nil, :qual 47.0, :filter "PASS", :info "NS=3;DP=13;AA=T",
:FORMAT "GT:GQ:DP:HQ", :NA00001 "0|0:54:7:56,60", :NA00002 "0|0:48:4:51,51", :NA00003 "0/0:61:2"}
{:chrom "20", :pos 1234567, :id "microsat1", :ref "GTC", :alt ["G" "GTCT"], :qual 50.0, :filter "PASS", :info "NS=3;DP=9;AA=G",
:FORMAT "GT:GQ:DP", :NA00001 "0/1:35:4", :NA00002 "0/2:17:2", :NA00003 "1/1:40:3"}))
48 changes: 33 additions & 15 deletions test/cljam/t_vcf.clj
Expand Up @@ -17,23 +17,41 @@
(with-open [wtr (vcf/writer f meta-info header)]
(vcf/write-variants wtr variants)))

(fact "`meta-info` returns meta-information of the VCF as a map"
(with-open [rdr (vcf/reader test-vcf-file)]
(vcf/meta-info rdr) => test-vcf-meta-info))
(fact "`meta-info` returns meta-information of the VCF (v4.0) as a map"
(with-open [rdr (vcf/reader test-vcf-v4_0-file)]
(vcf/meta-info rdr) => test-vcf-v4_0-meta-info))

(fact "`header` returns header line of the VCF as a vector"
(with-open [rdr (vcf/reader test-vcf-file)]
(vcf/header rdr) => test-vcf-header))
(fact "`meta-info` returns meta-information of the VCF (v4.3) as a map"
(with-open [rdr (vcf/reader test-vcf-v4_3-file)]
(vcf/meta-info rdr) => test-vcf-v4_3-meta-info))

(fact "`read-variants` returns data lines of the VCF as a lazy sequence"
(with-open [rdr (vcf/reader test-vcf-file)]
(vcf/read-variants rdr) => test-vcf-variants))
(fact "`header` returns header line of the VCF (v4.0) as a vector"
(with-open [rdr (vcf/reader test-vcf-v4_0-file)]
(vcf/header rdr) => test-vcf-v4_0-header))

(fact "`header` returns header line of the VCF (v4.3) as a vector"
(with-open [rdr (vcf/reader test-vcf-v4_3-file)]
(vcf/header rdr) => test-vcf-v4_3-header))

(fact "`read-variants` returns data lines of the VCF (v4.0) as a lazy sequence"
(with-open [rdr (vcf/reader test-vcf-v4_0-file)]
(vcf/read-variants rdr) => test-vcf-v4_0-variants))

(fact "`read-variants` returns data lines of the VCF (v4.3) as a lazy sequence"
(with-open [rdr (vcf/reader test-vcf-v4_3-file)]
(vcf/read-variants rdr) => test-vcf-v4_3-variants))

(with-state-changes [(before :facts (prepare-cache!))
(after :facts (clean-cache!))]
(fact "about writing VCF"
(spit-vcf-for-test temp-file test-vcf-meta-info test-vcf-header
test-vcf-variants) => anything
(slurp-vcf-for-test temp-file) => {:meta-info test-vcf-meta-info
:header test-vcf-header
:variants test-vcf-variants}))
(fact "about writing VCF (v4.0)"
(spit-vcf-for-test temp-file test-vcf-v4_0-meta-info test-vcf-v4_0-header
test-vcf-v4_0-variants) => anything
(slurp-vcf-for-test temp-file) => {:meta-info test-vcf-v4_0-meta-info
:header test-vcf-v4_0-header
:variants test-vcf-v4_0-variants})
(fact "about writing VCF (v4.3)"
(spit-vcf-for-test temp-file test-vcf-v4_3-meta-info test-vcf-v4_3-header
test-vcf-v4_3-variants) => anything
(slurp-vcf-for-test temp-file) => {:meta-info test-vcf-v4_3-meta-info
:header test-vcf-v4_3-header
:variants test-vcf-v4_3-variants}))