Skip to content

Commit

Permalink
Ensure BED files passed to evaluations are converted to reference mat…
Browse files Browse the repository at this point in the history
…erial coordinates when inputs differ (hg19->GRCh37). Thanks to Severine Catreux.
  • Loading branch information
chapmanb committed Jan 4, 2014
1 parent cb7cc75 commit 6dabe0b
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 20 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -54,6 +54,8 @@ test/data/phased/*-cmp*.vcf
test/data/phased/work/
test/data/digrade/*.idx
test/data/digrade/*.csv
test/data/digrade/*-sorted.bed
test/data/digrade/hg19/*-sorted.bed
test/data/digrade/work/
test/data/svs/*.idx
test/data/svs/*-sorted.bed
Expand Down
2 changes: 2 additions & 0 deletions HISTORY.md
Expand Up @@ -3,6 +3,8 @@
- Avoid errors on converting hg19 to GRCh37 where hg19 variants contain hg19 hap
contigs with no equivalent in GRCh37. It now drops these variants instead
of generating an error. Thanks to Severine Catreux.
- Ensure BED files passed to evaluations are converted to reference material
coordinates when inputs differ (hg19->GRCh37). Thanks to Severine Catreux.
- Avoid issues with running LeftAlignVariants on indels with END tags. Thanks to
Justin Johnson.
- Move to external bcbio.run tool to help abstract out some core functionality
Expand Down
18 changes: 18 additions & 0 deletions src/bcbio/align/interval.clj
Expand Up @@ -30,3 +30,21 @@
(map (partial update-contig-name name-map))
(remove nil?)))))))
out-file))

(defn- maybe-remap-bed
[interval int-ref base-ref out-dir]
(when interval
(if (and (not (nil? int-ref)) (not= base-ref int-ref))
(rename-bed interval base-ref :out-dir out-dir)
interval)))

(defn prep-multi
"Prepare multiple input intervals, remapping chromosome names as needed."
([to-prep ref-file orig out-dir]
(->> to-prep
(map (juxt :intervals :ref))
(map (fn [[bed ref]] (maybe-remap-bed bed ref ref-file out-dir)))
(concat (flatten [orig]))
(remove nil?)))
([to-prep ref-file out-dir]
(prep-multi to-prep ref-file [] out-dir)))
12 changes: 6 additions & 6 deletions src/bcbio/variation/compare.clj
Expand Up @@ -19,7 +19,6 @@
[bcbio.variation.evaluate :only [calc-variant-eval-metrics]]
[bcbio.variation.filter :only [variant-filter variant-format-filter
pipeline-recalibration]]
[bcbio.variation.filter.intervals :only [combine-multiple-intervals]]
[bcbio.variation.multiple :only [prep-cmp-name-lookup pipeline-compare-multiple]]
[bcbio.variation.multisample :only [compare-two-vcf-flexible
multiple-samples?]]
Expand All @@ -32,9 +31,11 @@
[clj-yaml.core :as yaml]
[me.raynes.fs :as fs]
[lonocloud.synthread :as ->]
[bcbio.align.interval :as ainterval]
[bcbio.run.fsp :as fsp]
[bcbio.run.itx :as itx]
[bcbio.run.broad :as broad]
[bcbio.variation.filter.intervals :as fintervals]
[bcbio.variation.grade :as grade]
[bcbio.variation.phasing :as phasing]
[bcbio.variation.report :as report]
Expand All @@ -46,8 +47,7 @@
"Variant comparison producing 3 files: concordant and both directions discordant"
[sample call1 call2 ref & {:keys [out-dir intervals]}]
(let [base-dir (if (nil? out-dir) (fs/parent (:file call1)) out-dir)
ready-intervals (remove nil? (flatten [intervals (:intervals call1)
(:intervals call2)]))
ready-intervals (ainterval/prep-multi [call1 call2] ref intervals out-dir)
sample (or sample (-> call1 :file gvc/get-vcf-header .getGenotypeSamples first))]
(if-not (fs/exists? base-dir)
(fs/mkdirs base-dir))
Expand Down Expand Up @@ -116,7 +116,7 @@
(let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
transition (partial do-transition config)
align-bams (prepare-input-bams exp out-dir)
all-intervals (remove nil? (map :intervals (cons exp (:calls exp))))
all-intervals (ainterval/prep-multi (cons exp (:calls exp)) (:ref exp) out-dir)
start-vcfs (vec (map #(gatk-normalize % exp all-intervals out-dir transition)
(:calls exp)))
_ (transition :combine "Creating merged VCF files for all comparisons")
Expand Down Expand Up @@ -148,8 +148,8 @@
(let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
align-bams (remove nil? (map :align [c1 c2]))]
(when (and (:intervals exp) (seq align-bams))
(combine-multiple-intervals (:intervals exp) align-bams (:ref exp)
:out-dir out-dir :name (:sample exp)))))
(fintervals/combine-multiple (:intervals exp) align-bams (:ref exp)
:out-dir out-dir :name (:sample exp)))))
(discordant-name [x]
(format "%s-discordant" (:name x)))
(zipmap-ordered [xs1 xs2]
Expand Down
15 changes: 8 additions & 7 deletions src/bcbio/variation/filter/intervals.clj
Expand Up @@ -80,6 +80,8 @@
(catch UserException$BadInput e []))
(rest intervals)))))

;; ## Merge BED files

(defn- prep-intervals-by-contig
"Intersect and exclude intervals on a contig."
[start-intervals exclude-intervals loc-parser combine-rule]
Expand Down Expand Up @@ -109,7 +111,7 @@
combine-rule)
contigs))))

(defn combine-multiple-intervals
(defn combine-multiple
"Combine intervals from an initial BED and coverage BAM files."
[initial-bed align-bams ref & {:keys [out-dir name exclude-intervals combine-rule
more-beds]}]
Expand Down Expand Up @@ -137,9 +139,8 @@
(let [base-intervals (:intervals exp)
all-aligns (set (remove nil? (map :align (cons exp (:calls exp)))))]
(when (and base-intervals (seq all-aligns))
(combine-multiple-intervals base-intervals all-aligns
(:ref exp)
:exclude-intervals (:exclude-intervals exp)
:name (:sample exp)
:out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))))))

(combine-multiple base-intervals all-aligns
(:ref exp)
:exclude-intervals (:exclude-intervals exp)
:name (:sample exp)
:out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))))))
5 changes: 2 additions & 3 deletions src/bcbio/variation/utils/svmerge.clj
Expand Up @@ -45,9 +45,8 @@
(-> (combine/combine-variants [call-safesv svready-file] ref-file
:merge-type :full)
(fs/rename (:calls out-files)))
(-> (intervals/combine-multiple-intervals region-file [] ref-file
:combine-rule :union
:more-beds [sv-bed])
(-> (intervals/combine-multiple region-file [] ref-file
:combine-rule :union :more-beds [sv-bed])
(fs/rename (:regions out-files)))
))
out-files))
18 changes: 18 additions & 0 deletions test/bcbio/variation/test/grade.clj
Expand Up @@ -67,6 +67,24 @@
(-> cmp :grade-breakdown :discordant :snp :shared :hethom) => 1
(-> cmp :c-files :eval-discordant) => out-file)))

(facts "Comparisons where inputs have alternative reference sequence: hg19->GRCh37"
(let [base-dir (fs/file data-dir "digrade")
c1o {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf"))
:prep true
:name "ref" :type "grading-ref"}
c2o {:file (str (fs/file base-dir "hg19" "NA12878-cmp-r3.vcf"))
:name "eval"
:prep true
:intervals (str (fs/file base-dir "hg19" "NA12878-cmp-r3-intervals.bed"))
:ref (str (fs/file data-dir "hg19.fa"))}
exp {:ref ref-file :sample "NA12878" :approach "grade"
:intervals (str (fs/file base-dir "NA12878-cmp-r1-intervals.bed"))}
config {:dir {:out (str (fs/file base-dir "work"))}}]
(fsp/remove-path (get-in config [:dir :out]))
(fs/mkdirs (get-in config [:dir :out]))
(let [[c1 c2] (#'compare/prepare-vcf-calls (assoc exp :calls [c1o c2o]) config)]
(compare-two-vcf c1 c2 exp config))))

(facts "Normalize input VCFs containing END tags"
(let [base-dir (fs/file data-dir "digrade")
c1 {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf"))
Expand Down
8 changes: 4 additions & 4 deletions test/bcbio/variation/test/validate.clj
Expand Up @@ -4,14 +4,14 @@
[bcbio.variation.haploid :exclude [-main]]
[bcbio.variation.filter.attr]
[bcbio.variation.filter.classify]
[bcbio.variation.filter.intervals]
[bcbio.variation.filter]
[bcbio.variation.validate]
[bcbio.variation.variantcontext :exclude [-main]])
(:require [me.raynes.fs :as fs]
[bcbio.run.fsp :as fsp]
[bcbio.run.itx :as itx]
[bcbio.variation.filter.custom :as cf]))
[bcbio.variation.filter.custom :as cf]
[bcbio.variation.filter.intervals :as fintervals]))

(background
(around :facts
Expand Down Expand Up @@ -95,5 +95,5 @@
(cf/freebayes-filter fb-vcf ref) => fb-filter-out)

(facts "Prepare combined interval lists based on filtering criteria"
(combine-multiple-intervals region-bed [align-bam] ref
:exclude-intervals exclude-bed) => region-multi-out)
(fintervals/combine-multiple region-bed [align-bam] ref
:exclude-intervals exclude-bed) => region-multi-out)
1 change: 1 addition & 0 deletions test/data/digrade/NA12878-cmp-r1-intervals.bed
@@ -0,0 +1 @@
22 1 15000
1 change: 1 addition & 0 deletions test/data/digrade/hg19/NA12878-cmp-r3-intervals.bed
@@ -0,0 +1 @@
chr22 1 15000

0 comments on commit 6dabe0b

Please sign in to comment.