/
vcfsample.clj
62 lines (57 loc) · 2.72 KB
/
vcfsample.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
(ns bcbio.variation.ensemble.vcfsample
"Sort VCF sample columns to have a consistent order between multiple inputs.
Variant callers order called outputs differently and this ensures they are
consistent to feed into ensemble calling."
(:import [htsjdk.samtools.util BlockCompressedInputStream BlockCompressedOutputStream])
(:require [bcbio.run.fsp :as fsp]
[bcbio.variation.ensemble.prep :as eprep]
[bcbio.variation.variantcontext :as vc]
[clojure.java.io :as io]
[clojure.string :as string]))
(defn- sort-sample-line
"Sort samples in a VCF line using reordered indexes from calculate-reorder."
[line reorder]
(let [[keep samples] (split-at 9 (string/split line #"\t"))]
(string/join "\t"
(concat keep
(->> samples
(map-indexed vector)
(sort-by (fn [[i x]] (get reorder i)))
(map second))))))
(defn- calculate-reorder
"Create a dictionary of sample indexes in the original VCF to those desired in the output."
[orig-order want-order]
(let [want-indexes (reduce (fn [coll [i x]]
(assoc coll x i))
{} (map-indexed vector want-order))]
(reduce (fn [coll [i x]]
(assoc coll i (get want-indexes x)))
{} (map-indexed vector orig-order))))
(defn- sort-samples
"Sort samples in a VCF file, moving from orig-order to want-order."
[vcf-file orig-order want-order all-vcfs work-dir]
(let [out-file (eprep/unique-work-file vcf-file "ssort" all-vcfs work-dir)
sample-reorder (calculate-reorder orig-order want-order)]
(with-open [rdr (io/reader (BlockCompressedInputStream. (io/file vcf-file)))
wtr (io/writer (BlockCompressedOutputStream. (io/file out-file)))]
(doseq [line (line-seq rdr)]
(.write wtr (str (if (.startsWith line "##")
line
(sort-sample-line line sample-reorder))
"\n"))))
(eprep/bgzip-index-vcf out-file)
out-file))
(defn- maybe-sort-names
"Extract sample names for the current file and do sorting if needed."
[vcf-file sorder all-vcfs work-dir]
(let [cur-sorder (vc/get-vcf-samples vcf-file)]
(if (not= cur-sorder sorder)
(sort-samples vcf-file cur-sorder sorder all-vcfs work-dir)
vcf-file)))
(defn consistent-order
"Ensure the set of VCF files have a consistent sample order relative to the first."
[vcf-files work-dir]
(fsp/safe-mkdir work-dir)
(let [sorder (vc/get-vcf-samples (first vcf-files))]
(cons (first vcf-files)
(map #(maybe-sort-names % sorder vcf-files work-dir) (rest vcf-files)))))