/
liftover_bgen_beds.wdl
256 lines (237 loc) · 11 KB
/
liftover_bgen_beds.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
###########################################################################################
# +------------------------------------------------------------+
# | liftover_bgen_beds.wdl |
# list of bgen files | +---------------------+ | list of lifted plink sets
# per chromosomes | +---------------------+ | +------------------------+ | per chromosomes
# | ++-------------------+ | +--->| | |
# UCSC chain file -------+->| | +----->| merge_and_split_by_chr +--+-> merged plink file set
# (*.chain) | | picard_liftovervcf +------->| | | of chromosome 1-22
# | | +-+ +------------------------+ |
# target reference build | +--------------------+ | merged plink file set
# (*.fa.gz) | | of all chromosomes
# +------------------------------------------------------------+
#
###########################################################################################
version 1.0
workflow liftover_bgen_beds {
input {
Array[File]+ bgens
Array[File]+ bgens_sample
File ucsc_chain
File reference_fastagz
String split_par_build_code = "hg38"
Boolean output_autosomal = true
}
String reference = basename(basename(basename(reference_fastagz, ".gz"), ".fa"), ".fasta")
scatter(bgen in bgens){
String file_prefix = basename(bgen, ".bgen")
call picard_liftovervcf {
input:
bgen = bgen,
bgens_sample = bgens_sample,
ucsc_chain = ucsc_chain,
file_prefix = file_prefix,
reference_fastagz = reference_fastagz,
split_par_build_code = split_par_build_code,
reference = reference
}
}
call merge_and_split_by_chr {
input:
plink_beds = picard_liftovervcf.lifted_plink_bed,
plink_bims = picard_liftovervcf.lifted_plink_bim,
plink_fams = picard_liftovervcf.lifted_plink_fam,
split_par_build_code = split_par_build_code,
reference = reference,
output_autosomal = output_autosomal
}
output {
Array[File] merged_plinks = merge_and_split_by_chr.merged_plinks
Array[File?] autosomal_plinks = merge_and_split_by_chr.autosomal_plinks
Array[File?] per_chr_plinks = merge_and_split_by_chr.per_chr_plinks
Array[File?] unplaced_plinks = merge_and_split_by_chr.unplaced_plinks
Array[File] lifted_vcf = flatten(picard_liftovervcf.lifted_vcf)
}
parameter_meta {
bgens: {
patterns: ["*.bgen"]
}
bgens_sample: {
patterns: ["*.sample"]
}
ucsc_chain: {
patterns: ["*.chain"]
}
reference_fastagz: {
patterns: ["*.fa*.gz", "*.fa", "*.fasta"]
}
}
}
task picard_liftovervcf {
input {
File bgen
Array[File]+ bgens_sample
File ucsc_chain
File reference_fastagz
String split_par_build_code
String file_prefix
String reference
}
Int disk_space = ceil(size(bgen, "GB") * 30)
command <<<
set -x -e -o pipefail
## Get memory on the machine, use 80% of the memory for each operation
memTotal=$(head -n1 /proc/meminfo|awk '{print $2}')
memXmx_m=$(( ${memTotal}*8/10/1024 ))
memXmx_g=$(( ${memTotal}*8/10/1024/1024 ))
## Convert BGEN to VCF
# chromosome codes for PAR1/PAR2 are all saved as chrX
# `--merge-x` may cause ploidies to be wrong when using directly with exporting VCF,
# thus we used `--merge-x` + `--sort-vars` + `--make-bpgen`, then `--export vcf`.
bgen_path="~{bgen}"
bgen_prefix="${bgen_path%.bgen}"
plink2 --bgen "${bgen_prefix}.bgen" ref-first --sample "${bgen_prefix}.sample" --export vcf bgz --out "~{file_prefix}" \
# plink2 --bfile "${plink_prefix}" --make-bpgen --out "~{file_prefix}" \
--merge-x \
--sort-vars n \
--threads $(nproc) --memory "${memXmx_m}"
# plink2 --bpfile "~{file_prefix}" --export vcf bgz --out "~{file_prefix}"
# rm -rf ~{file_prefix}.pgen
rm -rf ~{bgen}
## Generate reference index, required for picard LiftoverVcf
java -jar /opt/picard/picard.jar CreateSequenceDictionary -R "~{reference_fastagz}"
## Run Liftover
java -Xmx"${memXmx_g}"g -jar /opt/picard/picard.jar LiftoverVcf \
-I "~{file_prefix}.vcf.gz" -O "~{file_prefix}_~{reference}.vcf" \
-R "~{reference_fastagz}" -C "~{ucsc_chain}" \
--REJECT "~{file_prefix}_~{reference}_rejected.vcf" \
--RECOVER_SWAPPED_REF_ALT true --DISABLE_SORT true
rm -rf "~{file_prefix}.vcf.gz"
## Sort VCF by coordinate and generate BGZIP VCF using bcftools
# Sorting in picard Liftover requires large memory at file writing stage
# --var-sort
# This is to avoid error thrown by PLINK with VCF with more than one contigs (i.e. splited chromosome)
# when/after converting VCF -> PLINK
bcftools sort -o "~{file_prefix}_~{reference}.vcf.gz" -O z \
--max-mem "${memXmx_g}g" \
"~{file_prefix}_~{reference}.vcf"
rm -rf "~{file_prefix}_~{reference}.vcf"
## Convert VCF back to plink
# `--indiv-sort` is used to enusre PLINK FAM output order remains as the original FAM, as LiftOverVcf changed sample order
# `--allow-extra-chr` is used to permit unplaced contigs
plink2 --vcf "~{file_prefix}_~{reference}.vcf.gz" --make-bed --out "~{file_prefix}_temp" \
--id-delim "_" --indiv-sort f "${plink_prefix}.fam" \
--ref-allele force "~{file_prefix}_~{reference}.vcf.gz" 4 3 '#' \
--alt1-allele force "~{file_prefix}_~{reference}.vcf.gz" 5 3 '#' \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
## Collect father/mother/sex information from the original FAM file
# If set to split _par, represent the X chromosome's pseudo-autosomal region (PARs) as PAR1/PAR2
if [ "~split_par_build_code" == "none" ]; then
plink2 --bfile "~{file_prefix}_temp" --make-bed --out "~{file_prefix}_~{reference}" \
--fam "${bgen_prefix}.fam" --allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
else
plink2 --bfile "~{file_prefix}_temp" --make-bed --out "~{file_prefix}_~{reference}" \
--split-par "~{split_par_build_code}" \
--fam "${bgen_prefix}.fam" --allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
fi
rm -rf "~{file_prefix}_temp.*"
## update FAM file with the phenotype column (batch info for UKB Array)
awk 'BEGIN{OFS="\t";FS = "[ \t\n]+"}FNR==NR{a[$2]=$6;next}{if ($2 in a) print $1,$2,$3,$4,$5,a[$2]}' \
"${bgen_prefix}.fam" "~{file_prefix}_~{reference}.fam" > "~{file_prefix}_~{reference}_temp.fam"
mv "~{file_prefix}_~{reference}_temp.fam" "~{file_prefix}_~{reference}.fam"
bcftools sort -o "~{file_prefix}_~{reference}_rejected.vcf.gz" -O z \
--max-mem "${memXmx_g}g" \
"~{file_prefix}_~{reference}_rejected.vcf"
>>>
output {
File lifted_plink_bed = "~{file_prefix}_~{reference}.bed"
File lifted_plink_bim = "~{file_prefix}_~{reference}.bim"
File lifted_plink_fam = "~{file_prefix}_~{reference}.fam"
Array[File] lifted_vcf = glob('*.vcf.gz')
}
runtime {
docker: "quay.io/yihchii/liftover_plink_beds:20220104"
memory: "64 GB"
disks: "local-disk ~{disk_space} SSD"
}
}
task merge_and_split_by_chr {
input {
Array[File] plink_beds
Array[File] plink_bims
Array[File] plink_fams
String split_par_build_code
String reference
Boolean output_autosomal = true
}
Int disk_space = ceil(size(flatten([plink_beds, plink_bims, plink_fams]), "GB") * 3)
File fam_file = select_first(plink_fams)
command <<<
set -x -e -o pipefail
mkdir -p autosomal_dir/
cat "~{write_lines(plink_beds)}" | sed -e 's/.bed//g' > files_to_merge.txt
cat files_to_merge.txt
## Get memory on the machine, use 80% of the memory for each operation
memTotal=$(head -n1 /proc/meminfo|awk '{print $2}')
memXmx_m=$(( ${memTotal}*8/10/1024 ))
## Get one of the fam files as correspondance FAM file for all
## Merge all PLINK files into one
plink --merge-list files_to_merge.txt --make-bed \
--indiv-sort f "~{fam_file}" \
--update-sex "~{fam_file}" 3 \
--out "ukb_~{reference}_merged" \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
# Fix the FAM file to show also the batch numbers
awk 'BEGIN{OFS="\t";FS = "[ \t\n]+"}FNR==NR{a[$2]=$6;next}{if ($2 in a) print $1,$2,$3,$4,$5,a[$2]}' \
"~{fam_file}" "ukb_~{reference}_merged.fam" > "ukb_~{reference}_merged_temp.fam"
mv "ukb_~{reference}_merged_temp.fam" "ukb_~{reference}_merged.fam"
# Remove input files to save space
set +x
for prefix in $(cat "files_to_merge.txt"); do
rm -rf ${prefix}.*
done
set -x
if [ "~{output_autosomal}" == "true" ]; then
## Get autosomal (chr1-22) PLINK file set
plink2 --bfile "ukb_~{reference}_merged" --chr 1-22 --make-bed \
--out "autosomal_dir/ukb_c1-22_~{reference}_merged" \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
cp "ukb_~{reference}_merged.fam" "autosomal_dir/ukb_c1-22_~{reference}_merged.fam"
fi
## Split into per-chromosome PLINK file set from the merged PLINK
for i in $(cut -f1 ukb_~{reference}_merged.bim|sort -u|grep -v 'PAR'||true); do
plink2 --bfile "ukb_~{reference}_merged" --chr "${i}" --make-bed \
--out "ukb_c${i}_~{reference}" \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
cp "ukb_~{reference}_merged.fam" "ukb_c${i}_~{reference}.fam"
done
## Gererate PAR plink file set if they exist
par_num=$(cut -f1 ukb_~{reference}_merged.bim|sort -u|grep 'PAR1\|PAR2'|wc -l||true)
if [ "${par_num}" == "2" ]; then
plink2 --bfile "ukb_~{reference}_merged" --chr PAR1,PAR2 --make-bed \
--out "ukb_cPAR_~{reference}" \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
cp "ukb_~{reference}_merged.fam" "ukb_cPAR_~{reference}.fam"
fi
## Generate plink files containing unplaced contigs
unplaced_num=$(cut -f1 ukb_~{reference}_merged.bim|sort -u|grep -v 'PAR'|grep -v -e '[0-9]$'|wc -l||true)
if [ "${unplaced_num}" != "0" ]; then
plink2 --bfile "ukb_~{reference}_merged" --not-chr 1-22,X,Y,PAR1,PAR2,MT --make-bed \
--out "ukb_unplaced_~{reference}" \
--allow-extra-chr --threads $(nproc) --memory "${memXmx_m}"
cp "ukb_~{reference}_merged.fam" "ukb_unplaced_~{reference}.fam"
fi
>>>
output {
Array[File] merged_plinks = glob("ukb_~{reference}_merged.*")
Array[File?] autosomal_plinks = glob("autosomal_dir/ukb_c1-22_~{reference}_merged.*")
Array[File?] per_chr_plinks = glob("ukb_c*_~{reference}.*")
Array[File?] unplaced_plinks = glob("ukb_unplaced_~{reference}.*")
}
runtime {
docker: "quay.io/yihchii/liftover_plink_beds:20220104"
cpu: 8
memory: "16 GB"
disks: "local-disk ~{disk_space} SSD"
}
}