This repository has been archived by the owner on Jan 25, 2020. It is now read-only.
/
Alignment.wdl
165 lines (149 loc) · 5.95 KB
/
Alignment.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
version 1.0
## Copyright Broad Institute, 2018
##
## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data.
##
## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
# Local Import
#import "../structs/GermlineStructs.wdl"
# Git URL Import
import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl"
# Get version of BWA
task GetBwaVersion {
command {
# not setting set -o pipefail here because /bwa has a rc=1 and we dont want to allow rc=1 to succeed because
# the sed may also fail with that error and that is something we actually want to fail on.
/usr/gitc/bwa 2>&1 | \
grep -e '^Version' | \
sed 's/Version: //'
}
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
memory: "1 GiB"
}
output {
String bwa_version = read_string(stdout())
}
}
# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment
task SamToFastqAndBwaMemAndMba {
input {
File input_bam
String bwa_commandline
String bwa_version
String output_bam_basename
# reference_fasta.ref_alt is the .alt file from bwa-kit
# (https://github.com/lh3/bwa/tree/master/bwakit),
# listing the reference contigs that are "alternative".
ReferenceFasta reference_fasta
Int compression_level
Int preemptible_tries
}
Float unmapped_bam_size = size(input_bam, "GiB")
Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB")
Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB")
# Sometimes the output is larger than the input, or a task can spill to disk.
# In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5).
Float disk_multiplier = 2.5
Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20)
command <<<
set -o pipefail
set -e
# set the bash variable needed for the command-line
bash_ref_fasta=~{reference_fasta.ref_fasta}
# if reference_fasta.ref_alt has data in it,
if [ -s ~{reference_fasta.ref_alt} ]; then
java -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \
SamToFastq \
INPUT=~{input_bam} \
FASTQ=/dev/stdout \
INTERLEAVE=true \
NON_PF=true | \
/usr/gitc/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \
java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \
MergeBamAlignment \
VALIDATION_STRINGENCY=SILENT \
EXPECTED_ORIENTATIONS=FR \
ATTRIBUTES_TO_RETAIN=X0 \
ATTRIBUTES_TO_REMOVE=NM \
ATTRIBUTES_TO_REMOVE=MD \
ALIGNED_BAM=/dev/stdin \
UNMAPPED_BAM=~{input_bam} \
OUTPUT=~{output_bam_basename}.bam \
REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \
PAIRED_RUN=true \
SORT_ORDER="unsorted" \
IS_BISULFITE_SEQUENCE=false \
ALIGNED_READS_ONLY=false \
CLIP_ADAPTERS=false \
MAX_RECORDS_IN_RAM=2000000 \
ADD_MATE_CIGAR=true \
MAX_INSERTIONS_OR_DELETIONS=-1 \
PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
PROGRAM_RECORD_ID="bwamem" \
PROGRAM_GROUP_VERSION="~{bwa_version}" \
PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \
PROGRAM_GROUP_NAME="bwamem" \
UNMAPPED_READ_STRATEGY=COPY_TO_TAG \
ALIGNER_PROPER_PAIR_FLAGS=true \
UNMAP_CONTAMINANT_READS=true \
ADD_PG_TAG_TO_READS=false
grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \
grep -v "read 0 ALT contigs"
# else reference_fasta.ref_alt is empty or could not be found
else
exit 1;
fi
>>>
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
preemptible: preemptible_tries
memory: "14 GiB"
cpu: "16"
disks: "local-disk " + disk_size + " HDD"
}
output {
File output_bam = "~{output_bam_basename}.bam"
File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log"
}
}
task SamSplitter {
input {
File input_bam
Int n_reads
Int preemptible_tries
Int compression_level
}
Float unmapped_bam_size = size(input_bam, "GiB")
# Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2.
Float disk_multiplier = 2.5
Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20)
command {
set -e
mkdir output_dir
total_reads=$(samtools view -c ~{input_bam})
java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -jar /usr/gitc/picard.jar SplitSamByNumberOfReads \
INPUT=~{input_bam} \
OUTPUT=output_dir \
SPLIT_TO_N_READS=~{n_reads} \
TOTAL_READS_IN_INPUT=$total_reads
}
output {
Array[File] split_bams = glob("output_dir/*.bam")
}
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
preemptible: preemptible_tries
memory: "3.75 GiB"
disks: "local-disk " + disk_size + " HDD"
}
}