-
Notifications
You must be signed in to change notification settings - Fork 34
/
subread.yml
199 lines (180 loc) · 5.89 KB
/
subread.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# ================
# Aligner - Subread
# ================
---
- slug: alignment-subread
name: Subread
requirements:
expression-engine: jinja
executor:
docker:
image: resolwebio/legacy:latest
data_name: "Alignment ({{ reads.fastq.0.file|basename|default('?') }})"
version: 0.0.7
type: data:alignment:bam:subread
category: analyses:alignment
flow_collection: sample
persistence: CACHED
description: >
An accurate and efficient aligner for mapping both genomic DNA-seq reads and RNA-seq reads (for the purpose of expression analysis).
input:
- name: genome
label: Reference genome
type: data:genome:fasta
- name: reads
label: Reads
type: data:reads:fastq
- name: options
label: Options
group:
- name: indel
label: Number of INDEL bases
type: basic:integer
required: false
default: 5
description: >
Specify the number of INDEL bases allowed in the mapping.
- name: consensus
label: Consensus threshold
type: basic:integer
required: false
default: 3
description: >
Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit.
- name: mis_matched_bp
label: Max number of mis-matched bases
type: basic:integer
required: false
default: 3
description: >
Specify the maximum number of mis-matched bases allowed in the alignment.
- name: cpu_number
label: Number of threads/CPUs
type: basic:integer
required: false
default: 1
description: >
Specify the number of threads/CPUs used for mapping
- name: unique_reads
label: Output uniquely mapped reads
type: basic:boolean
required: false
description: >
Reads that were found to have more than one best mapping location will not be reported.
- name: PE_options
label: Paired end alignment options
group:
- name: reads_orientation
label: reads orientation
type: basic:string
required: false
description: >
Specify the orientation of the two reads from the same pair.
default: "fr"
choices:
- label: ff
value: ff
- label: fr
value: fr
- label: rf
value: rf
- name: consensus_subreads
label: Minimum number of consensus subreads
type: basic:integer
required: false
default: 1
description: >
Specify the minimum number of consensus subreads both reads from the sam pair must have.
output:
- name: bam
label: Alignment file
type: basic:file
description: Position sorted alignment
- name: bai
label: Index BAI
type: basic:file
- name: unmapped
label: Unmapped reads
type: basic:file
required: false
- name: stats
label: Statistics
type: basic:file
run:
runtime: polyglot
language: bash
program: |
GENOME_NAME=`basename {{ genome.fasta.file }} .fasta`
INDEX={{genome.index_subread.dir}}"/${GENOME_NAME}_index"
if [ -d {{ genome.index_subread.dir }} ]; then
echo "Genome index found."
else
re-error "Index not found, provide index file."
fi
re-progress 0.2
FW_READS=()
RW_READS=()
{% for r in reads.fastq %}
FW_READS+=({{ r.file }})
{% endfor %}
cat "${FW_READS[@]}" > fw_reads.fastq.gz
{% if reads|type|subtype('data:reads:fastq:paired:') %}
{% for r in reads.fastq2 %}
READS_NAME=`basename {{ r.file }} .fastq.gz`
RW_READS+=({{ r.file }})
{% endfor %}
cat "${RW_READS[@]}" > rw_reads.fastq.gz
{% endif %}
NAME=`basename {{ reads.fastq.0.file }} .fastq.gz`
re-progress 0.3
UNIQUE_READS="{% if options.unique_reads %}-u {% endif %}"
echo "Align with subread:"
{% if reads|type|subtype('data:reads:fastq:single:') %}
subread-align \
-i "${INDEX}" \
-o "${NAME}_align_unsorted.bam" \
-r "fw_reads.fastq.gz" \
-t 0 \
-I {{ options.indel }} \
-m {{ options.consensus }} \
-M {{ options.mis_matched_bp }} \
-T {{ options.cpu_number }} \
${UNIQUE_READS}
{% else %}
subread-align \
-i "${INDEX}" \
-o "${NAME}_align_unsorted.bam" \
-r "fw_reads.fastq.gz" \
-R "rw_reads.fastq.gz" \
-t 0 \
-I {{ options.indel }} \
-m {{ options.consensus }} \
-M {{ options.mis_matched_bp }} \
-T {{ options.cpu_number }} \
${UNIQUE_READS} \
-S {{ PE_options.reads_orientation }} \
-S {{ PE_options.reads_orientation }}
{% endif %}
re-checkrc
re-progress 0.5
echo "Sorting BAM file by chromosomal coordinates:"
samtools sort -o "${NAME}.bam" "${NAME}_align_unsorted.bam"
re-checkrc
re-progress 0.6
echo -e "\nINDEXING bam:"
samtools index "${NAME}.bam" "${NAME}.bam.bai"
re-checkrc
echo "Calculating statistics"
samtools flagstat "${NAME}.bam" > "${NAME}_report.txt"
re-checkrc
re-progress 0.75
echo "Creating unmapped fastq file"
samtools view -u -f 4 -F 264 "${NAME}.bam" > "${NAME}_unmapped.bam"
bamToFastq -i "${NAME}_unmapped.bam" -fq "${NAME}_unmapped.fastq"
gzip -c "${NAME}_unmapped.fastq" > "${NAME}_unmapped.fastq.gz"
re-save-file bam "${NAME}.bam"
re-save-file bai "${NAME}.bam.bai"
re-save-file stats "${NAME}_report.txt"
if [ -f "${NAME}_unmapped.fastq.gz" ]; then
re-save-file unmapped "${NAME}_unmapped.fastq.gz"
fi