forked from sequana/rnaseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
395 lines (363 loc) · 14.1 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# ============================================================================
# Config file for RNA-seq
#
# ==================[ Sections for the users ]================================
#
# One of input_directory, input_pattern and input_samples must be provided
# If input_directory provided, use it otherwise if input_pattern provided,
# use it, otherwise use input_samples.
# ============================================================================
sequana_wrappers: "v0.15.1"
input_directory:
input_readtag: _R[12]_
input_pattern: '*fastq.gz'
# See sequana_pipetools.readthedocs.io for details about these 2 options
# common prefixes are removed. addition prefixes may be removed here
#extra_prefixes_to_strip = []
# in special cases, sample names can be extracted with a pattern
#sample_pattern: '{sample}.fastq.gz'
apptainers:
sequana_tools: "https://zenodo.org/record/7102074/files/sequana_tools_0.14.3.img"
salmon: "https://zenodo.org/record/5708843/files/salmon_1.3.0.img"
fastqc: "https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img"
fastp: "https://zenodo.org/record/7319782/files/fastp_0.23.2.img"
igvtools: "https://zenodo.org/record/7022635/files/igvtools_2.12.0.img"
graphviz: "https://zenodo.org/record/7928262/files/graphviz_7.0.5.img"
# =========================================== Sections for the users
#############################################################################
# Genome section:
#
# :Parameters:
#
# - aligner: either star or bowtie2.
# - genome_directory: directory where all indexes are written.
# - rRNA_contaminant: path to an existing fasta file for ribosomal RNA (to be found in
# genome_directory)
# - rRNA_feature: if rRNA_contaminant is not provided, ribosomal RNA will be extract
# from GFF using this feature name. It must be found.
general:
aligner: bowtie2
genome_directory:
contaminant_file:
rRNA_feature: rRNA
custom_gff: ''
#################################################################
# FastQC section
#
# :Parameters:
#
# - options: string with any valid FastQC options
#
fastqc:
skip_fastqc_raw: true
options: --nogroup
threads: 4
resources:
mem: 4G
#######################################################################
# Quality trimming and adapter removal
#
# for cutadapt, please fill the fwd and rev fields if required. It can be a
# string, or a filename. If a filename, it must be prefixed with "file:" to
# specify that it is a file and not a string. If the tool is cutadapt, the empty
# fwd and rev fields means that no adapters are to be used.
#
# :Parameters:
#
# - fwd: a string or file (prefixed with *file:*)
# - m: 20 means discard trimmed reads that are shorter than 20.
# must be > 0
# - quality: 0 means no trimming, 30 means keep base with quality
# above 30
# - mode: must be set to one of
# - g for 5'
# - a for 3'
# - b for both 5'/3'
# - rev: a string or file (prefixed with *file:*)
# - tool: only cutadapt supported for now
# - threads: number of threads to use (atropos only)
# - options: See cutadapt documentation for details on
# cutadapt.readthedocs.io. We change the default value
# of -O to 6 (at least 6 bases are required to match before
# trimming of an adapter)
#
# tool_choice__ = ["atropos", "cutadapt"]
#
# trim-n trims Ns at the end of the read
cutadapt:
tool_choice: cutadapt
fwd: ''
rev: ''
m: 20 # {"strict_min": 0}
mode: b # {"values": ["b","g","a"]}
options: -O 6 --trim-n
quality: 30 # {"range": [0,40]}
threads: 4
#############################################################################
# -Q should disable the quality filter
#
# Quality filtering only limits the N base number (-n, --n_base_limit)
# meaning if 5 Ns are found, the read is discarded,
# -q is the quality value se to Q15 to be qualified; If more than 40% of bases
# are unqualified, the read is discarded.
# You can also filter reads by average quality score using -e QUAL_score
#
# minimum length is set to 15 by default
#
# Adapter trimming is set by default. Can be disable with -A
# For adapters, this is automatic but you can be specific using
# --adapter_sequence for read1, and --adapter_sequence_r2 for read2.
# The --cut_tail moves a sliding window from tail (3') to front, drop the bases
# in the window if its mean quality is below cut_mean_quality, stop otherwise.
# Use cut_tail_window_size to set the widnow size (default 4)), and
# cut_tail_mean_quality to set the mean quality threshold (default 20)
# Other useful options: --disable_adapter_trimming and --disable_quality_filtering.
# or -n 5 (minimum number of Ns required to discard a read)
fastp:
options: ' --cut_tail '
minimum_length: 20
adapters: ''
quality: 15
threads: 4
disable_adapter_trimming: false
disable_quality_filtering: false
#######################################################
# Quality trimming software choice
#
# software_choice__ = ["atropos", "cutadapt", "fastp"]
#
trimming:
software_choice: fastp
do: true
#############################################################################
# bowtie1_mapping_rna used to align reads against ribosomal RNA
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - options: any options recognised by bowtie1 tool
# - threads: number of threads to be used
# - nreads: no need to analyse all data to estimate the ribosomal content.
# analyse 100,000 reads by default. Set to -1 to ignore and analyse all data
bowtie1_mapping_rna:
# remove in v1.20 and set automatically to on/off if rRNA/fasta provided
# do: true
options: ''
threads: 4
nreads: 100000
#############################################################################
# star_mapping used to align reads against genome file
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - options: any options recognised by rna-star tool. Set limitBAMsortRAM to 30G
# - threads: number of threads to be used
# - legacy: if set to True will use the old 2-pass version from STAR
# used in this pipeline until v0.15.3. If you want to use the
# 2-pass mode available in star, you will need star 2.7 and above
#
star_mapping:
options: " --limitBAMsortRAM 30000000000 --outFilterMismatchNoverLmax 0.05 --seedSearchStartLmax 20 "
legacy: True
threads: 4
resources:
mem: 32G
##############################################################################
# STAR indexing section
#
# :Parameters:
#
# - options: string with any valid STAR options
star_index:
options:
threads: 4
resources:
mem: 4G
#############################################################################
# bowtie1_mapping_ref used to align reads against genome file
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - options: any options recognised by bowtie1 tool
# - threads: number of threads to be used
#
bowtie1_mapping_ref:
options: --chunkmbs 400 -m 1
threads: 4
#############################################################################
# bowtie2_mapping used to align reads against genome file
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - options: any options recognised by bowtie2 tool
# - threads: number of threads to be used
#
bowtie2_mapping:
#options: "--dovetail --no-mixed --no-discordant " for paired-end data
options: ''
threads: 4
genome_size_larger_than_4gb: false
resources:
mem: 20G
bowtie2_index:
options: ''
threads: 4
resources:
mem: 20G
salmon_index:
threads: 2
options:
resources:
mem: 4G
salmon_mapping:
options: -l A
threads: 4
resources:
mem: 4G
#############################################################################
# feature_counts used to count reads against features
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - options: any options recognised by feature_counts tool except -s
# - threads: number of threads to be used
# - strandness: (optional) you should provide the strand parameters, given
# from the experimental design. If not provided, we will guess it (see
# tolerance parameter here below)
# - tolerance: if strandness is not provided, we will guess it from
# the data. The metric used is between 0 and 1. It is a ratio between
# strand + and -. If below tolerance, the strand is reversely stranded. If
# above 1-tolerance, it is (forward) stranded. If around 0.5 +- tolerance,
# it is unstranded. Otherwise, it means our guess would not be very
# reliable. A warning will be raised. Note also that if there is no
# consensus across samples, a warning/error may also be raised. tolerance
# is therefore in the range [0-0.25]
# - feature: this is equivalent to the -t option to specify the feature type in GTF
# annotation. For example gene, exon (default).
# - attribute: this is the -g option to specify the attribute type in GTF annoation.
# (gene_id) by default.
# - extra_attributes: any other
#
feature_counts:
do: true
options: '' ## if exon/CDS is used, put -O option
strandness: '' # set to 0, 1, 2 to force te type of strandness
threads: 1 #
tolerance: 0.15 # use to figure out the strandness. no need to change
feature: gene # could be exon, mRNA, etc
attribute: ID # could be ID, gene_id, etc
extra_attributes: # by default, stores only the main attribute, but could add more
#############################################################################
# bamCoverage write file in bigwig format from BAM files.
# This tool takes an alignment of reads or fragments as input (BAM file) and
# generates a coverage track (bigWig or bedGraph) as output. The coverage is
# calculated as the number of reads per bin, where bins are short consecutive
# counting windows of a defined size. It is possible to extended the length of
# the reads to better reflect the actual fragment length. *bamCoverage* offers
# normalization by scaling factor, Reads Per Kilobase per Million mapped reads
# (RPKM), and 1x depth (reads per genome coverage, RPGC).
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - binSize: Size of the bins, in bases, for the output of the
# bigwig/bedgraph file. (default: 50)
# - genomeSize: Report read coverage normalized to 1x sequencing depth
# (also known as Reads Per Genomic Content (RPGC)).
# Sequencing depth is defined as: (total number of
# mapped reads * fragment length) / effective genome
# size. The scaling factor used is the inverse of the
# sequencing depth computed for the sample to match the
# 1x coverage. To use this option, the effective genome
# size has to be indicated after the option. The
# effective genome size is the portion of the genome
# that is mappable.
# - extendReads: This parameter allows the extension of reads to
# fragment size.
# - minFragmentLength: The minimum fragment length needed for read/pair
# inclusion. Note that a value other than 0 will exclude
# all single-end reads.
# - maxFragmentLength: The maximum fragment length needed for read/pair
# inclusion. A value of 0 disables filtering and is
# needed for including single-end and orphan reads.
# - threads: number of threads to be used
bam_coverage:
do: false
options:
binSize: 10
genomeSize: 2150570000 ##mm10
extendReads: 65
minFragmentLength: 0 #Note that a value other than 0 will exclude all single-end reads.
maxFragmentLength: 0 #A value of 0 disables filtering and is needed for including single-end and orphan reads.
threads: 4
resources:
mem: 20G
###########################################################################
# Creates a tdf files using igvtools
#
# :Parameters:
#
# - chromSize: path to index of reference genome obtain by samtools faidx
igvtools:
do: false
# can be a link to the fasta file or an existing chrom.sizes file
# If none provided, will use the input fasta file
chrom_sizes_file: ''
#############################################################################
# mark_duplicates (picard-tools) allows to mark PCR duplicate in BAM files
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored. Mandatory for RNA-SeQC tool.
# - remove: If true do not write duplicates to the output file instead of writing them with
# appropriate flags set. Default value: false. This option can be set to 'null' to clear
# the default value. Possible values: {true, false}
# - tmpdir: write tempory file on this directory (default TMP_DIR=/tmp/, but could be "TMP_DIR=/local/scratch/")
#
mark_duplicates:
do: false
remove: false ## may be True
tmpdir: ./tmp/
threads: 4
resources:
mem: 34G
add_read_group:
options:
#############################################################################
# RNA-SeQC allows to compute a series of quality control metrics for RNA-seq data
#
# :Parameters:
#
# - do: if unchecked, this rule is ignored
# - ref: Reference Genome in fasta format
# - gtf: GTF File defining transcripts (must end in '.gtf')
# You can use the 'sequana gff-to-gtf input.gff' command
# - options: any options recognised by RNA-seQC tool
rnaseqc:
do: false
gtf_file:
options: --coverage
# if be_file not provided, try to create one on the fly
# needs mark_duplicates
rseqc:
do: false
bed_file:
#############################################################################
# MultiQC aggregates results from bioinformatics analyses across many
# samples into a single report.
#
# :Parameters:
#
# - options: any options recognised by multiqc
# - output-directory: Create report in the specified output directory
# - config_file: by default, we use sequana RNA-seq multiqc_config file.
# If you want your own multiqc, fill this entry
multiqc:
options: -p -f -x *_init_*
modules: ''
input_directory: .
config_file: multiqc_config.yaml