-
Notifications
You must be signed in to change notification settings - Fork 1
/
mga2.nf
executable file
·156 lines (120 loc) · 5.95 KB
/
mga2.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env nextflow
// enable DSL 2 syntax
nextflow.enable.dsl = 2
// include processes and sub-workflow modules
include { add_sample_ids } from "./processes"
include { mga2 } from "./workflow"
// -----------------------------------------------------------------------------
// show settings and/or help
// -----------------------------------------------------------------------------
if (params.showSettings) {
printParameterSummary()
}
if (params.help) {
helpMessage()
exit 0
}
// -----------------------------------------------------------------------------
// check/derive parameters
// -----------------------------------------------------------------------------
fastqDir = params.fastqDir
if (fastqDir && !fastqDir.endsWith("/")) {
fastqDir = "${params.fastqDir}/"
}
if (params.sampleSize < 1000) {
exit 1, "Invalid sample size - set to at least 1000 (100000 recommended)"
}
if (params.maxNumberToSampleFrom < params.sampleSize) {
exit 1, "Invalid maximum number of sequence reads to sample from - must be at least as large as the sample size"
}
if (params.chunkSize < 100000) {
exit 1, "Invalid chunk size for batch alignment - set to at least 100000 (1000000 recommended)"
}
if (params.trimStart <= 0) {
exit 1, "Invalid trim start - must be a positive integer value"
}
if (params.trimLength < 30) {
exit 1, "Invalid trim length - trimmed sequences should be sufficiently long to align to reference genomes, i.e. at least 30"
}
// -----------------------------------------------------------------------------
// workflow
// -----------------------------------------------------------------------------
workflow {
// add sample ids to the sample sheet
sample_sheet = channel.fromPath(params.sampleSheet, checkIfExists: true)
| add_sample_ids
// obtain FASTQ file name/pattern from the sample sheet
samples = sample_sheet.splitCsv(header: true, strip: true, quote: '"')
// check for missing fastq column or missing values within the fastq column
samples.subscribe { row -> assert row.fastq != null && !row.fastq.isEmpty(), "Missing fastq column or values in sample sheet" }
// convert FASTQ file name/pattern to file(s)
sample_fastq_files = samples.map { row -> tuple("${row.id}", file("${fastqDir}${row.fastq}", checkIfExists: true), "${fastqDir}${row.fastq}") }
// check that there were matches for the specified FASTQ file name/pattern
sample_fastq_files.subscribe { assert !it[1].isEmpty(), "No FASTQ files found for ${it[0]} matching pattern ${it[2]}" }
// fastq channel expected to contain tuples comprising the sample id and a
// collection of fastq files for each sample
fastq = sample_fastq_files.map { it[0..1] }
// core workflow
mga2(sample_sheet, fastq)
}
// -----------------------------------------------------------------------------
// summary of configuration parameters
// -----------------------------------------------------------------------------
def printParameterSummary() {
log.info ""
log.info """
Multi-Genome Alignment (MGA) Contaminant Screen
===============================================
Sample sheet : ${params.sampleSheet}
FASTQ directory : ${params.fastqDir}
Sample size : ${params.sampleSize}
Maximum sampled from : ${params.maxNumberToSampleFrom}
Chunk size : ${params.chunkSize}
Trim start : ${params.trimStart}
Trim length : ${params.trimLength}
Genomes details file : ${params.genomeDetails}
Bowtie index directory : ${params.bowtieIndexDir}
Adapters FASTA file : ${params.adaptersFasta}
Output directory : ${params.outputDir}
Output prefix : ${params.outputPrefix}
""".stripIndent()
log.info ""
}
// ----------------------------------------------------------------------------
// help/usage
// ----------
def helpMessage() {
log.info """
Usage:
nextflow run crukci-bioinformatics/mga2
Options:
--help Show this message and exit
--sampleSheet CSV file containing details of sample dataset (id, fastq, species and control columns required)
--fastqDir Directory in which FASTQ files are located (optional, can specify absolute or relative paths in sample sheet instead)
--sampleSize Number of sequences to sample for each sample/dataset
--maxNumberToSampleFrom Maximum number of sequences to read/sample from
--chunkSize Number of sequences for each chunk for batch alignment of sampled sequences
--trimStart The position at which the trimmed sequence starts, all bases before this position are trimmed
--trimLength The length of the trimmed sequences
--genomeDetails CSV file containing the species name and synonyms for each reference genome (genome, species and synonym colums required)
--bowtieIndexDir Directory containing bowtie indexes for reference genomes
--adaptersFasta FASTA file containing adapter sequences
--outputDir Directory to which output files are written
--outputPrefix Prefix for output file names
Alternatively, override settings using a configuration file such as the
following:
params {
sampleSheet = "samplesheet.csv"
sampleSize = 100000
trimStart = 11
trimLength = 36
genomeDetails = "genomes.csv"
bowtieIndexDir = "/path_to/bowtie_indexes"
outputDir = "mga"
outputPrefix = ""
}
and run as follows:
nextflow run crukci-bioinformatics/mga2 -c mga2.config
""".stripIndent()
log.info ""
}