Initial situation: fastq files are already demultiplexed, we have a pair of R1 and R2 files for each sample.
To adapt it to another dataset, you need to change primer sequences in
the initial block of variables, and the raw fastq file search pattern
and sample file naming if your raw fastq files follow another naming
rule (in the final while
loop):
#!/bin/bash
cd "${PWD}"
export LC_ALL=C
## ------------------------------------------------------------ define variables
declare -r PRIMER_F="${1}"
declare -r PRIMER_R="${2}"
declare -ri THREADS="${3:-4}"
declare -r FASTQ_NAME_PATTERN="_R1_001.fastq.gz"
declare -r CUTADAPT_OPTIONS="--minimum-length 32 --cores=${THREADS} --discard-untrimmed"
declare -r CUTADAPT="$(which cutadapt) ${CUTADAPT_OPTIONS}" # cutadapt 4.1 or more recent
declare -r SWARM="$(which swarm)" # swarm 3.0 or more recent
declare -r VSEARCH="$(which vsearch) --quiet" # vsearch 2.21.1 or more recent
declare -ri ENCODING=33
declare -r MIN_F=$(( ${#PRIMER_F} * 2 / 3 )) # match is >= 2/3 of primer length
declare -r MIN_R=$(( ${#PRIMER_R} * 2 / 3 ))
declare -r FIFOS=$(echo fifo_{merged,trimmed}_fastq fifo_filtered_fasta{,_bis})
declare -i TICKER=0
## ------------------------------------------------------------------- functions
revcomp() {
# reverse-complement a DNA/RNA IUPAC string
[[ -z "${1}" ]] && { echo "error: empty string" ; exit 1 ; }
local -r nucleotides="acgturykmbdhvswACGTURYKMBDHVSW"
local -r complements="tgcaayrmkvhdbswTGCAAYRMKVHDBSW"
tr "${nucleotides}" "${complements}" <<< "${1}" | rev
}
merge_fastq_pair() {
${VSEARCH} \
--threads "${THREADS}" \
--fastq_mergepairs "${FORWARD}" \
--reverse "${REVERSE}" \
--fastq_ascii "${ENCODING}" \
--fastq_allowmergestagger \
--fastqout fifo_merged_fastq 2> "${SAMPLE}.log" &
}
trim_primers() {
# search forward primer in both normal and revcomp: now all reads
# are in the same orientation. Matching leftmost is the default.
${CUTADAPT} \
--revcomp \
--front "${PRIMER_F};rightmost" \
--overlap "${MIN_F}" fifo_merged_fastq 2>> "${SAMPLE}.log" | \
${CUTADAPT} \
--adapter "${ANTI_PRIMER_R}" \
--overlap "${MIN_R}" \
--max-n 0 - > fifo_trimmed_fastq 2>> "${SAMPLE}.log" &
}
convert_fastq_to_fasta() {
# use SHA1 values as sequence names,
# compute expected error values (ee)
${VSEARCH} \
--fastq_filter fifo_trimmed_fastq \
--relabel_sha1 \
--fastq_ascii "${ENCODING}" \
--eeout \
--fasta_width 0 \
--fastaout - 2>> "${SAMPLE}.log" | \
tee fifo_filtered_fasta_bis > fifo_filtered_fasta &
}
extract_expected_error_values() {
# extract ee for future quality filtering (keep the lowest
# observed expected error value for each unique sequence)
local -ri length_of_sequence_IDs=40
paste - - < fifo_filtered_fasta_bis | \
awk 'BEGIN {FS = "[>;=\t]"} {print $2, $4, length($NF)}' | \
sort --key=3,3n --key=1,1d --key=2,2n | \
uniq --check-chars=${length_of_sequence_IDs} > "${SAMPLE}.qual" &
}
dereplicate_fasta() {
# dereplicate and discard expected error values (ee)
${VSEARCH} \
--derep_fulllength fifo_filtered_fasta \
--sizeout \
--fasta_width 0 \
--xee \
--output "${SAMPLE}.fas" 2>> "${SAMPLE}.log"
}
list_local_clusters() {
# retain only clusters with more than 2 reads
# (do not use the fastidious option here)
${SWARM} \
--threads "${THREADS}" \
--differences 1 \
--usearch-abundance \
--log /dev/null \
--output-file /dev/null \
--statistics-file - \
"${SAMPLE}.fas" | \
awk 'BEGIN {FS = OFS = "\t"} $2 > 2' > "${SAMPLE}.stats"
}
## ------------------------------------------------------------------------ main
declare -r ANTI_PRIMER_R="$(revcomp "${PRIMER_R}")"
# from raw fastq files to ready-to-use sample files
find . -name "${FASTQ_NAME_PATTERN}" -type f -print0 | \
while IFS= read -r -d '' FORWARD ; do
TICKER=$(( $TICKER + 1 ))
echo -e "${TICKER}\t${FORWARD}"
REVERSE="${FORWARD/_R1_/_R2_}" # adapt to fastq name patterns
SAMPLE="${FORWARD/_L001_R1_*/}"
# clean (remove older files, if any)
rm --force "${SAMPLE}".{fas,qual,log,stats} ${FIFOS}
mkfifo ${FIFOS}
merge_fastq_pair
trim_primers
convert_fastq_to_fasta
extract_expected_error_values
dereplicate_fasta
list_local_clusters
# make sure fifos are done and not reused
wait && rm ${FIFOS}
unset FORWARD REVERSE SAMPLE
done
exit 0
The code above uses named pipes (fifo
) to avoid writing intermediate
results to mass storage. The goal is to speed up processing, and to
make the code more modular and clearer. On the other hand, fifos are
tricky to use, as you must remember to launch producers and consumers
in the backgroup before running the last consumer.
Under certain very rare and elusive multithreading conditions,
vsearch --fastq_mergepairs
can hang, interrupting the data flow and
the pipeline. Until that bug can be reproduced and fixed, be cautious.
Now, sequence length are header attributes extract_expected_error_values()
can be simplified.
The goal is to eliminate the need for manual edits. Observed patterns are:
_L001_R1_001.fastq
,_L001_R1_002.fastq
,_L001_R1.fastq
,_R1.fastq
,_n_1.fastq
(withn
a value ranging from 1 to 9),_1.fastq
,.1.fastq
,forward.fastq
(andreverse.fastq
)
Compressed (.gz
, .bz
) or not. The most current is
_L001_R1_001.fastq
as produced by Illumina MiSeq single-lane
sequencers.