In [1]:
# ## A03_trimming
# ## overall commands

# qsub Scripts/A03a_trimming_fastp.sub # †
# qsub Scripts/A03b_check_trimmed.sub # ‡
# qsub Scripts/A03c_fastqc_trimmed.sub


# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)
# # ‡ fast enough to run interactively

## (A03a) fastp

In [2]:
%%bash
cat > ../Scripts/A03a_adapter_sequences.fa
>P5_pre
AATGATACGGCGACCACCGAGATCTACAC
>P5_post
ACTCTTTCCCTACACGACGCTCT
>P5_pre_rc
GTGTAGATCTCGGTGGTCGCCGTATCATT
>P5_post_rc
AGAGCGTCGTGTAGGGAAAGAGT
>P7_pre
CAAGCAGAAGACGGCATACGAGAT
>P7_post
ACTGGAGTTCAGACGTGTGCTCTT
>P7_pre_rc
ATCTCGTATGCCGTCTTCTGCTTG
>P7_post_rc
AAGAGCACACGTCTGAACTCCAGT
>TruSeq_universal
AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
>TruSeq_universal_rc
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
>IlmnBisulfite_fwd
AGATCGGAAGAGCACACGTCTGAAC
>IlmnBisulfite_rev
AGATCGGAAGAGCGTCGTGTAGGGA
>IlmnBisulfite_fwd_rc
GTTCAGACGTGTGCTCTTCCGATCT
>IlmnBisulfite_rev_rc
TCCCTACACGACGCTCTTCCGATCT
>SmartSeq_fwd
AAGCAGTGGTATCAACGCAGAGT
>SmartSeq_rev
ACTCTGCGTTGATACCACTGCTT
>SmartSeq_fwd_rc
ACTCTGCGTTGATACCACTGCTT
>SmartSeq_rev_rc
AAGCAGTGGTATCAACGCAGAGT

In [3]:
%%bash
cat > ../Scripts/A03a_trimming_fastp.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A03a_trim.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=8:00:00,h_data=8G
#$ -N A03a_trim
#$ -t 1-512
#$ -hold_jid A02a_demultip



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

overwrite_existing=false # <-- for help with incomplete jobs



# extract target filepaths -----------------------------------------------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
  'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' ${metadat_well}
}

# extract target wells, print values for log

batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}
  
target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "${SGE_TASK_ID}" ]]
    then
        target_well_rows+=($row)
    fi
done

# filepaths associated with target rows in well-level metadata

wellprefix=($(query_metadat "wellprefix"))

r1in=($(query_metadat "A02a_fqgz_demultip_R1"))
r2in=($(query_metadat "A02a_fqgz_demultip_R2"))

r1paired=($(query_metadat "A03a_fqgz_paired_R1"))
r2paired=($(query_metadat "A03a_fqgz_paired_R2"))

r1singletrim=($(query_metadat "A03a_fqgz_singletrim_R1"))
r2singletrim=($(query_metadat "A03a_fqgz_singletrim_R2"))

jsonout=($(query_metadat "A03a_json_fastp"))

  
  
# run fastp for trimming -----------------------------------------------------

echo "batch number: ${SGE_TASK_ID}"
echo "processing the following rows in well metadata file (${metadat_well}):"

for row in ${target_well_rows[@]}
do
    echo -e "$row\t${wellprefix[$row]}"
done

if [[ ! -s fastq_trimmed ]]
then
    mkdir fastq_trimmed
fi

# trim each well in batch (could be empty wells w/o demultip output,
# but since by well doesn't cause catastrophic problems)
for row in ${target_well_rows[@]}
do

    echo -e "\n\ntrimming '${wellprefix[$row]}'...\n\n"

    # check for existing trimming output
    # if well already processed, skip; if file doesn't exist, run fastp
    if [[ -s ${r1paired[$row]} && -s ${r2paired[$row]} &&
          -s ${r1singletrim[$row]} && -s ${r2singletrim[$row]} && \
          -s ${jsonout[$row]} ]]
    then
        echo -e "output files for '${wellprefix[$row]}' already exist."
        if [[ "$overwrite_existing" == "true" ]]
        then
            echo "overwrite_existing=true. trimming anyway."
        else
            echo "overwrite_existing=false. skipping."
            continue
        fi
    fi
    
    fastp -i ${r1in[$row]} -I ${r2in[$row]} \
        -o ${r1paired[$row]} -O ${r2paired[$row]} \
        --unpaired1 ${r1singletrim[$row]} --unpaired2 ${r2singletrim[$row]} \
        -h fastq_trimmed/${wellprefix[$row]}.html -j ${jsonout[$row]} -R ${wellprefix[$row]} \
        --adapter_fasta=Scripts/A03a_adapter_sequences.fa \
        -f 17 -t 10 -F 15 -T 10 -l 30 \
        --cut_right -q 20 -u 50 -y -Y 15 -x
    
done





echo -e "\n\n'A03a_trim' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

## (A03b) file check

In [4]:
%%bash
cat > ../Scripts/A03b_check_trimmed.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A03b_check_trimmed.$JOB_ID
#$ -j y
#$ -l h_rt=1:00:00,h_data=8G
#$ -N A03b_check_trimmed
#$ -hold_jid A03a_trim



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

overwrite_existing=false # <-- for help with incomplete jobs



# extract target filepaths -----------------------------------------------------

query_metadat () {
  awk -F',' -v targetcol="$1" \
  'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              } 
      NR>1 {
                print $assayout
            }' ${metadat_well}
}

check_filepaths_in_assay() {
    for file in $@
    do 
        if [[ ! -s ${file} ]]
        then
            echo "missing '${file}'"
        fi
    done
}

check_filepath_by_batch() {
target_array=($@)
batches_to_rerun=()
for ((target_batch=1; target_batch<=nbatches; target_batch++))
    do
        target_well_rows=()
        for ((row=1; row<=nwells; row++))
        do
            if [[ "${batchnum[$row]}" == "${target_batch}" ]]
            then
                target_well_rows+=($row)
            fi
        done
    
        batch_file_list=${target_array[@]: ${target_well_rows[0]}:${#target_well_rows[@]} }
    
        num_files_missing=$(check_filepaths_in_assay ${batch_file_list[@]} | wc -l)

        if [[ ${num_files_missing} > 0 ]]
        then
            batches_to_rerun+=(${target_batch})
            echo -e "${target_batch} \t ${num_files_missing}"
        fi
    done 
    
    if [[ ${#batches_to_rerun[@]} > 0 ]]
    then
        echo "batches to re-run:"
        echo "${batches_to_rerun[*]}"        
    fi
}

batchnum=($(query_metadat "batchnum"))

nwells=${#batchnum[@]}
nbatches=${batchnum[-1]}

r1paired=($(query_metadat "A03a_fqgz_paired_R1"))
r2paired=($(query_metadat "A03a_fqgz_paired_R2"))

r1singletrim=($(query_metadat "A03a_fqgz_singletrim_R1"))
r2singletrim=($(query_metadat "A03a_fqgz_singletrim_R2"))

jsonout=($(query_metadat "A03a_json_fastp"))



# apply checks for A03a output -------------------------------------------------

echo "-----------------------------------------------------------------"
echo "A. counting number of .fastq.gz files generated during trimming step... "
echo "-----------------------------------------------------------------"


nfastq_tot_trimmed=0

num_platesin=$(cut -f 2 -d ',' ${metadat_plate} | tail -n +2 | wc -l)
nfastq_expected=$((${num_platesin}*384*4))
platenames=($(cut -f 2 -d ',' ${metadat_plate} | tail -n +2))

echo -e "\npaired_R1.fastq.gz files\n"
for ((i=0; i<${num_platesin}; i++)); 
    do
        nfastq_plate=$(
            find "fastq_trimmed/" -maxdepth 1 -mindepth 1 -name ${platenames[$i]}*paired_R1.fastq.gz | wc -l)
        echo -e "$(($i + 1)) \t ${platenames[$i]} \t ${nfastq_plate}"
        let "nfastq_tot_trimmed+=${nfastq_plate}"
done

echo -e "\npaired_R2.fastq.gz files\n"
for ((i=0; i<${num_platesin}; i++)); 
    do
        nfastq_plate=$(
            find "fastq_trimmed/" -maxdepth 1 -mindepth 1 -name ${platenames[$i]}*paired_R2.fastq.gz | wc -l)
        echo -e "$(($i + 1)) \t ${platenames[$i]} \t ${nfastq_plate}"
        let "nfastq_tot_trimmed+=${nfastq_plate}"
done

echo -e "\nsingletrim_R1.fastq.gz files\n"
for ((i=0; i<${num_platesin}; i++)); 
    do
        nfastq_plate=$(
            find "fastq_trimmed/" -maxdepth 1 -mindepth 1 -name ${platenames[$i]}*singletrim_R1.fastq.gz | wc -l)
        echo -e "$(($i + 1)) \t ${platenames[$i]} \t ${nfastq_plate}"
        let "nfastq_tot_trimmed+=${nfastq_plate}"
done

echo -e "\nsingletrim_R2.fastq.gz files\n"
for ((i=0; i<${num_platesin}; i++)); 
    do
        nfastq_plate=$(
            find "fastq_trimmed/" -maxdepth 1 -mindepth 1 -name ${platenames[$i]}*singletrim_R2.fastq.gz | wc -l)
        echo -e "$(($i + 1)) \t ${platenames[$i]} \t ${nfastq_plate}"
        let "nfastq_tot_trimmed+=${nfastq_plate}"
done

nfastq_demult_folder=$(find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name '*.fastq.gz' | wc -l)
nfastq_trimmed_folder=$(find "fastq_trimmed/" -maxdepth 1 -mindepth 1 -name '*.fastq.gz' | wc -l)
fastq_missing=$((${nfastq_expected} - ${nfastq_tot_trimmed}))

echo -e "\n\nchecks based on plate metadata:"
echo "* num trimmed .fastqs expected based on # plates: (${num_platesin} plates)*384*4 = ${nfastq_expected} (R1p, R2p, R1single, R2single)"
echo "* num trimmed .fastq detected, summed from per-plate values above: ${nfastq_tot_trimmed}"
echo "* num .fastqs missing: ${fastq_missing} ($((${fastq_missing}/4)) wells have no trimmed output?)"

echo -e "\nother checks:"
echo "* num .fastqs in fastq_trimmed: ${nfastq_trimmed_folder} (regardless of expected plate prefix; should be same as above)"
echo "* num demultiplexed .fastqs detected: ${nfastq_demult_folder} (R1 & R2, count*2 = $((${nfastq_demult_folder}*2)) should equal trimmed .fastq.gz counts)"
echo -e "\n\n\n"



echo "-----------------------------------------------------------------"
echo "B. checking each expected file (from ${metadat_well}, by batch...)"
echo "-----------------------------------------------------------------"

r1paired=($(query_metadat "A03a_fqgz_paired_R1"))
r2paired=($(query_metadat "A03a_fqgz_paired_R2"))
r1singletrim=($(query_metadat "A03a_fqgz_singletrim_R1"))
r2singletrim=($(query_metadat "A03a_fqgz_singletrim_R2"))
jsonout=($(query_metadat "A03a_json_fastp"))

echo -e "\nchecking Read 1 properly paired .fastq.gz files:\n"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${r1paired[@]}

echo -e "\nchecking Read 2 properly paired .fastq.gz files:\n"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${r2paired[@]}

echo -e "\nchecking Read 1 trimming singleton .fastq.gz files:\n"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${r1singletrim[@]}

echo -e "\nchecking Read 2 trimming singleton .fastq.gz files:\n"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${r2singletrim[@]}

echo -e "\nchecking .json summary file out:\n"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${jsonout[@]}

echo -e "\n\nsuggest re-running and checking sublog output of above batches."


echo -e "\n* checks the A03a output columns of 'metadat_well' if the file exists and is non-empty."
echo "* if none missing, will only output target column names above."
echo "* caveat: if some fastq.gz files seem 'missing' but all other checks OK, may just be no/few reads surviving trimming."
echo "  (check 'fastq_demultip/' and associated fastp logs e.g., fastq_trimmed/wellprefix.html report)"






echo -e "\n\n-----------------------------------------------------------------"
echo "C. checking log files for issues."
echo -e "-----------------------------------------------------------------\n"

echo "checking if 'completed' in sublogs/A03a_trim* output."
echo "if any filename is printed, the associated batch may have not completed trimming."

grep -c "'A03a_trim' completed" sublogs/A03a_trim* | awk -F ":" '$2==0 {print $1}'





echo -e "\n\n'A03b_check_trimmed' completed.\n\n"



echo " "
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

## (A03c) fastqc

In [5]:
%%bash
cat > ../Scripts/A03c_fastqc_trimmed.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A03c_fastqc_trimmed.$JOB_ID
#$ -j y
#$ -N A03c_fastqc_trimmed
#$ -l h_rt=8:00:00,h_data=8G
#$ -pe shared 4
#$ -hold_jid A03a_trim,A02c_fastqc_demultip



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

overwrite_existing=false # <-- for help with incomplete jobs
wells_to_run=Scripts/A02c_random_fastqc_wells.txt # <-- (qc'd wells from A02c by default)
n_wells=4  # <-- num wells/plate to sample
overwrite_random_wells=false # <-- overwrite target list of wells (false by default)



# choose random wells ----------------------------------------------------------

# randomly select $n_wells from each prefix associated with 'raw_fastq' R1s to run fastqc on
# & saves the names of the corresponding R1 .fastq in "${wells_to_run}"

# warning: will overwrite this .txt if already exists,
# to avoid re-running more files than expected / the same files multiple times

if [[ ! -s qc_trimmed ]]
then
    mkdir qc_trimmed
fi

if [[ -e ${wells_to_run} && "${overwrite_random_wells}" == true ]]
    then
    
        echo "warning: Scripts/A02c_random_fastqc_wells.txt already exists."
        echo "deleting and re-randomizing. (overwrite_random_wells=true)"

        rm ${wells_to_run}
fi

# make random samples (otherwise skip)
if [[ ! -e ${wells_to_run} ]]
then
    for prefix in $(cut -f 2 -d ',' ${metadat_plate} | tail -n +2);
        do
        printf '%s\n' $(ls fastq_trimmed/${prefix}*R1*fastq.gz \
                        | shuf \
                        | head -n ${n_wells}) \
             >> ${wells_to_run}
        done
else
    echo "warning: Scripts/A02c_random_fastqc_wells.txt already exists."
    echo "using existing random wells. (overwrite_random_wells=false)"
fi

# print wells for record in sublog
echo -e "\nnum wells sampled: $(wc -l ${wells_to_run})\n"
cat ${wells_to_run}




# apply fastqc -----------------------------------------------------------------

# run fastqc on the randomly selected wells
echo "running fastqc on each well."
while read r1file;
    do
        wellprefix=$(echo $(basename ${r1file}) | awk -F"_indexed" '{print $1}')
        echo $(ls fastq_trimmed/${wellprefix})
        fastqc -t 4 fastq_trimmed/${wellprefix}*fastq.gz -o qc_trimmed/
    done < ${wells_to_run}

# compile across wells
echo "aggregating via multiqc."
multiqc -d qc_trimmed -o qc_trimmed -n multiqc_fastqc \
    --cl-config "sp: { fastqc/zip: {fn: '*_fastqc.zip' } }" -m "fastqc"




echo -e "\n\n'A03c_fastqc_trimmed' completed.\n\n"


echo " "
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "