In [1]:
# # A04_mapping_bismark overall cmds ===========================================

# qsub Scripts/A04a_bismark_map_TAURUS.sub # †
# qsub Scripts/A04b_check_bismark.sub 
# qsub Scripts/A04c_coverage.sub # †

# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)

In [2]:
%%bash
cat > ../Scripts/A04a_bismark_map_TAURUS.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A04a_bismark.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=12:00:00,h_data=8G
#$ -pe shared 4
#$ -N A04a_bismark
#$ -t 1-512
#$ -hold_jid_ad A03a_trim



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs
overwrite_partial=true # <-- for help with incomplete jobs

# note: estimated time is ~20 min/well so h_rt=24:00:00 may be excessive (anticipate ~8hr)
# alternatives are to use less time & resubmit if incomplete or change # wells/batch in A01c



# extract target filepaths -----------------------------------------------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' ${metadat_well}
}

# extract target wells, print values for log
batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------
# (generally not customizeable because output names set by bismark)

if [[ ! -s mapping_bismark ]]
then
    mkdir mapping_bismark
fi

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A04a_dir_bismark"))

# trimmed .fastqs for input to mapping
fastq_r1p=($(query_metadat "A03a_fqgz_paired_R1"))
fastq_r2p=($(query_metadat "A03a_fqgz_paired_R2"))
fastq_r1singletrim=($(query_metadat "A03a_fqgz_singletrim_R1"))
fastq_r2singletrim=($(query_metadat "A03a_fqgz_singletrim_R2"))

# TAURUS-related files
r1unmap=r1unmap.fq.gz
r2unmap=r2unmap.fq.gz

# final files to check
log_picard=($(query_metadat "A04a_log_picard"))
log_R2P3=($(query_metadat "A04a_bismarktxt_R2p3"))
bam_final=($(query_metadat "A04a_bam_final"))



# print target files -----------------------------------------------------------

echo "batch number: $SGE_TASK_ID"
echo "processing the following rows in well metadata file ($metadat_well):"
for row in ${target_well_rows[@]}
    do
        echo -e "$row\t${wellprefix[$row]}"
    done
echo -e "\n\n"


# for each well in batch, apply mC map & quant
# (could add check here to skip rows where no trimming output,
# but since done by well doesn't cause catastrophic problems)
for row in ${target_well_rows[@]} 
do

    # check for existing mapping output
    # if final outputs exist, skip; else run mapping .bam
    cd $dir_proj
    
    if [[ -s ${log_R2P3[$row]} \
        && -s ${log_picard[$row]} \
        && -s ${bam_final[$row]} \
        && "$skip_complete"=="true" ]]
    then
        echo -e "final alignments for '${wellprefix[$row]}' already exist. skipping this well.'"
    else
    
        echo -e "\n\napplying bismark to '${wellprefix[$row]}'...\n\n"

        # remove old directory if one exists to deal with incomplete files
        # albeit the only major issues are .bai and .tbi indices 
        # (these often are not overwritten by software in the pipeline,
        # resulting in "index is older than file" errors later on)
        if [[ -e mapping_bismark/${wellprefix[$row]} && "$overwrite_partial" == "true" ]]
        then
            echo -e "\n\nWARNING: folder for '${wellprefix[$row]}' exists, but not its final allc files."
            echo "because overwrite_partial=true, deleting the directory and re-mapping."
            rm -rf mapping_bismark/${wellprefix[$row]}
        fi
        
        mkdir $dir_proj/${dir_well[$row]}
        cd $dir_proj/${dir_well[$row]}
        
    # (A) run bismark two-stage" mapping -------------------------------------
    # in: .fastqs from trimming: four .fastqs,
    #     properly paired ($fastq_r2p, $fastq_r1p) and trimming singletons
    #    ($fastq_r1singletrim, $fastq_r2singletrim)
    # out: - paired-end alignments out ($bam_pe, $bam_pe_unmap1, $bam_pe_unmap2)
    #      - single-end .bam alignments out ($bam_single1, $bam_single2)
    #      - key log files (e.g., mapping rate) 
    # -------------------------------------------------------------------------


        # TAURUS mapping step 1 -----------------------------------------------
        # (Ai.) first pass, full length reads [2 to 7 minutes each read]
        bismark $ref_dir_bowtie1 --multicore 3 --bowtie1 --pbat --un -se \
                $dir_proj/${fastq_r1p[$row]},$dir_proj/${fastq_r1singletrim[$row]}
        bismark $ref_dir_bowtie1 --multicore 3 --bowtie1 --un -se  \
                $dir_proj/${fastq_r2p[$row]},$dir_proj/${fastq_r2singletrim[$row]}
             
        # aggregate unmapped reads [<1 min]
        cat *R1*_unmapped_reads.fq.gz > $r1unmap
        cat *R2*_unmapped_reads.fq.gz > $r2unmap


        # umapped read-splitting ---------------------------------------------

        # TAURUS-MH style read-splitting [<1 min per split, usually <1 min tot]
        # read 1 - first 40bp (requires min length of 80)
        seqkit seq -m 80 $r1unmap \
              | seqkit subseq -r 1:40 \
              | seqkit replace -p "_1:N:0" -r "_1:P1:N:0" > subseq_R1_1.fq
        # middle (min length needed 30)
        seqkit seq -m 110 $r1unmap \
              | seqkit subseq -r 41:-41 \
              | seqkit replace -p "_1:N:0" -r "_1:P2:N:0" > subseq_R1_2.fq
        # last 40bp
        seqkit seq -m 80 $r1unmap \
              | seqkit subseq -r -40:-1 \
              | seqkit replace -p "_1:N:0" -r "_1:P3:N:0" > subseq_R1_3.fq
              
        # read 2
        # first 40bp (requires min length of 80)
        seqkit seq -m 80 $r2unmap \
              | seqkit subseq -r 1:40 \
              | seqkit replace -p "_2:N:0" -r "_2:P1:N:0" > subseq_R2_1.fq
        # middle (min length after trim 30)
        seqkit seq -m 110 $r2unmap \
              | seqkit subseq -r 41:-41 \
              | seqkit replace -p "_2:N:0" -r "_2:P2:N:0" > subseq_R2_2.fq
        # last 40bp
        seqkit seq -m 80 $r2unmap \
              | seqkit subseq -r -40:-1 \
              | seqkit replace -p "_2:N:0" -r "_2:P3:N:0" > subseq_R2_3.fq


        # TAURUS step 2 --------------------------------------------------
        
        # single-end, R1, [<2 minutes per read substring]
        bismark $ref_dir_bowtie1 --multicore 3 --bowtie1 --pbat -se \
                subseq_R1_1.fq,subseq_R1_2.fq,subseq_R1_3.fq
                
        # single-end, R2
        bismark $ref_dir_bowtie1 --multicore 3 --bowtie1 -se \
                subseq_R2_1.fq,subseq_R2_2.fq,subseq_R2_3.fq


        # merge & dedupe ------------------------------------------------

        # merge & sort all alignments [<3 min]
        samtools merge -f merged.bam *bismark.bam
        samtools sort -o merged_sorted.bam merged.bam
        
        # deduplication [<1-2 min]
        picard MarkDuplicates I=merged_sorted.bam  \
            OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
            ADD_PG_TAG_TO_READS=false REMOVE_DUPLICATES=true \
            O=merged_dedupe.bam M=picard.log

        # if fails quickcheck,
        # remove final file to force re-running this well / skip in subsequent steps
        samtools quickcheck merged_dedupe.bam \
            || rm merged_dedupe.bam && echo "quickcheck error with ${wellprefix[$row]}?"
        
        samtools index merged_dedupe.bam
        
        # optionally cleanup files --------------------------------------
        # (empty-var check to avoid broad deletion of .bam files)
        if [[ ! -z "$wellprefix" ]]
        then
        echo 'clearing intermediate files for ${wellprefix[$row]}.'
            rm subseq_*.fq
            rm *.fq.gz
            rm ${wellprefix[$row]}*bam
            rm subseq*bam
            rm merged.bam
            rm merged_sorted.bam
        fi
    fi
done





echo -e "\n\n'A04a_bismark' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `


In [3]:
%%bash
cat > ../Scripts/A04b_check_bismark.sub

# #!/bin/bash
# #$ -cwd
# #$ -o sublogs/A04b_check_bismark.$JOB_ID
# #$ -j y
# #$ -l h_rt=2:00:00,h_data=4G
# #$ -N A04b_bischeck
# #$ -hold_jid A04a_bismark



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--



# extract target filepaths -----------------------------------------------------

query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              } 
      NR>1 {
                print $assayout
            }' $metadat_well
}

check_filepaths_in_assay() {
    for file in $@
        do 
        if [[ ! -s $file ]]
            then
                echo "missing '$file'"
            fi
        done
}

check_filepath_by_batch() {
target_array=($@)
batches_to_rerun=()
for ((target_batch=1; target_batch<=nbatches; target_batch++))
    do
        target_well_rows=()
        for ((row=1; row<=nwells; row++))
        do
            if [[ "${batchnum[$row]}" == "${target_batch}" ]]
            then
                target_well_rows+=($row)
            fi
        done

        batch_file_list=${target_array[@]: ${target_well_rows[0]}:${#target_well_rows[@]} }
    
        num_files_missing=$(check_filepaths_in_assay ${batch_file_list[@]} | wc -l)

        if [[ ${num_files_missing} > 0 ]]
        then
            batches_to_rerun+=(${target_batch})
            echo -e "${target_batch} \t ${num_files_missing}"
        fi
    done 
    
    if [[ ${#batches_to_rerun[@]} > 0 ]]
    then
        echo "batches to re-run:"
        echo "${batches_to_rerun[*]}"        
    fi
}

batchnum=($(query_metadat "batchnum"))

nwells=${#batchnum[@]}
nbatches=${batchnum[-1]}



# apply checks for A04a output -------------------------------------------------

echo "-----------------------------------------------------------------"
echo "A. printing number of final .bams missing (by batch)... "
echo "-----------------------------------------------------------------"

log_picard=($(query_metadat "A04a_log_picard"))
log_R2P3=($(query_metadat "A04a_bismarktxt_R2p3"))
bam_final=($(query_metadat "A04a_bam_final"))

echo "checking final merged .bam:"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${bam_final[@]}

echo "checking singleton trimming .log:"
check_filepath_by_batch ${log_R2P3[@]}

echo "checking picard .log:"
check_filepath_by_batch ${picard_log[@]}

echo "checking trimming logs:"
check_filepath_by_batch ${trimming_log[@]}

echo -e "\n\nsuggest re-running and checking sublog output of above batches."



echo -e "\n\n-----------------------------------------------------------------"
echo "B. checking each expected .bam file (from $metadat_well)"
echo -e "-----------------------------------------------------------------\n"

echo -e "\nchecking final .bam file:\n"
check_filepaths_in_assay ${final_bam[@]}

echo -e "\nchecking R2:P3 singleton trimming file:\n"
check_filepaths_in_assay ${log_R2P3[@]}

echo -e "\nchecking picard .log:\n"
check_filepaths_in_assay ${picard_log[@]}

echo -e "\ncompare to the number of trimmed .fastq sets in:\n"
echo -e "(using .json as proxy):\n"
check_filepaths_in_assay ${trimming_log[@]}

echo -e "\n* checks the A04a output columns of 'metadat_well' if the file exists and is non-empty."
echo "* if none missing, will only output target column names above."
echo "* if some declared 'missing' but all other checks OK, may just be no/few reads surviving trimming."
echo "  (check 'fastq_demultip/' and associated fastp logs e.g., fastq_trimmed/wellprefix.html report)"



echo -e "\n\n-----------------------------------------------------------------"
echo "C. checking log files for issues."
echo -e "-----------------------------------------------------------------\n"

echo "checking if 'completed' in sublogs/A04a_bismark* output."
echo "if any filename is printed, the associated batch may have not completed mapping."

grep -c 'ended on' sublogs/A04a_bismark* | awk -F ":" '$2==0 {print $1}'





echo -e "\n\n'A04b_bischeck' completed.\n\n"



echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `



In [4]:
%%bash
cat > ../Scripts/A04c_coverage.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A04c_coverage.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=16G
#$ -N A04c_coverage
#$ -t 1-512
#$ -hold_jid_ad A04a_bismark



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs




# extract target filepaths -----------------------------------------------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log

batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A04a_dir_bismark"))

final_bam=($(query_metadat "A04a_bam_final"))

outsamstats=($(query_metadat "A04c_txt_samstats")) # 'samstats.txt' by default
outcovstats=($(query_metadat "A04c_txt_covnsites")) # 'nbases_cov_by_chr.txt'
outcovtot=($(query_metadat "A04c_txt_covtot")) # 'total_cov_by_chr.txt'



# samtools stats on each well in the batch -------------------------------------

for row in ${target_well_rows[@]} 
do

    cd ${dir_proj}
    
    if [[ -s ${dir_proj}/${outsamstats[$row]} \
        && -s ${dir_proj}/${outcovstats[$row]} \
        && -s ${dir_proj}/${outcovtot[$row]} ]]
    then
        echo -e "coverage output for '${wellprefix[$row]}' already exists."
                
        if [[ "${skip_complete}" == "true" ]]
        then
            echo "skip_complete == true. skipping this well.'"
            continue
        else
            echo "skip_complete != true. re-running this well.'"
        fi
    fi
    
    if [[ ! -s ${dir_proj}/${final_bam[$row]} ]]
    then
        echo -e "input .bam for '${wellprefix[$row]}' seems to be missing. skipping.\n\n"
    fi

    echo -e "\n\nprofiling .bams from '${wellprefix[$row]}'...\n\n"

    cd ${dir_well[$row]}

    # run samtools stats
    samtools stats ${dir_proj}/${final_bam[$row]} | grep '^SN' | cut -f 2,3 > ${dir_proj}/${outsamstats[$row]}

    # use samtools mpileup for total coverage
    samtools mpileup ${dir_proj}/${final_bam[$row]} | cut -f 1,4 > tmp_coverage_mpileup

    # aggregate by chromosome
    # (useful for sex-checks)
    cut -f 1 tmp_coverage_mpileup | uniq -c > ${dir_proj}/${outcovstats[$row]}
    awk '{covsums[$1]+=$2} END {for (key in covsums) printf("%s\t%s\n", key, covsums[key])}' \
        tmp_coverage_mpileup > ${dir_proj}/${outcovtot[$row]}
    rm tmp_coverage_mpileup

done



echo -e "\n\n'A04c_coverage' completed.\n\n"





echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
