In [1]:
# # A05_quantify_mC overall cmds ===============================================

# qsub Scripts/A05a_bam2allc.sub  # † 
# qsub Scripts/A05b_check_allcs.sub
# qsub Scripts/A05c_global_mC_stats.sub # †
# qsub Scripts/A05d_allc_to_mcds.sub # *

# * = job array based on "platenum"
# † = job array based on "batchnum" (two rows at a time)
# ‡ fast enough to run interactively

In [2]:
%%bash
cat > ../Scripts/A05a_bam2allc.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05a_bam2allc.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=4:00:00,h_data=24G
#$ -N A05a_bam2allc
#$ -t 1-512
#$ -hold_jid_ad A04c_coverage



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs
overwrite_partial=true # <-- for help with incomplete jobs



# extract target filepaths -----------------------------------------------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log
batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------
# (generally not customizeable because output names set by bismark)

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A04a_dir_bismark"))

bam_in=($(query_metadat "A04a_bam_final"))

# allc.tsv.gz by default
allc_out=($(query_metadat "A05a_allc"))
allctbi_out=($(query_metadat "A05a_allctbi"))



# print target files -----------------------------------------------------------

echo "batch number: $SGE_TASK_ID"
echo "processing the following rows in well metadata file ($metadat_well):"
for row in ${target_well_rows[@]}
    do
        echo -e "$row\t${wellprefix[$row]}"
    done
echo -e "\n\n"

for row in ${target_well_rows[@]} 
do

    # check for existing mapping output
    # if final outputs exist, skip; else run mapping .bam
    cd $dir_proj
    
    if [[ -s ${allc_out[$row]} \
        && -s ${allctbi_out[$row]} ]]
    then
        if [ $(wc -c < ${allc_out[$row]}) -le 50 ];
        then echo ".allc exists, but seems to be empty. re-running '${wellprefix[$row]}'."
        else
            echo -e "final .allc for '${wellprefix[$row]}' already exists."
            if [[ "${skip_complete}" == "true" ]]
            then
                echo "skip_complete == true. skipping this well.'"
                continue
            else
                echo "skip_complete != true. re-running this well.'"
            fi
        fi
    fi
    
    if [[ ! -e ${bam_in[$row]} ]]
    then
        echo -e "\n\n.bam for '${wellprefix[$row]}' missing. skipping this well."
        continue
    fi

    # .bam --> .allc (generally 3-5min)
    echo -e "\n\ncoverting .bam to .allc for '${wellprefix[$row]}'...\n\n"
    cd ${dir_proj}/${dir_well[$row]}

    if [ $(ls allc.tsv.gz.temp.* | wc -l) > 0 ]
    then
        echo "some intermediate .allc files found in directory; removing."
        rm allc.tsv.gz.temp.*
    fi
    
    allcools bam-to-allc -bam ${dir_proj}/${bam_in[$row]} \
         --reference_fasta $ref_fasta --output_path ${dir_proj}/${allc_out[$row]} \
         --convert_bam_strandness --cpu 1

done





echo -e "\n\n'A05a_bam2allc' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `


In [3]:
%%bash
cat > ../Scripts/A05b_check_allcs.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05b_check_allcs.$JOB_ID
#$ -j y
#$ -l h_rt=2:00:00,h_data=4G
#$ -N A05b_allccheck
#$ -hold_jid A05a_bam2allc



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

export $(cat snm3C_parameters.env | grep -v '^#' | xargs)



# extract target filepaths -----------------------------------------------------

query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              } 
      NR>1 {
                print $assayout
            }' $metadat_well
}

check_filepaths_in_assay() {
    for file in $@
        do 
        if [[ ! -s $file ]]
            then
                echo "missing '$file'"
            fi
        done
}

check_filepath_by_batch() {
target_array=($@)
batches_to_rerun=()
for ((target_batch=1; target_batch<=nbatches; target_batch++))
    do
        target_well_rows=()
        for ((row=1; row<=nwells; row++))
        do
            if [[ "${batchnum[$row]}" == "${target_batch}" ]]
            then
                target_well_rows+=($row)
            fi
        done

        batch_file_list=${target_array[@]: ${target_well_rows[0]}:${#target_well_rows[@]} }
    
        num_files_missing=$(check_filepaths_in_assay ${batch_file_list[@]} | wc -l)

        if [[ ${num_files_missing} > 0 ]]
        then
            batches_to_rerun+=(${target_batch})
            echo -e "${target_batch} \t ${num_files_missing}"
        fi
    done 
    
    if [[ ${#batches_to_rerun[@]} > 0 ]]
    then
        echo "batches to re-run:"
        echo "${batches_to_rerun[*]}"        
    fi
}

batchnum=($(query_metadat "batchnum"))

nwells=${#batchnum[@]}
nbatches=${batchnum[-1]}



# apply checks for A04a output -------------------------------------------------

echo "-----------------------------------------------------------------"
echo "A. printing number of final .bams missing (by batch)... "
echo "-----------------------------------------------------------------"

wellprefix=($(query_metadat "wellprefix"))
allcfile=($(query_metadat "A05a_allc"))
allctbi=($(query_metadat "A05a_allctbi"))

echo "checking .allc.tsv.gz"
echo -e "batchnum\tnum_missing"
check_filepath_by_batch ${allcfile[@]}

echo "checking .allc.tsv.gz.tbi"
check_filepath_by_batch ${allctbi[@]}


echo -e "\n\nsuggest re-running and checking sublog output of above batches."



echo -e "\n\n-----------------------------------------------------------------"
echo "B. checking each expected .bam file (from $metadat_well)"
echo -e "-----------------------------------------------------------------\n"

echo -e "\nchecking .allc.tsv.gz:\n"
check_filepaths_in_assay ${allcfile[@]}

echo -e "\nchecking .allc.tsv.gz.tbi:\n"
check_filepaths_in_assay ${allctbi[@]}


echo -e "\n* checks the A05a output columns of 'metadat_well' if the file exists and is non-empty."
echo "* if none missing, will only output target column names above."
echo "* if some declared 'missing' but all other checks OK, may just be no/few reads surviving trimming."
echo "  (check 'fastq_demultip/' and associated fastp logs e.g., fastq_trimmed/wellprefix.html report)"



echo -e "\n\n-----------------------------------------------------------------"
echo "C. checking log files for issues."
echo -e "-----------------------------------------------------------------\n"

echo "checking if 'completed' in sublogs/A05a_bam2allc* output."
echo "if any filename is printed, the associated batch may have not completed allc gen."

grep -c 'ended on' sublogs/A05a_bam2allc* | awk -F ":" '$2==0 {print $1}'





echo -e "\n\n'A05b_allccheck' completed.\n\n"



echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `


In [4]:
%%bash
cat > ../Scripts/A05c_global_mC_stats.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05c_global_mC_stats.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=4G
#$ -N A05c_global_mC_stats
#$ -t 1-512
#$ -hold_jid_ad A05a_bam2allc



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--

action_metadata_exists="overwrite" # <-- for help with incomplete jobs; overwrite, append, rename
check_lambda=true # <-- lambda phage spike-in (check if included in wet lab prep)
check_autosomal_only=false # <-- exclude mitochondrial, scaffolds, lambda

# output file
# (should probably make these explicitly named vs batchnumbered in future)
metadat_out=Metadata/A04d_mCfrac_${SGE_TASK_ID}.tsv



# extract target filepaths -----------------------------------------------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log
batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done

wellprefix=($(query_metadat "wellprefix"))
allcfile=($(query_metadat "A05a_allc"))
allctbi=($(query_metadat "A05a_allctbi"))

# for methylation fraction
calc_mC_frac () {
    awk -v context=$1 '$4 ~ context' ${2:-${tmpfile}} |
        awk '{mC+=$5; tot+=$6} END {
            if (tot == 0) { print "0\t0\tNA" } else { print mC "\t" tot "\t" mC/tot }}'
}



# print target files -----------------------------------------------------------

echo "batch number: $SGE_TASK_ID"
echo "processing the following rows in well metadata file ($metadat_well):"
for row in ${target_well_rows[@]}
    do
        echo -e "$row\t${wellprefix[$row]}"
    done
echo -e "\n\n"



# checking for existing metadata file ------------------------------------------

# columns are wellprefix, then for a given sequence context
# num reads supporting methylation, total coverage, then fraction methylated

header="wellprefix\tmLam\tLam\tmLamfrac\tmCCC\tCCC\tmCCCfrac\tmCG\tCG\tmCGfrac\tmCH\tCH\tmCHfrac\n"

if [[ ! -e ${metadat_out} ]]
then
    printf ${header} > ${metadat_out}
else
    echo "WARNING: ${metadat_out} seems to already exist."
    if [[ $action_metadata_exists == "overwrite" ]]
        then
        echo "overwriting the existing file. (action_metadata_exists=='overwrite')."
        printf "${header}" > ${metadat_out}
    elif [[ $action_metadata_exists == "append" ]]
        then
            echo "appending to the existing file. (action_metadata_exists=='append')"
    elif [[ $action_metadata_exists == "rename" ]]
        then
            metadat_out=Metadata/A04d_mCfrac_${SGE_TASK_ID}.tsv
            echo "renaming output file to ${metadat_out} to avoid overwriting (action_metadata_exists=='rename')."
            printf "${header}" > ${metadat_out}
    else
        echo "exiting. (check 'action_metadata_exists' variable if to change action.)"
        exit 1
    fi
fi



# loop through allc, calculate methylation fracs -------------------------------
# assuming methylation given for CHN or CGN contexts
# (usually <1 minute per file)

for row in ${target_well_rows[@]} 
do

    # check for existing mapping output
    # if final outputs exist, skip; else run awk tabulization
    cd ${dir_proj}
    
    if [[ ! -s ${allcfile[$row]} \
        && ! -s ${allctbi[$row]} ]]
    then
        echo -e "WARNING: final .allc files for '${wellprefix[$row]}' missing. skipping this well.'"
    else
    
    echo "processing '${wellprefix[$row]}'..."
        
    # create temporary unzipped .allc file to awk through
    # (if there are "tmp_*" files in '${dir_proj}/Metadata', this script may have failed/timeout)
    tmpfile="Metadata/tmp_${wellprefix[$row]}"
    if [[ "$check_lambda" == "true" ]]
    then
        # use a partially methylated lambda, methylated at all but CAG, CTG
        gunzip -c ${allcfile[$row]} | awk '$1 == "chrL" && $4 != "CAG" && $4 != "CTG"' \
                > ${tmpfile}_lambda
        mlambda=$(calc_mC_frac "^[ACTG][ACTG][ACTG]" ${tmpfile}_lambda )
        rm ${tmpfile}_lambda
    else
        mlambda="0\t0\tNA"
    fi
    
    if [[ "$check_autosomal_only" == "true" ]]
        then
            # warning: may not capture scaffolds named e.g., chr6_SCAFF
            gunzip -c ${allcfile[$row]} | awk '$1 ~ "^chr[0-9]"' > ${tmpfile}
        else
            gunzip -c ${allcfile[$row]} | awk '$1 != "chrL"' > ${tmpfile}
    fi

    # calculate CCC, CGN, CHN
    mccc=$( calc_mC_frac "^CCC" )
    mcg=$( calc_mC_frac "^CG[ACTG]" )
    mch=$( calc_mC_frac "^C[ACT][ACTG]" )

    # record metrics, remove tmp file
    echo -e "${wellprefix[$row]}\t${mlambda}\t${mccc}\t${mcg}\t${mch}" \
        >> ${metadat_out}
    rm ${tmpfile}

fi
done





echo -e "\n\n'A05c_global_mC_stats' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `


In [5]:
%%bash
cat > ../Scripts/A05d_allc_to_mcds.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05d_allc2mcds.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=24:00:00,h_data=8G,highp
#$ -N A05d_allc2mcds
#$ -t 1-32
#$ -pe shared 8
#$ -hold_jid A05a_bam2allc



echo "Job $JOB_ID.$SGE_TASK_ID started on:    " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:    " `date `
echo " "





# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--



# extract target filepaths ------------------—------------------—---------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' ${metadat_well}
}

# extract target wells, print values for log
platenum=($(query_metadat "platenum"))
nwells=${#platenum[@]}
  
target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${platenum[$row]}" == "${SGE_TASK_ID}" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------
# (generally not customizeable because output names set by bismark)

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A04a_dir_bismark"))

bam_in=($(query_metadat "A04a_bam_mergedsort"))

allc_out=($(query_metadat "A05a_allc"))
allctbi_out=($(query_metadat "A05a_allctbi"))



# print target files -----------------------------------------------------------

echo "batch number: $SGE_TASK_ID"
echo "processing the following rows in well metadata file ($metadat_well):"

for row in ${target_well_rows[@]}
do
    echo -e "${row}\t${wellprefix[$row]}"
done



# make .tsv of allcs -----------------------------------------------------------
# will appear as Metadata/A05d_allclist_*

tsv_target_allcs=Metadata/A05d_allclist_${SGE_TASK_ID}.tsv
if [[ -s ${tsv_target_allcs} ]]
then
    rm ${tsv_target_allcs}
fi

for row in ${target_well_rows[@]}
do
    if [[ -s ${allc_out[$row]} && -s ${allctbi_out[$row]} ]]
    then
        echo -e "${wellprefix[$row]}\t${allc_out[$row]}"  >> ${tsv_target_allcs}
    fi
done
echo -e "\n\n"



# run mcds generation ----------------------------------------------------------

if [[ ! -s mcds ]]
then
    mkdir mcds
fi

# .allc files --> aggregated into mcds regions
# note: the mcds is not a single file, but more like a directory with binary compression
# if this step fails may leave a tmp file in the mcds/ folder
allcools generate-dataset  \
    --allc_table ${tsv_target_allcs} \
    --output_path mcds/${SGE_TASK_ID}.mcds \
    --chrom_size_path ${ref_chromsizes} \
    --obs_dim cell \
    --cpu 8 \
    --chunk_size 400 \
    --regions chrom100k 100000 \
    --regions genebody ${ref_genebody} \
    --quantifiers chrom100k count CGN,CHN \
    --quantifiers genebody count CGN,CHN

# not standard, but but consider adding:
#    --regions chrom5k 5000 \
#    --regions geneslop2k $ref_geneslop2k \
# (genebody +/- 2kb to include promoter region + increase cov)  



echo -e "\n\n'A05d_allc2mcds' completed.\n\n"





echo "Job $JOB_ID.$SGE_TASK_ID ended on:    " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:    " `date `
