In [None]:
# ## overall commands
# ## might be cleaner to wait for STAR mapping to finish 100% (run A04a only --> check output)
# ## in lieu of submitting all of the below, but technically could run all cmds at once
    
# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)

# qsub Scripts/A05a_star_mapping.sub # † 
# qsub Scripts/A05b_check_star.sub 
# qsub Scripts/A05c_star_filtering.sub # † 
# qsub Scripts/A05d_featurecounts.sub   # *
# qsub Scripts/A05e_star_bam_stats.sub # † 


In [None]:
%%bash
cat > ../Scripts/A05a_star_mapping.sub
#!/bin/bash
#$ -cwd
#$ -o sublogs/A05a_star.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=8:00:00,h_data=16G,exclusive
#$ -pe shared 4
#$ -N A05a_star
#$ -t 1-256
#$ -hold_jid_ad A03a_trim




echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs
overwrite_partial=true # <-- for help with incomplete jobs

# (most of the) STAR settings
# originally adapted from ENCODE guidelines
star_params="--runThreadN 4 --genomeDir ${ref_starfolder} --genomeLoad LoadAndKeep \
–alignEndsType EndToEnd --outSAMtype BAM Unsorted \
--outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif \
--sjdbOverhang 149 --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 –outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 \
--alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesCommand zcat "



# extract target filepaths ------------------—------------------—---------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log
batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done

# filepaths associated with target rows in well-level metadata -----------------

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A05a_dir_star"))

# .fastqs for input to PE mapping (properly paired read pairs)
fastq_r1p=($(query_metadat "A03a_fqgz_paired_R1"))
fastq_r2p=($(query_metadat "A03a_fqgz_paired_R2"))

# .fastqs for input to SE mapping, including singletons from trimming & unaligned in PE-mapping
fastq_r1singletrim=($(query_metadat "A03a_fqgz_singletrim_R1"))
fastq_r2singletrim=($(query_metadat "A03a_fqgz_singletrim_R2"))

# temporary/intermediate mapping files -----------------------------------------------
# (expected output generated by STAR, given outname prefixes PE., SE1., SE2.)

fastq_pe_unmap1=PE.Unmapped.out.mate1
fastq_pe_unmap2=PE.Unmapped.out.mate2

bam_pe=PE.Aligned.out.bam
bam_se1=SE1.Aligned.out.bam
bam_se2=SE2.Aligned.out.bam



# run STAR mapping ------------------—------------------—-----------------------

cd $projdir/mapping_star

# load genome index [5~10 min]
# creates some apparent .log, .sam out despite just loading genome
# so putting in mapping_star to keep these files in one place

STAR --runThreadN 4 --genomeDir $ref_starfolder --genomeLoad LoadAndExit

# map each well
for row in ${target_well_rows[@]} 
do

    # check directory/prior mapping ------------------—------------------—----
    
    cd $projdir

    # check for existing mapping output
    # if final outputs exist, skip; else run mapping .bam
    
    if [[ -s ${dir_well[$row]}/$bam_pe \
        && -s ${dir_well[$row]}/$bam_se1 \
        && -s ${dir_well[$row]}/$bam_se2 \
        && "$skip_complete"=="true" ]]
    then
        echo -e "final aligned .bams for '${wellprefix[$row]}' already exist. skipping this well.'"
    else
    
    echo -e "\n\napplying STAR to '${wellprefix[$row]}'...\n\n"

    # remove old directory if one exists to deal with incomplete files
    # albeit the only major issues are .bai and .tbi indices 
    # (these often are not overwritten by software in the pipeline,
    # resulting in "index is older than file" errors later on)
    if [[ -e ${dir_well[$row]} && "$overwrite_partial" == "true" ]]
    then
        echo -e "\n\nWARNING: folder for '${wellprefix[$row]}' exists, but not its final .bam alignments."
        echo "because overwrite_partial=true, deleting the directory and re-mapping."
        rm -rf ${dir_well[$row]}
    fi

    mkdir ${dir_well[$row]}
    cd ${dir_well[$row]}
    
    
    # run alignments ------------------—------------------—-------------------
    # in: .fastqs from trimming: four .fastqs,
    #     properly paired ($fastq_r2p, $fastq_r1p) and trimming singletons ($fastq_r1singletrim, $fastq_r2singletrim)
    # out: - paired-end, single-end .bam alignments out ($bam_pe, $bam_se1, $bam_se2)
    #      - key log files (e.g., mapping rate) 
    # ---------------------------------------—------------------—-------------

   # (i) paired-end mapping [<1-3 min] 
   # assumptions: pairs that map ambiguously in paired-end mode should be discarded
    STAR $star_params \
        --outFileNamePrefix PE. \
        --readFilesIn $projdir/${fastq_r1p[$row]} $projdir/${fastq_r2p[$row]} \
        --outReadsUnmapped Fastx

    # .fq --> .fq.gz for future storage/help match STAR's expected input type [<1-2 min]
    bgzip $fastq_pe_unmap1
    bgzip $fastq_pe_unmap2

    # (ii.) single-end, R1 [<1-3 min]
    # includes Read 1 singletons from trimming and STAR mapping in (i)
    STAR $star_params \
        --outFileNamePrefix SE1. \
        --readFilesIn $projdir/${fastq_r1singletrim[$row]},$fastq_pe_unmap1.gz

    # (iii.) single-end, R2 [<1-3 min]
    # includes Read 2 singletons from trimming and STAR mapping in (i)
    STAR $star_params \
        --outFileNamePrefix SE2. \
        --readFilesIn $projdir/${fastq_r2singletrim[$row]},$fastq_pe_unmap2.gz

    # (iv.) optional clean-up (comment out as desired)
    # *.Log.final.out contains mapping rate, whereas other *.out have STAR internals
    # *.SJ.out.tab contain splice junctions
    rm -rf *_STARtmp
    rm *Log.progress.out
    rm *Log.out

fi
done


# unload at end
STAR --genomeDir $refstar --genomeLoad Remove




echo -e "\n\n'A05a_star_mapping' completed.\n\n"


echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

In [None]:
%%bash
cat > ../Scripts/A05b_check_star.sub

# #!/bin/bash
# #$ -cwd
# #$ -o sublogs/A05b_check_star.$JOB_ID
# #$ -j y
# #$ -l h_rt=2:00:00,h_data=4G
# #$ -N A05b_starcheck
# #$ -hold_jid A05a_star



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init ------------------—------------------—-----------------------

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# extract target filepaths ------------------—------------------—---------------

cd $projdir

query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              } 
      NR>1 {
                print $assayout
            }' $metadat_well
}

check_filepaths_in_assay() {
    for file in $@
        do 
        if [[ ! -s $file ]]
            then
                echo "missing '$file'"
            fi
        done
}

check_filepath_in_batch() {
target_array=$@
for ((target_batch=1; target_batch<=$nbatches; target_batch++))
    do
        target_well_rows=()
        for ((row=1; row<=nwells; row++))
        do
            if [[ "${batchnum[$row]}" == "$target_batch" ]]
            then
                target_well_rows+=($row)
            fi
        done
        
        batch_allc_list=()
        for row in ${target_well_rows[@]}
        do
            batch_allc_list+=(${allctbi[$row]})
        done
        
        num_allc_missing=$(check_filepaths_in_assay ${batch_allc_list[@]} | wc -l)
        echo -e "${target_batch} \t $num_allc_missing"

    done \
    | awk -F "\t" '($2 > 0) { print $0 }'
}

batchnum=($(query_metadat "batchnum"))

nwells=${#batchnum[@]}
nbatches=${batchnum[-1]}



# apply checks for A05a output -------—------------------—----------------------

echo "-----------------------------------------------------------------"
echo "A. printing number of missing .bam files missing (by batch)... "
echo "-----------------------------------------------------------------"

bam_star_pe=($(query_metadat "A05a_bam_star_PE"))
bam_star_se1=($(query_metadat "A05a_bam_star_SE1"))
bam_star_se2=($(query_metadat "A05a_bam_star_SE2"))

echo "checking PE.Aligned.out.bam"
echo -e "batchnum\tnum_missing"
check_filepath_in_batch ${bam_star_pe[@]}

echo "checking SE1.Aligned.out.bam"
echo -e "batchnum\tnum_missing"
check_filepath_in_batch ${bam_star_se1[@]}

echo "checking SE2.Aligned.out.bam"
echo -e "batchnum\tnum_missing"
check_filepath_in_batch ${bam_star_se2[@]}

echo -e "\n\nsuggest re-running and checking sublog output of above batches."


echo -e "\n\n-----------------------------------------------------------------"
echo "B. checking log files for issues."
echo -e "-----------------------------------------------------------------\n"


echo "checking if 'completed' in sublogs/A05a_star* output."
echo "if any filename is printed, the associated batch may have not completed mapping."

grep -c 'A05a_star_mapping completed' sublogs/A05a_star* | awk -F ":" '$2==0 {print $1}'




echo -e "\n\n'A05b_starcheck' completed.\n\n"


echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

In [None]:
%%bash
cat > ../Scripts/A05c_classify_mCT_reads_STAR.pl

#!/usr/bin/perl -w
use strict;


# A05c_classify_mCT_reads_STAR.pl, v0.2 =====================================================
# based loosely on original perl script written by Dr. Chongyuan Luo (@luogenomics)
# modifications by Choo Liu (@chooliu):
# - readability/documentation
# - changes in logic for paired-end mapping (fwd/rev strand) & more efficient parsing of MD:Z flag
#   incl looking at only C/G positions in read vs. ref changes, calculating # cytosines, etc
# inputs: - .sam file from STAR (compatible with single-end or paired-end alignments)
# outputs: - "_annotations" .tsv recording each alignment's:
#             number of cytosines, mC/C fraction, and call (DNA, RNA, ambiguous)
#             this annotation file is subsequently appended to the. bam to keep RNA reads
# typical usage: perl A05c_classify_mCT_reads_STAR.pl alignments.sam
# =====================================================================================

# reading in .sam file
my ($samfile)=($ARGV[0]);
my @sample; my @samline; 
my $read; my @read; my @ref; my $ref; my @md; my $dir; my $unmch; 

# determining read classification
my $filter_num_CH=3;
my $filter_frac_mCHmin=0.5; # DNA (mCH/CH <0.5)
my $filter_frac_mCHmax=0.9; # RNA (mCH/CH >0.9)
my $mch_fraction; my $totalch; my $totcpg; my @call;




# load .sam file, loop through lines in .sam 
# exports info on each read to "(input .sam file name)_annotations"

# for each line in .sam file, --------------------------------------------------------
# count unmethylated and methylated cytosines in CHG and CHH context based on XR-tag 

# notes:
# - skip header header rows (starting with @)
# - assumes the XR-tag is in column 10 (STAR output), $samline[9]
# - currently ignores indels
# - default settings: keep reads with >=3 CHNs and mCHN/CHN fraction >=0.9 [*]
#   at present, we only retain call="RNA" and discard all others

open sam_in, "$samfile" or die $!;
open sam_annotations, ">$samfile\_annotations" or die $!;
while (<sam_in>)
{
  chop $_; 
  if (substr($_,0,1) eq '@') { print sam_annotations "\n"; }
  else
  {
  
    @samline = split(/\t/,$_);
    $read = $samline[9];
    @read = split(//,$read);
    @ref = @read;
    
    @md=split( /(\d+)/ , $samline[15]);
    
    # check mapping orientation relative to reference genome ----------------------------
    # dir=1 if fwd (99, 163, 0, 73), =0 if reverse (147, 83, 16, 89)
    # flaw in script is that these numbers manually specified--if see strange results,
    # check mapping output for other flags before filtering / add interpreter
    $dir = 0 + ( ($samline[1] eq 99) or ($samline[1] eq 163 ) or ($samline[1] eq 0) or ($samline[1] eq 73) ); 
    
    # compare read to reference genome --------------------------------------------------
    # split MD:Z: flag by numbers then examine resulting length
    # e.g., MD:Z:10A0B121 --> MD:Z: 10 A 0 B 121 has $num_md_features=5
    
    # assume fully methylated if perfect match to genomic ref
    # (if MD stores a single number indicating no changes, $#md = 1)
    my $unmch = 0;
    my $num_md_features=$#md;
    if ($num_md_features == 1) { }
    
    # <-- start ELSE for num_md_features != 1
    # otherwise, attempt to reconstruct reference sequence, 
    # looping through positions in read where the read != ref nucleotide
    else {
        my $pos=0;

        for (my $i=1; $i<=$num_md_features; $i += 2) {
        
            # loop through bases where there are changes
            $pos = $pos + @md[$i];
            
            # do nothing if starts with "^" (deletion)
            # modify @ref sequence if differences
             if ( (rindex("@md[$i+1]", "^", 0) == 0) or ($i+1 > $num_md_features)) { }  
             else {
                 @ref[$pos] = @md[$i + 1];
                 }
            
            # increment base by 1
            $pos += 1;
        }
  
    # tabulate # unmethylated cytosines --------------------------------------------------- 
    # again looping through each position where read != ref
    # (although same loop as above, re-loop because potential cases where adjacent bases differ btwn ref & read)
        my $pos=0;
        for (my $i=1; $i<=$num_md_features; $i += 2) {
        
           $pos = $pos + @md[$i];
           
           # if read maps to forward strand of genome
           # check if at a CH-site, and unmethylated cytosine converted to "T"
           if ( ($dir eq 1) and
                (@ref[$pos] eq "C") and (@ref[$pos + 1] ne "G") and (@read[$pos] eq "T") ) {
                    $unmch += 1;
                }
       
           # if read maps to reverse strand of genome
           # STAR rev compliments the $read sequence, so cytosines are represented by "G"
           # check if at a CH-site, and unmethylated cytosine converted to "A"
           # (note: mCH underestimated in niche case where dir=0 & 
           #        first base is cytosine, as @ref[$pos - 1] is undefined)
           if ( ($dir eq 0) and
                (@ref[$pos] eq "G") and (@ref[$pos - 1] ne "C") and (@read[$pos] eq "A") ) {
                $unmch += 1;
            }
            
            $pos += 1;
        }
    } # <-- end ELSE statement for num_md_features != 1

    $ref = join('', @ref);
    
    # count cytosines in CH-context --------------------------------------------------
    # note: since done via regex, faster to count [# C] and subtract [# CG] vs searching wild flag C[ACT]
    if ($dir eq 1) {
      my @totalch = $ref =~ /C/g;
      $totalch = scalar @totalch;
      my @totcpg = $ref =~ /CG/g;
      $totcpg = scalar @totcpg;
    }
    if ($dir eq 0) {
      my @totalch = $ref =~ /G/g;
      $totalch = scalar @totalch;
      my @totcpg = $ref =~ /CG/g;
      $totcpg = scalar @totcpg;
    }

    my $totalch = $totalch - $totcpg;
    
    # classify each read into modalities ----------------------------------------------
    if ($totalch==0) { # avoid div by zero error
          $mch_fraction=-999;
          @call="amb";
    } else {
          $mch_fraction = 1 - ($unmch/$totalch);
          if (($totalch>=$filter_num_CH) and ($mch_fraction < $filter_frac_mCHmin)) { @call="DNA"; }
          elsif (($totalch>=$filter_num_CH) and ($mch_fraction >= $filter_frac_mCHmax)) { @call="RNA"; }
          else { @call="amb"; } # exclude (low # CH or ambiguous mCH between 0.5-0.9)
    }

    print sam_annotations "${totalch}\t${mch_fraction}\t@call\n";

}
}

close bam_in;

In [None]:
%%bash
cat > ../Scripts/A05c_star_filtering.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05c_starfilt.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=8:00:00,h_data=8G
#$ -N A05c_starfilt
#$ -t 1-256
#$ -hold_jid_ad A05a_star



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "




# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs



# extract target filepaths ------------------—------------------—---------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log

batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A05a_dir_star"))



# set naming convention within each well folder -------------—------------------

# .bam files in (from A05a)
bam_in_pe=PE.Aligned.out.bam
bam_in_se1=SE1.Aligned.out.bam
bam_in_se2=SE2.Aligned.out.bam

# final .sam
sam_q10_pe=q10_pe.sam
sam_q10_se1=q10_se1.sam
sam_q10_se2=q10_se2.sam

# final .bam
bam_final_pe=PE.Final.bam
bam_final_se1=SE1.Final.bam
bam_final_se2=SE2.Final.bam

# map each well
for row in ${target_well_rows[@]} 
do

    # check directory/prior mapping ------------------—------------------—-----
    # check for existing mapping output
    # if final outputs exist, skip; else run filtering of direct STAR align
    
    cd $projdir
    
    if [[ -s ${dir_well[$row]}/$bam_final_pe \
        && -s ${dir_well[$row]}/$bam_final_se1 \
        && -s ${dir_well[$row]}/$bam_final_se2 \
        && "$skip_complete"=="true" ]]
    then
        echo -e "final aligned .bams for '${wellprefix[$row]}' already exist. skipping this well.'"
    else
    
        echo -e "\n\napplying STAR to '${wellprefix[$row]}'...\n\n"
        cd ${dir_well[$row]}
        

        # proceed if all input files exist
        if [[ -s $bam_in_pe && -s $bam_in_se1 && -s $bam_in_se2 ]]
        then

            # clear intermediate files (if not all of them exist)
            rm $bam_final_pe $bam_final_se1 $bam_final_se2 2>&1 >/dev/null

        # run RNA filtering ------------------—------------------—-----------------
        # in: three .bams from STAR mapping: $bam_in_* (paired-end, single-end read 1, read 2)
        # out: - MAPQ and f ($bam_final_*)
        #      - key log files (e.g., mapping rate) 
        # ------------------—------------------—-----------------------------------
        
        # STAR will output singletons in .bam file
        # (e.g., read 2 maps but read 1 doesn't; samtools view -f 8 $bam_in_pe)
        # hence the "-f 0x0002" flag for "proper pairs" only, and SE alignments merged with...
        # in later sections, "-f 0x0048" (the read is R1; it mapped but R2 mate didn't map) 
        #                and "-f 0x0088" (the read is R2; it mapped but R1 mate didn't map)

        # if both R1 & R2 of pair need to pass filtering criteria (AND instead of OR),
        #     needs an added annotations --> wide step like the below
        # sed '$!N;s/\n/ /' ${sam_q10_pe}_annotations \
        #     | awk '{print ( ($3 == "RNA") && ($6 == "RNA") )}' \
        #     | awk '{print $0}1' > ${sam_q10_pe}_annotations_bothpairs
        # then awk filter off of this 'bothpairs' file
        
        # each step usually ~1-2 min

        # (i) paired-end
        samtools view -h -q 10 -f 0x0002 $bam_in_pe > $sam_q10_pe
        perl $projdir/Scripts/A05c_classify_mCT_reads_STAR.pl $sam_q10_pe
        awk 'NR == FNR { if ($0=="" || $3=="RNA") line[NR]; next } (FNR in line)' \
                ${sam_q10_pe}_annotations ${sam_q10_pe} |
                samtools view -b - | samtools sort - > $bam_final_pe
        samtools index $bam_final_pe

        # (ii) single-end, read 1
        samtools view -h -q 10 $bam_in_se1 > $sam_q10_se1
        perl $projdir/Scripts/A05c_classify_mCT_reads_STAR.pl $sam_q10_se1
        awk 'NR == FNR { if ($0=="" || $3=="RNA") line[NR]; next } (FNR in line)' \
                ${sam_q10_se1}_annotations ${sam_q10_se1} |
                samtools view -b - | samtools sort - > $bam_final_se1
        samtools index $bam_final_se1

        # (iii) single-end, read 2
        samtools view -h -q 10 $bam_in_se2 > $sam_q10_se2
        perl $projdir/Scripts/A05c_classify_mCT_reads_STAR.pl $sam_q10_se2
        awk 'NR == FNR { if ($0=="" || $3=="RNA") line[NR]; next } (FNR in line)' \
                ${sam_q10_se2}_annotations ${sam_q10_se2} |
                samtools view -b - | samtools sort - > $bam_final_se2
        samtools index $bam_final_se2

        # (iv.) optional clean-up (comment out as desired)
        rm $sam_q10_pe $sam_q10_se1 $sam_q10_se2
        # rm *annotations

        fi
    fi
done



echo "completed 'A05c_star_filtering.'"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

In [None]:
%%bash
cat > ../Scripts/A05d_featurecounts.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05d_featurecounts.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=16:00:00,h_data=24G
#$ -N A05d_featurecounts
#$ -t 1-16
#$ -hold_jid A05c_starfilt




echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

dir_out_gene=featurecounts_gene/ # <-- 
dir_out_exon=featurecounts_exon/ # <--



# extract target filepaths ------------------—------------------—---------------

mkdir $dir_out_gene
mkdir $dir_out_exon

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {f
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log

platenum=($(query_metadat "platenum"))
nwells=${#platenum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${platenum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A05a_dir_star"))

bam_pe=($(query_metadat "A05c_bam_starfilt_PE"))
bam_se1=($(query_metadat "A05c_bam_starfilt_SE1"))
bam_se2=($(query_metadat "A05c_bam_starfilt_SE2"))



# extract valid .bam filepaths -------------------------------------------------

# checks only if the .bam exists (otherwise featureCounts may terminate with error)
# could also wrap in check that number of alignments in file is > 0
# for greater future compatibility with featureCounts

pe_files=$(
for f in ${bam_pe[@]: ${target_well_rows[0] }:${#target_well_rows[@] } }
do
    if [[ -s $f ]]
    then
        echo $f
    fi
done)

se1_files=$(
for f in ${bam_se1[@]: ${target_well_rows[0] }:${#target_well_rows[@] } }
do
    if [[ -s $f ]]
    then
        echo $f
    fi
done)

se2_files=$(
for f in ${bam_se2[@]: ${target_well_rows[0] }:${#target_well_rows[@] } }
do
    if [[ -s $f ]]
    then
        echo $f
    fi
done)



# featurecounts on genes ------------------—------------------—-----------------
# usually <1 min/well --> ~6 hrs per plate

echo "running gene featureCounts on paired-end alignments (mapping_star/*/PE.Final.bam)."
featureCounts -p -T 4 -t gene -a $ref_gtf \
    -o ${dir_out_gene}PE_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $pe_files

echo "running gene featureCounts on single-end R1 alignments (mapping_star/*/SE1.Final.bam)."
se1_files=${bam_se1[@]: ${target_well_rows[0] }:${#target_well_rows[@] } }
featureCounts -T 4 -t gene -a $ref_gtf \
    -o ${dir_out_gene}SE1_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $se1_files

echo "running gene featureCounts on single-end R2 alignments (mapping_star/*/SE2.Final.bam)."
se2_files=${bam_se2[@]: ${target_well_rows[0] }:${#target_well_rows[@] } }
featureCounts -T 4 -t gene -a $ref_gtf \
    -o ${dir_out_gene}SE2_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $se2_files



# featurecounts on exons ------------------—------------------—-----------------

echo "running exon featureCounts on paired-end alignments (mapping_star/*/PE.Final.bam)."
featureCounts -p -T 4 -t exon -a $ref_gtf \
    -o ${dir_out_exon}PE_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $pe_files

echo "running exon featureCounts on single-end R1 alignments (mapping_star/*/SE1.Final.bam)."
featureCounts -T 4 -t exon -a $ref_gtf \
    -o ${dir_out_exon}SE1_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $se1_files

echo "running exon featureCounts on single-end R2 alignments (mapping_star/*/SE2.Final.bam)."
featureCounts -T 4 -t exon -a $ref_gtf \
    -o ${dir_out_exon}SE2_${SGE_TASK_ID} --donotsort --tmpDir $dir_scratch $se2_files




echo -e "\n\n'A05d_featurecounts' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "




In [None]:
%%bash
cat > ../Scripts/A05e_star_bam_stats.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05g_samstat_star.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=8:00:00,h_data=24G
#$ -N A05e_samstat_star
#$ -t 1-256
#$ -hold_jid_ad A05c_starfilt




echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--

skip_complete=true # <-- for help with incomplete jobs




# extract target filepaths ------------------—------------------—---------------

# helper functions
query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              }
      NR>1 {
                print $assayout
            }' $metadat_well
}

# extract target wells, print values for log

batchnum=($(query_metadat "batchnum"))
nwells=${#batchnum[@]}

target_well_rows=()
for ((row=1; row<=nwells; row++))
do
    if [[ "${batchnum[$row]}" == "$SGE_TASK_ID" ]]
    then
        target_well_rows+=($row)
    fi
done



# filepaths associated with target rows in well-level metadata -----------------

wellprefix=($(query_metadat "wellprefix"))
dir_well=($(query_metadat "A05a_dir_star"))



# samtools stats on each well in the batch --—------------------—---------------

for row in ${target_well_rows[@]} 
do

    cd $projdir
    
    if [[ -s ${dir_well[$row]}/samstats_PE \
        && -s ${dir_well[$row]}/samstats_SE1 \
        && -s ${dir_well[$row]}/samstats_SE2 \
        && -s ${dir_well[$row]}/picard_PE \
        && -s ${dir_well[$row]}/picard_SE1 \
        && -s ${dir_well[$row]}/picard_SE2 \
        && "$skip_complete"=="true" ]]
    then
        echo -e "final aligned .bams for '${wellprefix[$row]}' already exist. skipping this well.'"
    else
    
        echo -e "\n\nprofiling .bams from '${wellprefix[$row]}'...\n\n"

        cd ${dir_well[$row]}
        
        # run samtools stats
        samtools stats PE.Final.bam | grep '^SN' | cut -f 2,3 > samstats_PE
        samtools stats SE1.Final.bam | grep '^SN' | cut -f 2,3 > samstats_SE1
        samtools stats SE2.Final.bam | grep '^SN' | cut -f 2,3 > samstats_SE2

        # run picard collectrnaseqmetrics
        samtools sort PE.Final.bam | \
            picard CollectRnaSeqMetrics -I /dev/stdin -O picard_PE \
            --REF_FLAT $ref_flat -STRAND "NONE" --RIBOSOMAL_INTERVALS $ref_rrna
        samtools sort SE1.Final.bam | \
            picard CollectRnaSeqMetrics -I /dev/stdin -O picard_SE1 \
            --REF_FLAT $ref_flat -STRAND "NONE" --RIBOSOMAL_INTERVALS $ref_rrna
        samtools sort SE2.Final.bam | \
            picard CollectRnaSeqMetrics -I /dev/stdin -O picard_SE2 \
            --REF_FLAT $ref_flat -STRAND "NONE" --RIBOSOMAL_INTERVALS $ref_rrna
            
    fi
done


echo -e "\n\n'A05e_samstat_star' completed.\n\n"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

