# Analyze 2019 Course Data to Counts

## Load Variables and Make Directories

In [None]:
source star_2019_course_config.sh
# rm -rf $CUROUT
mkdir -p $STAR_OUT $GENOME_DIR $MYINFO $TRIMMED $QC/R1 $QC/R2

In [None]:
echo ">Adapter
AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
>AdapterRead2
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
>Adapter_rc
TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
>AdapterRead2_rc
ACACTCTTTCCCTACACGACGCTCTTCCGATCT" > $ADAPTERS

## Download Genome and Annotation

In [None]:
for CUR in $FA_URL $GTF_URL ; do
    wget --directory-prefix ${GENOME_DIR} ${CUR}
done

In [None]:
gunzip --force ${GENOME_DIR}/${GTF}.gz
gunzip --force ${GENOME_DIR}/${FA}.gz

## Index Genome

In [None]:
STAR \
    --runThreadN $THREADS \
    --runMode genomeGenerate \
    --genomeDir $GENOME_DIR \
    --genomeFastaFiles ${GENOME_DIR}/${FA} \
    --sjdbGTFfile ${GENOME_DIR}/${GTF} \
    --outFileNamePrefix ${STAR_OUT}/genome_ \
    --sjdbGTFfeatureExon exon \
    --sjdbGTFtagExonParentTranscript transcript_id \
    --sjdbGTFtagExonParentGene gene_id \
    --genomeSAindexNbases 6

## Read Quality Control

In [None]:
fastqc --quiet --threads $THREADS $RAW_FASTQS/*R1*.fastq.gz --outdir $QC/R1

In [None]:
fastqc --quiet --threads $THREADS $RAW_FASTQS/*R2*.fastq.gz --outdir $QC/R2

## Trim and Map Reads

In [None]:
for FASTQ in $RAW_FASTQS/*_L00?_R1_001.fastq.gz
    do
        FASTQ_BASE="$(basename ${FASTQ} '_R1_001.fastq.gz')"
        echo "---------------- TRIMMING: $FASTQ_BASE ----------------"
        fastq-mcf \
            $ADAPTERS \
            $RAW_FASTQS/${FASTQ_BASE}_R1_001.fastq.gz \
            $RAW_FASTQS/${FASTQ_BASE}_R2_001.fastq.gz \
            -q 20 -x 0.5 \
            -o $TRIMMED/${FASTQ_BASE}_R1_001.trim.fastq.gz \
            -o $TRIMMED/${FASTQ_BASE}_R2_001.trim.fastq.gz
        
        echo "---------------- MAPPING: $FASTQ_BASE ----------------"
        STAR \
            --runMode alignReads \
            --twopassMode None \
            --genomeDir $GENOME_DIR \
            --readFilesIn $TRIMMED/${FASTQ_BASE}_R1_001.trim.fastq.gz $TRIMMED/${FASTQ_BASE}_R2_001.trim.fastq.gz \
            --readFilesCommand gunzip -c \
            --outFileNamePrefix ${STAR_OUT}/${FASTQ_BASE}_ \
            --quantMode GeneCounts \
            --outSAMtype BAM SortedByCoordinate \
            --runThreadN $THREADS \
            --alignIntronMax 5000 \
            --outSJfilterIntronMaxVsReadN 500 1000 2000
            
    done

In [None]:
MAX_JOBS=1
THREADS=10
trim_and_star_func() {
    FASTQ_BASE="$(basename ${1} '_R1_001.fastq.gz')"

#     FASTQ=$1
#     FASTQ_BASE=${FASTQ##*/} # strip directory from file path
#     SAMPLE="${FASTQ_BASE%_R1_001.fastq.gz}" # strip .fq.gz file extension
#     echo $SAMPLE
#     echo $FASTQ
#     echo $FASTQ_BASE
    echo $1
    echo $FASTQ_BASE
    # exit 1

    # make a pipe for trimmed fastq
    R1_PIPE=`mktemp --dry-run`_${FASTQ_BASE}_R1_pipe.fq
    mkfifo $R1_PIPE
    R2_PIPE=`mktemp --dry-run`_${FASTQ_BASE}_R2_pipe.fq
    mkfifo $R2_PIPE

    # Run fastq-mcf
    fastq-mcf \
        $ADAPTERS \
        $FASTQ_BASE_R1_001.fastq.gz \
        $FASTQ_BASE_R2_001.fastq.gz \
        -o $R1_PIPE \
        -o $R2_PIPE \
        -q 20 -x 0.5 &
        
    STAR \
    --runMode alignReads \
    --runThreadN $THREADS \
    --genomeDir $GENOME_DIR \
    --outSAMtype BAM SortedByCoordinate \
    --limitBAMsortRAM 1280000000 \
    --quantMode GeneCounts \
    --genomeLoad LoadAndKeep \
    --twopassMode None \
    --outFileNamePrefix ${STAR_OUT}/${FASTQ_BASE}_ \
    --alignIntronMax 5000 \
    --outSJfilterIntronMaxVsReadN 500 1000 2000 \
    --readFilesIn $R1_PIPE $R2_PIPE 
        
    rm -f $R1_PIPE
    rm -f $R2_PIPE
}
export -f trim_and_star_func

# STAR --genomeDir $GENOME_DIR \
#     --outFileNamePrefix ${STAR_OUT}/genomeload_ \
#     --genomeLoad LoadAndExit 

# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/1_A_*R1*.fastq.gz
# # parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/A_[1-2]_*.fastq.gz

# STAR --genomeDir $GENOME_DIR \
#     --outFileNamePrefix ${STAR_OUT}/genomeremove_ \
#     --genomeLoad Remove

In [None]:
chmod u+w $DATA_BASE
mkdir -p $FINAL_COUNTS
chmod -R u+w $FINAL_COUNTS
mv $STAR_OUT/*_ReadsPerGene.out.tab $FINAL_COUNTS
chmod -R a-w $DATA_BASE

In [None]:
chmod u+w $DATA_BASE
mkdir -p $FINAL_BAMS
chmod -R u+w $FINAL_BAMS
mv $STAR_OUT/*_Aligned.sortedByCoord.out.bam $FINAL_BAMS
chmod -R a-w $DATA_BASE

In [None]:
chmod u+w $DATA_BASE
mkdir -p $FINAL_LOG
chmod -R u+w $FINAL_LOG
mv $STAR_OUT/*_Log.final.out $FINAL_LOG
chmod -R a-w $DATA_BASE

In [None]:
multiqc --force $QC/R1 --outdir ${STAR_OUT} --filename multiqc_report_R1.html

In [None]:
multiqc --force $QC/R2 --outdir ${STAR_OUT} --filename multiqc_report_R2.html

In [None]:
multiqc --force ${FINAL_BAMS} ${FINAL_LOG} ${FINAL_COUNTS} ${STAR_OUT} --outdir ${STAR_OUT} --filename multiqc_report_counts.html
echo ${STAR_OUT}