# All in one pipeline

## Load Variables and Make Directories

In [1]:
set -u
source bioinf_intro_config.sh
# clean up
rm -rf $CUROUT
mkdir -p $STAR_OUT $GENOME_DIR $MYINFO $TRIMMED $QC_RAW $QC_TRIM

## Make Adapter File

In [2]:
echo ">Adapter
AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
>AdapterRead2
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
>Adapter_rc
TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
>AdapterRead2_rc
ACACTCTTTCCCTACACGACGCTCTTCCGATCT" > $ADAPTERS

## Download Genome and Annotation

In [3]:
for CUR in $FA_URL $GTF_URL ; do
    wget --directory-prefix ${GENOME_DIR} ${CUR}
done

--2019-06-26 17:01:07--  ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/fasta/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/dna/Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz
           => ‘/home/jovyan/work/scratch/bioinf_intro/genome/Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz’
Resolving ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)... 193.62.197.94
Connecting to ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)|193.62.197.94|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/release-39/fungi/fasta/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/dna ... done.
==> SIZE Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz ... 5922212
==> PASV ... done.    ==> RETR Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz ... done.
Length: 5922212 (5.6M) (unauthoritative)


2019-06-26 17:01:09 (5.21 MB/s

In [4]:
gunzip --force ${GENOME_DIR}/${GTF}.gz
gunzip --force ${GENOME_DIR}/${FA}.gz

## Index Genome

In [5]:
STAR \
    --runThreadN $THREADS \
    --runMode genomeGenerate \
    --genomeDir $GENOME_DIR \
    --genomeFastaFiles ${GENOME_DIR}/${FA} \
    --sjdbGTFfile ${GENOME_DIR}/${GTF} \
    --outFileNamePrefix ${STAR_OUT}/genome_ \
    --sjdbGTFfeatureExon exon \
    --sjdbGTFtagExonParentTranscript transcript_id \
    --sjdbGTFtagExonParentGene gene_id \
    --genomeSAindexNbases 6

Jun 26 17:01:12 ..... started STAR run
Jun 26 17:01:12 ... starting to generate Genome files
Jun 26 17:01:13 ... starting to sort Suffix Array. This may take a long time...
Jun 26 17:01:13 ... sorting Suffix Array chunks and saving them to disk...
Jun 26 17:01:34 ... loading chunks from disk, packing SA...
Jun 26 17:01:35 ... finished generating suffix array
Jun 26 17:01:35 ... generating Suffix Array index
Jun 26 17:01:35 ... completed Suffix Array index
Jun 26 17:01:35 ..... processing annotations GTF
Jun 26 17:01:35 ..... inserting junctions into the genome indices
Jun 26 17:01:59 ... writing Genome to disk ...
Jun 26 17:01:59 ... writing Suffix Array to disk ...
Jun 26 17:01:59 ... writing SAindex to disk
Jun 26 17:01:59 ..... finished successfully


## Read Quality Control

In [6]:
fastqc --quiet --threads $THREADS $RAW_FASTQS/21_2019_P_M1_S21_L00[1-2]_R1_001.fastq.gz --outdir $QC_RAW

With globs and `basename` in our toolbox, we are ready to **conquer the world** or at least run multiple FASTQs through our pipeline, without breaking a sweat!

## Trim and Map Reads

In [7]:
for FASTQ in $RAW_FASTQS/21_2019_P_M1_S21_L00[1-2]_R1_001.fastq.gz
    do
        FASTQ_BASE="$(basename ${FASTQ} '_001.fastq.gz')"
        echo "---------------- TRIMMING: $FASTQ_BASE ----------------"
        fastq-mcf \
            $MYINFO/neb_e7600_adapters.fasta \
            $RAW_FASTQS/${FASTQ_BASE}_001.fastq.gz \
            -q 20 -x 0.5 \
            -o $TRIMMED/${FASTQ_BASE}_001.trim.fastq.gz
        
        echo "---------------- MAPPING: $FASTQ_BASE ----------------"
        STAR \
            --runMode alignReads \
            --twopassMode None \
            --genomeDir $GENOME_DIR \
            --readFilesIn $TRIMMED/${FASTQ_BASE}_001.trim.fastq.gz \
            --readFilesCommand gunzip -c \
            --outFileNamePrefix ${STAR_OUT}/${FASTQ_BASE}_ \
            --quantMode GeneCounts \
            --outSAMtype BAM SortedByCoordinate \
            --runThreadN $THREADS \
            --alignIntronMax 5000 \
            --outSJfilterIntronMaxVsReadN 500 1000 2000    
    done

---------------- TRIMMING: 21_2019_P_M1_S21_L001_R1 ----------------
Command Line: /home/jovyan/work/scratch/bioinf_intro/myinfo/neb_e7600_adapters.fasta /data/hts_2019_data/hts2019_pilot_rawdata/21_2019_P_M1_S21_L001_R1_001.fastq.gz -q 20 -x 0.5 -o /home/jovyan/work/scratch/bioinf_intro/trimmed_fastqs/21_2019_P_M1_S21_L001_R1_001.trim.fastq.gz
Scale used: 2.2
Phred: 33
Threshold used: 751 out of 300000
Adapter Adapter (AGATCGGAAGAGCACACGTCTGAACTCCAGTCA): counted 2504 at the 'end' of '/data/hts_2019_data/hts2019_pilot_rawdata/21_2019_P_M1_S21_L001_R1_001.fastq.gz', clip set to 6
Files: 1
Total reads: 2479050
Too short after clip: 1519
Clipped 'end' reads: Count: 46013, Mean: 15.47, Sd: 8.24
Trimmed 299018 reads by an average of 1.71 bases on quality < 20
---------------- MAPPING: 21_2019_P_M1_S21_L001_R1 ----------------
Jun 26 17:03:00 ..... started STAR run
Jun 26 17:03:00 ..... loading genome
Jun 26 17:03:00 ..... started mapping
Jun 26 17:03:41 ..... started sorting BAM
Jun 26 17:0

In [8]:
fastqc --quiet --threads $THREADS $TRIMMED/21_2019_P_M1_S21_L00[1-2]_R1_001.trim.fastq.gz --outdir $QC_TRIM

### And let's check the result

In [9]:
ls ${STAR_OUT}

21_2019_P_M1_S21_L001_R1_Aligned.sortedByCoord.out.bam
21_2019_P_M1_S21_L001_R1_Log.final.out
21_2019_P_M1_S21_L001_R1_Log.out
21_2019_P_M1_S21_L001_R1_Log.progress.out
21_2019_P_M1_S21_L001_R1_ReadsPerGene.out.tab
21_2019_P_M1_S21_L001_R1_SJ.out.tab
21_2019_P_M1_S21_L002_R1_Aligned.sortedByCoord.out.bam
21_2019_P_M1_S21_L002_R1_Log.final.out
21_2019_P_M1_S21_L002_R1_Log.out
21_2019_P_M1_S21_L002_R1_Log.progress.out
21_2019_P_M1_S21_L002_R1_ReadsPerGene.out.tab
21_2019_P_M1_S21_L002_R1_SJ.out.tab
genome_Log.out


In [10]:
head ${STAR_OUT}/21_2019_P_M1_S21_L00?_R1_ReadsPerGene.out.tab

==> /home/jovyan/work/scratch/bioinf_intro/star_out/21_2019_P_M1_S21_L001_R1_ReadsPerGene.out.tab <==
N_unmapped	47554	47554	47554
N_multimapping	34395	34395	34395
N_noFeature	12474	2182940	18831
N_ambiguous	206368	813	114
CNAG_04548	0	0	0
CNAG_07303	0	0	0
CNAG_07304	10	0	10
CNAG_00001	0	0	0
CNAG_07305	2	0	2
CNAG_00002	43	0	43

==> /home/jovyan/work/scratch/bioinf_intro/star_out/21_2019_P_M1_S21_L002_R1_ReadsPerGene.out.tab <==
N_unmapped	46059	46059	46059
N_multimapping	33738	33738	33738
N_noFeature	12438	2146527	18736
N_ambiguous	203237	786	143
CNAG_04548	0	0	0
CNAG_07303	0	0	0
CNAG_07304	6	0	6
CNAG_00001	0	0	0
CNAG_07305	1	0	1
CNAG_00002	51	0	51


In [11]:
multiqc --force ${STAR_OUT} --outdir ${STAR_OUT} --filename multiqc_report_counts.html
multiqc --force $QC_RAW --outdir ${STAR_OUT} --filename multiqc_report_rawfastq.html
multiqc --force $QC_TRIM --outdir ${STAR_OUT} --filename multiqc_report_trimfastq.html

  configs = yaml.load(f)
  sp = yaml.load(f)
[INFO   ]         multiqc : This is MultiQC v1.7
[INFO   ]         multiqc : Template    : default
[INFO   ]         multiqc : Searching '/home/jovyan/work/scratch/bioinf_intro/star_out'
[?25lSearching 13 files..  [####################################]  100%          [?25h
[INFO   ]            star : Found 2 reports and 2 gene count files
[INFO   ]         multiqc : Compressing plot data
[INFO   ]         multiqc : Report      : ../../scratch/bioinf_intro/star_out/multiqc_report_counts.html
[INFO   ]         multiqc : Data        : ../../scratch/bioinf_intro/star_out/multiqc_report_counts_data
[INFO   ]         multiqc : MultiQC complete
  configs = yaml.load(f)
  sp = yaml.load(f)
[INFO   ]         multiqc : This is MultiQC v1.7
[INFO   ]         multiqc : Template    : default
[INFO   ]         multiqc : Searching '/home/jovyan/work/scratch/bioinf_intro/qc_output_raw'
[INFO   ]          fastqc : Found 2 reports
[INFO   ]         multiqc 