# Align Reads (YJF153) to the Reference (S288c)

In [1]:
source config.sh

check the files

In [6]:
ls $RAW_FASTQS
ls $TRIMMED
ls $FILTER

SRR4841864_1.fastq  SRR4841864_2.fastq  SRR4841864.sra
SRR4841864_1.trim.fastq  SRR4841864_2.trim.fastq
SRR4841864_1.filter.fastq  SRR4841864_2.filter.fastq


In [3]:
ls $GENOME

Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa
Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.amb
Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.ann
Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.bwt
[0m[01;31mSaccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz[0m
Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.pac
Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.sa


## Alignment using `BWA mem`

https://github.com/lh3/bwa
```
git clone https://github.com/lh3/bwa.git
cd bwa; make
./bwa index ref.fa
./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
```

Parameters
```
-t INT ---> Number of threads [1]
-M     ---> Mark shorter split hits as secondary (for Picard compatibility).
```

**Test looping in/out directories and fastq files**

In [20]:
DIR=( "$RAW_FASTQS" "$TRIMMED"    "$FILTER" )
OUT=( "$ALIGN_RAW"  "$ALIGN_TRIM" "$ALIGN_FILTER" )
REFFA=$GENOME/$FA

for ((i=0; i<${#OUT[@]}; ++i)); do
    echo "================================================"
    READ1=${DIR[i]}/SRR4841864_1*.fastq
    READ2=${DIR[i]}/SRR4841864_2*.fastq
    
    echo "${DIR[i]}" 
    echo "${OUT[i]}"
    ls $READ1
    ls $READ2
    echo "++++++++++++++++++++++++++++++++++++++++++++++++"
done

/home/jovyan/work/Data/SRR4841864/rawdata
/home/jovyan/work/Data/SRR4841864/align_raw
/home/jovyan/work/Data/SRR4841864/rawdata/SRR4841864_1.fastq
/home/jovyan/work/Data/SRR4841864/rawdata/SRR4841864_2.fastq
++++++++++++++++++++++++++++++++++++++++++++++++
/home/jovyan/work/Data/SRR4841864/fastqs_trimmed
/home/jovyan/work/Data/SRR4841864/align_trim
/home/jovyan/work/Data/SRR4841864/fastqs_trimmed/SRR4841864_1.trim.fastq
/home/jovyan/work/Data/SRR4841864/fastqs_trimmed/SRR4841864_2.trim.fastq
++++++++++++++++++++++++++++++++++++++++++++++++
/home/jovyan/work/Data/SRR4841864/fastqs_filter
/home/jovyan/work/Data/SRR4841864/align_filter
/home/jovyan/work/Data/SRR4841864/fastqs_filter/SRR4841864_1.filter.fastq
/home/jovyan/work/Data/SRR4841864/fastqs_filter/SRR4841864_2.filter.fastq
++++++++++++++++++++++++++++++++++++++++++++++++


**Alignment raw, trimmed, and filtered reads to the reference**

In [21]:
DIR=( "$RAW_FASTQS" "$TRIMMED"    "$FILTER" )
OUT=( "$ALIGN_RAW"  "$ALIGN_TRIM" "$ALIGN_FILTER" )
REFFA=$GENOME/$FA

for ((i=0; i<${#OUT[@]}; ++i)); do
    echo "================================================"
    READ1=${DIR[i]}/SRR4841864_1*.fastq
    READ2=${DIR[i]}/SRR4841864_2*.fastq
    
    echo "${DIR[i]}" 
    echo "${OUT[i]}"
    ls $READ1
    ls $READ2
    echo "++++++++++++++++++++++++++++++++++++++++++++++++"
    bwa mem -t 10 $REFFA $READ1 $READ2 > ${OUT[i]}/aln_pe.sam
done

/home/jovyan/work/Data/SRR4841864/rawdata
/home/jovyan/work/Data/SRR4841864/align_raw
/home/jovyan/work/Data/SRR4841864/rawdata/SRR4841864_1.fastq
/home/jovyan/work/Data/SRR4841864/rawdata/SRR4841864_2.fastq
++++++++++++++++++++++++++++++++++++++++++++++++
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 990100 sequences (100000100 bp)...
[M::process] read 990100 sequences (100000100 bp)...
[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (3, 426894, 34, 0)
[M::mem_pestat] skip orientation FF as there are not enough pairs
[M::mem_pestat] analyzing insert size distribution for orientation FR...
[M::mem_pestat] (25, 50, 75) percentile: (222, 290, 352)
[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 612)
[M::mem_pestat] mean and std.dev: (289.16, 95.30)
[M::mem_pestat] low and high boundaries for proper pairs: (1, 742)
[M::mem_pestat] analyzing insert size distribution for orientation RF...
[M::mem_pestat] (25, 50, 75) percentile: 