# Sort and Index the alignment results

In [2]:
source config.sh

## [Converting SAM to BAM with samtools](http://quinlanlab.org/tutorials/samtools/samtools.html)

Here I followed the instructions from the [Samtools Tutorial](http://quinlanlab.org/tutorials/samtools/samtools.html) created by Aaron Quinlan.

- Converting SAM to BAM with samtools "view"

```
### convert SAM to BAM
samtools view -S -b sample.sam > sample.bam

### print the results
samtools view sample.bam | head
```

- samtools "sort"

"When you align FASTQ files with all current sequence aligners, the alignments produced are in random order with respect to their position in the reference genome. In other words, the BAM file is in the order that the sequences occurred in the input FASTQ files."

```
### Before sort
samtools view sample.bam | head

### sort
samtools sort sample.bam -o sample.sorted.bam

### After sort
samtools view sample.bam | head
```

In [3]:
DIR=( "$RAW_FASTQS" "$TRIMMED"    "$FILTER" )
OUT=( "$ALIGN_RAW"  "$ALIGN_TRIM" "$ALIGN_FILTER" )
REFFA=$GENOME/$FA

for ((i=0; i<${#OUT[@]}; ++i)); do
    echo "================================================"
    echo "${OUT[i]}"
done

/home/jovyan/work/Data/SRR4841864/align_raw
/home/jovyan/work/Data/SRR4841864/align_trim
/home/jovyan/work/Data/SRR4841864/align_filter


In [4]:
DIR=( "$RAW_FASTQS" "$TRIMMED"    "$FILTER" )
OUT=( "$ALIGN_RAW"  "$ALIGN_TRIM" "$ALIGN_FILTER" )
REFFA=$GENOME/$FA

for ((i=0; i<${#OUT[@]}; ++i)); do
    echo "================================================"
    echo "${OUT[i]}"
    echo "++++++++++++++++++++++++"  
    
    ### SAM to BAM
    samtools view -S -b ${OUT[i]}/aln_pe.sam > ${OUT[i]}/aln_pe.bam
    
    ### Sorting
    samtools sort ${OUT[i]}/aln_pe.bam -o ${OUT[i]}/aln_pe_sorted.bam
done

/home/jovyan/work/Data/SRR4841864/align_raw
++++++++++++++++++++++++
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
/home/jovyan/work/Data/SRR4841864/align_trim
++++++++++++++++++++++++
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
/home/jovyan/work/Data/SRR4841864/align_filter
++++++++++++++++++++++++
[bam_sort_core] merging from 2 files and 1 in-memory blocks...


In [2]:
echo $FA

Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa


In [4]:
ls $ALIGN_RAW

aln_pe.sam


In [5]:
samtools view -S -b $ALIGN_RAW/aln_pe.sam > $ALIGN_RAW/aln_pe.bam

In [10]:
samtools sort $ALIGN_RAW/aln_pe.bam -o $ALIGN_RAW/aln_pe_sorted.bam

[bam_sort_core] merging from 3 files and 1 in-memory blocks...


In [8]:
samtools view $ALIGN_RAW/aln_pe.bam | head

SRR4841864.1	83	VII	224972	60	101M	=	224852	-221	TCATCCTTTATAGCTTTCCCAACAGCGGCAGCAGCAGCAGCATCCTTCTTTCTCGATTTTTGAATACCATTCCCACGCATGCTCTTCCCATTGGTATGCCN	CCCCCCECCCCCCBBABBCCCBCCCCECCFDFDBBCCCGHIIIIIIIIIHCGHFGDDIIIHGHF?BDFGHIFIGHEHGGIGHGGBBGGDHFHDFFFDB:1#	NM:i:3	MD:Z:11G10T77T0	MC:Z:41S50M10S	AS:i:90	XS:i:19
SRR4841864.1	163	VII	224852	60	41S50M10S	=	224972	221	TGTCATTTTCATTTTCGTTGTCGTTGTCGTTGTCATTGTCATTGTCGTTGTCGTTGTCGTTGTTGTTTTCATTGTCGTTGTCGTTGTCTTCATCGTTATTG	==1A:BDDHFHHFIIIGIGHIAFGGCEHI?:?:CFD*:BDFHEH*?DHIBBF6<FAHBBCACEHFFH>CCBED;6@BC@=?AB@BCBCCCCDA@??BBBCC	NM:i:0	MD:Z:50	MC:Z:101M	AS:i:50	XS:i:0	SA:Z:VII,224835,+,38M6D20M43S,51,7;
SRR4841864.1	2211	VII	224835	51	38M6D20M43H	=	224972	238	TGTCATTTTCATTTTCGTTGTCGTTGTCGTTGTCATTGTCATTGTCGTTGTCGTTGTC	==1A:BDDHFHHFIIIGIGHIAFGGCEHI?:?:CFD*:BDFHEH*?DHIBBF6<FAHB	NM:i:7	MD:Z:34G3^TTGTTT20	MC:Z:101M	AS:i:41	XS:i:0	SA:Z:VII,224852,+,41S50M10S,60,0;
SRR4841864.2	83	XI	618277	60	101M	=	618040	-338	TACCTTCCTTGTCTTGGATCTTGGACTTGACATTGTCAATGGTGTCAGAAG

In [11]:
samtools view $ALIGN_RAW/aln_pe_sorted.bam | head

SRR4841864.1882824	163	I	1	12	16M3D8M1I10M1D13M53S	=	244	339	CCACACCACACCCACACCACACACACACACCACACCCACACCACACCCCCACCCCACCCCCACCCCACCCCCACCCCCTCCCAAACCACCCCCCCACCCCC	CCCFFFFFHGHHGJJIJJHIIB8EAEGHIII@BDHBHGB1CF(.6;@A'95>A################################################	NM:i:5	MD:Z:16^CAC18^A13	MC:Z:5S96M	AS:i:24	XS:i:37
SRR4841864.3374304	99	I	1	0	1S31M7I26M36S	=	111	208	ACCACACCACACCCACACCCACACACACCACACCCACACCCACACACCACACCACACCCACACACCCACACACCACAACCACCCCCACCCCCCACACCCCC	@B@FFFFFHHHGHIIGIIGHIIJIJJJIIG=GGII?FHG(;5@CG15;6?;;;=ACB?@BBB?<@8=B0<??@############################	NM:i:9	MD:Z:17A1C37	MC:Z:29M2I37M1D31M2S	AS:i:34	XS:i:47
SRR4841864.4256138	99	I	1	7	17S33M51S	=	111	207	ACACCCACACACCACACCCACACCACACCCACACACCCACACACCACACCCCAACCCCACACCACACCAACACCACACCCACACCACCCCCCACCCACCCA	@???BDDBHDFA?E:EFIEGIEHIGGGGIIE6BF1D;@6;;FE##########################################################	NM:i:0	MD:Z:33	MC:Z:29M2I23M2I45M	AS:i:33	XS:i:47
SRR4841864.1997645	163	I	2	17	3S17M1D12M1I30M38S	=	140	208	ACACACACC

In [13]:
samtools faidx $GENOME/$FA

In [15]:
echo $GENOME/$FA

/home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa


In [16]:
samtools mpileup -g -f $GENOME/$FA $ALIGN_RAW/aln_pe_sorted.bam > $ALIGN_RAW/raw.bcf

[mpileup] 1 samples in 1 input files


In [33]:
bcftools mpileup -f $GENOME/$FA $ALIGN_RAW/aln_pe_sorted.bam > $ALIGN_RAW/raw.bcf

[mpileup] 1 samples in 1 input files


In [21]:
bcftools view $ALIGN_RAW/raw.bcf > $ALIGN_RAW/var.bcf

## filter SNP

In [32]:
bcftools view $ALIGN_RAW/var.bcf | head

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##samtoolsVersion=1.9+htslib-1.9
##samtoolsCommand=samtools mpileup -g -f /home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa /home/jovyan/work/Data/SRR4841864/align_raw/aln_pe_sorted.bam
##reference=file:///home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa
##contig=<ID=I,length=230218>
##contig=<ID=II,length=813184>
##contig=<ID=III,length=316620>
##contig=<ID=IV,length=1531933>
##contig=<ID=V,length=576874>


In [24]:
bcftools view $ALIGN_RAW/var.bcf | vcfutils varFilter > $ALIGN_RAW/final.vcf

In [20]:
ls $ALIGN_RAW/

aln_pe.bam  aln_pe.sam  aln_pe_sorted.bam  raw.bcf


In [29]:
samtools index $ALIGN_RAW/aln_pe_sorted.bam

In [35]:
# remove header lines and look at top 4 entires
#zcat variants/evolved-6.mpileup.vcf.gz | egrep -v '##' | head -4
cat $ALIGN_RAW/final.vcf | egrep -v '##' | head -4

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	/home/jovyan/work/Data/SRR4841864/align_raw/aln_pe_sorted.bam
I	1	.	C	<*>	0	.	DP=3;I16=1,0,0,0,33,1089,0,0,12,144,0,0,0,0,0,0;QS=1,0;MQ0F=0.333333	PL	0,3,12
I	12	.	C	<*>	0	.	DP=8;I16=5,0,0,0,178,6444,0,0,96,2350,0,0,47,487,0,0;QS=1,0;MQ0F=0.25	PL	0,15,73
I	23	.	C	<*>	0	.	DP=14;I16=8,0,0,0,271,9393,0,0,132,3516,0,0,119,2145,0,0;QS=1,0;MQ0F=0.214286	PL	0,24,89


In [37]:
bgzip -c $ALIGN_RAW/final.vcf > $ALIGN_RAW/final.vcf.gz

In [38]:
tabix -p vcf $ALIGN_RAW/final.vcf.gz

In [41]:
#bcftools stats -F assembly/spades_final/scaffolds.fasta -s - variants/evolved-6.mpileup.vcf.gz > variants/evolved-6.mpileup.vcf.gz.stats
bcftools stats -F $GENOME/$FA -s - $ALIGN_RAW/final.vcf.gz > $ALIGN_RAW/final.vcf.gz.stats

In [45]:
#plot-vcfstats -p variants/plots/ variants/evolved-6.mpileup.vcf.gz.stats
plot-vcfstats -p $ALIGN_RAW $ALIGN_RAW/final.vcf.gz.stats

Parsing bcftools stats output: /home/jovyan/work/Data/SRR4841864/align_raw/final.vcf.gz.stats
	expected: # PSC	[2]id	[3]sample	[4]nRefHom	[5]nNonRefHom	[6]nHets	[7]nTransitions	[8]nTransversions	[9]nIndels	[10]average depth	[11]nSingletons
	found:    # PSC	[2]id	[3]sample	[4]nRefHom	[5]nNonRefHom	[6]nHets	[7]nTransitions	[8]nTransversions	[9]nIndels	[10]average depth	[11]nSingletons	[12]nHapRef	[13]nHapAlt	[14]nMissing
Plotting graphs: python plot.py
Creating PDF: pdflatex summary.tex >plot-vcfstats.log 2>&1
Finished: /home/jovyan/work/Data/SRR4841864/align_raw/summary.pdf


In [40]:
echo $GENOME/$FA

/home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa


In [30]:
head -200 $ALIGN_RAW/final.vcf

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##samtoolsVersion=1.9+htslib-1.9
##samtoolsCommand=samtools mpileup -g -f /home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa /home/jovyan/work/Data/SRR4841864/align_raw/aln_pe_sorted.bam
##reference=file:///home/jovyan/work/Data/SRR4841864/genome/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa
##contig=<ID=I,length=230218>
##contig=<ID=II,length=813184>
##contig=<ID=III,length=316620>
##contig=<ID=IV,length=1531933>
##contig=<ID=V,length=576874>
##contig=<ID=VI,length=270161>
##contig=<ID=VII,length=1090940>
##contig=<ID=VIII,length=562643>
##contig=<ID=IX,length=439888>
##contig=<ID=X,length=745751>
##contig=<ID=XI,length=666816>
##contig=<ID=XII,length=1078177>
##contig=<ID=XIII,length=924431>
##contig=<ID=XIV,length=784333>
##contig=<ID=XV,length=1091291>
##contig=<ID=XVI,length=948066>
##contig=<ID=Mito,length=85779>
##ALT=<ID=*,Description="Represents allele(s) other th