##Discard sequences that exceed the max expected error threshold.

In [1]:
%%bash

nprocs=20
#Changing max expected error to 2 - less conservative
maxee=1

tmpdir1=`mktemp -d`
trap "rm -r $tmpdir1" 1 2 3 15
split -d -l 2000000 /var/seq_data/Chazy/515_806_SIP/Pool2/pear_merged_raw-2015-08-15.assembled.dmult.fastq $tmpdir1/Block

tmpdir2=`mktemp -d`
trap "rm -r $tmpdir2" 1 2 3 15
ls $tmpdir1/Block?? | parallel --gnu -j $nprocs -k "usearch -fastq_filter {} \
-fastq_maxee $maxee \
-fastaout $tmpdir2/{#}.fasta >/dev/null 2>&1 && cat $tmpdir2/{#}.fasta" > data/tmp/maxee$maxee.fasta
rm -r $tmpdir2 $tmpdir1

grep -c ">" data/tmp/maxee$maxee.fasta
head -n 8 data/tmp/maxee$maxee.fasta

1798673
>13X.PTH.D1.R4.Frac15_0 orig_name=M01032:308:000000000-AHEJ4:1:1101:15377:1346
TACGGAGGGAGCTAGCGTTGTGCTGGATGACTGGGCGTAAAGCGCACGTAGGCGGAACAGAAAGTCAGAGGTGACATCCCAGGGCTCAACCTTGGAACTGCCTTTGAAACTCCTGTGCTTGAGGTCGGGAGAGGTGAGTGGAATTCCGAGGGTAGAGGTGGAATTCGTAGATATTCGGAGGAACACCAGGGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
>13X.PTH.D1.R4.Frac7_1 orig_name=M01032:308:000000000-AHEJ4:1:1101:16052:1355
TACGTAGGTGGCAAGCGTTATCCGGACTTACGGGGCGTAGAGCGCGCGCAGGTGGTTGCTCAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGG
>13X.NTH.D14.R4.Frac9_2 orig_name=M01032:308:000000000-AHEJ4:1:1101:16446:1361
GACGGAGGGTGCAAGCGGTGTTCGGACTTACTGGGCGTAAAGCGCGGGCAGGTGGTCTTTTAAGTCCTTTGTGAAATCCCGGGGCTCAACTCCGGATGTGCATAGGATACTGGAAGACTGGAGACTGGTAGAGGCGAGCGGAATTCCTGGTGTAGCGGTGGAATGCGTAGATATCAGGAAGAACACCAGTGGCGAAGGCGACTACCTGGCCTGTTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAA

In [2]:
%%bash
bioawk -c fastx '{if ($seq !~ /N/){print ">" $name " " $4 "\n" $seq}}' data/tmp/maxee1.fasta > \
data/tmp/maxee1.0.noN.fasta
grep -c ">" data/tmp/maxee1.fasta
grep -c ">" data/tmp/maxee1.0.noN.fasta

1798673
1798673


##Remove seqs with N characters.

# Alignment-based QC with Mothur

In [3]:
%%bash
ionice -c2 -n3 \
mothur "#unique.seqs(fasta=data/tmp/maxee1.0.noN.fasta)" > /dev/null

In [4]:
!grep -c ">" data/tmp/maxee1.0.noN.fasta
!grep -c ">" data/tmp/maxee1.0.noN.unique.fasta

1798673
1719127


##### Need to use all sequences (not just unique sequences) for next step to create the data file

In [None]:
%%bash
bioawk -c fastx '{print $name}' data/tmp/maxee1.0.noN.fasta | \
awk -F"_" 'BEGIN{OFS="\t"}{print $0, $1}' > data/group_file.txt
head data/group_file.txt

13X.PTH.D1.R4.Frac15_0	13X.PTH.D1.R4.Frac15
13X.PTH.D1.R4.Frac7_1	13X.PTH.D1.R4.Frac7
13X.NTH.D14.R4.Frac9_2	13X.NTH.D14.R4.Frac9
12C.NTH.D30.R4.Frac4_3	12C.NTH.D30.R4.Frac4
13C.PTH.D30.R4.Frac7_4	13C.PTH.D30.R4.Frac7
13X.NTH.D14.R4.Frac6_6	13X.NTH.D14.R4.Frac6
13X.PTH.D14.R4.Frac12_7	13X.PTH.D14.R4.Frac12
13C.PTH.D30.R4.Frac22_8	13C.PTH.D30.R4.Frac22
12C.NTH.D7.R4.Frac26_9	12C.NTH.D7.R4.Frac26
13X.PTH.D14.R4.Frac10_10	13X.PTH.D14.R4.Frac10


In [None]:
%%bash
ionice -c2 -n3 \
mothur "#align.seqs(candidate=data/tmp/maxee1.0.noN.unique.fasta, \
template=/home/chantal/RNASIP/data/tmp/db/silva_ref_aln_mothur.filter.fasta, \
processors=15, \
flip=T)" > /dev/null

#### We can filter out vertical gaps...

In [None]:
%%bash
mothur "#filter.seqs(vertical=t, \
fasta=data/tmp/maxee1.0.noN.unique.align, \
processors=10)" > /dev/null

### Here is what our seqs look like.

In [20]:
%%bash
mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=20, \
name=data/tmp/maxee1.0.noN.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, processors=20, name=data/tmp/maxee1.0.noN.names)

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20

### Removing homopolymers (larger than 8) and screen out sequences that don't align to amplicon region

In [21]:
%%bash

mothur "#screen.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=10, \
name=data/tmp/maxee1.0.noN.names, \
group=data/group_file.txt, \
start=115, \
end=761, \
maxhomop=8, minlength=253)" > /dev/null

In [22]:
%%bash

mothur "#filter.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.fasta, \
processors=10, \
vertical=T)" > /dev/null

In [23]:
%%bash

mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.names)"

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.names)

Using 1 processors.

		Start	End	NBases	Ambigs	Polymer	NumSeqs
Minimum:	1	645	253	0	3	1
2.5%-tile:	51	645	253	0	4	43811
25%-tile:	51	645	253	0	4	438105
Median: 	51	645	253	0	4	876210
75%-tile:	51	645	253	0	5	1314314
97.5%-tile:	51	645	254	0	6	1708608
Maximum:	51	716	320	0	8	1752418
Mean:	50.9986	645.009	253.096	0	4.64

# Chimera Checking w Uchime (when assigning OTUs- in other notebook)

#### Expand data, remove gaps, and copy into data directory

In [24]:
%%bash
mothur "#deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.good.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.good.names)

Output File Names: 
data/tmp/maxee1.0.noN.redundant.fasta


mothur > quit()


In [25]:
!sed '/>/! s/-//g;/>/! s/\.//g' data/tmp/maxee1.0.noN.redundant.fasta > data/finalQC_pool2.fasta

In [26]:
!head data/finalQC_pool2.fasta

>13X.PTH.D1.R4.Frac15_0
TACGGAGGGAGCTAGCGTTGTGCTGGATGACTGGGCGTAAAGCGCACGTAGGCGGAACAGAAAGTCAGAGGTGACATCCCAGGGCTCAACCTTGGAACTGCCTTTGAAACTCCTGTGCTTGAGGTCGGGAGAGGTGAGTGGAATTCCGAGGGTAGAGGTGGAATTCGTAGATATTCGGAGGAACACCAGGGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
>13X.PTH.D1.R4.Frac7_1
TACGTAGGTGGCAAGCGTTATCCGGACTTACGGGGCGTAGAGCGCGCGCAGGTGGTTGCTCAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGG
>13X.NTH.D14.R4.Frac9_2
GACGGAGGGTGCAAGCGGTGTTCGGACTTACTGGGCGTAAAGCGCGGGCAGGTGGTCTTTTAAGTCCTTTGTGAAATCCCGGGGCTCAACTCCGGATGTGCATAGGATACTGGAAGACTGGAGACTGGTAGAGGCGAGCGGAATTCCTGGTGTAGCGGTGGAATGCGTAGATATCAGGAAGAACACCAGTGGCGAAGGCGACTACCTGGCCTGTTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGG
>12C.NTH.D30.R4.Frac4_3
AACGTCGGGTGCAAGCGTTAATCGGAAGTACTGGGCGTAAAGCGAGCGCAGGCGGTCTTGCAAGACAGATGTGAAATCCCCGGGCTTCGCCTGGGAACTGCATTTGTGACTGCAAGGCTGGAGTGCGGCAGAGGGG

#### Total Number of QC'd sequences

In [27]:
!grep -c ">" data/finalQC_pool2.fasta

1752418
