##Discard sequences that exceed the max expected error threshold.

In [11]:
%%bash

nprocs=20
#Changing max expected error to 2 - less conservative
maxee=1

tmpdir1=`mktemp -d`
trap "rm -r $tmpdir1" 1 2 3 15
split -d -l 2000000 /var/seq_data/Chazy/515_806_SIP/Pool1/pear_merged_raw-2015-07-02.assembled.dmult.fastq $tmpdir1/Block

tmpdir2=`mktemp -d`
trap "rm -r $tmpdir2" 1 2 3 15
ls $tmpdir1/Block?? | parallel --gnu -j $nprocs -k "usearch -fastq_filter {} \
-fastq_maxee $maxee \
-fastaout $tmpdir2/{#}.fasta >/dev/null 2>&1 && cat $tmpdir2/{#}.fasta" > data/tmp/maxee$maxee.fasta
rm -r $tmpdir2 $tmpdir1

grep -c ">" data/tmp/maxee$maxee.fasta
head -n 8 data/tmp/maxee$maxee.fasta

8982380
>12C.NTH.D3.R4_Frac19_0 orig_name=M02465:159:000000000-AGE5C:1:1101:16210:1816
TACGTAGGGGACGAGCGTTGTCCGGATTCATTGGGCGTAAAGCGCGCGTAGGCGGCTCGGAAAGTCGGTCGTGAAATGCCGGGGCTCAACCCCGGGACTGCGTCCGATACTTCCGGGCTGGAGGTAGGTAGGGGAGATCGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCCGATACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGG
>12C.PTH.D1.R4_Frac23_1 orig_name=M02465:159:000000000-AGE5C:1:1101:13282:1830
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCAGGGCTCAACCCTGGAATTGCCTTTAAGACTGCATCGCTTGAACATCGGAGAGGTAAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTTACTGGACGATTGTTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
>12C.PTH.D1.R4_Frac21_2 orig_name=M02465:159:000000000-AGE5C:1:1101:14760:1836
TACAGAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCTCCTAAGTCGGACGTGAAATCCCCGAGCTCAACTCGGGAACTGCGTCCGATACTGGTTGGCTTGAATCCGGGAGAGGGATGCGGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAACACCGGTGGCGAAGGCGGCATCCTGGACCGGTATTGACGCTGAGGCGCGAAAGCCAGGGGAGCA

In [12]:
%%bash
bioawk -c fastx '{if ($seq !~ /N/){print ">" $name " " $4 "\n" $seq}}' data/tmp/maxee1.fasta > \
data/tmp/maxee1.0.noN.fasta
grep -c ">" data/tmp/maxee1.fasta
grep -c ">" data/tmp/maxee1.0.noN.fasta

8982380
8982380


##Remove seqs with N characters.

# Alignment-based QC with Mothur

In [13]:
%%bash
ionice -c2 -n3 \
mothur "#unique.seqs(fasta=data/tmp/maxee1.0.noN.fasta)" > /dev/null

In [14]:
!grep -c ">" data/tmp/maxee1.0.noN.fasta
!grep -c ">" data/tmp/maxee1.0.noN.unique.fasta

8982380
3230660


##### Need to use all sequences (not just unique sequences) for next step to create the data file

In [15]:
%%bash
bioawk -c fastx '{print $name}' data/tmp/maxee1.0.noN.fasta | \
awk -F"_" 'BEGIN{OFS="\t"}{print $0, $1}' > data/group_file.txt
head data/group_file.txt

12C.NTH.D3.R4_Frac19_0	12C.NTH.D3.R4
12C.PTH.D1.R4_Frac23_1	12C.PTH.D1.R4
12C.PTH.D1.R4_Frac21_2	12C.PTH.D1.R4
13C.NTH.D14.R4_Frac9_3	13C.NTH.D14.R4
13X.NTH.D3.R4_Frac11_4	13X.NTH.D3.R4
12C.NTH.D3.R4_Frac6_5	12C.NTH.D3.R4
13X.PTH.D7.R4_Frac26_6	13X.PTH.D7.R4
13X.PTH.D7.R4_Frac6_7	13X.PTH.D7.R4
12C.PTH.D3.R4_Frac5_8	12C.PTH.D3.R4
12C.NTH.D3.R4_Frac6_9	12C.NTH.D3.R4


In [3]:
!mkdir -p /home/chantal/RNASIP/data/tmp/db

In [8]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_B.zip http://www.mothur.org/w/images/9/98/Silva.bacteria.zip && unzip data/tmp/db/silva_B.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_E.zip http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip && unzip data/tmp/db/silva_E.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_A.zip http://www.mothur.org/w/images/3/3c/Silva.archaea.zip && unzip data/tmp/db/silva_A.zip
fi

In [11]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    cat /home/chantal/RNASIP/data/tmp/db/silva.bacteria/silva.bacteria.fasta \
    /home/chantal/RNASIP/data/tmp/db/silva.eukarya/silva.eukarya.fasta \
    /home/chantal/RNASIP/data/tmp/db/Silva.archaea/silva.archaea.fasta \
    > /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta
fi

In [8]:
%%bash
mothur "#filter.seqs(vertical=t, \
fasta=/home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta, \
processors=10)" > /dev/null

In [None]:
%%bash
ionice -c2 -n3 \
mothur "#align.seqs(candidate=data/tmp/maxee1.0.noN.unique.fasta, \
template=/home/chantal/RNASIP/data/tmp/db/silva_ref_aln_mothur.fasta, \
processors=15, \
flip=T)" > /dev/null

#### We can filter out vertical gaps...

In [None]:
%%bash
mothur "#filter.seqs(vertical=t, \
fasta=data/tmp/maxee1.0.noN.unique.align, \
processors=10)" > /dev/null

### Here is what our seqs look like.

In [19]:
%%bash
mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=20, \
name=data/tmp/maxee1.0.noN.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, processors=20, name=data/tmp/maxee1.0.noN.names)

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20

### Removing homopolymers (larger than 8) and screen out sequences that don't align to amplicon region

In [20]:
%%bash

mothur "#screen.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=10, \
name=data/tmp/maxee1.0.noN.names, \
group=data/group_file.txt, \
start=463, \
end=1385, \
maxhomop=8, minlength=253)" > /dev/null

In [21]:
%%bash

mothur "#filter.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.fasta, \
processors=10, \
vertical=T)" > /dev/null

In [22]:
%%bash

mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.names)"

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.names)

Using 1 processors.

		Start	End	NBases	Ambigs	Polymer	NumSeqs
Minimum:	1	1007	253	0	3	1
2.5%-tile:	143	1007	253	0	4	218579
25%-tile:	143	1007	253	0	4	2185782
Median: 	143	1007	253	0	4	4371563
75%-tile:	143	1007	253	0	5	6557344
97.5%-tile:	143	1007	254	0	7	8524547
Maximum:	143	1177	424	0	8	8743125
Mean:	142.998	1007.0

# Chimera Checking w Uchime (when assigning OTUs- in other notebook)

#### Expand data, remove gaps, and copy into data directory

In [None]:
%%bash
mothur "#deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.good.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.good.names)

Output File Names: 
data/tmp/maxee1.0.noN.redundant.fasta


mothur > quit()


In [27]:
!sed '/>/! s/-//g;/>/! s/\.//g' data/tmp/maxee1.0.noN.redundant.fasta > data/finalQC.fasta

In [None]:
!head data/finalQC.fasta

#### Total Number of QC'd sequences

In [None]:
!grep -c ">" data/finalQC.fasta