##Discard sequences that exceed the max expected error threshold.

In [1]:
%%bash

nprocs=20
#Changing max expected error to 2 - less conservative
maxee=2

tmpdir1=`mktemp -d`
trap "rm -r $tmpdir1" 1 2 3 15
split -d -l 2000000 /var/seq_data/Chazy/SIP_bulk/2_250_bulk_pear_merged_raw-2015-03-16.assembled.dmult.fastq $tmpdir1/Block

tmpdir2=`mktemp -d`
trap "rm -r $tmpdir2" 1 2 3 15
ls $tmpdir1/Block?? | parallel --gnu -j $nprocs -k "usearch -fastq_filter {} \
-fastq_maxee $maxee \
-fastaout $tmpdir2/{#}.fasta >/dev/null 2>&1 && cat $tmpdir2/{#}.fasta" > data/tmp/maxee$maxee.fasta
rm -r $tmpdir2 $tmpdir1

grep -c ">" data/tmp/maxee$maxee.fasta
head -n 8 data/tmp/maxee$maxee.fasta

3061920
>13X.NTH.Day7.Rep3_0 orig_name=M02465:134:000000000-ACLGB:1:1101:16302:1721
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5 orig_name=M02465:134:000000000-ACLGB:1:1101:18653:1762
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6 orig_name=M02465:134:000000000-ACLGB:1:1101:18013:1767
TACGGAGGGGGCTAGCG

In [2]:
%%bash
head /var/seq_data/Chazy/SIP_bulk/2_250_bulk_pear_merged_raw-2015-03-16.assembled.dmult.fastq
grep -c "+" /var/seq_data/Chazy/SIP_bulk/2_250_bulk_pear_merged_raw-2015-03-16.assembled.dmult.fastq

@13X.NTH.Day7.Rep3_0 orig_name=M02465:134:000000000-ACLGB:1:1101:16302:1721
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
+
>1>1AFFAFBA?1B11AAE0A3DAEB0BBD1D2AFA/AAABD1B//AABG0BBA>/>EG1B22BF@2B11>B/BB@B0010?B0BB11B>B1B1FGBCG1FFFGG1G/?BFF0FDGA?0?CG?III>IH<IGIIIIIIIIIIIIIIIIIIIIIII0IIIIIII=IIIIIICIII=GIIIIIIIIIIIIIIIIIIIIIIIIIIIG>GIIIIIEIGIIIIIBIIIII/IIIIIIIIIIIIIIIIIIIIIIIIGDDGD??1GF<A/?1FGF2F?B/CC@/?FGEHHFGFFHFGGGBHHF?>E/CGHHFGFF?BCCCEEGFAEAB>>>BBABGGEEEBHFFFHHFHEEAFGEGEEEFFGGEGFFE@DDCFFCA3>33
@13X.NTH.Day7.Rep4_1 orig_name=M02465:134:000000000-ACLGB:1:1101:14598:1736
TACGTAGGGTGCGAGCGTTATTCGGATTTACTGGGCGTAAAGCGTGCGCAGGCGGTGTTTTAAGTCTGGTGTGAAATCCCCGGGCTTAACCTGGGAAC

In [3]:
%%bash
bioawk -c fastx '{if ($seq !~ /N/){print ">" $name " " $4 "\n" $seq}}' data/tmp/maxee2.fasta > \
data/tmp/maxee2.0.noN.fasta
grep -c ">" data/tmp/maxee2.fasta
grep -c ">" data/tmp/maxee2.0.noN.fasta

3061920
3061920


##Remove seqs with N characters.

# Alignment-based QC with Mothur

In [5]:
%%bash
ionice -c2 -n3 \
mothur "#unique.seqs(fasta=data/tmp/maxee2.0.noN.fasta)" > /dev/null

In [6]:
!grep -c ">" data/tmp/maxee2.0.noN.fasta
!grep -c ">" data/tmp/maxee2.0.noN.unique.fasta

3061920
2783092


##### Need to use all sequences (not just unique sequences) for next step to create the data file

In [None]:
%%bash
bioawk -c fastx '{print $name}' data/tmp/maxee2.0.noN.fasta | \
awk -F"_" 'BEGIN{OFS="\t"}{print $0, $1}' > data/group_file.txt
head data/group_file.txt

13X.NTH.Day7.Rep3_0	13X.NTH.Day7.Rep3
13C.NTH.Day7.Rep1_5	13C.NTH.Day7.Rep1
13X.PTH.Day3.Rep2_6	13X.PTH.Day3.Rep2
12C.PTH.Day7.Rep3_13	12C.PTH.Day7.Rep3
12C.PTH.Day7.Rep2_18	12C.PTH.Day7.Rep2
13C.PTH.Day3.Rep2_23	13C.PTH.Day3.Rep2
12C.PTH.Day7.Rep2_24	12C.PTH.Day7.Rep2
13X.NTH.Day3.Rep4_25	13X.NTH.Day3.Rep4
13X.NTH.Day14.Rep1_26	13X.NTH.Day14.Rep1
13X.PTH.Day1.Rep2_29	13X.PTH.Day1.Rep2


In [3]:
!mkdir -p /home/chantal/RNASIP/data/tmp/db

In [8]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_B.zip http://www.mothur.org/w/images/9/98/Silva.bacteria.zip && unzip data/tmp/db/silva_B.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_E.zip http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip && unzip data/tmp/db/silva_E.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_A.zip http://www.mothur.org/w/images/3/3c/Silva.archaea.zip && unzip data/tmp/db/silva_A.zip
fi

In [11]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    cat /home/chantal/RNASIP/data/tmp/db/silva.bacteria/silva.bacteria.fasta \
    /home/chantal/RNASIP/data/tmp/db/silva.eukarya/silva.eukarya.fasta \
    /home/chantal/RNASIP/data/tmp/db/Silva.archaea/silva.archaea.fasta \
    > /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta
fi

In [8]:
%%bash
mothur "#filter.seqs(vertical=t, \
fasta=/home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta, \
processors=10)" > /dev/null

In [None]:
%%bash
ionice -c2 -n3 \
mothur "#align.seqs(candidate=data/tmp/maxee2.0.noN.unique.fasta, \
template=/home/chantal/RNASIP/data/tmp/db/silva_ref_aln_mothur.fasta, \
processors=15, \
flip=T)" > /dev/null

#### We can filter out vertical gaps...

In [None]:
%%bash
mothur "#filter.seqs(vertical=t, \
fasta=data/tmp/maxee2.0.noN.unique.align, \
processors=10)" > /dev/null

### Here is what our seqs look like.

In [16]:
%%bash
mothur "#summary.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.fasta, \
processors=20, \
name=data/tmp/maxee2.0.noN.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.fasta, processors=20, name=data/tmp/maxee2.0.noN.names)

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20 processors.

Using 20

### Removing homopolymers (larger than 8) and screen out sequences that don't align to amplicon region

In [17]:
%%bash

mothur "#screen.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.fasta, \
processors=10, \
name=data/tmp/maxee2.0.noN.names, \
group=data/group_file.txt, \
start=236, \
end=1621, \
maxhomop=8, minlength=370)" > /dev/null

In [18]:
%%bash

mothur "#filter.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.good.fasta, \
processors=10, \
vertical=T)" > /dev/null

In [20]:
%%bash

mothur "#summary.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee2.0.noN.names)"

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee2.0.noN.names)

Using 1 processors.

		Start	End	NBases	Ambigs	Polymer	NumSeqs
Minimum:	1	1193	370	0	3	1
2.5%-tile:	58	1193	370	0	4	74438
25%-tile:	58	1193	372	0	4	744372
Median: 	58	1193	373	0	5	1488743
75%-tile:	58	1193	374	0	6	2233114
97.5%-tile:	58	1193	375	0	8	2903048
Maximum:	58	1271	439	0	8	2977485
Mean:	57.9989	1193.02	373.19

# Chimera Checking w Uchime (when assigning OTUs- in other notebook)

#### Expand data, remove gaps, and copy into data directory

In [22]:
%%bash
mothur "#deunique.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee2.0.noN.good.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > deunique.seqs(fasta=data/tmp/maxee2.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee2.0.noN.good.names)

Output File Names: 
data/tmp/maxee2.0.noN.redundant.fasta


mothur > quit()


In [23]:
!sed '/>/! s/-//g;/>/! s/\.//g' data/tmp/maxee2.0.noN.redundant.fasta > data/finalQC.fasta

In [24]:
!head data/finalQC.fasta

>13X.NTH.Day7.Rep3_0
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAATTGCCTTTAAGACTGCATCGCTAGAATCATGGAGAGGGGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACA

#### Total Number of QC'd sequences

In [25]:
!grep -c ">" data/finalQC.fasta

2977485
