##Discard sequences that exceed the max expected error threshold.

In [27]:
%%bash

nprocs=20
#Changing max expected error to 2 - less conservative
maxee=1

tmpdir1=`mktemp -d`
trap "rm -r $tmpdir1" 1 2 3 15
split -d -l 2000000 /var/seq_data/Chazy/515_806_SIP/Pool3/pear_merged_raw-2015-08-16.assembled.dmult.fastq $tmpdir1/Block

tmpdir2=`mktemp -d`
trap "rm -r $tmpdir2" 1 2 3 15
ls $tmpdir1/Block?? | parallel --gnu -j $nprocs -k "usearch -fastq_filter {} \
-fastq_maxee $maxee \
-fastaout $tmpdir2/{#}.fasta >/dev/null 2>&1 && cat $tmpdir2/{#}.fasta" > data/tmp/maxee$maxee.fasta
rm -r $tmpdir2 $tmpdir1

grep -c ">" data/tmp/maxee$maxee.fasta
head -n 8 data/tmp/maxee$maxee.fasta

5817405
>13X.NTH.D1.R4_Frac15_0 orig_name=M02465:168:000000000-AHFAK:1:1101:15527:1335
TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGACAGTGGTGAAATCCCCGGGCTCAACCTGGGAACTGCCATTGTGACTGCAAAGCTAGAGTGCGGCAGAGGGGGATGGAATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTGGGCCTGCACTGACGCTCATGCACGAAAGCGTGGGGAGCAAACAGG
>12C.PTH.D14.R4_Frac6_1 orig_name=M02465:168:000000000-AHFAK:1:1101:15832:1338
TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGAGCGCGCAGGCGGTCGATTTAGTCGCGTGTGAAAGCCCCCGGCTCAACTGGGGAGGGTCACGCGATACTGATCGACTCGAAGGCAGGAGAGGGTAGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATTGGGTGGAACACCGGTGGCGAAAGCGGCTCACTGGCTCGGTTCTGACGCTGAGGCGCGAGAGCGTGGGGAGCAAACAGG
>12C.PTH.D7.R4_Frac17_2 orig_name=M02465:168:000000000-AHFAK:1:1101:15325:1339
TACGTAGGGTCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTTTATAAGTCTGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGAGACTGTAGAGCTTGAGGGCGGTAGGGGCTGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCCGGAAGAACACCGGTGGCGAAGGCGGCCCGCTGGGCAGGAAGGCTCGCAAGAGCCGGCTGACACTGAGGCGC

In [28]:
%%bash
bioawk -c fastx '{if ($seq !~ /N/){print ">" $name " " $4 "\n" $seq}}' data/tmp/maxee1.fasta > \
data/tmp/maxee1.0.noN.fasta
grep -c ">" data/tmp/maxee1.fasta
grep -c ">" data/tmp/maxee1.0.noN.fasta

5817405
5817405


##Remove seqs with N characters.

# Alignment-based QC with Mothur

In [29]:
%%bash
ionice -c2 -n3 \
mothur "#unique.seqs(fasta=data/tmp/maxee1.0.noN.fasta)" > /dev/null

In [30]:
!grep -c ">" data/tmp/maxee1.0.noN.fasta
!grep -c ">" data/tmp/maxee1.0.noN.unique.fasta

5817405
3086162


##### Need to use all sequences (not just unique sequences) for next step to create the data file

In [None]:
%%bash
bioawk -c fastx '{print $name}' data/tmp/maxee1.0.noN.fasta | \
awk -F"_" 'BEGIN{OFS="\t"}{print $0, $1}' > data/group_file.txt
head data/group_file.txt

13X.NTH.D1.R4_Frac15_0	13X.NTH.D1.R4
12C.PTH.D14.R4_Frac6_1	12C.PTH.D14.R4
12C.PTH.D7.R4_Frac17_2	12C.PTH.D7.R4
12C.NTH.D14.R4_Frac17_3	12C.NTH.D14.R4
12C.PTH.D14.R4_Frac6_4	12C.PTH.D14.R4
12C.NTH.D14.R4_Frac10_5	12C.NTH.D14.R4
13X.NTH.D1.R4_Frac27_6	13X.NTH.D1.R4
12C.NTH.D1.R4_Frac8_7	12C.NTH.D1.R4
12C.NTH.D1.R4_Frac5_8	12C.NTH.D1.R4
13X.NTH.D1.R4_Frac19_9	13X.NTH.D1.R4


In [3]:
!mkdir -p /home/chantal/RNASIP/data/tmp/db

In [8]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_B.zip http://www.mothur.org/w/images/9/98/Silva.bacteria.zip && unzip data/tmp/db/silva_B.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_E.zip http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip && unzip data/tmp/db/silva_E.zip
    curl -o /home/chantal/RNASIP/data/tmp/db/silva_A.zip http://www.mothur.org/w/images/3/3c/Silva.archaea.zip && unzip data/tmp/db/silva_A.zip
fi

In [11]:
%%bash
if ! [ -e /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta ]; then
    cat /home/chantal/RNASIP/data/tmp/db/silva.bacteria/silva.bacteria.fasta \
    /home/chantal/RNASIP/data/tmp/db/silva.eukarya/silva.eukarya.fasta \
    /home/chantal/RNASIP/data/tmp/db/Silva.archaea/silva.archaea.fasta \
    > /home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta
fi

In [None]:
%%bash
#sleep #seconds

In [2]:
%%bash
ionice -c2 -n3 \
mothur "#filter.seqs(vertical=t, \
fasta=/home/chantal/RNASIP/data/silva_ref_aln_mothur.fasta, \
processors=10)" > /dev/null

In [None]:
%%bash
ionice -c2 -n3 \
mothur "#align.seqs(candidate=data/tmp/maxee1.0.noN.unique.fasta, \
template=/home/chantal/RNASIP/data/tmp/db/silva_ref_aln_mothur.fasta, \
processors=15, \
flip=T)" > /dev/null

#### We can filter out vertical gaps...

In [None]:
%%bash
ionice -c2 -n3 \
mothur "#filter.seqs(vertical=t, \
fasta=data/tmp/maxee1.0.noN.unique.align, \
processors=10)" > /dev/null

### Here is what our seqs look like.

In [None]:
%%bash
ionice -c2 -n3 \
mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=20, \
name=data/tmp/maxee1.0.noN.names)" 

### Removing homopolymers (larger than 8) and screen out sequences that don't align to amplicon region

In [9]:
%%bash
ionice -c2 -n3 \
mothur "#screen.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.fasta, \
processors=10, \
name=data/tmp/maxee1.0.noN.names, \
group=data/group_file.txt, \
start=337, \
end=1272, \
maxhomop=8, minlength=252)" > /dev/null

In [10]:
%%bash
ionice -c2 -n3 \
mothur "#filter.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.fasta, \
processors=10, \
vertical=T)" > /dev/null

In [19]:
%%bash
ionice -c2 -n3 \
mothur "#summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.names)"

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > summary.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.names)

Using 1 processors.
[ERROR]: '13X.PTH.D1.R4_Frac15_0' is not in your name or count file, please correct.

mothur > quit()


# Chimera Checking w Uchime (when assigning OTUs- in other notebook)

#### Expand data, remove gaps, and copy into data directory

In [18]:
%%bash
ionice -c2 -n3 \
mothur "#deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, \
name=data/tmp/maxee1.0.noN.good.names)" 

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > deunique.seqs(fasta=data/tmp/maxee1.0.noN.unique.filter.good.filter.fasta, name=data/tmp/maxee1.0.noN.good.names)
[ERROR]: data/tmp/maxee1.0.noN.good.names is blank, aborting.
You have no current namefile and the name parameter is required.
[ERROR]: did not complete deunique.seqs.

mothur > quit()


In [13]:
!sed '/>/! s/-//g;/>/! s/\.//g' data/tmp/maxee1.0.noN.redundant.fasta > data/finalQC_pool3.fasta

In [14]:
!head data/finalQC_pool3.fasta

>13X.PTH.D1.R4_Frac15_0
TACGGAGGGAGCTAGCGTTGTGCTGGATGACTGGGCGTAAAGCGCACGTAGGCGGAACAGAAAGTCAGAGGTGACATCCCAGGGCTCAACCTTGGAACTGCCTTTGAAACTCCTGTGCTTGAGGTCGGGAGAGGTGAGTGGAATTCCGAGGGTAGAGGTGGAATTCGTAGATATTCGGAGGAACACCAGGGGCGAAGGCGGCTCACTGGCTCGATACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG
>13X.PTH.D1.R4_Frac7_1
TACGTAGGTGGCAAGCGTTATCCGGACTTACGGGGCGTAGAGCGCGCGCAGGTGGTTGCTCAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGG
>13X.NTH.D14.R4_Frac9_2
GACGGAGGGTGCAAGCGGTGTTCGGACTTACTGGGCGTAAAGCGCGGGCAGGTGGTCTTTTAAGTCCTTTGTGAAATCCCGGGGCTCAACTCCGGATGTGCATAGGATACTGGAAGACTGGAGACTGGTAGAGGCGAGCGGAATTCCTGGTGTAGCGGTGGAATGCGTAGATATCAGGAAGAACACCAGTGGCGAAGGCGACTACCTGGCCTGTTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGG
>12C.NTH.D30.R4_Frac4_3
AACGTCGGGTGCAAGCGTTAATCGGAAGTACTGGGCGTAAAGCGAGCGCAGGCGGTCTTGCAAGACAGATGTGAAATCCCCGGGCTTCGCCTGGGAACTGCATTTGTGACTGCAAGGCTGGAGTGCGGCAGAGGGG

#### Total Number of QC'd sequences

In [16]:
!grep -c ">" data/finalQC_pool3.fasta

1752418
