In [1]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [2]:
# applies for the whole segment
nprocs = 10

In [3]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head data/finalQC.fasta

>13X.NTH.Day7.Rep3_0
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAATTGCCTTTAAGACTGCATCGCTAGAATCATGGAGAGGGGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACA

In [4]:
%%bash
# Running the seq separation on mothur instead

mothur "#unique.seqs(fasta=data/finalQC.fasta)" > /dev/null

In [5]:
# Making a dictionary of the names files, splitting it into the first (ID) and second (commas list of all seqs in it)
# Then it counts their lengths and saves it in the dictionary

counts = {}

with open("data/finalQC.names") as f:
    for line in f:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count        

In [6]:
# Adds the counts from this dictionary to our finalQC.unique file so it looks like a usearch file with "size=XXX"

with open("data/finalQC.unique.usearch_names.fasta", "w") as f:
    for n, s in parse(open("data/finalQC.unique.fasta")):
        f.write(">%s;size=%s;\n%s\n"%(n,counts[n],s))  

In [7]:
!head data/finalQC.unique.usearch_names.fasta

>13X.NTH.Day7.Rep3_0;size=1;
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5;size=1;
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6;size=1;
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAATTGCCTTTAAGACTGCATCGCTAGAATCATGGAGAGGGGAGTGGAATTCCGAGTGTAGAGGTGA

In [8]:
# Chuck looking to see how many Gb this file is.
!du -h data/finalQC.fasta

1.2G	data/finalQC.fasta


In [9]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons
# Or, you know, just not do this step.
# But you should just get rid of them.
!usearch -sortbysize data/finalQC.unique.usearch_names.fasta -output data/finalQC.unique_sorted.fasta -minsize 2

usearch v7.0.1090_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: chuck.peperanney@gmail.com

00:04 1.2Gb  100.0% Reading data/finalQC.unique.usearch_names.fasta
00:04 1.2Gb Getting sizes                                          
00:07 1.2Gb Sorting 62982 sequences
00:07 1.2Gb  100.0% Writing data/finalQC.unique_sorted.fasta


In [10]:
# Checking data
# You can see here, the first two sequences we saw above are now gone.
!head data/finalQC.unique_sorted.fasta

>12C.PTH.Day1.Rep4_4611;size=1821;
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGTGGTTCCTTAAGTCTGATGTGAAAGCCC
ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCAAGTGTAGCGGTG
AAATGCGTAGAGATTTGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTG
GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTGCTAAGTGTTAGAGGGTTTCCGCCCTTTA
GTGCTGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGGCCGCAAGGCTG
>13C.PTH.Day14.Rep4_12341;size=1685;
TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTGATGTAAGACAGATGTGAAATCCC
CGGGCTCAACCTGGGAACTGCATTTGTGACTGCATCGCTGGAGTGCGGCAGAGGGGGATGGAATTCCGCGTGTAGCAGTG
AAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTGGGCCTGCACTGACGCTCATGCACGAAAGCGTG


In [11]:
# This is the clustering command.
# Default is 97% minimum ID.
# Not recommended to use more than 97%.
# Creates the centroids, or "seeds"
# Then you can take them out
!usearch -cluster_otus data/finalQC.unique_sorted.fasta -otus data/otus.fasta

usearch v7.0.1090_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: chuck.peperanney@gmail.com

00:21  67Mb  100.0% 3676 OTUs
                             
Input seqs  62982 (63.0k)
      OTUs  3676
   Members  46975 (47.0k)
  Chimeras  12331 (12.3k)
   Max mem  67Mb
      Time  21.0s
Throughput  2999.1 seqs/sec.



In [12]:
# Making another file
# Figure this out (what is this?)
# This is a script (fasta_number.py) that replaces fasta names with XXX1, XXX2, etc.
# In our case, it is replacing the names with OTU.1, OTU.2, etc., and outputs it into a file called otusn.fasta
!/opt/bioinfo/edgar_python_scripts/fasta_number.py data/otus.fasta OTU. > data/otusn.fasta

In [13]:
!head data/otusn.fasta

>OTU.1
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCgcgcgcAGGTGGTTCCTTAAGTCTGATGTGAAAGCCC
ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCAAGTGTAGCGGTG
AAATGCGTAGAGATTTGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTG
GGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTGCTAAGTGTTAGAGGGTTTCCGCCCTTTA
GTGCTGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGGCCGCAAGGCTG
>OTU.2
TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTGATGTAAGACAGATGTGAAATCCC
CGGGCTCAACCTGGGAACTGCATTTGTGACTGCATCGCTGGAGTGCGGCAGAGGgggATGGAATTCCGCGTGTAGCAGTG
AAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTGGGCCTGCACTGACGCTCATGCACGAAAGCGTG


### Removing Chloroplast, Eukaryal, and Archaeal sequences

In [20]:
# You do need to assign taxonomy in order to pull out the Euks., etc.
# I could cp these files from the server to wherever I need them.
# Input is your fasta file
# Output is a fasta with taxonomy assinged (still working with unique seqs)
# This didn't work first, because Chuck had to delete a "jobs" folder in the tmp directory.

!parallel_assign_taxonomy_uclust.py \
-r /home/chantal/RNASIP/data/databases/Silva_111_post/no_ambiguous_bases_files/97_Silva_111_rep_set_no_ambig.fasta \
-t /home/chantal/RNASIP/data/databases/Silva_111_post/taxonomy/Silva_111_taxa_map_full.txt \
-O 10 \
-i data/otusn.fasta \
-o data/otusn_tax

In [21]:
# This makes a file of what we want to remove
# Could change this to pull out different groups.
# These primers actually had good Archaeal targets - so, it would be okay to include them.
!egrep "Chloroplast|Eukaryota|Archaea|mitochondria" \
data/otusn_tax/otusn_tax_assignments.txt \
| awk '{print $1}' > data/to_remove_tax.accnos

In [22]:
# wc is number of lines of the taxa that will be removed
!wc -l data/to_remove_tax.accnos

39 data/to_remove_tax.accnos


In [23]:
# Looking at what you're removing
!head data/to_remove_tax.accnos

OTU.151
OTU.335
OTU.293
OTU.36
OTU.329
OTU.175
OTU.58
OTU.50
OTU.256
OTU.227


In [24]:
%%bash
# Remove.seqs command will actually remove these taxa
mothur "#remove.seqs(fasta=data/otusn.fasta, \
accnos=data/to_remove_tax.accnos)" #> /dev/null

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > remove.seqs(fasta=data/otusn.fasta, accnos=data/to_remove_tax.accnos)
Removed 39 sequences from your fasta file.

Output File Names: 
data/otusn.pick.fasta


mothur > quit()


### Mapping Reads

In [25]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
data/finalQC.fasta | \
sed 's/;>//' > data/finalQC_usearchfmt.fasta

In [26]:
!head data/finalQC_usearchfmt.fasta

>13X.NTH.Day7.Rep3_0;barcodelabel=13X.NTH.Day7.Rep3
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5;barcodelabel=13C.NTH.Day7.Rep1
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6;barcodelabel=13X.PTH.Day3.Rep2
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTC

In [27]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - like 5 minutes
# Depending ont he clustering algorithm, like pairwise... it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data (modified above to have the sample ID extracted)
# We compare it to the otusn.pick.fasta database we made above
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global data/finalQC_usearchfmt.fasta \
-db data/otusn.fasta \
-strand plus -id 0.97 \
-uc data/readmap.uc \
-threads 15

usearch v7.0.1090_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: chuck.peperanney@gmail.com

00:00  19Mb Reading data/otusn.fasta, 1.4Mb
00:00  20Mb 3676 seqs, min 370, avg 373, max 410nt
00:00  20Mb  100.0% Masking
00:00  21Mb  100.0% Word stats
00:00  27Mb  100.0% Building slots
00:00  27Mb  100.0% Build index
00:45 175Mb  100.0% Searching, 40.4% matched


In [28]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/bioinfo/edgar_python_scripts/uc2otutab.py data/readmap.uc > data/otu_table.txt

data/readmap.uc 100.0%   


In [29]:
# Issues with biom table formatting
!if [ -f data/otu_table.biom ]; then rm data/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i data/otu_table.txt -o data/otu_table.biom --table-type "otu table"

In [30]:
# Issues with biom table formatting
!if [ -f data/otu_table_summary.txt ]; then rm data/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i data/otu_table.biom -o data/otu_table_summary.txt

In [31]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs
# Chantal had 50% reduction after QC.

!cat data/otu_table_summary.txt



Num samples: 112
Num observations: 3676
Total count: 1201884
Table density (fraction of non-zero values): 0.310
Table md5 (unzipped): 2c08138502078396f94ba12e05b75cc7

Counts/sample summary:
 Min: 5687.0
 Max: 24329.0
 Median: 10519.500
 Mean: 10731.107
 Std. dev.: 2781.967
 Sample Metadata Categories: None provided
 Observation Metadata Categories: None provided

Counts/sample detail:
 13X.PTH.Day1.Rep4: 5687.0
 12C.NTH.Day1.Rep1: 6472.0
 13C.PTH.Day3.Rep4: 6616.0
 13C.PTH.Day14.Rep1: 6941.0
 13C.NTH.Day7.Rep2: 6979.0
 13X.PTH.Day1.Rep3: 7114.0
 13X.NTH.Day1.Rep3: 7244.0
 12C.NTH.Day14.Rep4: 7339.0
 13X.NTH.Day14.Rep4: 7504.0
 13C.PTH.Day3.Rep3: 7522.0
 H2O.PTH.Rep1: 7533.0
 12C.NTH.Day3.Rep1: 7625.0
 13C.NTH.Day14.Rep4: 7781.0
 12C.NTH.Day30.Rep4: 7794.0
 12C.PTH.Day14.Rep4: 7848.0
 13X.NTH.Day3.Rep2: 7876.0
 13C.PTH.Day7.Rep1: 8130.0
 12C.NTH.Day7.Rep1: 8285.0
 13C.PTH.Day14.Rep3: 8306.0
 13C.NTH.Day30.Rep3: 8374.0
 12C.NTH.Day30.Rep2: 8399.0
 12