In [1]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [2]:
# applies for the whole segment
nprocs = 10

In [3]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head data/finalQC.fasta

>13X.NTH.Day7.Rep3_0
TACGTAGGTGGCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTGTAAGTCAGAGGTGAAAGCCTGGAGCTCAACTCCAGAACTGCCTTTGAGACTGCATCGCTTGAATCCAGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGCTATTCGGAAGAACACCAGTGGCGAAGGCGGCTCACTGGACTGGTATTGCCGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATAACTAGCTGTCCGGGCACTTGGTGCTTGGGTGGCGCAGCTAACGCATTAAGTTATCCGCCTGGGGAGTACGGTCGCAAGGTTG
>13C.NTH.Day7.Rep1_5
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGGTATCTCGAGTATGGAAGAGGTGAGTGGAATTCCGGGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAATGTTAGCCGTCGGCATGCATGCATGTCGGTGGCGCAGCTAACGCATTAAACATTCCGCCTGGGGAGTACGGTCGCAAGATTG
>13X.PTH.Day3.Rep2_6
TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAATTGCCTTTAAGACTGCATCGCTAGAATCATGGAGAGGGGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACA

In [None]:
%%bash
# Running the seq separation on mothur instead

mothur "#unique.seqs(fasta=data/finalQC.fasta)" > /dev/null

In [None]:
# Making a dictionary of the names files, splitting it into the first (ID) and second (commas list of all seqs in it)
# Then it counts their lengths and saves it in the dictionary

counts = {}

with open("data/finalQC.names") as f:
    for line in f:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count        

In [None]:
# Adds the counts from this dictionary to our finalQC.unique file so it looks like a usearch file with "size=XXX"

with open("data/finalQC.unique.usearch_names.fasta", "w") as f:
    for n, s in parse(open("data/finalQC.unique.fasta")):
        f.write(">%s;size=%s;\n%s\n"%(n,counts[n],s))  

In [None]:
!head data/finalQC.unique.usearch_names.fasta

In [None]:
# Chuck looking to see how many Gb this file is.
!du -h data/finalQC.fasta

In [None]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons
# Or, you know, just not do this step.
# But you should just get rid of them.
!usearch -sortbysize data/finalQC.unique.usearch_names.fasta -output data/finalQC.unique_sorted.fasta -minsize 2

In [None]:
# Checking data
# You can see here, the first two sequences we saw above are now gone.
!head data/finalQC.unique_sorted.fasta

In [None]:
# This is the clustering command.
# Default is 97% minimum ID.
# Not recommended to use more than 97%.
# Creates the centroids, or "seeds"
# Then you can take them out
!usearch -cluster_otus data/finalQC.unique_sorted.fasta -otus data/otus.fasta

In [None]:
# Making another file
# Figure this out (what is this?)
# This is a script (fasta_number.py) that replaces fasta names with XXX1, XXX2, etc.
# In our case, it is replacing the names with OTU.1, OTU.2, etc., and outputs it into a file called otusn.fasta
!/opt/bioinfo/edgar_python_scripts/fasta_number.py data/otus.fasta OTU. > data/otusn.fasta

In [None]:
!head data/otusn.fasta

### Removing Chloroplast, Eukaryal, and Archaeal sequences

In [None]:
# You do need to assign taxonomy in order to pull out the Euks., etc.
# I could cp these files from the server to wherever I need them.
# Input is your fasta file
# Output is a fasta with taxonomy assinged (still working with unique seqs)
# This didn't work first, because Chuck had to delete a "jobs" folder in the tmp directory.

!parallel_assign_taxonomy_uclust.py \
-r /home/chantal/RNASIP/RNAdata/databases/Silva_111_post/no_ambiguous_bases_files/97_Silva_111_rep_set_no_ambig.fasta \
-t /home/chantal/RNASIP/RNAdata/databases/Silva_111_post/taxonomy/Silva_111_taxa_map_full.txt \
-O 10 \
-i /home/chantal/Chazy/BulkSIP/data/otusn.fasta \
-o /home/chantal/Chazy/data/otusn_tax

In [None]:
# This makes a file of what we want to remove
# Could change this to pull out different groups.
# These primers actually had good Archaeal targets - so, it would be okay to include them.
!egrep "Chloroplast|Eukaryota|Archaea|mitochondria" \
data/otusn_tax/otusn_tax_assignments.txt \
| awk '{print $1}' > data/to_remove_tax.accnos

In [None]:
# wc is number of lines of the taxa that will be removed
!wc -l data/to_remove_tax.accnos

In [82]:
# Looking at what you're removing
!head data/to_remove_tax.accnos

OTU.1370
OTU.903
OTU.1308
OTU.995
OTU.994
OTU.765
OTU.768
OTU.572
OTU.1334
OTU.1190


In [83]:
%%bash
# Remove.seqs command will actually remove these taxa
mothur "#remove.seqs(fasta=data/otusn.fasta, \
accnos=data/to_remove_tax.accnos)" #> /dev/null

[H[2J





mothur v.1.32.1
Last updated: 10/16/2013

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
pschloss@umich.edu
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

Type 'quit()' to exit program



mothur > remove.seqs(fasta=data/otusn.fasta, accnos=data/to_remove_tax.accnos)
Removed 4484 sequences from your fasta file.

Output File Names: 
data/otusn.pick.fasta


mothur > quit()


### Mapping Reads

In [85]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
data/finalQC.fasta | \
sed 's/;>//' > data/finalQC_usearchfmt.fasta

In [86]:
!head data/finalQC_usearchfmt.fasta

>NTH1_23;barcodelabel=NTH1
TACGAACCGGGCAAACGTTATTCGGAATTACTGGGCTTAAAGGGTGCGTAGGCTGCGCTATAAGTCGGGTGTGAAATCCCTCAGCTCAACTGAGGAACTGCGCCCGATACTGTAGTGCTTGAGGAGGATAGAGGTGAGCGGAACTAGCAGTGGAGCGGTGAAATGCGTTGATATTGCTGGGAACACCCGTGGCGAAAGCGGCTCACTGGGTCCTTTCTGACGCTGAGGCACGAAAGCTAGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCTAGCCCTAAACGATCAGGACTTGACGTGGGCCCCGTCCGGGGTCCGTGCCGTAGCCAACGTGATAAGTCCTGCGCCTGGGGAGTACGGTCGCAAGGCTG
>NTR1_26;barcodelabel=NTR1
TACAGAGGTGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCATCTAAGTCAGACGTGAAATCCCCCGGCTCAACCTGGGAACTGCGTCTGATACTGGACGGCTTGAGTTTGGGAGAGGGATGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGAATACCGGTGGCGAAGGCGGCATCCTGGACCAATACTGACGCTGGGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGGATACTCGACATCAGCGATACACTGTTGGTGTCTGAGCGAAAGCATTAAGTATCCCGCCTGGGAAGTACGATCGCAAGATTA
>NTH3_27;barcodelabel=NTH3
TACAGAGGTGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCATCTAAGTCAGACGTGAAATCCCCCGGCTTAACCTGGGAACTGCGTCTGATACTGGAAGGCTTGAGTATGGGAGAGGGATGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAG

In [87]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - like 5 minutes
# Depending ont he clustering algorithm, like pairwise... it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data (modified above to have the sample ID extracted)
# We compare it to the otusn.pick.fasta database we made above
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global data/finalQC_usearchfmt.fasta \
-db data/otusn.fasta \
-strand plus -id 0.97 \
-uc data/readmap.uc \
-threads 15

usearch v7.0.1090_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: chuck.peperanney@gmail.com

00:00  19Mb Reading data/otusn.fasta, 5.3Mb
00:00  24Mb 13729 (13.7k) seqs, min 370, avg 374, max 427nt
00:00  24Mb  100.0% Masking
00:01  25Mb  100.0% Word stats
00:01  45Mb  100.0% Building slots
00:01  45Mb  100.0% Build index
01:34 207Mb  100.0% Searching, 45.6% matched


In [88]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/bioinfo/edgar_python_scripts/uc2otutab.py data/readmap.uc > data/otu_table.txt

data/readmap.uc 100.0%   


In [89]:
# Issues with biom table formatting
!if [ -f data/otu_table.biom ]; then rm data/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i data/otu_table.txt -o data/otu_table.biom --table-type "otu table"

In [90]:
# Issues with biom table formatting
!if [ -f data/otu_table_summary.txt ]; then rm data/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i data/otu_table.biom -o data/otu_table_summary.txt

In [91]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs
# Chantal had 50% reduction after QC.

!cat data/otu_table_summary.txt



Num samples: 32
Num observations: 13729
Total count: 2468002
Table density (fraction of non-zero values): 0.277
Table md5 (unzipped): 7b22cb5d411aaeb81343e9b39dcbaf17

Counts/sample summary:
 Min: 57819.0
 Max: 99704.0
 Median: 77831.500
 Mean: 77125.062
 Std. dev.: 10612.260
 Sample Metadata Categories: None provided
 Observation Metadata Categories: None provided

Counts/sample detail:
 NTH2: 57819.0
 NTR2: 58315.0
 PTR2: 62237.0
 PAS2: 64185.0
 PTR1: 66182.0
 TWE2: 66583.0
 CON2: 67461.0
 NTR1: 68093.0
 TWE1: 69366.0
 PTH1: 70713.0
 CON3: 71164.0
 PTH2: 71939.0
 PTR4: 72814.0
 NTH1: 73658.0
 CON1: 74477.0
 TEN3: 77356.0
 TEN2: 78307.0
 PAS1: 78952.0
 PAS3: 79448.0
 CON4: 80006.0
 PTH4: 80279.0
 TEN1: 81973.0
 NTR3: 82626.0
 NTR4: 85590.0
 TWE3: 86171.0
 PTH3: 86253.0
 NTH3: 88705.0
 TWE4: 88937.0
 PTR3: 90283.0
 NTH4: 90878.0
 PAS4: 97528.0
 TEN4: 99704.0
