In [1]:
!cat diatomPipeline.dms

#!/bin/bash
echo " Pipeline stated: $(date)"
makeblastdb -in diatoms.sequences.FINAL2017.fasta -out diatoms -dbtype nucl
echo " QC started: $(date)"
python ./ampliconQC.py --data $1 --forward ATGCGTTGGAGAGARCGTTTC --reverse GATCACCTTCTAATTTACCWACAACTG --threads 130 --histograms --qiime
echo " QIIME Pipeline: $(date)"
for file in $1/*.passedQC.fastq; do awk 'NR%4==2{sum+=1}END{print FILENAME,sum}' $file >> $1/diatomSequenceCounts.txt; done
echo $(date)
pick_otus.py -i $1/readyForQiime.allsamples.fasta -o $1/picked_otus_97 
echo $(date)
pick_rep_set.py -i $1/picked_otus_97/readyForQiime.allsamples_otus.txt -f $1/readyForQiime.allsamples.fasta -o $1/repset.fasta
echo $(date)
blastn -db diatoms -query $1/repset.fasta -out $1/repset.diatoms.blastn -task blastn -max_target_seqs 1 -num_threads 130 -outfmt 6 -evalue 0.01
echo $(date)
mkdir $1/assigned_taxonomy
echo $(date)
python ./create_taxonomy_assignments_from_blast.py --taxonomy diatoms.taxonomy.FINAL2017.txt --percid 95.0

### Create reference database

In [2]:
!makeblastdb -in diatoms.sequences.FINAL2017.fasta -out diatoms -dbtype nucl



Building a new DB, current time: 05/31/2019 14:11:32
New DB name:   /code/diatoms
New DB title:  diatoms.sequences.FINAL2017.fasta
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 2701 sequences in 0.189506 seconds.


#### Run quality control

In [3]:
!python ./ampliconQC.py --data sequences --forward ATGCGTTGGAGAGARCGTTTC --reverse GATCACCTTCTAATTTACCWACAACTG --threads 130 --histograms --qiime

This is cutadapt 1.9.1 with Python 2.7.16
Command line parameters: -e 0.047619047619 -b ATGCGTTGGAGAGARCGTTTC -b GATCACCTTCTAATTTACCWACAACTG -o /code/sequences/101.R2.fastq.gz.trimmed.fastq.gz /code/sequences/101.R2.fastq.gz
Trimming 2 adapters with at most 4.8% errors in single-end mode ...
Finished in 5.53 s (55 us/read; 1.10 M reads/minute).

=== Summary ===

Total reads processed:                 101,443
Reads with adapters:                    98,773 (97.4%)
Reads written (passing filters):       101,443 (100.0%)

Total basepairs processed:    30,534,343 bp
Total written (filtered):     27,867,887 bp (91.3%)

=== Adapter 1 ===

Sequence: ATGCGTTGGAGAGARCGTTTC; Type: variable 5'/3'; Length: 21; Trimmed: 19 times.
2 times, it overlapped the 5' end of a read
17 times, it overlapped the 3' end or was within the read

No. of allowed errors:
0-21 bp: 0

Overview of removed sequences (5')
length	count	expect	max.err	error counts
5	1	99.1	0	1
8	1	1.5	0	1


Over


FastQ paired records kept: 352174 (176087 pairs)
FastQ single records kept: 1532 (from PE1: 1433, from PE2: 99)
FastQ paired records discarded: 170 (85 pairs)
FastQ single records discarded: 1532 (from PE1: 99, from PE2: 1433)



FastQ paired records kept: 272720 (136360 pairs)
FastQ single records kept: 1498 (from PE1: 1385, from PE2: 113)
FastQ paired records discarded: 428 (214 pairs)
FastQ single records discarded: 1498 (from PE1: 113, from PE2: 1385)



FastQ paired records kept: 200810 (100405 pairs)
FastQ single records kept: 988 (from PE1: 926, from PE2: 62)
FastQ paired records discarded: 100 (50 pairs)
FastQ single records discarded: 988 (from PE1: 62, from PE2: 926)


 ____  _____    _    ____ 
|  _ \| ____|  / \  |  _ \
| |_) |  _|   / _ \ | |_) |
|  __/| |___ / ___ \|  _ <
|_|   |_____/_/   \_\_| \_\

PEAR v0.9.11 [Nov 5, 2017]

Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatic

In [5]:
!for file in sequences/*.passedQC.fastq; \
do \
  awk 'NR%4==2{sum+=1}END{print FILENAME,sum}' $file >> sequences/diatomSequenceCounts.txt; \
done

In [6]:
!cat sequences/diatomSequenceCounts.txt

sequences/100.passedQC.fastq 119431
sequences/101.passedQC.fastq 84186
sequences/102.passedQC.fastq 152834


#### Assign similar sequences to OTUs

In [7]:
!pick_otus.py -i sequences/readyForQiime.allsamples.fasta -o sequences/picked_otus_97



####  Pick a representative set of sequences. For each OTU, one sequence will be used in subsequent analysis

In [10]:
!pick_rep_set.py -i sequences/picked_otus_97/readyForQiime.allsamples_otus.txt \
  -f sequences/readyForQiime.allsamples.fasta \
  -o sequences/repset.fasta

In [11]:
!blastn -db diatoms -query sequences/repset.fasta \
  -out sequences/repset.diatoms.blastn \
  -task blastn -max_target_seqs 1 -num_threads 130 -outfmt 6 -evalue 0.01

In [12]:
!mkdir sequences/assigned_taxonomy

In [13]:
!python ./create_taxonomy_assignments_from_blast.py --taxonomy diatoms.taxonomy.FINAL2017.txt \
  --percid 95.0 --blast sequences/repset.diatoms.blastn --output sequences/assigned_taxonomy/repset.taxonomy.txt 

In [14]:
!make_otu_table.py -i sequences/picked_otus_97/readyForQiime.allsamples_otus.txt \
  -t sequences/assigned_taxonomy/repset.taxonomy.txt \
  -o sequences/otu_table.biom

#### Lookup taxa from OTU table

In [15]:
!filter_taxa_from_otu_table.py -i sequences/otu_table.biom \
  -o sequences/otu_table.diatomsonly.biom \
  -n MARINE,NOT_DIATOM,Yellow_green_Algae,None

#### Sort OTU table

In [16]:
!sort_otu_table.py -i sequences/otu_table.diatomsonly.biom \
  -o sequences/otu_table.diatomsonly.biom

#### Summerise taxa

In [17]:
!summarize_taxa.py -L 1 \
  -i sequences/otu_table.diatomsonly.biom \
  -o sequences/visualised_taxonomy -a

#### Produce Diatom reports

In [18]:
!python ./produceDiatomReports.py --folder sequences --lookup lookuptable.txt

Reports completed


In [23]:
import pandas as pd

In [24]:
pd.read_csv('sequences/Abundances.fail.csv')

Unnamed: 0,Taxonomy,100763785,none present,100763824


In [25]:
pd.read_csv('sequences/Abundances.pass.csv')

Unnamed: 0,Taxonomy,100763785,none present,100763824
