# Step 1: User Input

In [1]:
import os, re, numpy as np

# Provide the directory for your index and read files (you can do multiple independently in one go)
growthrate = '/home/cassi/dunlop_popdynamics'
# CW: this is the directory where I want to put my outputs from the pipeline

# Prepare an object with the name of the library, the name of the directory object (created above), and the metadatafile name
#datasets = [['name',directory1,'metadata1','domain of life',raw reads directory],['name',directory2,'metadata2','domain of life']]
datasets = [['growthrate', growthrate,'bacteria']]

# Set # of Processors to Use
processors = "20"
# CW: there are 40 processers total, check with others to see how many are being used, affects speed of pipeline

# Classification Database to Use 
# options: "Silva" [default] | "GreenGenes" 
db = "Silva"
# CW: use Silva for bacteria, need UNITE or ITS1db databases for fungi

## Enter Minimum Support for Keeping QIIME Classification
# Note: Classifications that do not meet this criteria will simply be retained, but labeled 'putative'
min_support = 0.8

# Step 10: Classify Seqs

### Example of creating Silva Classifier DB  (very slow: use a pre-built one if possible)

#### For Silva, remove all the alignment information (unsure of the impact of keeping it) using the following python code:
```python 
import re
from Bio import SeqIO

output = open("silva.nr_v128.fasta", "w")

for record in SeqIO.parse(open("silva.nr_v128.align", "rU"), "fasta") :
    seq = str(record.seq)
    seq = re.sub("\.|-","",seq)  # Remove "." and "-"

    output.write(">"+record.id+"\n"+seq+"\n")```

#### Import fasta sequence file and taxonomy file as .qza
```bash
qiime tools import
  --type 'FeatureData[Sequence]'
  --input-path silva.nr_v128.fasta
  --output-path silva.nr_v128.qza

qiime tools import 
  --type 'FeatureData[Taxonomy]' 
  --source-format HeaderlessTSVTaxonomyFormat 
  --input-path silva.nr_v128.tax 
  --output-path silva.nr_v128.taxonomy.qza```

#### Run QIIME2 'fit-classifier-naive-bayes'
```bash
qiime feature-classifier fit-classifier-naive-bayes 
  --i-reference-reads silva.nr_v128.qza 
  --i-reference-taxonomy silva.nr_v128.taxonomy.qza 
  --o-classifier silva.nr_v128.nb.classifier.qza```

In [3]:
## Note: Different QIIME2 versions can conflict with previously donwloaded databases. This section might have to be updated.
classification_db = "/home/cassi/databases/silva/silva-138-99-515-806-nb-classifier.qza"
        
for dataset in datasets:
    name = dataset[0]
    directory = dataset[1]
    domain = dataset[2]

    # Classify
    os.system(' '.join([
        "qiime feature-classifier classify-sklearn",
        "--i-classifier",
        classification_db,
        "--i-reads "+directory+"/output/"+name+".rep.seqs.final.qza",
        "--o-classification "+directory+"/reclassify_silva138/"+name+".taxonomy.final.qza",
        "--p-n-jobs",
        processors
    ]))

    # Output Summary
    os.system(' '.join([
        "qiime metadata tabulate",
        "--m-input-file "+directory+"/reclassify_silva138/"+name+".taxonomy.final.qza",
        "--o-visualization "+directory+"/reclassify_silva138/"+name+".taxonomy.final.summary.qzv"
    ])) 

Saved FeatureData[Taxonomy] to: /home/cassi/dunlop_popdynamics/reclassify_silva138/growthrate.taxonomy.final.qza
Saved Visualization to: /home/cassi/dunlop_popdynamics/reclassify_silva138/growthrate.taxonomy.final.summary.qzv


In [4]:
## Make Function to Re-Format Taxonomy File to Contain Full Column Information 
# and factor in the certain of the taxonomic assignment

def format_taxonomy(tax_file, classification_db, min_support):
    output = open(re.sub(".tsv",".fixed.tsv",tax_file), "w")

 
    # Silva db lacks species classifications
    if classification_db == "GreenGenes":
        full_rank_length = 7
        output.write("\t".join(["OTU","Domain","Phylum","Class","Order","Family","Genus","Species"])+"\n")
    else:
        full_rank_length = 6  
        output.write("\t".join(["OTU","Domain","Phylum","Class","Order","Family","Genus"])+"\n")
        
    with open(tax_file, "r") as f:
        next(f) #skip header

        for line in f:
            line = line.strip()
            line = line.split("\t")

            read_id = line[0]
            tax_string = line[1]

            ## Remove All Underscore Garbage (I need aesthetics)
            if classification_db == "GreenGenes":
                tax_string = re.sub("k__|p__|c__|o__|f__|g__|s__","",tax_string)
            else:
                tax_string = re.sub("_cl|_or|_fa|_ge","",tax_string)
            
            # Split full rank into ranks
            full_rank = tax_string.split(";")
          
            # Getting trailing empty tab in Silva
            if full_rank[len(full_rank)-1] == "":
                    full_rank = full_rank[:-1]
                    
            ## Identify the Lowest Classified Taxonomic Rank
            # Account for cases when a taxonomic rank contains an empty space (common in GreenGenes output)
            last_classified = full_rank[len(full_rank)-1]            

            count = 1
            while last_classified == " ":
                last_classified = full_rank[len(full_rank)-count]
                count = count + 1

            # Annotate the last classified as 'putative' if it does not meet the minimum support criteria
            # Older versions of this script contain code to designate all taxonomic ranks as 'putative' in this case, but 
            # this seems conservative
            if float(line[2]) < float(min_support):
                    full_rank[full_rank.index(last_classified)] = "putative "+last_classified
                    last_classified = "putative "+last_classified
                    
            # Add in columns containing unclassified taxonomic information
            try: # In Silva, many classifications are a single entry (which breaks from the reliance on lists for full_rank.index)
                for n in range(full_rank.index(last_classified)+1, full_rank_length, 1):               
                    try:
                        full_rank[n] = "unclassified "+last_classified
                    except:
                        full_rank.append("unclassified "+last_classified)
            except:
                for n in range(0, full_rank_length, 1):               
                    try:
                        full_rank[n] = "unclassified "+last_classified
                    except:
                        full_rank.append("unclassified "+last_classified)
                    
            # Clean-up the trailing whitespace introduced in Silva classification 
            if classification_db == "Silva":
                full_rank = [x.strip(' ') for x in full_rank]

            # Write Taxonomy to File
            output.write(read_id+"\t"+'\t'.join(full_rank)+"\n")
            
    return()

In [7]:
#####################
## Export from QIIME2

# CW: had to remove first / in front of every output

for dataset in datasets:
    name = dataset[0]
    directory = dataset[1]
    domain = dataset[2]

    # Export Classifications
    os.system(' '.join([
        "qiime tools export",
        "--input-path "+directory+"/reclassify_silva138/"+name+".taxonomy.final.qza",
        "--output-path "+directory+"/reclassify_silva138/"
    ]))
    
    # Reformat Classifications to meet phyloseq format   
    format_taxonomy(directory+"/reclassify_silva138/taxonomy.tsv", db, min_support)

    # Rename Exported Files
    %mv $directory/output/taxonomy.fixed.tsv $tax_file

Exported /home/cassi/dunlop_popdynamics/reclassify_silva138/growthrate.taxonomy.final.qza as TSVTaxonomyDirectoryFormat to directory /home/cassi/dunlop_popdynamics/reclassify_silva138/
mv: missing destination file operand after '/output/taxonomy.fixed.tsv'
Try 'mv --help' for more information.
