In [1]:
import boto3
import pandas as pd
from ete3 import NCBITaxa
import subprocess
import itertools
import os
import s3fs
import numpy as np
from lca_functions import *

In [2]:
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket_name = "czbiohub-mosquito"
bucket = s3.Bucket(bucket_name)
contig_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
contig_quality_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"] if "Mos" not in x["Prefix"]]

ncores = os.cpu_count()



In [None]:
%%bash
download_file() {
    tax_db=$1
    if [ ! -f "${tax_db}.zip" ]; then
        curl -O ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/$tax_db.zip
        unzip -d $tax_db $tax_db.zip
        cd $tax_db
        tar -czvf ../$tax_db.tar.gz *
        cd ..
        rm -rf $tax_db $tax_db.zip
    fi
}
export -f download_file
download_file "new_taxdump_2019-01-01" ## latest taxonomy release that still contains original virus taxonomy
download_file "new_taxdump_2019-06-01"


### functions

In [None]:
def run_lca_analysis (input_file_name, output_dir, bucket_name, blast_type, default=False, ncores=8):
    # First list the folders (there is a limit of 1000 files output by AWS by default)
    list_of_folders = [client.list_objects(Bucket=bucket_name, Prefix=x["Prefix"]+input_file_name) \
                       for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
    list_of_folders = ["s3://"+bucket_name+"/"+x["Prefix"] for x in list_of_folders if "Contents" in x.keys()]
    filenames = pd.DataFrame(list_of_folders, columns=["blast_"+blast_type])
    output_string = output_dir
    if not default:
        output_string += "/ident"+str(ident_cutoff)+"align"+str(align_cutoff)+"bitscore"+str(bitscore_cutoff)
    filenames = filenames.assign(filtered_blast=filenames["blast_"+blast_type].str.replace("contigs", output_string).str.replace(".m9", "_filtered.m9"))
    filenames = filenames.assign(excluded_contigs=filenames["filtered_blast"].apply(os.path.dirname).apply(lambda x: os.path.join(x, "exclude_contigs_"+blast_type+".txt")))
    filenames = filenames.assign(lca=filenames["filtered_blast"].str.replace("blast_"+blast_type, "lca_"+blast_type).str.replace("_filtered", ""))
    filenames = filenames.assign(reads=filenames["blast_"+blast_type].str.replace("blast_"+blast_type+".m9", "contig_stats.json").replace(""))
    #filenames.loc[~filenames["reads"].str.contains("ater"), "reads"] = filenames.loc[~filenames["reads"].str.contains("ater"), "reads"].str.replace("bowtie", "Mos/bowtie")
    commands = filenames.apply(lambda x: "python lca_analysis.py"+\
                               " --blast_type "+blast_type+\
                               " --fpath "+x.iloc[0]+\
                               " --filtered_blast_path "+x.iloc[1]+\
                               " --excluded_contigs_path "+x.iloc[2]+\
                               " --outpath "+x.iloc[3]+\
                               " --read_count_path "+x.iloc[4]+\
                               " --verbose True", axis=1)
    print (commands)
    commands_csv_filename = "lca_"+blast_type+"_commands"
    commands.to_csv(commands_csv_filename, index=False)
    if (len(filenames) < ncores):
        ncores = len(filenames)
    command_str = "parallel -a "+commands_csv_filename+" -j "+str(ncores)
    print (command_str)
#     process = subprocess.Popen(command_str.split(), stdout=subprocess.PIPE)
#     output, error = process.communicate()
#     return (output, error)


### nt hits

In [None]:
run_lca_analysis(input_file_name="blast_nt.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name, blast_type="nt", default=True, ncores=ncores)

In [3]:
lca_nt_paths = ["s3://"+bucket_name+"/"+x["Prefix"]+"lca_nt.m9" \
 for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"] if "Mos" not in x["Prefix"]]
blast_nt_paths = [x.replace("lca_nt", "blast_nt_filtered") for x in lca_nt_paths]


In [4]:
for i in range(len(lca_nt_paths)):
    sample_name = os.path.basename(os.path.dirname(lca_nt_paths[i]))
    outfile = lca_nt_paths[i].replace("lca_nt", "blast_lca_nt_filtered")
    try:
        combine_blast_lca (lca_nt_paths[i], blast_nt_paths[i], outfile, sample_name, "nt")
    except:
        print ("error: "+sample_name)



error: CMS001_Water5_RNA_A_S12
error: CMS001_water1_S11
error: CMS001_water5_RNA_A_S12
error: CMS002_016a_Rb_S121_L004
error: CMS002_025d_Rb_S143_L004
error: CMS002_025f_Rb_S145_L004
error: CMS002_0Water8_Rb_S11_L004


### nr hits

Extract blast_nr hits from plast results and save to folder for each sample

In [None]:
%%bash
aws s3 ls s3://czbiohub-mosquito/plast/ | grep '.m8' | awk 'NF>1{print $NF}' | parallel -j 72 python create_blast_nr.py --fpath s3://czbiohub-mosquito/plast/{}
aws s3 ls s3://lucymli/skeeters/blast_nr/ | grep 'CMS00' | awk 'NF>1{print $NF}' | parallel aws s3 sync s3://lucymli/skeeters/blast_nr/{} blast_nr_output/{}
head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > header_line 
for x in `ls blast_nr_output`; do 
    mkdir -p blast_nr_output_full/$x
    head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > blast_nr_output_full/$x/blast_nr.m9
    ls -d $(find blast_nr_output/$x -type f) | xargs -0 -I file cat file > blast_nr_output_full/$x/blast_nr.m9
done
ls blast_nr_output_full | parallel aws s3 cp blast_nr_output_full/{}/blast_nr.m9 s3://czbiohub-mosquito/contigs/{}/blast_nr.m9

# add missing taxids
python initiate_gi2taxid_database.py
aws s3 ls s3://czbiohub-mosquito/contigs/ --recursive | grep "blast_nr.m9" | awk 'NF>1{print $NF}' | parallel python get_missing_prot_taxa.py s3://czbiohub-mosquito/{}






In [None]:
run_lca_analysis(input_file_name="blast_nr.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name, blast_type="nr", default=True, ncores=ncores)

In [5]:
lca_nr_paths = ["s3://"+bucket_name+"/"+x["Prefix"]+"lca_nr.m9" \
 for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"] if "Mos" not in x["Prefix"]]
blast_nr_paths = [x.replace("lca_nr", "blast_nr_filtered") for x in lca_nr_paths]


In [6]:
for i in range(len(lca_nr_paths)):
    sample_name = os.path.basename(os.path.dirname(lca_nr_paths[i]))
    outfile = lca_nr_paths[i].replace("lca_nr", "blast_lca_nr_filtered")
    try:
        combine_blast_lca (lca_nr_paths[i], blast_nr_paths[i], outfile, sample_name, "nr")
    except:
        print ("error: "+sample_name)



error: CMS001_012_Ra_S4
error: CMS001_Water5_RNA_A_S12
error: CMS001_water1_S11
error: CMS001_water5_RNA_A_S12
error: CMS002_016a_Rb_S121_L004
error: CMS002_025d_Rb_S143_L004
error: CMS002_025f_Rb_S145_L004
error: CMS002_053a_Rb_S7_L004
error: CMS002_0Water8_Rb_S11_L004
