In [2]:
import boto3
import pandas as pd
from ete3 import NCBITaxa
import subprocess
import itertools
import os
import s3fs
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from lca_functions import *

In [3]:
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket_name = "czbiohub-mosquito"
bucket = s3.Bucket(bucket_name)
contig_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
contig_quality_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]

ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()



### read counts

In [4]:
read_count_files = [client.list_objects(Bucket=bucket_name, Prefix=x+"bowtie_csp_counts_1000.txt") \
                   for x in contig_quality_folders]
read_count_files = ["s3://"+bucket_name+"/"+x["Prefix"] for x in read_count_files if "Contents" in x.keys()]
read_counts_csp_1000 = pd.concat([pd.read_csv(x, sep="\t", header=None, names=["query", "read_count"]).\
                                  assign(sample=os.path.split(os.path.split(x)[0])[1]) for x in read_count_files])


In [5]:
sum(read_counts_csp_1000["read_count"]>2)/len(read_counts_csp_1000.index)

0.5577723823263779

In [6]:
pd.DataFrame(np.histogram(read_counts_csp_1000["read_count"], 
                          bins=list(range(21))+[50, 100, 1000, 10000])).transpose()


Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,339327.0,2.0
3,0.0,3.0
4,143502.0,4.0
5,0.0,5.0
6,72762.0,6.0
7,0.0,7.0
8,43966.0,8.0
9,0.0,9.0


In [7]:
filtered_contigs_by_read_count = read_counts_csp_1000[read_counts_csp_1000["read_count"]>2]

### functions

In [9]:
def run_lca_analysis (input_file_name, output_dir, bucket_name, ident_cutoff, align_cutoff, bitscore_cutoff, \
                      blast_type, default=False, ncores=8):
    # First list the folders (there is a limit of 1000 files output by AWS by default)
    list_of_folders = [client.list_objects(Bucket=bucket_name, Prefix=x["Prefix"]+input_file_name) \
                       for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
    list_of_folders = ["s3://"+bucket_name+"/"+x["Prefix"] for x in list_of_folders if "Contents" in x.keys()]
    filenames = pd.DataFrame(list_of_folders, columns=["blast_"+blast_type])
    output_string = output_dir
    if not default:
        output_string += "/ident"+str(ident_cutoff)+"align"+str(align_cutoff)+"bitscore"+str(bitscore_cutoff)
    filenames = filenames.assign(filtered_blast=filenames["blast_"+blast_type].str.replace("contigs", output_string).str.replace(".m9", "_filtered.m9"))
    filenames = filenames.assign(lca=filenames["filtered_blast"].str.replace("blast_"+blast_type, "lca_"+blast_type).str.replace("_filtered", ""))
    commands = filenames.apply(lambda x: "python lca_analysis.py"+\
                               " --blast_type "+blast_type+\
                               " --fpath "+x.iloc[0]+\
                               " --filtered_blast_path "+x.iloc[1]+\
                               " --outpath "+x.iloc[2]+\
                               " --ident_cutoff "+str(ident_cutoff)+\
                               " --align_len_cutoff "+str(align_cutoff)+\
                               " --bitscore_cutoff "+str(bitscore_cutoff), axis=1)
    print (commands)
    commands_csv_filename = "lca_"+blast_type+"_commands"
    if not default:
        commands_csv_filename += "_ident"+str(ident_cutoff)+"align"+str(align_cutoff)+"bitscore"+str(bitscore_cutoff)
    commands.to_csv(commands_csv_filename, index=False)
    if (len(filenames) < ncores):
        ncores = len(filenames)
    command_str = "parallel -a "+commands_csv_filename+" -j "+str(ncores)
    process = subprocess.Popen(command_str.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    return (output, error)


In [10]:
def combine_blast_lca (lca_file_name, blast_file_name, outfile, sample_name, blast_type, output_file_name=None):
    lca_data = pd.read_csv(lca_file_name, sep="\t", header=0)
    blast_data = pd.read_csv(blast_file_name, sep="\t", header=0)
    blast_data_grouped = blast_data.groupby(["query"], as_index=False).\
    agg({'align_length':["mean"], 'mismatches':["mean"], 'gaps':["mean"],
         'qstart':["min"], 'qend':["max"], 'sstart':["min"], 'send':["max"],
         'bitscore':["mean"]})
    blast_data_grouped.columns = blast_data_grouped.columns.get_level_values(0)
    grouped_df = pd.merge(blast_data_grouped, lca_data, how="left", on="query")
    grouped_df.insert(1, "blast_type", value=blast_type)
    grouped_df.insert(2, "sample", value=sample_name)
    df_to_s3(grouped_df, outfile)
    outfile

In [None]:
%%bash
aws s3 ls s3://czbiohub-mosquito/plast/ | grep '.m8' | awk 'NF>1{print $NF}' | parallel -j 72 python create_blast_nr.py --fpath s3://czbiohub-mosquito/plast/{}
aws s3 ls s3://lucymli/skeeters/ | grep 'CMS00' | awk 'NF>1{print $NF}' | parallel aws s3 sync s3://lucymli/skeeters/{} blast_nr_output/{}
head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > header_line 
for x in `ls blast_nr_output`; do 
    mkdir -p blast_nr_output_full/$x
    head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > blast_nr_output_full/$x/blast_nr.m9
    ls -d $(find blast_nr_output/$x -type f) | xargs -0 -I file cat file > blast_nr_output_full/$x/blast_nr.m9
done
ls blast_nr_output_full | parallel aws s3 cp blast_nr_output_full/{}/blast_nr.m9 s3://czbiohub-mosquito/contigs/{}/blast_nr.m9


In [11]:
##
## Produce a dataframe from every combination of values
## From: https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html
##
def expand_grid(data_dict):
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

In [12]:
combinations = pd.concat([
    expand_grid({
        "ident_cutoff":[x*0.1 for x in range(0, 10)], \
        "align_cutoff":[x*0.1 for x in range(0, 10)], \
        "bitscore_cutoff":[0]
    }),
    expand_grid({"ident_cutoff":[0], "align_cutoff":[0], "bitscore_cutoff":[x*0.1 for x in range(0, 10)]})
], axis=0, ignore_index=True)


In [13]:
ncores = os.cpu_count()

### nt hits
The nt hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [51]:
run_lca_analysis(input_file_name="blast_nt.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nt", default=True, ncores=ncores)

(b'', None)

Sensitivity analysis using different combinations of cutoffs

In [None]:
combinations.apply(lambda x: \
                   run_lca_analysis(input_file_name="blast_nt.m9", output_dir="contig_quality_sensitivity", \
                                    ident_cutoff=x["ident_cutoff"], align_cutoff=x["align_cutoff"], \
                                    bitscore_cutoff=x["bitscore_cutoff"], \
                                    blast_type="nt", default=False, ncores=ncores), \
                   axis=1)

### subset nr hits
The subset nr hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [52]:
run_lca_analysis(input_file_name="blast_subset_nr.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nr", default=True, ncores=ncores)

(b'', None)

In [None]:
combinations.apply(lambda x: \
                   run_lca_analysis(input_file_name="blast_subset_nr.m9", output_dir="contig_quality_sensitivity", \
                                    ident_cutoff=x["ident_cutoff"], align_cutoff=x["align_cutoff"], \
                                    bitscore_cutoff=x["bitscore_cutoff"], \
                                    blast_type="nr", default=False, ncores=ncores), \
                   axis=1)

### nr hits
The nr hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [None]:
run_lca_analysis(input_file_name="blast_nr.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nr", default=True, ncores=ncores)

In [15]:
lca_nr_paths = ["s3://"+bucket_name+"/"+x["Prefix"]+"lca_nr.m9" \
 for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]
blast_nr_paths = [x.replace("lca_nr", "blast_nr_filtered") for x in lca_nr_paths]


In [16]:
for i in range(len(lca_nr_paths)):
    sample_name = os.path.basename(os.path.dirname(lca_nr_paths[i]))
    outfile = lca_nr_paths[i].replace("lca_nr", "blast_lca_nr_filtered")
    try:
        combine_blast_lca (lca_nr_paths[i], blast_nr_paths[i], outfile, sample_name, "nr")
    except:
        print ("error: "+sample_name)



error: CMS001_005_Ra_S3
error: CMS001_026_Ra_S18
