In [1]:
import boto3
import pandas as pd
from ete3 import NCBITaxa
import subprocess
import itertools
import os
import s3fs
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from lca_functions import *

In [22]:
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket_name = "czbiohub-mosquito"
bucket = s3.Bucket(bucket_name)
contig_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
contig_quality_folders = [x["Prefix"] for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]

ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()

ncores = os.cpu_count()



In [17]:
ncbi.get_descendant_taxa(6960)


[974533,
 1597517,
 1647087,
 1788287,
 1788289,
 1903043,
 211997,
 89971,
 135968,
 136172,
 212011,
 383857,
 383858,
 438502,
 627589,
 1080977,
 1519098,
 1519099,
 61994,
 61995,
 243523,
 307023,
 1453226,
 1721378,
 2155344,
 2155345,
 2155346,
 2155347,
 2155348,
 2155349,
 2155350,
 2155351,
 2155352,
 2453054,
 2453055,
 2453056,
 2453057,
 109745,
 165470,
 212007,
 212009,
 289466,
 289468,
 911375,
 1519111,
 289470,
 1519141,
 2038688,
 2038689,
 212001,
 1646822,
 61997,
 127174,
 438503,
 62008,
 135969,
 136174,
 296598,
 109748,
 173800,
 178291,
 282107,
 1519110,
 289462,
 289464,
 1136246,
 1519109,
 1519120,
 1519128,
 407253,
 165473,
 212005,
 1236688,
 1255087,
 1255088,
 172548,
 191513,
 1820139,
 1820140,
 194916,
 289450,
 289451,
 510025,
 511698,
 533271,
 533272,
 1079363,
 289453,
 533289,
 2454510,
 2454511,
 2454512,
 2454513,
 2454514,
 2454515,
 2454516,
 165478,
 533275,
 533288,
 61992,
 533270,
 1134126,
 1455656,
 1455657,
 89823,
 1314697,
 13

### read counts

In [19]:
read_count_files = [client.list_objects(Bucket=bucket_name, Prefix=x+"bowtie_csp_counts_1000.txt") \
                   for x in contig_quality_folders]
read_count_files = ["s3://"+bucket_name+"/"+x["Prefix"] for x in read_count_files if "Contents" in x.keys()]
read_counts_csp_1000 = pd.concat([pd.read_csv(x, sep="\t", header=None, names=["query", "read_count"]).\
                                  assign(sample=os.path.split(os.path.split(x)[0])[1]) for x in read_count_files])


In [20]:
sum(read_counts_csp_1000["read_count"]>2)/len(read_counts_csp_1000.index)

0.5577723823263779

In [21]:
pd.DataFrame(np.histogram(read_counts_csp_1000["read_count"], 
                          bins=list(range(21))+[50, 100, 1000, 10000])).transpose()


ValueError: DataFrame constructor not properly called!

In [23]:
filtered_contigs_by_read_count = read_counts_csp_1000[read_counts_csp_1000["read_count"]>2]

### functions

In [24]:
def run_lca_analysis (input_file_name, output_dir, bucket_name, ident_cutoff, align_cutoff, bitscore_cutoff, \
                      blast_type, default=False, ncores=8):
    # First list the folders (there is a limit of 1000 files output by AWS by default)
    list_of_folders = [client.list_objects(Bucket=bucket_name, Prefix=x["Prefix"]+input_file_name) \
                       for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
    list_of_folders = ["s3://"+bucket_name+"/"+x["Prefix"] for x in list_of_folders if "Contents" in x.keys()]
    filenames = pd.DataFrame(list_of_folders, columns=["blast_"+blast_type])
    output_string = output_dir
    if not default:
        output_string += "/ident"+str(ident_cutoff)+"align"+str(align_cutoff)+"bitscore"+str(bitscore_cutoff)
    filenames = filenames.assign(filtered_blast=filenames["blast_"+blast_type].str.replace("contigs", output_string).str.replace(".m9", "_filtered.m9"))
    filenames = filenames.assign(lca=filenames["filtered_blast"].str.replace("blast_"+blast_type, "lca_"+blast_type).str.replace("_filtered", ""))
    commands = filenames.apply(lambda x: "python lca_analysis.py"+\
                               " --blast_type "+blast_type+\
                               " --fpath "+x.iloc[0]+\
                               " --filtered_blast_path "+x.iloc[1]+\
                               " --outpath "+x.iloc[2]+\
                               " --ident_cutoff "+str(ident_cutoff)+\
                               " --align_len_cutoff "+str(align_cutoff)+\
                               " --bitscore_cutoff "+str(bitscore_cutoff), axis=1)
    print (commands)
    commands_csv_filename = "lca_"+blast_type+"_commands"
    if not default:
        commands_csv_filename += "_ident"+str(ident_cutoff)+"align"+str(align_cutoff)+"bitscore"+str(bitscore_cutoff)
    commands.to_csv(commands_csv_filename, index=False)
    if (len(filenames) < ncores):
        ncores = len(filenames)
    command_str = "parallel -a "+commands_csv_filename+" -j "+str(ncores)
    process = subprocess.Popen(command_str.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    return (output, error)


In [25]:
def combine_blast_lca (lca_file_name, blast_file_name, outfile, sample_name, blast_type, output_file_name=None):
    lca_data = pd.read_csv(lca_file_name, sep="\t", header=0)
    blast_data = pd.read_csv(blast_file_name, sep="\t", header=0)
    blast_data_grouped = blast_data.groupby(["query"], as_index=False).\
    agg({'identity':["max"], 'align_length':["max"], 'mismatches':["max"], 'gaps':["max"],
         'qstart':["min"], 'qend':["max"], 'sstart':["min"], 'send':["max"],
         'bitscore':["max"]})
    blast_data_grouped.columns = blast_data_grouped.columns.get_level_values(0)
    grouped_df = pd.merge(blast_data_grouped, lca_data, how="left", on="query")
    grouped_df.insert(1, "blast_type", value=blast_type)
    grouped_df.insert(2, "sample", value=sample_name)
    df_to_s3(grouped_df, outfile)
    outfile

In [None]:
%%bash
aws s3 ls s3://czbiohub-mosquito/plast/ | grep '.m8' | awk 'NF>1{print $NF}' | parallel -j 72 python create_blast_nr.py --fpath s3://czbiohub-mosquito/plast/{}
aws s3 ls s3://lucymli/skeeters/ | grep 'CMS00' | awk 'NF>1{print $NF}' | parallel aws s3 sync s3://lucymli/skeeters/{} blast_nr_output/{}
head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > header_line 
for x in `ls blast_nr_output`; do 
    mkdir -p blast_nr_output_full/$x
    head -n 1 $(find blast_nr_output -type f -name '*.m8' | head -n 1) > blast_nr_output_full/$x/blast_nr.m9
    ls -d $(find blast_nr_output/$x -type f) | xargs -0 -I file cat file > blast_nr_output_full/$x/blast_nr.m9
done
ls blast_nr_output_full | parallel aws s3 cp blast_nr_output_full/{}/blast_nr.m9 s3://czbiohub-mosquito/contigs/{}/blast_nr.m9


In [None]:
##
## Produce a dataframe from every combination of values
## From: https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html
##
def expand_grid(data_dict):
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

In [12]:
combinations = pd.concat([
    expand_grid({
        "ident_cutoff":[x*0.1 for x in range(0, 10)], \
        "align_cutoff":[x*0.1 for x in range(0, 10)], \
        "bitscore_cutoff":[0]
    }),
    expand_grid({"ident_cutoff":[0], "align_cutoff":[0], "bitscore_cutoff":[x*0.1 for x in range(0, 10)]})
], axis=0, ignore_index=True)


### nt hits
The nt hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [51]:
run_lca_analysis(input_file_name="blast_nt.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nt", default=True, ncores=ncores)

(b'', None)

Sensitivity analysis using different combinations of cutoffs

In [16]:
lca_nt_paths = ["s3://"+bucket_name+"/"+x["Prefix"]+"lca_nt.m9" \
 for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]
blast_nt_paths = [x.replace("lca_nt", "blast_nt_filtered") for x in lca_nt_paths]


In [17]:
for i in range(len(lca_nt_paths)):
    sample_name = os.path.basename(os.path.dirname(lca_nt_paths[i]))
    outfile = lca_nt_paths[i].replace("lca_nt", "blast_lca_nt_filtered")
    try:
        combine_blast_lca (lca_nt_paths[i], blast_nt_paths[i], outfile, sample_name, "nt")
    except:
        print ("error: "+sample_name)



error: CMS001_012_Ra_S4


In [None]:
combinations.apply(lambda x: \
                   run_lca_analysis(input_file_name="blast_nt.m9", output_dir="contig_quality_sensitivity", \
                                    ident_cutoff=x["ident_cutoff"], align_cutoff=x["align_cutoff"], \
                                    bitscore_cutoff=x["bitscore_cutoff"], \
                                    blast_type="nt", default=False, ncores=ncores), \
                   axis=1)

### subset nr hits
The subset nr hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [52]:
run_lca_analysis(input_file_name="blast_subset_nr.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nr", default=True, ncores=ncores)

(b'', None)

In [None]:
combinations.apply(lambda x: \
                   run_lca_analysis(input_file_name="blast_subset_nr.m9", output_dir="contig_quality_sensitivity", \
                                    ident_cutoff=x["ident_cutoff"], align_cutoff=x["align_cutoff"], \
                                    bitscore_cutoff=x["bitscore_cutoff"], \
                                    blast_type="nr", default=False, ncores=ncores), \
                   axis=1)

### nr hits
The nr hits of contigs from each sample are filtered with ident_cutoff=0.9 and align_len_cutoff=0.9

In [None]:
run_lca_analysis(input_file_name="blast_nr.m9", output_dir="contig_quality", \
                 bucket_name=bucket_name,\
                 ident_cutoff=0.9, align_cutoff=0.9, bitscore_cutoff=0, \
                 blast_type="nr", default=True, ncores=ncores)

In [7]:
lca_nr_paths = ["s3://"+bucket_name+"/"+x["Prefix"]+"lca_nr.m9" \
 for x in client.list_objects(Bucket=bucket_name, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]
blast_nr_paths = [x.replace("lca_nr", "blast_nr_filtered") for x in lca_nr_paths]


In [15]:
for i in range(len(lca_nr_paths)):
    sample_name = os.path.basename(os.path.dirname(lca_nr_paths[i]))
    outfile = lca_nr_paths[i].replace("lca_nr", "blast_lca_nr_filtered")
    try:
        combine_blast_lca (lca_nr_paths[i], blast_nr_paths[i], outfile, sample_name, "nr")
    except:
        print ("error: "+sample_name)



error: CMS001_005_Ra_S3
error: CMS001_026_Ra_S18


## play

In [26]:
contig_file = pd.read_csv("s3://czbiohub-mosquito/contigs/CMS001_003_Ra_S2/blast_nt.m9", header=None, sep="\t", comment="#")
blast_lca_file = pd.read_csv("s3://czbiohub-mosquito/contig_quality/CMS001_003_Ra_S2/blast_lca_nt_filtered.m9", header=0, sep="\t", comment="#")






In [36]:
[contig_file[contig_file[0]==x][12].isin(ncbi.get_descendant_taxa("7157")).any()\
 for x in blast_lca_file[blast_lca_file["taxid"]==33213]["query"]]





[False, False, False, False, False, False, True, False, False, False]

In [41]:
blast_lca_file[blast_lca_file["taxid"]==33213].iloc[6,:]

query           NODE_5053_length_360_cov_0.664311
blast_type                                     nt
sample                           CMS001_003_Ra_S2
identity                                   95.238
align_length                                   42
mismatches                                      2
gaps                                            0
qstart                                        120
qend                                          164
sstart                                       2619
send                                     20072008
bitscore                                     67.6
taxid                                       33213
Name: 580, dtype: object

In [42]:
contig_file[contig_file[0]=="NODE_5053_length_360_cov_0.664311"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
461492,NODE_5053_length_360_cov_0.664311,LN010170.1,95.238,42,2,0,120,161,2619,2578,2.94e-07,67.6,99802,Spirometra erinaceieuropaei,Spirometra erinaceieuropaei,Spirometra erinaceieuropaei genome assembly S_...
461493,NODE_5053_length_360_cov_0.664311,XM_005057306.1,100.0,35,0,0,127,161,968,1002,1.06e-06,65.8,59894,Ficedula albicollis,collared flycatcher,PREDICTED: Ficedula albicollis myelin transcri...
461494,NODE_5053_length_360_cov_0.664311,CP026249.1,94.737,38,2,0,127,164,20071971,20072008,4.92e-05,60.2,52904,Scophthalmus maximus,turbot,Scophthalmus maximus chromosome 7
461495,NODE_5053_length_360_cov_0.664311,XM_006970601.2,97.143,35,1,0,127,161,2180,2214,4.92e-05,60.2,230844,Peromyscus maniculatus bairdii,prairie deer mouse,PREDICTED: Peromyscus maniculatus bairdii upst...
461496,NODE_5053_length_360_cov_0.664311,XM_015995521.1,97.143,35,1,0,127,161,3381,3415,4.92e-05,60.2,230844,Peromyscus maniculatus bairdii,prairie deer mouse,PREDICTED: Peromyscus maniculatus bairdii upst...
461497,NODE_5053_length_360_cov_0.664311,XM_015995516.1,97.143,35,1,0,127,161,2201,2235,4.92e-05,60.2,230844,Peromyscus maniculatus bairdii,prairie deer mouse,PREDICTED: Peromyscus maniculatus bairdii upst...
461498,NODE_5053_length_360_cov_0.664311,XM_006970600.1,97.143,35,1,0,127,161,2291,2325,4.92e-05,60.2,230844,Peromyscus maniculatus bairdii,prairie deer mouse,PREDICTED: Peromyscus maniculatus bairdii upst...
461499,NODE_5053_length_360_cov_0.664311,XM_026007050.1,96.97,33,1,0,127,159,1996,2028,0.000637,56.5,9627,Vulpes vulpes,red fox,"PREDICTED: Vulpes vulpes podocan (PODN), trans..."
461500,NODE_5053_length_360_cov_0.664311,XM_026007049.1,96.97,33,1,0,127,159,1996,2028,0.000637,56.5,9627,Vulpes vulpes,red fox,"PREDICTED: Vulpes vulpes podocan (PODN), trans..."
461501,NODE_5053_length_360_cov_0.664311,XM_025427651.1,96.97,33,1,0,127,159,1953,1985,0.000637,56.5,286419,Canis lupus dingo,dingo,"PREDICTED: Canis lupus dingo podocan (PODN), t..."


In [43]:
df = contig_file
db="nucleotide"
return_taxid_only=True
ident_cutoff=0.9
align_len_cutoff=0.9
bitscore_cutoff=0



In [45]:
df[0].value_counts()

NODE_285_length_1096_cov_0.607458     234514
NODE_1136_length_656_cov_0.411054     158521
NODE_15469_length_236_cov_0.918239    142173
NODE_14382_length_241_cov_0.890244    123947
NODE_13150_length_248_cov_0.853801    122442
NODE_7415_length_303_cov_1.088496      72063
NODE_15289_length_237_cov_0.887500     67027
NODE_6460_length_323_cov_1.012195      56131
NODE_1369_length_610_cov_0.474672      44797
NODE_7634_length_299_cov_0.860360      42498
NODE_13451_length_246_cov_0.863905     36120
NODE_15422_length_236_cov_0.918239     34179
NODE_16432_length_231_cov_0.948052     31562
NODE_6654_length_319_cov_0.685950      27936
NODE_8220_length_288_cov_22.085308     22213
NODE_10099_length_268_cov_0.963351     15842
NODE_9043_length_277_cov_0.310000      12949
NODE_11445_length_259_cov_0.802198     10864
NODE_8440_length_285_cov_0.754808      10035
NODE_15589_length_235_cov_0.924051      9619
NODE_10308_length_267_cov_0.768421      8183
NODE_14865_length_239_cov_0.901235      6172
NODE_14885

In [27]:
contig_file

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,NODE_1_length_12379_cov_17.273451,MH430654.1,75.980,204,44,3,4361,4563,8355,8156,1.100000e-15,100.0,2230910,Aedes anphevirus,Aedes anphevirus,Aedes anphevirus isolate Pune CCL-125 nucleopr...
1,NODE_2_length_6551_cov_77.843373,XM_020076725.1,74.597,6271,1407,145,324,6489,6263,74,0.000000e+00,2584.0,7160,Aedes albopictus,Asian tiger mosquito,PREDICTED: Aedes albopictus vitellogenin-A1-li...
2,NODE_2_length_6551_cov_77.843373,XM_019687400.1,74.595,6235,1412,133,324,6466,6253,99,0.000000e+00,2580.0,7160,Aedes albopictus,Asian tiger mosquito,PREDICTED: Aedes albopictus vitellogenin-A1 (L...
3,NODE_2_length_6551_cov_77.843373,XM_020076724.1,74.442,6272,1418,143,324,6489,6264,72,0.000000e+00,2531.0,7160,Aedes albopictus,Asian tiger mosquito,PREDICTED: Aedes albopictus vitellogenin-A1-li...
4,NODE_2_length_6551_cov_77.843373,GU017909.1,74.907,5133,1100,137,1438,6455,8312,3253,0.000000e+00,2170.0,7177,Culex tarsalis,Culex tarsalis,"Culex tarsalis vitellogenin 1a (Vg1a) gene, co..."
5,NODE_2_length_6551_cov_77.843373,AY691322.1,73.003,5019,1158,151,1522,6455,5641,735,0.000000e+00,1580.0,28624,Aedes atropalpus,rock-pool mosquito,Ochlerotatus atropalpus vitellogenin C (Vg-C) ...
6,NODE_2_length_6551_cov_77.843373,LT727660.1,91.489,47,4,0,4990,5036,831079,831125,2.110000e-05,65.8,5850,Plasmodium knowlesi,Plasmodium knowlesi,"Plasmodium knowlesi genome assembly, chromosom..."
7,NODE_2_length_6551_cov_77.843373,XM_002258039.1,91.489,47,4,0,4990,5036,241,287,2.110000e-05,65.8,5851,Plasmodium knowlesi strain H,Plasmodium knowlesi strain H,Plasmodium knowlesi strain H hypothetical prot...
8,NODE_2_length_6551_cov_77.843373,AM910987.1,87.500,56,7,0,4990,5045,330890,330835,2.110000e-05,65.8,5851,Plasmodium knowlesi strain H,Plasmodium knowlesi strain H,"Plasmodium knowlesi strain H chromosome 5, com..."
9,NODE_2_length_6551_cov_77.843373,LT598659.1,95.000,40,2,0,5006,5045,3223078,3223117,7.600000e-05,63.9,5516,Fusarium culmorum,Fusarium culmorum,"Fusarium culmorum genome assembly, chromosome: I"


In [36]:
int(contig_file[0][0].split("_")[3])

12379