In [1]:
import boto3
import pandas as pd
from ete3 import NCBITaxa

In [3]:
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket_name = "czbiohub-mosquito"
bucket = s3.Bucket(bucket_name)

### nt hits
The nt hits of contigs from each sample are filtered with ident_cutoff=0.5 and align_len_cutoff=0.2

In [None]:
# First list the folders (there is a limit of 1000 files output by AWS by default)
list_of_folders_nt = [client.list_objects(Bucket=bucket_name, Prefix=x["Prefix"]+"blast_nt.m9") \
                   for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
list_of_folders_nt = ["s3://"+bucket_name+"/"+x["Prefix"] for x in list_of_folders_nt if "Contents" in x.keys()]


In [234]:
filenames_nt = pd.DataFrame(list_of_folders, columns=["blast_nt"])
filenames_nt = filenames_nt.assign(filtered_blast_nt=filenames_nt["blast_nt"].str.replace("contigs", "contig_quality").str.replace("_nt.m9", "_nt_filtered.m9"))
filenames_nt = filenames_nt.assign(lca_nt=filenames_nt["filtered_blast_nt"].str.replace("blast_nt_filtered.m9", "lca_nt.m9"))

In [235]:
commands_t = filenames_nt.apply(lambda x: "python lca_analysis.py --blast_type nt --fpath "+x.iloc[0]+\
                " --filtered_blast_path "+x.iloc[1]+" --outpath "+x.iloc[2]+\
                " --ident_cutoff 0.5 --align_len_cutoff 0.5", axis=1)
commands_t.to_csv("lca_nt_commands", index=False)

Perform LCA analysis on the nt hits

In [187]:
%%bash
parallel -a lca_nt_commands -j 8


Process is interrupted.


### nr hits
The nr hits of contigs from each sample are filtered with ident_cutoff=0.8 and align_len_cutoff=0.2

In [294]:
# First list the folders (there is a limit of 1000 files output by AWS by default)
list_of_folders_nr = [client.list_objects(Bucket=bucket_name, Prefix=x["Prefix"]+"blast_subset_nr.m9") \
                   for x in client.list_objects(Bucket=bucket_name, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]
list_of_folders_nr = ["s3://"+bucket_name+"/"+x["Prefix"] for x in list_of_folders_nr if "Contents" in x.keys()]


In [313]:
filenames_nr = pd.DataFrame(list_of_folders_nr, columns=["blast_nr"])
filenames_nr = filenames_nr.assign(filtered_blast_nr=filenames_nr["blast_nr"].str.replace("contigs", "contig_quality").str.replace("_nr.m9", "_nr_filtered.m9"))
filenames_nr = filenames_nr.assign(lca_nr=filenames_nr["filtered_blast_nr"].str.replace("blast_subset_nr_filtered.m9", "lca_nr.m9"))




In [314]:
commands_nr = filenames_nr.apply(lambda x: "python lca_analysis.py --blast_type nr --fpath "+x.iloc[0]+\
                " --filtered_blast_path "+x.iloc[1]+" --outpath "+x.iloc[2]+\
                " --ident_cutoff 0.5 --align_len_cutoff 0.5", axis=1)
commands_nr.to_csv("lca_nr_commands", index=False)

In [315]:
commands_nr[0]

'python lca_analysis.py --blast_type nr --fpath s3://czbiohub-mosquito/contigs/CMS001_001_Ra_S1/blast_subset_nr.m9 --filtered_blast_path s3://czbiohub-mosquito/contig_quality/CMS001_001_Ra_S1/blast_subset_nr_filtered.m9 --outpath s3://czbiohub-mosquito/contig_quality/CMS001_001_Ra_S1/lca_nr.m9 --ident_cutoff 0.5 --align_len_cutoff 0.5'

Perform LCA analysis on the nr hits

In [316]:
%%bash
parallel -a lca_nr_commands -j 8

                                     ...
query                                ...
NODE_10_length_2024_cov_2.039034 0 0 ...

[1 rows x 2 columns]
                                                                      query  taxid
query                                                                             
NODE_1258_length_349_cov_2.128676 0 2613  NODE_1258_length_349_cov_2.128676      2
                                                                        query   taxid
query                                                                                
NODE_1231_length_616_cov_26.846011 0 1010  NODE_1231_length_616_cov_26.846011  131567
                                                                        query   taxid
query                                                                                
NODE_1248_length_631_cov_80.900722 0 1803  NODE_1248_length_631_cov_80.900722  131567
                                                                      query   taxid
query 