In [None]:
import os
import boto3
import subprocess
import pandas as pd
import s3fs
import io
import json

In [None]:
s3 = boto3.client('s3')
s3_bucket = "czbiohub-mosquito"

In [None]:
sample_dirs = [x["Prefix"] for x in s3.list_objects(Bucket=s3_bucket, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]
sample_dirs_rawdata = [x["Prefix"] for x in s3.list_objects(Bucket=s3_bucket, Prefix="contigs/", Delimiter="/")["CommonPrefixes"]]



In [None]:
sample_names = [os.path.basename(os.path.dirname(x)) for x in sample_dirs]
sample_names_rawdata = [os.path.basename(os.path.dirname(x)) for x in sample_dirs_rawdata]

In [None]:
all_sample_files = [[x["Key"] for x in s3.list_objects(Bucket=s3_bucket, Prefix=y)["Contents"]] for y in sample_dirs]

all_sample_files_rawdata = [[x["Key"] for x in s3.list_objects(Bucket=s3_bucket, Prefix=y)["Contents"]] for y in sample_dirs_rawdata]


In [None]:
all_sample_files

In [None]:
all_sample_files_rawdata

In [None]:
summary_files = [
    "blast_lca_nr_filtered.m9", 
    "blast_lca_nt_filtered.m9",
    "blast_nr_filtered.m9", 
    "blast_nt_filtered.m9", 
    "lca_nr.m9", 
    "lca_nt.m9", 
    "exclude_contigs_nr.txt", 
    "exclude_contigs_nt.txt",
    "contig_coverage.json",
    "contig_coverage_summary.csv"
]

In [None]:
def split_s3_path (s3_path):
    s3_split = os.path.normpath(s3_path).split(os.sep)
    bucket_name = s3_split[1]
    s3_path = '/'.join(s3_split[2:])
    return bucket_name, s3_path

In [None]:
all_dfs = {}

In [None]:
for filename in summary_files:
    print ("starting to process "+filename+" files.")
    if ("MosRefOnly" in filename):
        summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
    elif ("contig_coverage" in filename):
        summary_files_names = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + filename} for i, x in enumerate(sample_dirs_rawdata) if (x + filename) in all_sample_files_rawdata[i]]
    else:
        summary_files_names = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + filename} for i, x in enumerate(sample_dirs) if (x + filename) in all_sample_files[i]]
    if ("CoverageSummaryStats" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]]) for x in summary_files_names]
    elif (".csv" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], header=0) for x in summary_files_names]
        summary_files_names = [summary_files_names[i] for i, x in enumerate(summary_files_dfs) if x.columns[0] is not 'No Contigs']
        summary_files_dfs = [x for x in summary_files_dfs if x.columns[0] is not 'No Contigs']
    elif (any(substring in filename for substring in [".m9", "exclude_contigs"])):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in summary_files_names]
    elif (filename.endswith(".json")):
        summary_files_dfs = []
        for fpath in summary_files_names:
            s3_bucket_name, s3_path = split_s3_path(fpath[list(fpath.keys())[0]])
            data_in_bytes = boto3.resource('s3').Object(s3_bucket_name, s3_path).get()["Body"].read().decode('utf-8')
            json_data = list(map(json.loads, io.StringIO(data_in_bytes).readlines()))[0]
            outdf = pd.DataFrame(pd.Series(json_data), columns=["read_count"]).reset_index(level=0).rename(columns={"index":"query"})
            summary_files_dfs.append(outdf)
    else:
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]
    print ("read in "+filename+" files.")
    if ('sample' in summary_files_dfs[0].columns):
        summary_files_df_all = pd.concat(summary_files_dfs)
    else:
        summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])
    print ("concatenated "+filename+" files.")
    all_dfs[filename] = summary_files_df_all        
        
        

In [None]:
s3_fs = s3fs.S3FileSystem(anon=False)

In [None]:
for filename in summary_files:
    if filename=="contig_coverage.json":
        filename =  "contig_coverage.tsv"
    upload_fn = s3_bucket+'/contig_quality_concat/'+filename
    print ("starting to upload "+filename+" file to s3://"+upload_fn)
    with s3_fs.open(upload_fn,'w') as f:
        if ("CoverageSummaryStats" in filename):
            all_dfs[filename].to_csv(f, index=False)
        elif (".csv" in filename):
            all_dfs[filename].to_csv(f, sep=",", header=True, index=False)
        elif (any(substring in filename for substring in [".m9", "exclude_contigs", "contig_coverage"])):
            all_dfs[filename].to_csv(f, sep="\t", index=False)
        else:
            all_dfs[filename].to_csv(f, sep="\t", header=False, index=False)
    print ("finished uploading "+filename+" files.")  
        

In [None]:
csp_counts = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_csp_counts_1000.txt", sep="\t", header=None)
    

In [None]:
csp_counts[csp_counts[0].str.contains("_1_")]

In [None]:
filename="MosRefOnly/bowtie_csp_counts_1000.txt"

In [None]:
summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]





In [None]:
i=0
summary_files_dfs[i][summary_files_dfs[i][0].str.contains("_1_")]

In [None]:
for i, x in enumerate(sample_dirs):
    if ("MosRefOnly" in '\t'.join(all_sample_files[i])):
       print (i)
       {sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]}




In [None]:
i=61
all_sample_files[i]
#[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s]





In [None]:
summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])








In [None]:
blast_lca = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nt_filtered.m9", sep="\t", header=0)
    

In [None]:
blast_lca[(blast_lca["taxid"]==1) & (blast_lca["query"].str.split("_").apply(lambda x: int(x[3]))>1000)]


In [None]:
(blast_lca["taxid"]==1)

In [None]:
blast_results = pd.read_csv("s3://czbiohub-mosquito/contig_quality/CMS001_009_Ra_S13/blast_nt_filtered.m9", sep="\t", header=0)

In [None]:
blast_results[blast_results["query"]=="NODE_1_length_10423_cov_75.085830"]