In [85]:
import os
import boto3
import subprocess
import pandas as pd
import s3fs

In [22]:
s3 = boto3.client('s3')
s3_bucket = "czbiohub-mosquito"

In [19]:
sample_dirs = [x["Prefix"] for x in s3.list_objects(Bucket=s3_bucket, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]



In [32]:
sample_names = [os.path.basename(os.path.dirname(x)) for x in sample_dirs]

In [26]:
all_sample_files = [[x["Key"] for x in s3.list_objects(Bucket=s3_bucket, Prefix=y)["Contents"]] for y in sample_dirs]


In [83]:
summary_files = ["blast_lca_nr_filtered.m9", "blast_lca_nt_filtered.m9",
 "blast_nr_filtered.m9", "blast_nt_filtered.m9", 
 "lca_nr.m9", "lca_nt.m9",
 "Mos/CoverageSummaryStats.csv",
 "Mos/CoverageSummaryStatsFinal.csv", 
 "Mos/bowtie_all_counts_1000.txt",
 "Mos/bowtie_cs_counts_1000.txt",
 "Mos/bowtie_csp_counts_1000.txt",
 "Mos/contigList.txt",
 "Mos/my.genome"]#,
 #"Mos/perBaseCov.csv"]

In [45]:
blast_lca_nr_filtered_files = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + summary_files[0]} for i, x in enumerate(sample_dirs) if (x + summary_files[0]) in all_sample_files[i]]



In [67]:
blast_lca_nr_files = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in blast_lca_nr_filtered_files]

In [69]:
if ('sample' in blast_lca_nr_files[0].columns):
    blast_lca_nr = pd.concat(blast_lca_nr_files)
else:
    [x.assign(sample=list(blast_lca_nr_filtered_files[i].keys())[0]) for i, x in enumerate(blast_lca_nr_files)]

Unnamed: 0,query,blast_type,sample,identity,align_length,mismatches,gaps,qstart,qend,sstart,send,bitscore,taxid
0,NODE_10014_length_237_cov_30.950000,nr,CMS001_001_Ra_S1,97.4,77,2,0,232,2,14,90,70.5,864142
1,NODE_10026_length_237_cov_1.825000,nr,CMS001_001_Ra_S1,53.19,47,22,0,96,236,22,68,55.5,838
2,NODE_10027_length_237_cov_1.825000,nr,CMS001_001_Ra_S1,43.59,78,44,0,4,237,121,198,75.1,2
3,NODE_10045_length_237_cov_1.181250,nr,CMS001_001_Ra_S1,57.41,54,15,2,90,227,5,58,49.3,5204
4,NODE_10048_length_237_cov_1.100000,nr,CMS001_001_Ra_S1,91.14,79,7,0,1,237,359,437,147.1,2


In [79]:
all_dfs = {}

In [80]:
for filename in summary_files:
    print ("starting to process "+filename+" files.")
    summary_files_names = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + filename} for i, x in enumerate(sample_dirs) if (x + filename) in all_sample_files[i]]
    if ("CoverageSummaryStats" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]]) for x in summary_files_names]
    elif (".csv" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], header=None) for x in summary_files_names]
    elif (".m9" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in summary_files_names]
    else:
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]
    print ("read in "+filename+" files.")
    if ('sample' in summary_files_dfs[0].columns):
        summary_files_df_all = pd.concat(summary_files_dfs)
    else:
        summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])
    print ("concatenated "+filename+" files.")
    all_dfs[filename] = summary_files_df_all        
        
        

starting to process blast_lca_nr_filtered.m9 files.
read in blast_lca_nr_filtered.m9 files.
concatenated blast_lca_nr_filtered.m9 files.
starting to process blast_lca_nt_filtered.m9 files.
read in blast_lca_nt_filtered.m9 files.
concatenated blast_lca_nt_filtered.m9 files.
starting to process blast_nr_filtered.m9 files.
read in blast_nr_filtered.m9 files.
concatenated blast_nr_filtered.m9 files.
starting to process blast_nt_filtered.m9 files.
read in blast_nt_filtered.m9 files.
concatenated blast_nt_filtered.m9 files.
starting to process lca_nr.m9 files.
read in lca_nr.m9 files.
concatenated lca_nr.m9 files.
starting to process lca_nt.m9 files.
read in lca_nt.m9 files.
concatenated lca_nt.m9 files.
starting to process Mos/CoverageSummaryStats.csv files.
read in Mos/CoverageSummaryStats.csv files.
concatenated Mos/CoverageSummaryStats.csv files.
starting to process Mos/CoverageSummaryStatsFinal.csv files.
read in Mos/CoverageSummaryStatsFinal.csv files.
concatenated Mos/CoverageSummaryS

KeyboardInterrupt: 

In [86]:
s3_fs = s3fs.S3FileSystem(anon=False)

In [89]:
for filename in summary_files:
    upload_fn = s3_bucket+'/contig_quality_concat/'+filename
    print ("starting to upload "+filename+" file to s3://"+upload_fn)
    with s3_fs.open(upload_fn,'w') as f:
        if ("CoverageSummaryStats" in filename):
            all_dfs[filename].to_csv(f, index=False)
        elif (".csv" in filename):
            all_dfs[filename].to_csv(f, header=False, index=False)
        elif (".m9" in filename):
            all_dfs[filename].to_csv(f, sep="\t", index=False)
        else:
            all_dfs[filename].to_csv(f, sep="\t", header=False, index=False)
    print ("finished uploading "+filename+" files.")  
        

starting to upload blast_lca_nr_filtered.m9 file to s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nr_filtered.m9
finished uploading blast_lca_nr_filtered.m9 files.
starting to upload blast_lca_nt_filtered.m9 file to s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nt_filtered.m9
finished uploading blast_lca_nt_filtered.m9 files.
starting to upload blast_nr_filtered.m9 file to s3://czbiohub-mosquito/contig_quality_concat/blast_nr_filtered.m9
finished uploading blast_nr_filtered.m9 files.
starting to upload blast_nt_filtered.m9 file to s3://czbiohub-mosquito/contig_quality_concat/blast_nt_filtered.m9
finished uploading blast_nt_filtered.m9 files.
starting to upload lca_nr.m9 file to s3://czbiohub-mosquito/contig_quality_concat/lca_nr.m9
finished uploading lca_nr.m9 files.
starting to upload lca_nt.m9 file to s3://czbiohub-mosquito/contig_quality_concat/lca_nt.m9
finished uploading lca_nt.m9 files.
starting to upload Mos/CoverageSummaryStats.csv file to s3://czbiohub-mosqui