In [1]:
import os
import boto3
import subprocess
import pandas as pd
import s3fs

In [2]:
s3 = boto3.client('s3')
s3_bucket = "czbiohub-mosquito"

In [3]:
sample_dirs = [x["Prefix"] for x in s3.list_objects(Bucket=s3_bucket, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]



In [4]:
sample_names = [os.path.basename(os.path.dirname(x)) for x in sample_dirs]

In [5]:
all_sample_files = [[x["Key"] for x in s3.list_objects(Bucket=s3_bucket, Prefix=y)["Contents"]] for y in sample_dirs]


In [6]:
all_sample_files

[['contig_quality/CMS001_001_Ra_S1/Mos/CoverageSummaryStats.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/CoverageSummaryStatsFinal.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_all_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_cs_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_csp_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/contigList.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/my.genome',
  'contig_quality/CMS001_001_Ra_S1/Mos/perBaseCov.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/sorted.bed',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.contigList.txt',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.my.genome',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.perBaseCov.csv',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.sorted.bed',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly_CoverageSummaryStats.

In [7]:
summary_files = ["blast_lca_nr_filtered.m9", "blast_lca_nt_filtered.m9",
 "blast_nr_filtered.m9", "blast_nt_filtered.m9", 
 "lca_nr.m9", "lca_nt.m9",
 "Mos/CoverageSummaryStats.csv",
 "Mos/CoverageSummaryStatsFinal.csv", 
 "Mos/bowtie_all_counts_1000.txt",
 "Mos/bowtie_cs_counts_1000.txt",
 "Mos/bowtie_csp_counts_1000.txt",
 "Mos/contigList.txt",
 "Mos/my.genome",
 "MosRefOnly/CoverageSummaryStats.csv",
 "MosRefOnly/CoverageSummaryStatsFinal.csv", 
 "MosRefOnly/bowtie_all_counts_1000.txt",
 "MosRefOnly/bowtie_cs_counts_1000.txt",
 "MosRefOnly/bowtie_csp_counts_1000.txt",
 "MosRefOnly/contigList.txt",
 "MosRefOnly/my.genome"]#,
 #"Mos/perBaseCov.csv"]

In [8]:
blast_lca_nr_filtered_files = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + summary_files[0]} for i, x in enumerate(sample_dirs) if (x + summary_files[0]) in all_sample_files[i]]



In [9]:
blast_lca_nr_files = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in blast_lca_nr_filtered_files]

In [10]:
if ('sample' in blast_lca_nr_files[0].columns):
    blast_lca_nr = pd.concat(blast_lca_nr_files)
else:
    [x.assign(sample=list(blast_lca_nr_filtered_files[i].keys())[0]) for i, x in enumerate(blast_lca_nr_files)]

In [11]:
all_dfs = {}

In [None]:
for filename in summary_files[13:]:
    print ("starting to process "+filename+" files.")
    if ("MosRefOnly" in filename):
        summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
    else:
        summary_files_names = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + filename} for i, x in enumerate(sample_dirs) if (x + filename) in all_sample_files[i]]
    if ("CoverageSummaryStats" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]]) for x in summary_files_names]
    elif (".csv" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], header=None) for x in summary_files_names]
    elif (".m9" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in summary_files_names]
    else:
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]
    print ("read in "+filename+" files.")
    if ('sample' in summary_files_dfs[0].columns):
        summary_files_df_all = pd.concat(summary_files_dfs)
    else:
        summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])
    print ("concatenated "+filename+" files.")
    all_dfs[filename] = summary_files_df_all        
        
        

starting to process MosRefOnly/CoverageSummaryStats.csv files.
read in MosRefOnly/CoverageSummaryStats.csv files.
concatenated MosRefOnly/CoverageSummaryStats.csv files.
starting to process MosRefOnly/CoverageSummaryStatsFinal.csv files.
read in MosRefOnly/CoverageSummaryStatsFinal.csv files.
concatenated MosRefOnly/CoverageSummaryStatsFinal.csv files.
starting to process MosRefOnly/bowtie_all_counts_1000.txt files.


In [None]:
s3_fs = s3fs.S3FileSystem(anon=False)

In [None]:
for filename in summary_files[13:]:
    upload_fn = s3_bucket+'/contig_quality_concat/'+filename
    print ("starting to upload "+filename+" file to s3://"+upload_fn)
    with s3_fs.open(upload_fn,'w') as f:
        if ("CoverageSummaryStats" in filename):
            all_dfs[filename].to_csv(f, index=False)
        elif (".csv" in filename):
            all_dfs[filename].to_csv(f, header=False, index=False)
        elif (".m9" in filename):
            all_dfs[filename].to_csv(f, sep="\t", index=False)
        else:
            all_dfs[filename].to_csv(f, sep="\t", header=False, index=False)
    print ("finished uploading "+filename+" files.")  
        

In [None]:
csp_counts = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_csp_counts_1000.txt", sep="\t", header=None)
    

In [None]:
csp_counts[csp_counts[0].str.contains("_1_")]

In [None]:
filename="MosRefOnly/bowtie_csp_counts_1000.txt"

In [None]:
summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]





In [None]:
i=0
summary_files_dfs[i][summary_files_dfs[i][0].str.contains("_1_")]

In [None]:
for i, x in enumerate(sample_dirs):
    if ("MosRefOnly" in '\t'.join(all_sample_files[i])):
       print (i)
       {sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]}




In [55]:
i=61
all_sample_files[i]
#[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s]





['contig_quality/CMS001_water1_S11/Mos/CMS001_water1_S11Mos_bowtie_all_1000_counts.txt',
 'contig_quality/CMS001_water1_S11/Mos/CoverageSummaryStats.csv',
 'contig_quality/CMS001_water1_S11/Mos/CoverageSummaryStatsFinal.csv',
 'contig_quality/CMS001_water1_S11/Mos/contigList.txt',
 'contig_quality/CMS001_water1_S11/Mos/my.genome',
 'contig_quality/CMS001_water1_S11/Mos/perBaseCov.csv',
 'contig_quality/CMS001_water1_S11/Mos/sorted.bed',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.contigList.txt',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.my.genome',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.perBaseCov.csv',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.sorted.bed',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly_CoverageSummaryStats.csv',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly_CoverageSummaryStatsFinal.csv

In [None]:
summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])






