In [1]:
import os
import boto3
import subprocess
import pandas as pd
import s3fs

In [2]:
s3 = boto3.client('s3')
s3_bucket = "czbiohub-mosquito"

In [3]:
sample_dirs = [x["Prefix"] for x in s3.list_objects(Bucket=s3_bucket, Prefix="contig_quality/", Delimiter="/")["CommonPrefixes"]]



In [4]:
sample_names = [os.path.basename(os.path.dirname(x)) for x in sample_dirs]

In [5]:
all_sample_files = [[x["Key"] for x in s3.list_objects(Bucket=s3_bucket, Prefix=y)["Contents"]] for y in sample_dirs]


In [6]:
all_sample_files

[['contig_quality/CMS001_001_Ra_S1/Mos/CoverageSummaryStats.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/CoverageSummaryStatsFinal.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_all_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_cs_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/bowtie_csp_counts_1000.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/contigList.txt',
  'contig_quality/CMS001_001_Ra_S1/Mos/my.genome',
  'contig_quality/CMS001_001_Ra_S1/Mos/perBaseCov.csv',
  'contig_quality/CMS001_001_Ra_S1/Mos/sorted.bed',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.contigList.txt',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.my.genome',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.perBaseCov.csv',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly.sorted.bed',
  'contig_quality/CMS001_001_Ra_S1/MosRefOnly/CMS001_001_Ra_S1MosRefOnly_CoverageSummaryStats.

In [7]:
summary_files = ["blast_lca_nr_filtered.m9", "blast_lca_nt_filtered.m9",
 "blast_nr_filtered.m9", "blast_nt_filtered.m9", 
 "lca_nr.m9", "lca_nt.m9",
 "Mos/CoverageSummaryStats.csv",
 "Mos/CoverageSummaryStatsFinal.csv", 
 "Mos/bowtie_all_counts_1000.txt",
 "Mos/bowtie_cs_counts_1000.txt",
 "Mos/bowtie_csp_counts_1000.txt",
 "Mos/contigList.txt",
 "Mos/my.genome",
 "MosRefOnly/CoverageSummaryStats.csv",
 "MosRefOnly/CoverageSummaryStatsFinal.csv", 
 "MosRefOnly/bowtie_all_counts_1000.txt",
 "MosRefOnly/bowtie_cs_counts_1000.txt",
 "MosRefOnly/bowtie_csp_counts_1000.txt",
 "MosRefOnly/contigList.txt",
 "MosRefOnly/my.genome"]#,
 #"Mos/perBaseCov.csv"]

In [8]:
blast_lca_nr_filtered_files = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + summary_files[0]} for i, x in enumerate(sample_dirs) if (x + summary_files[0]) in all_sample_files[i]]



In [9]:
blast_lca_nr_files = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in blast_lca_nr_filtered_files]

In [10]:
if ('sample' in blast_lca_nr_files[0].columns):
    blast_lca_nr = pd.concat(blast_lca_nr_files)
else:
    [x.assign(sample=list(blast_lca_nr_filtered_files[i].keys())[0]) for i, x in enumerate(blast_lca_nr_files)]

In [11]:
all_dfs = {}

In [12]:
for filename in summary_files[13:]:
    print ("starting to process "+filename+" files.")
    if ("MosRefOnly" in filename):
        summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
    else:
        summary_files_names = [{sample_names[i]:"s3://" + s3_bucket + "/" + x + filename} for i, x in enumerate(sample_dirs) if (x + filename) in all_sample_files[i]]
    if ("CoverageSummaryStats" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]]) for x in summary_files_names]
    elif (".csv" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], header=None) for x in summary_files_names]
    elif (".m9" in filename):
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t") for x in summary_files_names]
    else:
        summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]
    print ("read in "+filename+" files.")
    if ('sample' in summary_files_dfs[0].columns):
        summary_files_df_all = pd.concat(summary_files_dfs)
    else:
        summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])
    print ("concatenated "+filename+" files.")
    all_dfs[filename] = summary_files_df_all        
        
        

starting to process MosRefOnly/CoverageSummaryStats.csv files.
read in MosRefOnly/CoverageSummaryStats.csv files.
concatenated MosRefOnly/CoverageSummaryStats.csv files.
starting to process MosRefOnly/CoverageSummaryStatsFinal.csv files.
read in MosRefOnly/CoverageSummaryStatsFinal.csv files.
concatenated MosRefOnly/CoverageSummaryStatsFinal.csv files.
starting to process MosRefOnly/bowtie_all_counts_1000.txt files.
read in MosRefOnly/bowtie_all_counts_1000.txt files.
concatenated MosRefOnly/bowtie_all_counts_1000.txt files.
starting to process MosRefOnly/bowtie_cs_counts_1000.txt files.
read in MosRefOnly/bowtie_cs_counts_1000.txt files.
concatenated MosRefOnly/bowtie_cs_counts_1000.txt files.
starting to process MosRefOnly/bowtie_csp_counts_1000.txt files.
read in MosRefOnly/bowtie_csp_counts_1000.txt files.
concatenated MosRefOnly/bowtie_csp_counts_1000.txt files.
starting to process MosRefOnly/contigList.txt files.
read in MosRefOnly/contigList.txt files.
concatenated MosRefOnly/co

ReadTimeoutError: Read timeout on endpoint URL: "None"

In [13]:
s3_fs = s3fs.S3FileSystem(anon=False)

In [14]:
for filename in summary_files[13:]:
    upload_fn = s3_bucket+'/contig_quality_concat/'+filename
    print ("starting to upload "+filename+" file to s3://"+upload_fn)
    with s3_fs.open(upload_fn,'w') as f:
        if ("CoverageSummaryStats" in filename):
            all_dfs[filename].to_csv(f, index=False)
        elif (".csv" in filename):
            all_dfs[filename].to_csv(f, header=False, index=False)
        elif (".m9" in filename):
            all_dfs[filename].to_csv(f, sep="\t", index=False)
        else:
            all_dfs[filename].to_csv(f, sep="\t", header=False, index=False)
    print ("finished uploading "+filename+" files.")  
        

starting to upload MosRefOnly/CoverageSummaryStats.csv file to s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/CoverageSummaryStats.csv
finished uploading MosRefOnly/CoverageSummaryStats.csv files.
starting to upload MosRefOnly/CoverageSummaryStatsFinal.csv file to s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/CoverageSummaryStatsFinal.csv
finished uploading MosRefOnly/CoverageSummaryStatsFinal.csv files.
starting to upload MosRefOnly/bowtie_all_counts_1000.txt file to s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_all_counts_1000.txt
finished uploading MosRefOnly/bowtie_all_counts_1000.txt files.
starting to upload MosRefOnly/bowtie_cs_counts_1000.txt file to s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_cs_counts_1000.txt
finished uploading MosRefOnly/bowtie_cs_counts_1000.txt files.
starting to upload MosRefOnly/bowtie_csp_counts_1000.txt file to s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_csp_counts_1000.txt
fini

KeyError: 'MosRefOnly/my.genome'

In [15]:
csp_counts = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/MosRefOnly/bowtie_csp_counts_1000.txt", sep="\t", header=None)
    

In [16]:
csp_counts[csp_counts[0].str.contains("_1_")]

Unnamed: 0,0,1,2
2584,NODE_1_length_10868_cov_37.868316,7036,CMS001_001_Ra_S1
11761,NODE_1_length_17565_cov_7.309984,1940,CMS001_002_Ra_S1
22615,NODE_1_length_12379_cov_17.273451,3688,CMS001_003_Ra_S2
31312,NODE_1_length_3148_cov_123.031911,5820,CMS001_004_Ra_S2
42301,NODE_1_length_5955_cov_3.041170,300,CMS001_005_Ra_S3
55072,NODE_1_length_10873_cov_36.361245,5432,CMS001_006_Ra_S5
61781,NODE_1_length_3832_cov_5.467377,256,CMS001_007_Ra_S12
68415,NODE_1_length_12382_cov_11.752133,2240,CMS001_008_Ra_S3
87394,NODE_1_length_10423_cov_75.085830,32872,CMS001_009_Ra_S13
97763,NODE_1_length_10647_cov_8.761022,1596,CMS001_010_Ra_S1


In [None]:
filename="MosRefOnly/bowtie_csp_counts_1000.txt"

In [None]:
summary_files_names = [{sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]} for i, x in enumerate(sample_dirs) if (("MosRefOnly" in '\t'.join(all_sample_files[i]) and (os.path.basename(filename) in '\t'.join(all_sample_files[i]))))]
summary_files_dfs = [pd.read_csv(x[list(x.keys())[0]], sep="\t", header=None) for x in summary_files_names]





In [None]:
i=0
summary_files_dfs[i][summary_files_dfs[i][0].str.contains("_1_")]

In [None]:
for i, x in enumerate(sample_dirs):
    if ("MosRefOnly" in '\t'.join(all_sample_files[i])):
       print (i)
       {sample_names[i]:"s3://"+s3_bucket+"/"+[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s][0]}




In [55]:
i=61
all_sample_files[i]
#[s for s in all_sample_files[i] if os.path.basename(filename) in s and "MosRefOnly" in s]





['contig_quality/CMS001_water1_S11/Mos/CMS001_water1_S11Mos_bowtie_all_1000_counts.txt',
 'contig_quality/CMS001_water1_S11/Mos/CoverageSummaryStats.csv',
 'contig_quality/CMS001_water1_S11/Mos/CoverageSummaryStatsFinal.csv',
 'contig_quality/CMS001_water1_S11/Mos/contigList.txt',
 'contig_quality/CMS001_water1_S11/Mos/my.genome',
 'contig_quality/CMS001_water1_S11/Mos/perBaseCov.csv',
 'contig_quality/CMS001_water1_S11/Mos/sorted.bed',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.contigList.txt',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.my.genome',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.perBaseCov.csv',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly.sorted.bed',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly_CoverageSummaryStats.csv',
 'contig_quality/CMS001_water1_S11/MosRefOnly/CMS001_water1_S11MosRefOnly_CoverageSummaryStatsFinal.csv

In [None]:
summary_files_df_all = pd.concat([x.assign(sample=list(summary_files_names[i].keys())[0]) for i, x in enumerate(summary_files_dfs)])








In [18]:
blast_lca = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nt_filtered.m9", sep="\t", header=0)
    

In [30]:
blast_lca[(blast_lca["taxid"]==1) & (blast_lca["query"].str.split("_").apply(lambda x: int(x[3]))>1000)]


Unnamed: 0,query,blast_type,sample,identity,align_length,mismatches,gaps,qstart,qend,sstart,send,bitscore,taxid
1557,NODE_1010_length_1048_cov_2.217302,nt,CMS001_009_Ra_S13,100.0,28,0,0,120,147,21196330,21196303,52.8,1
1773,NODE_159_length_1882_cov_37.586150,nt,CMS001_009_Ra_S13,93.986,1480,87,2,1,1479,3674326,3672848,2239.0,1
3140,NODE_242_length_1219_cov_1.785464,nt,CMS001_016_Ra_S6,91.667,48,4,0,712,759,3105,3058,67.6,1
3908,NODE_152_length_1364_cov_80.483294,nt,CMS001_025_Ra_S7,96.774,31,1,0,195,225,677,647,52.8,1
4305,NODE_276_length_1387_cov_1.358779,nt,CMS001_026_Ra_S18,94.444,36,2,0,1324,1359,23704069,23704034,56.5,1
4374,NODE_483_length_1137_cov_1.196226,nt,CMS001_026_Ra_S18,97.143,35,1,0,426,460,1407,1373,60.2,1
5536,NODE_68_length_1454_cov_3.039942,nt,CMS001_035_Ra_S20,91.03,981,81,6,459,1436,978,2,1317.0,1
5860,NODE_31_length_1319_cov_0.771337,nt,CMS001_038_Ra_S22,100.0,28,0,0,1229,1256,31928110,31928083,52.8,1
7635,NODE_140_length_2038_cov_4.057114,nt,CMS001_060_Ra_S12,97.619,42,1,0,1990,2031,633,592,73.1,1
7976,NODE_681_length_1231_cov_2.415945,nt,CMS001_060_Ra_S12,90.244,41,4,0,574,614,276521,276561,54.7,1


In [29]:
(blast_lca["taxid"]==1)

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10        True
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23        True
24        True
25       False
26       False
27       False
28       False
29       False
         ...  
16557    False
16558    False
16559    False
16560    False
16561    False
16562    False
16563    False
16564    False
16565    False
16566     True
16567    False
16568    False
16569    False
16570    False
16571    False
16572    False
16573     True
16574    False
16575     True
16576    False
16577    False
16578    False
16579    False
16580    False
16581    False
16582    False
16583    False
16584    False
16585     True
16586    False
Name: taxid, Length: 16587, dtype: bool

In [32]:
blast_results = pd.read_csv("s3://czbiohub-mosquito/contig_quality/CMS001_009_Ra_S13/blast_nt_filtered.m9", sep="\t", header=0)

In [39]:
blast_results[blast_results["query"]=="NODE_1_length_10423_cov_75.085830"]

Unnamed: 0,query,subject,identity,align_length,mismatches,gaps,qstart,qend,sstart,send,evalue,bitscore,taxid,sci_name,common_name,subject_title,blast_type
