In [8]:
import pandas as pd

In [9]:
!mkdir tmp

mkdir: tmp: File exists


In [10]:
sequences_prefix = 's3://czbiohub-mosquito/sequences/'
folders = ['CMS001_fastq.gz/', 'CMS002_fastq.gz/']
signature_prefix = 's3://czbiohub-mosquito/references/sourmash/signatures/'
comparison_prefix = 's3://czbiohub-mosquito/references/sourmash/comparison/'

In [11]:
def get_paired_reads(folder):
    tmpfile = 'tmp/samples.txt'

    ! mkdir -p tmp
    ! aws s3 ls {folder} > {tmpfile}
    
    fastqs = []
    for line in open(tmpfile):
        if 'fastq' in line:
            fastqs.append(line.split()[-1])
    read_ones = sorted([f for f in fastqs if 'R1' in f])
    read_twos = sorted([f for f in fastqs if 'R2' in f])
    for r1, r2 in zip(read_ones, read_twos):
        assert r1.split('R1')[0] == r2.split('R2')[0]
        
    return read_ones, read_twos

In [12]:
df = pd.DataFrame(columns=["id","read1","read2","name","output","trim_low_abundance_kmers","dna","protein","ksizes","scaled"])

for folder in folders:
    read_ones, read_twos = get_paired_reads(sequences_prefix + folder)
    
    for r1 , r2 in zip(read_ones, read_twos):
        name = r1.split('R1')[0].strip('_')
        df = df.append({"id": name,
                        "read1": sequences_prefix + folder + r1,
                        "read2": sequences_prefix + folder + r2,
                        "name": name,
                        "output": signature_prefix + name + '.sig',
                        "trim_low_abundance_kmers": True,
                        "dna": True,
                        "protein": False,
                        "ksizes": "21,31,51",
                        "scaled": 500
                          }, ignore_index=True)

In [13]:
df.head()

Unnamed: 0,id,read1,read2,name,output,trim_low_abundance_kmers,dna,protein,ksizes,scaled
0,CMS_0015_RNA_A_S13,s3://czbiohub-mosquito/sequences/CMS001_fastq....,s3://czbiohub-mosquito/sequences/CMS001_fastq....,CMS_0015_RNA_A_S13,s3://czbiohub-mosquito/references/sourmash/sig...,True,True,False,213151,500
1,CMS_001_RNA_A_S1,s3://czbiohub-mosquito/sequences/CMS001_fastq....,s3://czbiohub-mosquito/sequences/CMS001_fastq....,CMS_001_RNA_A_S1,s3://czbiohub-mosquito/references/sourmash/sig...,True,True,False,213151,500
2,CMS_002_RNA_A_S1,s3://czbiohub-mosquito/sequences/CMS001_fastq....,s3://czbiohub-mosquito/sequences/CMS001_fastq....,CMS_002_RNA_A_S1,s3://czbiohub-mosquito/references/sourmash/sig...,True,True,False,213151,500
3,CMS_003_RNA_A_S2,s3://czbiohub-mosquito/sequences/CMS001_fastq....,s3://czbiohub-mosquito/sequences/CMS001_fastq....,CMS_003_RNA_A_S2,s3://czbiohub-mosquito/references/sourmash/sig...,True,True,False,213151,500
4,CMS_004_RNA_A_S2,s3://czbiohub-mosquito/sequences/CMS001_fastq....,s3://czbiohub-mosquito/sequences/CMS001_fastq....,CMS_004_RNA_A_S2,s3://czbiohub-mosquito/references/sourmash/sig...,True,True,False,213151,500


In [14]:
df.to_csv('../sourmash/skeeters/compute/samples.csv', index = False)

## Split into groups of 10

Since runbatch seems to choke on large numbers of samples, we split into groups of 10.

In [33]:
for i in range(len(df)//10 + 1):
    dir_name = '../sourmash/skeeters/compute_' + str(i)
    !mkdir {dir_name}
    df_slice = df.iloc[10*i:10*i + 10]
    df_slice.to_csv(dir_name + '/samples.csv', index = False)

mkdir: ../sourmash/skeeters/compute_0: File exists


In [32]:
! cat ../sourmash/skeeters/compute_12/samples.csv | wc -l

      10


# Compare

In [16]:
df = pd.DataFrame(columns=["id", "ksize", "signatures", "output"])

for k in ["21","31","51"]:
    name = "k" + str(k)
    df = df.append({"id": name,
               "ksize": k,
               "signatures": signature_prefix,
               "output": comparison_prefix + name + "abun.csv"},
             ignore_index = True)

In [17]:
df.head()

Unnamed: 0,id,ksize,signatures,output
0,k21,21,s3://czbiohub-mosquito/references/sourmash/sig...,s3://czbiohub-mosquito/references/sourmash/com...
1,k31,31,s3://czbiohub-mosquito/references/sourmash/sig...,s3://czbiohub-mosquito/references/sourmash/com...
2,k51,51,s3://czbiohub-mosquito/references/sourmash/sig...,s3://czbiohub-mosquito/references/sourmash/com...


In [18]:
df.to_csv('../sourmash/skeeters/compare/samples.csv', index = False)