## Create segment files

In [52]:
import pandas as pd
import sys
import os
from Bio import SeqIO

#------------ INPUT ------------ 
ref_file = "../100kb/ref_rep0.fasta"
seg_file = "../100kb/split/ref_seg_rep0.txt"
rep = 0
bins = 4

#------------ OUTPUT ------------
outfile_prefix = "../100kb/split/rep" + str(rep) + "_seg"

In [53]:
segments = pd.read_csv(seg_file, header=None, sep='\t')
segments.columns = ["bin_id", "ref_id", "start", "length"]
reference_segments = list(SeqIO.parse(ref_file, "fasta"))

In [54]:
# assuming a single reference sequence
for index, row in segments.iterrows():
    bin_id = row["bin_id"]
    ref_id = row["ref_id"]
    start = row["start"]
    length = row["length"]
    with open(outfile_prefix + str(bin_id) + ".fasta", 'w') as f:
        f.write('>' + str(bin_id) + '\t' + "ref=" + str(ref_id) + ",start=" + str(start) + ",length=" + str(length) + '\n')
        f.write(str(reference_segments[0].seq[start:start+length]))
    f.close()

## Distribute queries

In [77]:
import numpy as np
import pandas as pd
from Bio import SeqIO

#------------ INPUT ------------ 
bins = 64
rep = 0
er = 0.05

bin_int_list = list(range(bins))
bin_str_list = [str(b).zfill(len(str(bins))) for b in bin_int_list]

query_file = "../10Mb/rep" + str(rep) + "/queries/e" + str(er) + ".fastq"
search_file = "../10Mb/rep" + str(rep) + "/search/e" + str(er) + ".out"

#------------ OUTPUT ------------ 
output_prefix = "../10Mb/rep" + str(rep) + "/queries/"

In [78]:
matches = pd.read_csv(search_file, sep='\t', header=None)
matches.columns = ["read_id", "matches"]
matches = matches.assign(match_cols=matches['matches'].str.split(',')).explode('matches')
matches = matches.drop(["matches"], axis = 1)
matches = matches.explode("match_cols")
matches.columns = ["read_id", "bin_id"]
matches = matches.replace(r'^\s*$', np.nan, regex=True).dropna() # drop empty rows
matches = matches.reset_index(drop = True)
matches["bin_id"] = pd.to_numeric(matches["bin_id"])
matches.head()

Unnamed: 0,read_id,bin_id
0,0,0
1,0,3
2,0,12
3,1,0
4,2,0


In [80]:
# open all output bin query files 
bin_query_files = [open((output_prefix + 'bin_{}_e' + str(er) + ".fasta").format(b), 'w') for b in bin_str_list]

queries = list(SeqIO.parse(query_file, "fastq"))
# the aim is to only scan over query file once
for query in queries[0:5]:
    read_matches = matches[matches["read_id"]==int(query.name)]
    for index, match in read_matches.iterrows():
        print(match["bin_id"])
        SeqIO.write(query, bin_query_files[match["bin_id"]], "fasta")
    
for f in bin_query_files:
    f.close()

0
3
12
0
0
0
2
0


## Analyse run-time table

In [178]:
#--------- INPUT ---------
n = 1
error_rates = [0, 0.025, 0.05]

#--------- OUTPUT ---------
outfile = "../10Mb/table1.tsv"

import os
import pandas as pd

# gather benchmarks for steps that are shared between all error rates
def gather_benchmarks(tool, rep, prefixes):
    dir_path = "../10Mb/benchmarks/rep" + str(rep) + "/" + tool
    total_runtimes = []
    prefix_tuple = tuple(prefixes)
    with os.scandir(dir_path) as it:
        for entry in it:
            if (entry.name.endswith(prefix_tuple)) and entry.is_file():
                #print(entry.name)
                benchmark = pd.read_csv(entry.path, sep="\t")
                rep_runtime = benchmark["s"].iloc[0]
                continue
    it.close()
    return rep_runtime

# steps that are repeated for each error rate
def gather_er_benchmarks(tool, rep, prefixes):
    dir_path = "../10Mb/benchmarks/rep" + str(rep) + "/" + tool
    total_runtimes = []
    prefix_tuple = tuple(prefixes)
    # summation of run-times from a single repetition
    for er in error_rates:
        rep_er_runtimes = []
        with os.scandir(dir_path) as it:
            for entry in it:
                if (entry.name.endswith(prefix_tuple) or entry.name.endswith("e" + str(er) + ".txt")) and entry.is_file():
                    benchmark = pd.read_csv(entry.path, sep="\t")
                    rep_er_runtimes.append(benchmark["s"].iloc[0])
        it.close()
        total_runtimes.append(sum(rep_er_runtimes))
    return total_runtimes

In [179]:
valik_build_tuple = ("valik", ["build.txt"], "Valik-build")

shared_tuple_list = [valik_build_tuple]

valik_search_tuple = ("valik", [], "Valik-search")
distribution_tuple = ("distribute", [], "Distributing-search")
distributed_stellar_tuple = ("dream_stellar", [], "Distributed-Stellar")
stellar_tuple = ("stellar", [], "Stellar")

with_errors_tuple_list = [valik_search_tuple, distribution_tuple, distributed_stellar_tuple, stellar_tuple]

data = {'Error-rate' : error_rates}
df = pd.DataFrame(data)
df

Unnamed: 0,Error-rate
0,0.0
1,0.025
2,0.05


In [180]:
for method in shared_tuple_list:
    for rep in list(range(n)):
        runtime = gather_benchmarks(method[0], rep, method[1])
        df["rep" + str(rep)] = runtime
    
    col = df.loc[: , "rep0":"rep" + str(n - 1)]
    df[method[2]] = col.mean(axis=1).round(2)
    df = df[df.columns.drop(list(df.filter(regex='rep')))]
    
df

Unnamed: 0,Error-rate,Valik-build
0,0.0,8.46
1,0.025,8.46
2,0.05,8.46


In [181]:
for method in with_errors_tuple_list:
    for rep in list(range(n)):
        runtimes = gather_er_benchmarks(method[0], rep, method[1])
        df["rep" + str(rep)] = runtimes
    
    col = df.loc[: , "rep0":"rep" + str(n - 1)]
    df[method[2]] = col.mean(axis=1).round(2)
    df = df[df.columns.drop(list(df.filter(regex='rep')))]
    
df[stellar_tuple[2]] = round(df[stellar_tuple[2]] / 8, 2)
df[distributed_stellar_tuple[2]] = round(df[distributed_stellar_tuple[2]] / 8, 2)

In [186]:
df["DREAM-Stellar"] = df["Distributing-search"] + df["Valik-build"] + df["Valik-search"] + df["Distributed-Stellar"]
df["DREAM-Stellar"] = df["DREAM-Stellar"].round(2)

# reorder columns
cols = df.columns.tolist()
cols = cols[:-2] + [cols[-1]] + [cols[-2]]
df = df[cols]
df.to_csv(outfile, sep='\t')
df

Unnamed: 0,Error-rate,Valik-build,Valik-search,Distributing-search,Distributed-Stellar,DREAM-Stellar,Stellar
0,0.0,8.46,2.83,12.57,13.33,37.19,27.8
1,0.025,8.46,0.99,2.77,10.94,23.16,13.02
2,0.05,8.46,1.6,6.1,21.19,37.35,19.72


In [None]:
# TODO: make a table of time output
# TODO: why is the 100Mb not working on the cluster?
# TODO: use the new Stellar version for DREAM-Stellar and conda version for Stellar