## Create segment files

In [52]:
import pandas as pd
import sys
import os
from Bio import SeqIO

#------------ INPUT ------------ 
ref_file = "../100kb/ref_rep0.fasta"
seg_file = "../100kb/split/ref_seg_rep0.txt"
rep = 0
bins = 4

#------------ OUTPUT ------------
outfile_prefix = "../100kb/split/rep" + str(rep) + "_seg"

In [53]:
segments = pd.read_csv(seg_file, header=None, sep='\t')
segments.columns = ["bin_id", "ref_id", "start", "length"]
reference_segments = list(SeqIO.parse(ref_file, "fasta"))

In [54]:
# assuming a single reference sequence
for index, row in segments.iterrows():
    bin_id = row["bin_id"]
    ref_id = row["ref_id"]
    start = row["start"]
    length = row["length"]
    with open(outfile_prefix + str(bin_id) + ".fasta", 'w') as f:
        f.write('>' + str(bin_id) + '\t' + "ref=" + str(ref_id) + ",start=" + str(start) + ",length=" + str(length) + '\n')
        f.write(str(reference_segments[0].seq[start:start+length]))
    f.close()

## Analyse run-time

In [237]:
#--------- INPUT ---------
error_rates = [0.05]
ibf_bins = [64, 1024, 8192, 16384]
ref_len = 500000000 * 8
nr_queries = 1703936

#--------- OUTPUT ---------
workdir = "4Gb"
outfile = "../" + workdir + "/table1.tsv"

import os
import pandas as pd

def read_time_output(file):
    time_df = pd.read_csv(file, sep="\t", header=None)
    time_df.columns = ["Time", "Memory", "Exitcode", "Command"]
    command_df = time_df["Command"].str.split(' ',expand=True)
    time_df = time_df.drop(labels = "Command", axis = 1)

    return time_df, command_df

### Process Valik build time

In [238]:
valik_build_file = "../" + workdir + "/valik_build.time"
time_df, command_df = read_time_output(valik_build_file)

bin_nr_df = command_df[2].str.split('/', expand=True).drop(labels = 1, axis = 1)
bin_nr_df.columns = ["IBF bins"]

command_df = command_df.drop(labels = [0, 2, 3, 5, 7, 9, 11], axis = 1)
command_df.columns = ["process", "threads", "w", "k", "output", "IBF size"]

build_result = pd.concat([time_df, bin_nr_df, command_df], axis=1)    

bin_size = []
for bin_nr in build_result["IBF bins"]:
    bin_size.append(round(ref_len / int(bin_nr)))

build_result["Bin size"] = bin_size
build_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,threads,w,k,output,IBF size,Bin size
0,42.09,653712,0,64,build,16,17,15,/dev/shm/64/valik.index,500M,62500000
1,38.02,548804,0,1024,build,16,17,15,/dev/shm/1024/valik.index,500M,3906250
2,56.54,537048,0,8192,build,16,17,15,/dev/shm/8192/valik.index,500M,488281
3,70.91,537912,0,16384,build,16,17,15,/dev/shm/16384/valik.index,500M,244141


### Process Valik search time

In [239]:
# Total search time
valik_search_file = "../" + workdir + "/valik_search.time"
time_df, command_df = read_time_output(valik_search_file)

bin_nr_df = command_df[6].str.split('/', expand=True).drop(labels = 1, axis = 1)
bin_nr_df.columns = ["IBF bins"]

command_df = command_df.drop(labels = [0, 2, 3, 5, 6, 7, 8, 9, 11, 13, 15, 17], axis = 1)
command_df.columns = ["process", "input", "errors", "pattern", "overlap", "threads", "output"]

search_result = pd.concat([time_df, bin_nr_df, command_df], axis = 1)
search_result["IBF bins"] = pd.to_numeric(search_result["IBF bins"])

bin_queries = []
for bin_nr in search_result["IBF bins"]:
    bin_queries.append(round(nr_queries / bin_nr))

search_result["Bin queries"] = bin_queries

search_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,input,errors,pattern,overlap,threads,output,Bin queries
0,19.22,9591384,0,64,search,/dev/shm/64/valik.index,2,100,98,16,64/search/e0.05.out,26624
1,49.08,9708964,0,1024,search,/dev/shm/1024/valik.index,2,100,98,16,1024/search/e0.05.out,1664
2,434.27,9869520,0,8192,search,/dev/shm/8192/valik.index,2,100,98,16,8192/search/e0.05.out,208
3,800.73,10010300,0,16384,search,/dev/shm/16384/valik.index,2,100,98,16,16384/search/e0.05.out,104


In [240]:
# Time for individual steps
time_df_list = []
for b in ibf_bins:
    search_time_file = "../" + workdir + "/" + str(b) + "/search/e0.05.out.time"
    time_df = pd.read_csv(search_time_file, sep="\t")
    time_df["IBF bins"] = b
    time_df_list.append(time_df)
    individual_search_result = pd.concat(time_df_list)
    
individual_search_result

Unnamed: 0,IBF I/O,Reads I/O,Bin-queries I/O,Compute,IBF bins
0,0.2,4.07,0.94,14.77,64
0,0.21,10.36,0.82,38.22,1024
0,0.25,12.57,0.97,420.34,8192
0,0.21,3.99,0.94,793.45,16384


In [241]:
search_result = search_result.merge(individual_search_result, on = "IBF bins")
search_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,input,errors,pattern,overlap,threads,output,Bin queries,IBF I/O,Reads I/O,Bin-queries I/O,Compute
0,19.22,9591384,0,64,search,/dev/shm/64/valik.index,2,100,98,16,64/search/e0.05.out,26624,0.2,4.07,0.94,14.77
1,49.08,9708964,0,1024,search,/dev/shm/1024/valik.index,2,100,98,16,1024/search/e0.05.out,1664,0.21,10.36,0.82,38.22
2,434.27,9869520,0,8192,search,/dev/shm/8192/valik.index,2,100,98,16,8192/search/e0.05.out,208,0.25,12.57,0.97,420.34
3,800.73,10010300,0,16384,search,/dev/shm/16384/valik.index,2,100,98,16,16384/search/e0.05.out,104,0.21,3.99,0.94,793.45


### Process original Stellar time

In [257]:
stellar_file = "../" + workdir + "/stellar.time"
time_df, command_df = read_time_output(stellar_file)

stellar_result = time_df.drop(labels = ["Exitcode"], axis = 1)
stellar_result.mean(axis = 0)
stellar_result

Unnamed: 0,Time,Memory
0,3581.64,64377624
1,3601.03,64197628
2,3373.75,65086564
3,3520.75,65913976


### Process DREAM-Stellar alignment time

In [243]:
# Time for individual steps

time_df_list = []
for b in ibf_bins:
    dream_time_file = "../" + workdir + "/dream_" + str(b) + "_parallel.time"
    time_df = pd.read_csv(dream_time_file, sep="\t", header = None)
    time_df.columns = ["Time", "Memory", "Exitcode", "Command"]
    time_df["IBF bins"] = b
    time_df_list.append(time_df)
    alignment_result = pd.concat(time_df_list)
    
alignment_result

Unnamed: 0,Time,Memory,Exitcode,Command,IBF bins
0,1.71,102600,0,parallel --jobs 16,64
0,4.69,17728,0,parallel --jobs 16,1024
0,441.35,18092,0,parallel --jobs 16,8192
0,59.8,17632,0,parallel --jobs 16,16384


### Process total DREAM-Stellar time

In [250]:
dream_file = "../" + workdir + "/dream.time"
time_df, command_df = read_time_output(dream_file)

dream_result = time_df.drop(labels = ["Exitcode"], axis = 1)
dream_result

Unnamed: 0,Time,Memory
0,66.4,9591384
1,114.62,9708964
2,1270.96,9869520
3,1792.97,10010300


### Create tables

#### DREAM-Stellar runtime

In [245]:
data = {'IBF bins': list(build_result["IBF bins"]),
        'Total DREAM-Stellar time': list(dream_result["Time"]),
        'IBF build': list(build_result["Time"]),
        'IBF I/O': list(search_result["IBF I/O"]),
        'Query I/O': list(search_result["Reads I/O"]),
        'IBF search': list(search_result["Compute"]),
        'Query distribution': list(search_result["Bin-queries I/O"]),
        'DREAM alignment': list(alignment_result["Time"])}

dream_detailed_time_table = pd.DataFrame(data)
dream_detailed_time_table

Unnamed: 0,IBF bins,Total DREAM-Stellar time,IBF build,IBF I/O,Query I/O,IBF search,Query distribution,DREAM alignment
0,64,66.4,42.09,0.2,4.07,14.77,0.94,1.71
1,1024,114.62,38.02,0.21,10.36,38.22,0.82,4.69
2,8192,1270.96,56.54,0.25,12.57,420.34,0.97,441.35
3,16384,1792.97,70.91,0.21,3.99,793.45,0.94,59.8


In [246]:
data = {'IBF bins': list(build_result["IBF bins"]),
        'Total time': list(dream_result["Time"]),
        'Valik construct': list(build_result["Time"]),
        'Valik search': list(search_result["Time"]),
        'DREAM alignment': list(alignment_result["Time"])}

dream_time_table = pd.DataFrame(data)
# convert to mm:ss
for column in dream_time_table.drop(labels=["IBF bins"], axis = 1):
    dream_time_table[column] = dream_time_table[column].astype('float64')
    dream_time_table[column] = pd.to_datetime(dream_time_table[column], unit='s').dt.strftime("%M:%S")

dream_time_table

Unnamed: 0,IBF bins,Total time,Valik construct,Valik search,DREAM alignment
0,64,01:06,00:42,00:19,00:01
1,1024,01:54,00:38,00:49,00:04
2,8192,21:10,00:56,07:14,07:21
3,16384,29:52,01:10,13:20,00:59


#### DREAM-Stellar vs Stellar runtime and RAM

In [251]:
dream_result["IBF bins"] = ibf_bins
dream_result["Time"] = dream_result["Time"].astype('float64')
dream_result["Time"] = pd.to_datetime(dream_result["Time"], unit='s').dt.strftime("%M:%S")
dream_result["Memory"] = round(dream_result["Memory"]/10**6, 2)
dream_result

Unnamed: 0,Time,Memory,IBF bins
0,01:06,9.59,64
1,01:54,9.71,1024
2,21:10,9.87,8192
3,29:52,10.01,16384


In [273]:
list(dream_result["Memory"])

[9.59, 9.71, 9.87, 10.01]

In [274]:
list(dream_result["Time"])

['01:06', '01:54', '21:10', '29:52']

In [284]:
def performance_improvement(stellar, dream):
    return ((dream - stellar) / stellar) * 100

In [286]:
stellar_time = np.mean(stellar_result["Time"])
dream_time = 66.40

# DREAM was x % faster than Stellar
performance_improvement(stellar_time, dream_time)

-98.11325713904144

In [288]:
stellar_ram = np.mean(stellar_result["Memory"]/10**6)
dream_ram = 9.59

# DREAM used x % less RAM than Stellar
performance_improvement(stellar_ram, dream_ram)

-85.2220425855428