## Create segment files

In [52]:
import pandas as pd
import sys
import os
from Bio import SeqIO

#------------ INPUT ------------ 
ref_file = "../100kb/ref_rep0.fasta"
seg_file = "../100kb/split/ref_seg_rep0.txt"
rep = 0
bins = 4

#------------ OUTPUT ------------
outfile_prefix = "../100kb/split/rep" + str(rep) + "_seg"

In [53]:
segments = pd.read_csv(seg_file, header=None, sep='\t')
segments.columns = ["bin_id", "ref_id", "start", "length"]
reference_segments = list(SeqIO.parse(ref_file, "fasta"))

In [54]:
# assuming a single reference sequence
for index, row in segments.iterrows():
    bin_id = row["bin_id"]
    ref_id = row["ref_id"]
    start = row["start"]
    length = row["length"]
    with open(outfile_prefix + str(bin_id) + ".fasta", 'w') as f:
        f.write('>' + str(bin_id) + '\t' + "ref=" + str(ref_id) + ",start=" + str(start) + ",length=" + str(length) + '\n')
        f.write(str(reference_segments[0].seq[start:start+length]))
    f.close()



## Analyse run-time

In [78]:
# Pick one
workdir = "4Gb"
#workdir = "16GB"

if (workdir == "4Gb"):
    #--------- INPUT ---------
    error_rates = [0.05]
    ibf_bins = [8, 64, 1024, 8192, 16384]
    ref_len = 500000000 * 8
    nr_queries = 1703936

    #--------- OUTPUT ---------
    outfile = "../" + workdir + "/table1.tsv"

if (workdir == "16GB"):
    #--------- INPUT ---------
    error_rates = [0.05]
    ibf_bins = [1024]
    ref_len = 2000000000 * 8
    nr_queries = 8192000

    #--------- OUTPUT ---------
    outfile = "../" + workdir + "/table1.tsv"


import os
import pandas as pd

def read_time_output(file):
    time_df = pd.read_csv(file, sep="\t", header=None)
    time_df.columns = ["Time", "Memory", "Exitcode", "Command"]
    command_df = time_df["Command"].str.split(' ',expand=True)
    time_df = time_df.drop(labels = "Command", axis = 1)

    return time_df, command_df

### Process Valik build time

In [79]:
valik_build_file = "../" + workdir + "/valik_build.time"
time_df, command_df = read_time_output(valik_build_file)

bin_nr_df = command_df[2].str.split('/', expand=True).drop(labels = 1, axis = 1)
bin_nr_df.columns = ["IBF bins"]

command_df = command_df.drop(labels = [0, 2, 3, 5, 7, 9, 11], axis = 1)
command_df.columns = ["process", "threads", "w", "k", "output", "IBF size"]

build_result = pd.concat([time_df, bin_nr_df, command_df], axis=1)    

bin_size = []
for bin_nr in build_result["IBF bins"]:
    bin_size.append(round(ref_len / int(bin_nr)))

build_result["Bin size"] = bin_size
build_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,threads,w,k,output,IBF size,Bin size
0,248.38,616672,0,8,build,16,17,15,/dev/shm/8/valik.index,500M,500000000
1,42.09,653712,0,64,build,16,17,15,/dev/shm/64/valik.index,500M,62500000
2,25.07,549104,0,1024,build,16,17,15,/dev/shm/1024/valik.index,500M,3906250
3,56.54,537048,0,8192,build,16,17,15,/dev/shm/8192/valik.index,500M,488281
4,70.91,537912,0,16384,build,16,17,15,/dev/shm/16384/valik.index,500M,244141


### Process Valik search time

In [80]:
# Total search time
valik_search_file = "../" + workdir + "/valik_search.time"
time_df, command_df = read_time_output(valik_search_file)

bin_nr_df = command_df[6].str.split('/', expand=True).drop(labels = 1, axis = 1)
bin_nr_df.columns = ["IBF bins"]

command_df = command_df.drop(labels = [0, 2, 3, 5, 6, 7, 8, 9, 11, 13, 15, 17], axis = 1)
command_df.columns = ["process", "input", "errors", "pattern", "overlap", "threads", "output"]

search_result = pd.concat([time_df, bin_nr_df, command_df], axis = 1)
search_result["IBF bins"] = pd.to_numeric(search_result["IBF bins"])

bin_queries = []
for bin_nr in search_result["IBF bins"]:
    bin_queries.append(round(nr_queries / bin_nr))

search_result["Bin queries"] = bin_queries

search_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,input,errors,pattern,overlap,threads,output,Bin queries
0,29.51,9823320,0,8,search,/dev/shm/8/valik.index,2,100,98,16,8/search/e0.05.out,212992
1,19.22,9591384,0,64,search,/dev/shm/64/valik.index,2,100,98,16,64/search/e0.05.out,26624
2,47.93,9791396,0,1024,search,/dev/shm/1024/valik.index,2,100,98,16,1024/search/e0.05.out,1664
3,434.27,9869520,0,8192,search,/dev/shm/8192/valik.index,2,100,98,16,8192/search/e0.05.out,208
4,800.73,10010300,0,16384,search,/dev/shm/16384/valik.index,2,100,98,16,16384/search/e0.05.out,104


In [81]:
# Time for individual steps
time_df_list = []
for b in ibf_bins:
    search_time_file = "../" + workdir + "/" + str(b) + "/search/e0.05.out.time"
    time_df = pd.read_csv(search_time_file, sep="\t")
    time_df["IBF bins"] = b
    time_df_list.append(time_df)
    individual_search_result = pd.concat(time_df_list)
    
individual_search_result

Unnamed: 0,IBF I/O,Reads I/O,Bin-queries I/O,Compute,IBF bins
0,0.29,12.43,1.62,16.36,8
0,0.2,4.07,0.94,14.77,64
0,0.27,12.52,0.81,34.73,1024
0,0.25,12.57,0.97,420.34,8192
0,0.21,3.99,0.94,793.45,16384


In [82]:
search_result = search_result.merge(individual_search_result, on = "IBF bins")
search_result

Unnamed: 0,Time,Memory,Exitcode,IBF bins,process,input,errors,pattern,overlap,threads,output,Bin queries,IBF I/O,Reads I/O,Bin-queries I/O,Compute
0,29.51,9823320,0,8,search,/dev/shm/8/valik.index,2,100,98,16,8/search/e0.05.out,212992,0.29,12.43,1.62,16.36
1,19.22,9591384,0,64,search,/dev/shm/64/valik.index,2,100,98,16,64/search/e0.05.out,26624,0.2,4.07,0.94,14.77
2,47.93,9791396,0,1024,search,/dev/shm/1024/valik.index,2,100,98,16,1024/search/e0.05.out,1664,0.27,12.52,0.81,34.73
3,434.27,9869520,0,8192,search,/dev/shm/8192/valik.index,2,100,98,16,8192/search/e0.05.out,208,0.25,12.57,0.97,420.34
4,800.73,10010300,0,16384,search,/dev/shm/16384/valik.index,2,100,98,16,16384/search/e0.05.out,104,0.21,3.99,0.94,793.45


### Process DREAM-Stellar alignment time

In [83]:
# Time for individual steps

time_df_list = []
for b in ibf_bins:
    dream_time_file = "../" + workdir + "/dream_" + str(b) + "_parallel.time"
    time_df = pd.read_csv(dream_time_file, sep="\t", header = None)
    time_df.columns = ["Time", "Memory", "Exitcode", "Command"]
    time_df["IBF bins"] = b
    time_df_list.append(time_df)
    alignment_result = pd.concat(time_df_list)
    
alignment_result

Unnamed: 0,Time,Memory,Exitcode,Command,IBF bins
0,3.68,820788,0,parallel --jobs 16,8
0,1.71,102600,0,parallel --jobs 16,64
0,4.13,17664,0,parallel --jobs 16,1024
0,441.35,18092,0,parallel --jobs 16,8192
0,59.8,17632,0,parallel --jobs 16,16384


### Process total DREAM-Stellar time

In [84]:
dream_file = "../" + workdir + "/dream.time"
time_df, command_df = read_time_output(dream_file)

dream_result = time_df.drop(labels = ["Exitcode"], axis = 1)
dream_result

Unnamed: 0,Time,Memory
0,284.07,9823320
1,66.4,9591384
2,110.13,9791396
3,1270.96,9869520
4,1792.97,10010300


In [85]:
dream_pretty = dream_result.copy()
dream_pretty["IBF bins"] = ibf_bins
dream_pretty["Time"] = dream_result["Time"].astype('float64')
dream_pretty["Time"] = pd.to_datetime(dream_result["Time"], unit='s').dt.strftime("%H:%M:%S")
dream_pretty["Memory (GB)"] = round(dream_result["Memory"]/10**6, 2) # to GB; time output is in KB
dream_pretty

Unnamed: 0,Time,Memory,IBF bins,Memory (GB)
0,00:04:44,9823320,8,9.82
1,00:01:06,9591384,64,9.59
2,00:01:50,9791396,1024,9.79
3,00:21:10,9869520,8192,9.87
4,00:29:52,10010300,16384,10.01


### Process original Stellar time

In [86]:
stellar_file = "../" + workdir + "/stellar.time"
time_df, command_df = read_time_output(stellar_file)

stellar_result = time_df.drop(labels = ["Exitcode"], axis = 1)
stellar_result.mean(axis = 0)
stellar_result

Unnamed: 0,Time,Memory
0,3581.64,64377624
1,3601.03,64197628
2,3373.75,65086564
3,3520.75,65913976


In [87]:
stellar_pretty = stellar_result.copy()
stellar_pretty["Time"] = stellar_result["Time"].astype('float64')
stellar_pretty["Time"] = pd.to_datetime(stellar_result["Time"], unit='s').dt.strftime("%H:%M:%S")
stellar_pretty["Memory (GB)"] = round(stellar_result["Memory"]/10**6, 2)
stellar_pretty

Unnamed: 0,Time,Memory,Memory (GB)
0,00:59:41,64377624,64.38
1,01:00:01,64197628,64.2
2,00:56:13,65086564,65.09
3,00:58:40,65913976,65.91


### Create tables

#### DREAM-Stellar runtime

In [90]:
data = {'IBF bins': list(build_result["IBF bins"]),
        'Total DREAM-Stellar time': list(dream_result["Time"]),
        'IBF build': list(build_result["Time"]),
        'IBF I/O': list(search_result["IBF I/O"]),
        'Query I/O': list(search_result["Reads I/O"]),
        'IBF search': list(search_result["Compute"]),
        'Query distribution': list(search_result["Bin-queries I/O"]),
#        'IBF search': list(search_result["Time"]),
        'DREAM alignment': list(alignment_result["Time"])}

dream_detailed_time_table = pd.DataFrame(data)
dream_detailed_time_table

Unnamed: 0,IBF bins,Total DREAM-Stellar time,IBF build,IBF I/O,Query I/O,IBF search,Query distribution,DREAM alignment
0,8,284.07,248.38,0.29,12.43,16.36,1.62,3.68
1,64,66.4,42.09,0.2,4.07,14.77,0.94,1.71
2,1024,110.13,25.07,0.27,12.52,34.73,0.81,4.13
3,8192,1270.96,56.54,0.25,12.57,420.34,0.97,441.35
4,16384,1792.97,70.91,0.21,3.99,793.45,0.94,59.8


In [91]:
# convert to mm:ss
for column in dream_detailed_time_table.drop(labels=["IBF bins"], axis = 1):
    dream_detailed_time_table[column] = dream_detailed_time_table[column].astype('float64')
    dream_detailed_time_table[column] = pd.to_datetime(dream_detailed_time_table[column], unit='s').dt.strftime("%M:%S")

dream_detailed_time_table

Unnamed: 0,IBF bins,Total DREAM-Stellar time,IBF build,IBF I/O,Query I/O,IBF search,Query distribution,DREAM alignment
0,8,04:44,04:08,00:00,00:12,00:16,00:01,00:03
1,64,01:06,00:42,00:00,00:04,00:14,00:00,00:01
2,1024,01:50,00:25,00:00,00:12,00:34,00:00,00:04
3,8192,21:10,00:56,00:00,00:12,07:00,00:00,07:21
4,16384,29:52,01:10,00:00,00:03,13:13,00:00,00:59


In [89]:
data = {'IBF bins': list(build_result["IBF bins"]),
        'Total time': list(dream_result["Time"]),
        'Valik construct': list(build_result["Time"]),
        'Valik search': list(search_result["Time"]),
        'DREAM alignment': list(alignment_result["Time"])}

dream_time_table = pd.DataFrame(data)
# convert to mm:ss
for column in dream_time_table.drop(labels=["IBF bins"], axis = 1):
    dream_time_table[column] = dream_time_table[column].astype('float64')
    dream_time_table[column] = pd.to_datetime(dream_time_table[column], unit='s').dt.strftime("%M:%S")

dream_time_table

Unnamed: 0,IBF bins,Total time,Valik construct,Valik search,DREAM alignment
0,8,04:44,04:08,00:29,00:03
1,64,01:06,00:42,00:19,00:01
2,1024,01:50,00:25,00:47,00:04
3,8192,21:10,00:56,07:14,07:21
4,16384,29:52,01:10,13:20,00:59


#### DREAM-Stellar vs Stellar runtime and RAM

In [66]:
def performance_improvement(stellar, dream):
    return ((dream - stellar) / stellar) * 100

In [67]:
import numpy as np
stellar_time = np.mean(stellar_result["Time"])
dream_time = np.mean(dream_result["Time"])

# DREAM was x % faster than Stellar
performance_improvement(stellar_time, dream_time)

-98.54875552708468

In [69]:
stellar_ram = np.mean(stellar_result["Memory"]/10**6)
dream_ram = np.mean(dream_result["Memory"]/10**6)

# DREAM used x % less RAM than Stellar
performance_improvement(stellar_ram, dream_ram)

-86.03299637673767