# Gather run-time and memory

In [3]:
workdir = "4Gb"

#--------- INPUT ---------
er = 0.05
bins = [8, 64, 1024, 8192, 16384]

#--------- OUTPUT ---------
bin_nr_out = "../" + workdir + "_metagenome_bin_nr.tsv"
dream_bin_nr_out = "../" + workdir + "_dream_bin_nr.tsv"
detailed_search_out = "../" + workdir + "_detailed_dream.tsv"
detailed_memory_out = "../" + workdir + "_memory_usage.tsv"


import os
import pandas as pd
import numpy as np

### Valik search app internal run-time results 

In [4]:
dfs = []
for b in bins:
    detailed_search_file = "../" + workdir + "/" + str(b) + "/search/e" + str(er) + ".out.time"

    df = pd.read_csv(detailed_search_file, sep='\t')
    df["#bins"] = b
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    dfs.append(df)

detailed_search = pd.concat(dfs)
detailed_search

Unnamed: 0,#bins,IBF I/O,Reads I/O,Bin-queries I/O,Compute
0,8,0.29,12.43,1.62,16.36
0,64,0.2,4.07,0.94,14.77
0,1024,0.21,10.36,0.82,38.22
0,8192,0.25,12.57,0.97,420.34
0,16384,0.21,3.99,0.94,793.45


In [5]:
cols = detailed_search.columns
detailed_search.columns = [col + " (sec)" for col in cols]
detailed_search = detailed_search.rename(columns = {'#bins (sec)':'#bins'})

detailed_search.to_csv(detailed_search_out, sep = '\t')
detailed_search

Unnamed: 0,#bins,IBF I/O (sec),Reads I/O (sec),Bin-queries I/O (sec),Compute (sec)
0,8,0.29,12.43,1.62,16.36
0,64,0.2,4.07,0.94,14.77
0,1024,0.21,10.36,0.82,38.22
0,8192,0.25,12.57,0.97,420.34
0,16384,0.21,3.99,0.94,793.45


### /usr/bin/time run-time results for DREAM-Stellar alignment (GNU parallel)

In [6]:
build_name = "valik_build"
search_name = "valik_search"
dream_total = "dream"
stellar_total = "stellar"

def get_filepath(name):
    return "../" + workdir + "/" + name + ".time"

In [7]:
def get_time_df(name):
    path = get_filepath(name)
    df = pd.read_csv(path, sep = "\t", header = None)
    df.columns = ["Time (sec)", "Memory (KB)", "Exitcode", "Command"]
    
    assert(np.unique(df["Exitcode"]) == 0)
    df = df.drop(labels = "Exitcode", axis = 1)
    # check the command column to see if input/output in memory etc
    df = df.drop(labels = "Command", axis = 1)
    return df

def make_pretty_units(df_orig):
    df = df_orig.copy()
    df["Time (sec)"] = df["Time (sec)"] = df["Time (sec)"].astype('float64')  
    df["Time (sec)"] = pd.to_datetime(df["Time (sec)"], unit='s').dt.strftime("%M:%S")
    
    df["Memory (KB)"] = np.round(df["Memory (KB)"] / 10**3)
    df.columns = ["Time (mm:ss)", "Memory (MB)"]
    return df

In [8]:
dream_df = get_time_df(dream_total)
dream_pretty = make_pretty_units(dream_df)
dream_pretty.index = ["DREAM (" + str(b) + " bins)" for b in bins]
dream_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
DREAM (8 bins),04:44,9823.0
DREAM (64 bins),01:06,9591.0
DREAM (1024 bins),01:50,9791.0
DREAM (8192 bins),21:10,9870.0
DREAM (16384 bins),29:52,10010.0


### Stellar runs

These have no significant difference, take average.

In [9]:
stellar_df = get_time_df(stellar_total)
s_avg_t = round(np.mean(stellar_df["Time (sec)"]), 2)
s_avg_m = round(np.mean(stellar_df["Memory (KB)"]))

stellar_df.at[0, 'Time (sec)'] = s_avg_t
stellar_df.at[0, 'Memory (KB)'] = s_avg_m
stellar_df = stellar_df.drop(labels = [1, 2, 3])

stellar_df

Unnamed: 0,Time (sec),Memory (KB)
0,3519.29,64893948


In [10]:
stellar_pretty = make_pretty_units(stellar_df)
stellar_pretty.index = ["Stellar"]
stellar_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
Stellar,58:39,64894.0


### Table with Stellar and DREAM-Stellar total run-time and peak memory

In [11]:
bin_nr_table = pd.concat([stellar_pretty, dream_pretty])
bin_nr_table.to_csv(bin_nr_out, sep="\t")

### Gather /usr/bin/time run-time results for DREAM-Stellar alignment (GNU parallel)

In [12]:
dfs = []
for b in bins:
    parallel_file = "../" + workdir + "/" + "dream_" + str(b) + "_parallel.time"
    df = pd.read_csv(parallel_file, sep='\t', header = None)
    dfs.append(df)
    
parallel_search = pd.concat(dfs)
parallel_search.columns = ["Time (sec)", "Memory (KB)", "Exitcode", "Command"]
parallel_search = parallel_search.drop(labels = ["Exitcode", "Command"], axis = 1)
parallel_pretty = make_pretty_units(parallel_search)
parallel_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
0,00:03,821.0
0,00:01,103.0
0,00:04,18.0
0,07:21,18.0
0,00:59,18.0


### Table with detailed DREAM-Stellar steps

In [13]:
build_df = get_time_df(build_name)
build_pretty = make_pretty_units(build_df)
build_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
0,04:08,617.0
1,00:42,654.0
2,00:25,549.0
3,00:56,537.0
4,01:10,538.0


In [14]:
search_df = get_time_df(search_name)
search_pretty = make_pretty_units(search_df)
search_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
0,00:29,9823.0
1,00:19,9591.0
2,00:47,9791.0
3,07:14,9870.0
4,13:20,10010.0


In [17]:
data = {"#bins": bins,
        "Total DREAM": list(dream_df["Time (sec)"]),
        "IBF construct": list(build_df["Time (sec)"]),
        "IBF search": list(search_df["Time (sec)"]),
        "Distributed Stellar": list(parallel_search["Time (sec)"])}

dream_table = pd.DataFrame(data)
dream_table["Snakemake overhead"] = dream_table["Total DREAM"] - dream_table["IBF construct"] - dream_table["IBF search"] - dream_table["Distributed Stellar"]
dream_table

Unnamed: 0,#bins,Total DREAM,IBF construct,IBF search,Distributed Stellar,Snakemake overhead
0,8,284.07,248.38,29.51,3.68,2.5
1,64,66.4,42.09,19.22,1.71,3.38
2,1024,110.13,25.07,47.93,4.69,32.44
3,8192,1270.96,56.54,434.27,441.35,338.8
4,16384,1792.97,70.91,800.73,59.8,861.53


In [18]:
for column in dream_table.drop(labels="#bins", axis = 1).columns:
    dream_table[column] = dream_table[column] = dream_table[column].astype('float64')  
    dream_table[column] = pd.to_datetime(dream_table[column], unit='s').dt.strftime("%M:%S")
    
cols = dream_table.columns
dream_table.columns = [col + " (mm:ss)" for col in cols]
dream_table=dream_table.rename(columns = {'#bins (mm:ss)':'#bins'})
dream_table.to_csv(dream_bin_nr_out, sep="\t")
dream_table

Unnamed: 0,#bins,Total DREAM (mm:ss),IBF construct (mm:ss),IBF search (mm:ss),Distributed Stellar (mm:ss),Snakemake overhead (mm:ss)
0,8,04:44,04:08,00:29,00:03,00:02
1,64,01:06,00:42,00:19,00:01,00:03
2,1024,01:50,00:25,00:47,00:04,00:32
3,8192,21:10,00:56,07:14,07:21,05:38
4,16384,29:52,01:10,13:20,00:59,14:21


### Gather detailed memory usage report

In [21]:
data = {"#bins": bins,
        "Peak DREAM": list(dream_pretty["Memory (MB)"]),
        "IBF construct": list(build_pretty["Memory (MB)"]),
        "IBF search": list(search_pretty["Memory (MB)"]),
        "Distributed Stellar": list(parallel_pretty["Memory (MB)"])}

memory_table = pd.DataFrame(data)
cols = memory_table.columns
memory_table.columns = [col + " (MB)" for col in cols]
memory_table=memory_table.rename(columns = {'#bins (MB)':'#bins'})
memory_table.to_csv(detailed_memory_out, sep="\t")
memory_table

Unnamed: 0,#bins,Peak DREAM (MB),IBF construct (MB),IBF search (MB),Distributed Stellar (MB)
0,8,9823.0,617.0,9823.0,821.0
1,64,9591.0,654.0,9591.0,103.0
2,1024,9791.0,549.0,9791.0,18.0
3,8192,9870.0,537.0,9870.0,18.0
4,16384,10010.0,538.0,10010.0,18.0


## Gather pmax benchmarks

In [10]:
import pandas as pd
table = pd.read_csv("../4Gb_pmax_benchmark.tsv", sep ="\t")
table

Unnamed: 0,p_max,Total (sec),Alignment (sec),Precision (Valik),Recall
0,0.15,429.23,362.46,0.8,0.87
1,0.25,469.57,401.2,0.77,0.91
2,0.5,504.6,436.14,0.71,0.95
3,0.75,501.56,433.98,0.71,0.95
4,1.0,519.98,451.03,0.71,0.95


In [11]:
for column in ["Total (sec)", "Alignment (sec)"]:
    table[column] = table[column] = table[column].astype('float64')  
    table[column] = pd.to_datetime(table[column], unit='s').dt.strftime("%M:%S")

table

table.to_csv("../4Gb_pmax_benchmark_pretty.tsv", sep = "\t")