# Gather run-time and memory

In [1]:
workdir = "4Gb"

#--------- INPUT ---------
er = 0.05
bins = [64]
modes = ["DREAM (64 bins storage)", "DREAM (64 bins memory)"]

#--------- OUTPUT ---------
totals_out = "../" + workdir + "_split_total.tsv"
storage_vs_memory = "../" + workdir + "_storage_vs_memory_io.tsv"

import os
import pandas as pd
import numpy as np

### /usr/bin/time run-time results (Snakemake)

In [2]:
segment_name = "create_seg_files"
build_name = "valik_build"
search_name = "valik_search"
parallel_name = "dream_parallel"
dream_total = "dream"

def get_filepath(name):
    return "../" + workdir + "/" + name + ".time"

def get_time_df(path):
    df = pd.read_csv(path, sep = "\t", header = None)
    df.columns = ["Time (sec)", "Memory (KB)", "Exitcode", "Command"]
    
    assert(np.unique(df["Exitcode"]) == 0)
    df = df.drop(labels = "Exitcode", axis = 1)
    # check the command column to see if input/output in memory etc
    df = df.drop(labels = "Command", axis = 1)
    return df

def make_pretty_units(df_orig):
    df = df_orig.copy()
    df["Time (sec)"] = df["Time (sec)"] = df["Time (sec)"].astype('float64')  
    df["Time (sec)"] = pd.to_datetime(df["Time (sec)"], unit='s').dt.strftime("%M:%S")
    
    df["Memory (KB)"] = np.round(df["Memory (KB)"] / 10**3)
    df.columns = ["Time (mm:ss)", "Memory (MB)"]
    return df

In [3]:
dream_df = get_time_df(get_filepath(dream_total))
dream_pretty = make_pretty_units(dream_df)
dream_pretty.index = modes
dream_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
DREAM (64 bins storage),07:44,19921.0
DREAM (64 bins memory),07:14,19926.0


### Stellar runs

These have no significant difference, take average.

In [4]:
stellar_total = "../../metagenome/" + workdir + "/stellar.time"
stellar_df = get_time_df(stellar_total)
s_avg_t = round(np.mean(stellar_df["Time (sec)"]), 2)
s_avg_m = round(np.mean(stellar_df["Memory (KB)"]))

stellar_df.at[0, 'Time (sec)'] = s_avg_t
stellar_df.at[0, 'Memory (KB)'] = s_avg_m
stellar_df = stellar_df.drop(labels = [1, 2, 3])

stellar_df

Unnamed: 0,Time (sec),Memory (KB)
0,3519.29,64893948


In [5]:
stellar_pretty = make_pretty_units(stellar_df)
stellar_pretty.index = ["Stellar"]
stellar_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
Stellar,58:39,64894.0


### Table with Stellar and DREAM-Stellar total run-time and peak memory

In [6]:
bin_nr_table = pd.concat([stellar_pretty, dream_pretty])
bin_nr_table.to_csv(totals_out, sep="\t")

### Gather /usr/bin/time run-time results for DREAM-Stellar alignment (GNU parallel)

In [7]:
parallel_search = get_time_df(get_filepath(parallel_name))
parallel_pretty = make_pretty_units(parallel_search)
parallel_pretty.index = modes
parallel_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
DREAM (64 bins storage),03:45,682.0
DREAM (64 bins memory),03:42,681.0


### Gather segment creation from Snakemake benchmark

In [8]:
segment_df = pd.read_csv(get_filepath(segment_name), sep='\t')

segment_df = segment_df.drop(labels = "h:m:s", axis = 1)

# repeated the benchmark 3 times
for column in segment_df.columns: 
    seg_avg = round(np.mean(segment_df[column]), 2)
    segment_df.at[0, column] = seg_avg

segment_df = segment_df.drop(labels = [1, 2])
segment_df = segment_df.drop(labels = ["max_vms", "max_uss", "max_pss", "io_in", "io_out", "mean_load", "cpu_time"], axis = 1)
# Snakemake benchmark output in MB 
# /usr/bin/time output in KB
segment_df.columns = ["Time (sec)", "Memory (MB)"]

segment_df

Unnamed: 0,Time (sec),Memory (MB)
0,31.28,7168.54


In [9]:
segment_pretty = segment_df.copy()
segment_pretty["Time (sec)"] = segment_pretty["Time (sec)"] = segment_pretty["Time (sec)"].astype('float64')  
segment_pretty["Time (sec)"] = pd.to_datetime(segment_pretty["Time (sec)"], unit='s').dt.strftime("%M:%S")

segment_pretty.columns = ["Time (mm:ss)", "Memory (MB)"]
segment_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
0,00:31,7168.54


### Table with detailed DREAM-Stellar steps

Speed-up from reading intermediate files from memory

In [10]:
build_df = get_time_df(get_filepath(build_name))
build_pretty = make_pretty_units(build_df)
build_pretty.index = modes
build_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
DREAM (64 bins storage),01:05,2868.0
DREAM (64 bins memory),00:50,2868.0


In [11]:
search_df = get_time_df(get_filepath(search_name))
search_pretty = make_pretty_units(search_df)
search_pretty.index = modes
search_pretty

Unnamed: 0,Time (mm:ss),Memory (MB)
DREAM (64 bins storage),01:00,19921.0
DREAM (64 bins memory),00:48,19926.0


In [12]:
data = {"Mode": modes,
        "Total DREAM": list(dream_df["Time (sec)"]),
        "Segment creation": list(segment_df["Time (sec)"]) * 2,
        "IBF construct": list(build_df["Time (sec)"]),
        "IBF search": list(search_df["Time (sec)"]),
        "Distributed Stellar": list(parallel_search["Time (sec)"])}

dream_table = pd.DataFrame(data)
dream_table["Snakemake overhead"] = dream_table["Total DREAM"] - dream_table["Segment creation"] - dream_table["IBF construct"] - dream_table["IBF search"] - dream_table["Distributed Stellar"]
dream_table

Unnamed: 0,Mode,Total DREAM,Segment creation,IBF construct,IBF search,Distributed Stellar,Snakemake overhead
0,DREAM (64 bins storage),464.23,31.28,65.46,60.3,225.41,81.78
1,DREAM (64 bins memory),434.6,31.28,50.23,48.45,222.95,81.69


In [13]:
for column in dream_table.drop(labels="Mode", axis = 1).columns:
    dream_table[column] = dream_table[column] = dream_table[column].astype('float64')  
    dream_table[column] = pd.to_datetime(dream_table[column], unit='s').dt.strftime("%M:%S")
    
cols = dream_table.columns
dream_table.columns = [col + " (mm:ss)" for col in cols]
dream_table=dream_table.rename(columns = {'Mode (mm:ss)':'Mode'})
dream_table.to_csv(storage_vs_memory, sep="\t")
dream_table

Unnamed: 0,Mode,Total DREAM (mm:ss),Segment creation (mm:ss),IBF construct (mm:ss),IBF search (mm:ss),Distributed Stellar (mm:ss),Snakemake overhead (mm:ss)
0,DREAM (64 bins storage),07:44,00:31,01:05,01:00,03:45,01:21
1,DREAM (64 bins memory),07:14,00:31,00:50,00:48,03:42,01:21
