# signac-benchmarks

This notebook aggregates data from benchmark runs, which are currently implemented for signac and datreant.core.

The reporting tool uses a signac Collection or a database as input.

In [None]:
%load_ext autoreload
%autoreload 1
import sys
from pprint import pprint
from math import log, sqrt

from signac import Collection, get_database
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

%aimport complexity

## Fetch the data

Specify the data source and selection criteria.

 * `INPUT_SRC`: Where to fetch data from (either `db` or `file`).
 * `FN_COLLECTION`: Name of the local collection file.
 * `DB`: The name of the database that contains benchmark results.
 * `COLLECTION`: The name of the collection within the database that contains the benchmark results.
 * `QUERY`: A selection query for what data to include in the benchmark; should be considered across all measurements.

In [None]:
INPUT_SRC = 'db'  # (db|file)
FN_COLLECTION ='benchmark.txt'
DB = 'testing'
COLLECTION = 'signac_benchmarks'

QUERY = {
    'meta.N': {'$lte': 10000},   # increase or remove to include larger N
    'meta.versions.python': {'$regex': '3.6'},
    '$and': [
        {'$or': [{'meta.tool': {'$ne': 'signac'}}, {'meta.versions.signac': '0.9.1'}]},
        {'$or': [{'meta.tool': {'$ne': 'datreant'}}, {'meta.versions.datreant': '0.7.1'}]},
    ],
    'meta.cached': False,   # Specify, whether to include results with pre-caching.
    'meta.profile': None,   # Make sure to exclude profiled runs.
    'meta.fstype': 'nfs',   # nfs tmpfs hfs
}

Then, load the data either from a database or a local collection file.

In [None]:
if INPUT_SRC == 'db':
    db = get_database(DB)
    docs = list(db[COLLECTION].find(QUERY))
elif INPUT_SRC == 'file':
    with Collection.open(FN_COLLECTION) as c:
        docs = list(c.find(QUERY))
else:
    raise ValueError(INPUT_SRC)

print("# of docs: ", len(docs))

The `complexity.COMPLEXITY` constant contains hard-coded (expected) complexities for the individual measured operations.

In [None]:
pprint(complexity.COMPLEXITY)

Specification of a few utility functions.

In [None]:
def tr(s):
    cplx = complexity.COMPLEXITY.get(s)
    t = {
        'select_by_id': "Select by ID",
        'determine_len': "Determine N",
        'iterate': "Iterate",
        'iterate_single_pass': "Iterate (single pass)",
        'search_lean_filter': "Search w/ lean filter",
        'search_rich_filter': "Search w/ rich filter",
        'tool,N': "Tool, $\log_{10}(N)$",
        '3.6.3.final.0': "3.6",
        'nfs': "NFS",
    }.get(s, s)
    if cplx is not None:
        t += ' \u2014 O({})'.format(cplx)
    return t


def tr_legend(legend):
    title = legend.get_title()
    title.set_text(tr(title.get_text()))
    for text in legend.get_texts():
        tool, N = text.get_text()[1:-1].split(',')
        text.set_text("{}, {}".format(tr(tool), round(log(int(N), 10))))

        
def get_versions(df, tool):
    return set(df[df['tool'] == tool]['versions'].map(lambda v: v[tool]))


def ensure_unique(iterable):
    s = set(iterable)
    if s:
        if len(s) != 1:
            raise ValueError("The following set is not unique: '{}'.".format(s))
        return s.pop()


def get_meta(df):
    signac_version = ensure_unique(get_versions(df, 'signac'))
    if ensure_unique(df[df['tool'] == 'signac']['cached']):
        signac_version += 'C'
    return {
        'python': ensure_unique(df['versions'].map(lambda v: v['python'])),
        'signac': signac_version,
        'datreant': ensure_unique(get_versions(df, 'datreant')),
        'fstype': ensure_unique(df['fstype']),
    }

def fmt_meta(df):
    return "Python {} {}".format(tr(meta['python']), tr(meta['fstype']))

def fn_meta(df):
    meta = get_meta(df)
    keys = 'python', 'fstype', 'signac', 'datreant'
    return '_'.join([k + '_{' + k + '}' for k in keys]).format(** meta)

## Normalization

We average all runs within one session, select the shortest run, divide by the complexity order and finally convert the measured values to microseconds.

In [None]:
def normalize(data, N):
    for cat, x in data.items():
        x_mean = min([(y/n) for n, y in x])
        cplx = complexity.COMPLEXITY.get(cat)
        if cplx is not None:
            x_mean /= eval(cplx)
        yield cat, x_mean * 1e6  # microseconds

        
df_meta = pd.DataFrame({doc['_id']: doc['meta'] for doc in docs}).T
df_size = pd.DataFrame({doc['_id']: {'size': doc['size']['total']} for doc in docs}).T
df_data = pd.DataFrame({doc['_id']: dict(normalize(doc['data'], doc['meta']['N'])) for doc in docs}).T

df = pd.concat([df_meta, df_data, df_size], axis=1)

# The identifier for this benchmark (also ensures consistency):
print(fn_meta(df))

These are the benchmark values in microseconds and normalizd by complexity:

In [None]:
df.rename(columns=tr).groupby(['tool', 'N']).mean().round(2).T

Next, we prepare plotting the data by defining a tool and data space size ($N$) based color schemes.
We use blue for signac and green for datreant, roughly corresponding to the color scheme on the respective websites.

In [None]:
CMAP_TOOLS = {
    'signac': plt.get_cmap('Blues'),
    'datreant': plt.get_cmap('Greens'),
}

N_max = df['N'].max()

def color(x, a_min=0.1, a_max=1.0):
    tool = x['tool'][0]
    N = x['N'][0]
    y = log(N) / log(N_max)
    y = a_min + y * (a_max - a_min)
    return CMAP_TOOLS[tool](y)

colors = df.groupby(['tool', 'N']).apply(color).tolist()

The following section plots the output data. The interactive offset values help to generate a "publication-ready" figure with a slighty customized (hard-coded) legend.
The hard-coded legend only works for a specific subset.

In [None]:
import matplotlib.patches as patches
from ipywidgets import interact

offset = (0, 1.0, 0.01)

@interact(x_offset=offset, y_offset=offset, x_offset_i=offset, y_offset_i=offset, y_lim_e=(1, 10, 1))
def plot(x_offset=0.02, y_offset=0, x_offset_i=0, y_offset_i=0, y_lim_e=9, hardcoded_legend=False, save=False):
    fig, ax = plt.subplots(figsize=(4, 3), dpi=150)
    
    meta = get_meta(df)
    tools = df.sort_values('tool').groupby('tool').head(1)['tool'].tolist()
    Ns = df.sort_values(['tool', 'N']).groupby(['tool', 'N']).head(1)['N'].tolist()
    
    if hardcoded_legend:
        can_use_hcl = tools == ['datreant', 'signac'] and Ns == [100, 1000, 10000, 100, 1000, 10000]
        if not can_use_hcl:
            print("Can't use the hard-coded legend for this data set!", file=sys.stderr)
            hardcoded_legend = False
    
    tmp = df.rename(columns=tr).drop(columns=['size']).groupby(by=['tool', 'N']).mean()
    tmp.T.plot(kind='barh', ax=ax, color=colors, log=True, legend=not hardcoded_legend, fontsize=8)

    ax.set_xlim(1, 10**y_lim_e)
    ax.set_xlabel('Time / Complexity [\u00B5s]')

    if hardcoded_legend:
        assert tools == ['datreant', 'signac'] and Ns == [100, 1000, 10000, 100, 1000, 10000]
         
        x = lambda x_: x_ + x_offset_i
        y = lambda y_: y_ + y_offset_i

        a = fig.add_axes([.6 + x_offset, .2 + y_offset, .25, .6], facecolor='w')
        a.set_xticks([])
        a.set_yticks([])
        [spine.set_color('gray') for spine in a.spines.values()]
        plt.text(s='Tool', x=x(0.25), y = y(0.85), rotation=0, horizontalalignment='center')
        plt.text(s='log(N)', x=x(0.67), y =  y(0.85), rotation=0, horizontalalignment='center')
        plt.text(s='datreant\n{}'.format(meta['datreant']), x=x(0.26), y = y(0.26), rotation=90, verticalalignment='center', horizontalalignment='center')
        plt.text(s='signac\n{}'.format(meta['signac']), x=x(0.26), y = y(0.65), rotation=90, verticalalignment='center', horizontalalignment='center')
        logN = [2,3,4,2,3,4]
        for i in range(6):
            if(i>=3):
                a.text(s='{}'.format(logN[i]), x=x(0.68), y=y(0.1*(i+2)+0.025), horizontalalignment='center', color = 'w')
                a.add_patch(patches.Rectangle((x(0.48), y(0.1*(i+2))), 0.4, 0.1, color=colors[i]))
            else:
                a.text(s='{}'.format(logN[i]), x=x(0.68), y=y(0.1*(i+1)+0.025), horizontalalignment='center', color = 'w')
                a.add_patch(patches.Rectangle((x(0.48), y(0.1*(i+1))), 0.4, 0.1, color=colors[i]))
    else:
        legend = ax.get_legend()
        tr_legend(legend)
        
    fn = fn_meta(df) + '.pdf'
    if save:
        plt.savefig(fn, transparent=True, bbox_inches='tight')
        print("Saved to '{}'.".format(fn))
    else:
        print("Would save to '{}'.".format(fn))
    plt.show()