# Benchmarks
Speed and memory comparisons.

In [None]:
import conf  # setup local path
import resource
import time
from concurrent import futures
from functools import partial
import numpy as np
from spector import indices, matrix, vector

def memory(unit=1e6):
    """Return memory usage in megabytes."""
    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / unit

def diff(metric, func, *args):
    """Return metric difference before and after function call."""
    start = metric()
    _ = func(*args)  # noqa
    return metric() - start

executor = futures.ProcessPoolExecutor()
def sized(func, *args):
    """Measure memory in a separate process."""
    return executor.submit(diff, memory, func, *args).result()

timed = partial(diff, time.time)

def dok(size):
    return {(i, j): 1.0 for i in range(size) for j in range(size)}

def vecs(size):
    arr = np.array(range(size))
    return matrix((i, vector(arr)) for i in range(size))

keys = np.array(range(2 ** 18))
values = np.ones(len(keys))

## `indices` vs. `set`

In [None]:
'memory', sized(indices, keys) / sized(set, keys)

In [None]:
'from array', timed(indices, keys) / timed(set, keys)

In [None]:
'to array', timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))

In [None]:
'set op', timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))

## `vector` vs. `dict`

In [None]:
'memory', sized(vector, keys, values) / sized(dict, zip(keys, values))

In [None]:
'from arrays', timed(vector, keys, values) / timed(dict, zip(keys, values))

In [None]:
vec, d = vector(keys, values), dict(zip(keys, values))
'keys', timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))

In [None]:
'values', timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))

In [None]:
'sum', timed(np.sum, vec) / timed(sum, d.values())

In [None]:
'dot', timed(vec.dot, vec) / timed(sum, (d[k] * d[k] for k in d))

## `matrix` vs. `dict`

In [None]:
size = int(len(keys) ** 0.5)
'memory', sized(vecs, size) / sized(dok, size)

## groupby
Matrices rely on an optimized `groupby` implementation which is much faster than [pandas](https://pandas.pydata.org/pandas-docs/stable/groupby.html).

In [None]:
import math
import random
import pandas as pd
from spector.vector import arggroupby
from spector.matrix import groupby as gb

def measure(funcs, size, base=10):
    index = [base ** exp for exp in range(round(math.log(size, base)) + 1)]
    data = np.array([random.randint(0, size) for _ in range(size)])
    rows = []
    for num in index:
        grouped = data % num
        times = {func.__name__: timed(list, func(grouped)) for func in funcs}
        rows.append(times)
    return pd.DataFrame(rows, index=index)

def series(data):
    return pd.Series(np.arange(len(data)), data).groupby(level=0, sort=False)

def groupby(keys):
    return gb(keys, np.arange(len(keys)))

df = measure([groupby, arggroupby, series], 10 ** 5, 10)
df

In [None]:
for i in df.index:
    df.loc[i] = df.loc[i] / df.loc[i].min()
df