# Benchmark

```ipython
%load_ext watermark
```

In [1]:
import pandas as pd
from IPython.display import HTML, display

from bumbag import core

```ipython
%watermark -v -m --iversions
```
```text
Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.18.5-100.fc35.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 3
Architecture: 64bit

pandas    : 1.4.2
matplotlib: 3.5.2
numpy     : 1.22.4
```

## `core` module

### `freq` versus `value_counts`

In [2]:
from benchmark_freq_versus_value_counts import freq_pd, enhanced_value_counts

In [3]:
print(core.get_source_code(freq_pd))

def freq_pd(values):
    return pd.DataFrame(core.freq(values))



In [4]:
print(core.get_source_code(enhanced_value_counts))

def enhanced_value_counts(values):
    """Equivalent Pandas implementation of freq."""
    s = pd.Series(values).value_counts(
        sort=True,
        ascending=False,
        bins=None,
        dropna=False,
    )

    df = pd.DataFrame(s, columns=["n"])
    df["N"] = df["n"].cumsum()
    df["r"] = df["n"] / df["n"].sum()
    df["R"] = df["r"].cumsum()

    return df



In [5]:
values = ["a", "c", "b", "g", "h", "a", "g", "a"]

In [6]:
bb_output = freq_pd(values)
pd_output = enhanced_value_counts(values)
bb_output.equals(pd_output)

True

In [7]:
display(HTML(bb_output.to_html(col_space="30px")))

Unnamed: 0,n,N,r,R
a,3,3,0.375,0.375
g,2,5,0.25,0.625
c,1,6,0.125,0.75
b,1,7,0.125,0.875
h,1,8,0.125,1.0


```ipython
%%timeit -n 100 -r 10
freq_pd(values)
```
```text
1.66 ms ± 507 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
```

```ipython
%%timeit -n 100 -r 10
enhanced_value_counts(values)
```
```text
3.7 ms ± 448 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
```

In [8]:
3.7 / 1.66

2.2289156626506026

```ipython
%run -i "benchmark_freq_versus_value_counts.py"
```

![Image](./benchmark_freq_versus_value_counts.png)