In [1]:
import pandas as pd
import time
import random
from typing import Callable

from forms.executor.dfexecutor.lookup.lookupfuncexecutor import lookup_binary_search, lookup_binary_search_np, lookup_sort_merge
from forms.executor.dfexecutor.lookup.vlookupfuncexecutor import vlookup_exact_hash, vlookup_exact_loops

In [2]:
def create_df(size=(1000, 10), df_type="constant", start_val: float = 0, first_col_idx=True, seed=2):
    rows, cols = size
    assert rows >= 1 and cols >= 1
    lst = [([0.0] * cols) for _ in range(rows)]
    random.seed(seed)
    for i in range(rows):
        col_start = 0
        if first_col_idx:
            col_start = 1
            lst[i][0] = i
        for j in range(col_start, cols):
            if df_type == "constant":
                val = start_val
            elif df_type == "range":
                val = start_val + i
            elif df_type == "random":
                val = random.random() * rows
            else:
                raise IOError(f"Df type {df_type} is not supported!")
            lst[i][j] = val
    return pd.DataFrame(lst)

In [3]:
def run_lookup_trials(df: pd.DataFrame, lookup_func: Callable):
    values, search_range, result_range = df.iloc[:, 1], df.iloc[:, 0], df.iloc[:, 2]

    iterations: int = 5

    # Warm up cache
    lookup_func(values, search_range, result_range)

    # Run trials
    total_time = 0
    for i in range(iterations):
        start_time = time.time()
        lookup_func(values, search_range, result_range)
        total_time += time.time() - start_time

    return total_time / iterations

In [4]:
def run_vlookup_exact_trials(df: pd.DataFrame, lookup_func: Callable):
    values = df.iloc[:, 1]
    col_idxes = pd.Series([df.shape[1]] * df.shape[0])

    iterations: int = 5

    # Warm up cache
    lookup_func(values, df, col_idxes)

    # Run trials
    total_time = 0
    for i in range(iterations):
        start_time = time.time()
        lookup_func(values, df, col_idxes)
        total_time += time.time() - start_time

    return total_time / iterations

In [5]:
print("\nBENCHMARKING LOOKUP", "-" * 20, "\n")

subtitle1, subtitle2, subtitle3 = "Binary search", "Sort merge", "Binary search NumPy"

def run_lookup_benchmark(df, title):
    time1 = run_lookup_trials(df, lookup_binary_search)
    time2 = run_lookup_trials(df, lookup_sort_merge)
    time3 = run_lookup_trials(df, lookup_binary_search_np)
    print(title)
    print(f"{subtitle1} time: {time1}")
    print(f"{subtitle2} time: {time2}")
    print(f"{subtitle3} time: {time3}")
    print()

df = create_df(df_type="constant", start_val=50)
run_lookup_benchmark(df, "Constant DataFrame Exact")

df = create_df(df_type="range", start_val=0)
run_lookup_benchmark(df, "Range DataFrame Exact")

df = create_df(df_type="range", start_val=0.5)
run_lookup_benchmark(df, "Range DataFrame Approximate")

df = create_df(df_type="range", start_val=-2.5)
run_lookup_benchmark(df, "Range DataFrame Approximate NaN")

df = create_df(df_type="random")
run_lookup_benchmark(df, "Random DataFrame Approximate")


BENCHMARKING LOOKUP -------------------- 

Constant DataFrame Exact
Binary search time: 0.0827484130859375
Sort merge time: 0.0052035808563232425
Binary search NumPy time: 0.009124374389648438

Range DataFrame Exact
Binary search time: 0.08574237823486328
Sort merge time: 0.015183353424072265
Binary search NumPy time: 0.009176206588745118

Range DataFrame Approximate
Binary search time: 0.07981395721435547
Sort merge time: 0.020752763748168944
Binary search NumPy time: 0.008109760284423829

Range DataFrame Approximate NaN
Binary search time: 0.07844061851501465
Sort merge time: 0.020850515365600585
Binary search NumPy time: 0.007762813568115234

Random DataFrame Approximate
Binary search time: 0.07918896675109863
Sort merge time: 0.018822383880615235
Binary search NumPy time: 0.008295965194702149



In [6]:
print("\nBENCHMARKING VLOOKUP EXACT", "-" * 13, "\n")

subtitle1, subtitle2, subtitle3 = "Nested Loops", "Hash and Probe", "Binary search NumPy"

def run_vlookup_benchmark(df, title):
    time1 = run_vlookup_exact_trials(df, vlookup_exact_loops)
    time2 = run_vlookup_exact_trials(df, vlookup_exact_hash)
    print(title)
    print(f"{subtitle1} time: {time1}")
    print(f"{subtitle2} time: {time2}")
    print()

df = create_df(df_type="constant", start_val=50)
run_vlookup_benchmark(df, "Constant DataFrame Exact")

df = create_df(df_type="range", start_val=0)
run_vlookup_benchmark(df, "Range DataFrame Exact")

df = create_df(df_type="range", start_val=0.5)
run_vlookup_benchmark(df, "Range DataFrame All NaN")


BENCHMARKING VLOOKUP EXACT ------------- 

Constant DataFrame Exact
Nested Loops time: 0.14830985069274902
Hash and Probe time: 0.0447239875793457

Range DataFrame Exact
Nested Loops time: 0.1719046115875244
Hash and Probe time: 0.04344358444213867

Range DataFrame All NaN
Nested Loops time: 0.18710665702819823
Hash and Probe time: 0.022646999359130858

