In [18]:
import pandas as pd
import time
import random
from typing import Callable

from forms.executor.dfexecutor.lookup.lookupfuncexecutor import lookup_binary_search, lookup_binary_search_np, lookup_sort_merge
from forms.executor.dfexecutor.lookup.vlookupfuncexecutor import vlookup_exact_hash, vlookup_exact_loops

In [23]:
def create_df(size=(1000, 10), df_type="constant", start_val: float = 0, first_col_idx=True, seed=2):
    rows, cols = size
    assert rows >= 1 and cols >= 1
    lst = [([0.0] * cols) for _ in range(rows)]
    random.seed(seed)
    for i in range(rows):
        col_start = 0
        if first_col_idx:
            col_start = 1
            lst[i][0] = i
        for j in range(col_start, cols):
            if df_type == "constant":
                val = start_val
            elif df_type == "range":
                val = start_val + i
            elif df_type == "random":
                val = random.random() * rows
            else:
                raise IOError(f"Df type {df_type} is not supported!")
            lst[i][j] = val
    return pd.DataFrame(lst)

In [26]:
def run_lookup_trials(df: pd.DataFrame, lookup_func: Callable):
    values, search_range, result_range = df.iloc[:, 1], df.iloc[:, 0], df.iloc[:, 2]

    iterations: int = 5

    # Warm up cache
    lookup_func(values, search_range, result_range)

    # Run trials
    total_time = 0
    for i in range(iterations):
        start_time = time.time()
        lookup_func(values, search_range, result_range)
        total_time += time.time() - start_time

    return total_time / iterations

In [35]:
def run_vlookup_exact_trials(df: pd.DataFrame, lookup_func: Callable):
    values = df.iloc[:, 1]
    col_idxes = pd.Series([df.shape[1]] * df.shape[0])

    iterations: int = 5

    # Warm up cache
    lookup_func(values, df, col_idxes)

    # Run trials
    total_time = 0
    for i in range(iterations):
        start_time = time.time()
        lookup_func(values, df, col_idxes)
        total_time += time.time() - start_time

    return total_time / iterations

In [27]:
print("\nBENCHMARKING LOOKUP", "-" * 20, "\n")

subtitle1, subtitle2, subtitle3 = "Binary search", "Sort merge", "Binary search NumPy"

def run_lookup_benchmark(df, title):
    time1 = run_lookup_trials(df, lookup_binary_search)
    time2 = run_lookup_trials(df, lookup_sort_merge)
    time3 = run_lookup_trials(df, lookup_binary_search_np)
    print(title)
    print(f"{subtitle1} time: {time1}")
    print(f"{subtitle2} time: {time2}")
    print(f"{subtitle3} time: {time3}")
    print()

df = create_df(df_type="constant", start_val=50)
run_and_print(df, "Constant DataFrame Exact")

df = create_df(df_type="range", start_val=0)
run_and_print(df, "Range DataFrame Exact")

df = create_df(df_type="range", start_val=0.5)
run_and_print(df, "Range DataFrame Approximate")

df = create_df(df_type="range", start_val=-2.5)
run_and_print(df, "Range DataFrame Approximate NaN")

df = create_df(df_type="random")
run_and_print(df, "Random DataFrame Approximate")


BENCHMARKING LOOKUP -------------------- 

Constant DataFrame Exact
Binary search time: 0.08154296875
Sort merge time: 0.00537405014038086
Binary search NumPy time: 0.009706258773803711

Range DataFrame Exact
Binary search time: 0.08289117813110351
Sort merge time: 0.016659021377563477
Binary search NumPy time: 0.009268760681152344

Range DataFrame Approximate
Binary search time: 0.07866334915161133
Sort merge time: 0.02214956283569336
Binary search NumPy time: 0.007753229141235352

Range DataFrame Approximate NaN
Binary search time: 0.07774567604064941
Sort merge time: 0.020625972747802736
Binary search NumPy time: 0.007583475112915039

Random DataFrame Approximate
Binary search time: 0.0788238525390625
Sort merge time: 0.01687335968017578
Binary search NumPy time: 0.008039569854736328



In [36]:
print("\nBENCHMARKING VLOOKUP EXACT", "-" * 13, "\n")

subtitle1, subtitle2, subtitle3 = "Nested Loops", "Hash and Probe", "Binary search NumPy"

def run_lookup_benchmark(df, title):
    time1 = run_vlookup_exact_trials(df, vlookup_exact_loops)
    time2 = run_vlookup_exact_trials(df, vlookup_exact_hash)
    print(title)
    print(f"{subtitle1} time: {time1}")
    print(f"{subtitle2} time: {time2}")
    print()

df = create_df(df_type="constant", start_val=50)
run_lookup_benchmark(df, "Constant DataFrame Exact")

df = create_df(df_type="range", start_val=0)
run_lookup_benchmark(df, "Range DataFrame Exact")

df = create_df(df_type="range", start_val=0.5)
run_lookup_benchmark(df, "Range DataFrame All NaN")


BENCHMARKING VLOOKUP EXACT ------------- 

Constant DataFrame Exact
Nested Loops time: 0.14468202590942383
Hash and Probe time: 0.04580655097961426

Range DataFrame Exact
Nested Loops time: 0.18172135353088378
Hash and Probe time: 0.04652438163757324

Range DataFrame All NaN
Nested Loops time: 0.18553013801574708
Hash and Probe time: 0.02304677963256836

