In [1]:
import pandas as pd
import time
import random

from forms.executor.dfexecutor.lookup.lookupfuncexecutor import lookup_binary_search, lookup_sort_merge
from forms.executor.dfexecutor.lookup.vlookupfuncexecutor import vlookup_exact_hash, vlookup_exact_loops

In [2]:
def create_df(size=(1000, 10), df_type="constant", start_val: float = 0, first_col_idx=True, seed=2):
    rows, cols = size
    assert rows >= 1 and cols >= 1
    lst = [([0.0] * cols) for _ in range(rows)]
    random.seed(seed)
    for i in range(rows):
        col_start = 0
        if first_col_idx:
            col_start = 1
            lst[i][0] = i
        for j in range(col_start, cols):
            if df_type == "constant":
                val = start_val
            elif df_type == "range":
                val = start_val + i
            elif df_type == "random":
                val = random.random() * rows
            else:
                raise IOError(f"Df type {df_type} is not supported!")
            lst[i][j] = val
    return pd.DataFrame(lst)


def print_results(title: str, subtitle1: str, subtitle2: str, time1: float, time2: float):
    print(title)
    print(f"{subtitle1} time: {time1}")
    print(f"{subtitle2} time: {time2}")
    print()

In [3]:
def run_lookup_trials(df: pd.DataFrame):
    values, search_range, result_range = df.iloc[:, 1], df.iloc[:, 0], df.iloc[:, 2]

    iterations: int = 3

    # Warm up cache
    lookup_binary_search(values, search_range, result_range)
    lookup_sort_merge(values, search_range, result_range)

    # Run trials
    binary_search_time, sort_merge_time = 0, 0
    for i in range(iterations):
        start_time = time.time()
        result1 = lookup_binary_search(values, search_range, result_range)
        binary_search_time += time.time() - start_time
        start_time = time.time()
        result2 = lookup_sort_merge(values, search_range, result_range)
        sort_merge_time += time.time() - start_time
        assert result1.equals(result2)
    for i in range(iterations):
        start_time = time.time()
        result1 = lookup_sort_merge(values, search_range, result_range)
        sort_merge_time += time.time() - start_time
        start_time = time.time()
        result2 = lookup_binary_search(values, search_range, result_range)
        binary_search_time += time.time() - start_time
        assert result1.equals(result2)

    return binary_search_time / (3 * iterations), sort_merge_time / (3 * iterations)

In [4]:
def run_vlookup_exact_trials(df: pd.DataFrame):
    values = df.iloc[:, 1]
    col_idxes = pd.Series([df.shape[1]] * df.shape[0])

    iterations: int = 3

    # Warm up cache
    vlookup_exact_loops(values, df, col_idxes)
    vlookup_exact_hash(values, df, col_idxes)

    # Run trials
    loops_time, hash_time = 0, 0
    for i in range(iterations):
        start_time = time.time()
        result1 = vlookup_exact_loops(values, df, col_idxes)
        loops_time += time.time() - start_time
        start_time = time.time()
        result2 = vlookup_exact_hash(values, df, col_idxes)
        hash_time += time.time() - start_time
        assert result1.equals(result2)
    for i in range(iterations):
        start_time = time.time()
        result1 = vlookup_exact_hash(values, df, col_idxes)
        hash_time += time.time() - start_time
        start_time = time.time()
        result2 = vlookup_exact_loops(values, df, col_idxes)
        loops_time += time.time() - start_time
        assert result1.equals(result2)

    return loops_time / (3 * iterations), hash_time / (3 * iterations)

In [5]:
print("\nBENCHMARKING LOOKUP", "-" * 20, "\n")

df = create_df(df_type="constant", start_val=50)
binary_search_time, sort_merge_time = run_lookup_trials(df)
print_results(
    "Constant DataFrame Exact", "Binary search", "Sort merge", binary_search_time, sort_merge_time
)

df = create_df(df_type="range", start_val=0)
binary_search_time, sort_merge_time = run_lookup_trials(df)
print_results(
    "Range DataFrame Exact", "Binary search", "Sort merge", binary_search_time, sort_merge_time
)

df = create_df(df_type="range", start_val=0.5)
binary_search_time, sort_merge_time = run_lookup_trials(df)
print_results(
    "Range DataFrame Approximate", "Binary search", "Sort merge", binary_search_time, sort_merge_time
)

df = create_df(df_type="range", start_val=-2.5)
binary_search_time, sort_merge_time = run_lookup_trials(df)
print_results(
    "Range DataFrame Approximate NaN",
    "Binary search",
    "Sort merge",
    binary_search_time,
    sort_merge_time,
)

df = create_df(df_type="random")
binary_search_time, sort_merge_time = run_lookup_trials(df)
print_results(
    "Random DataFrame Approximate",
    "Binary search",
    "Sort merge",
    binary_search_time,
    sort_merge_time,
)


BENCHMARKING LOOKUP -------------------- 

Constant DataFrame Exact
Binary search time: 0.051648987664116755
Sort merge time: 0.003308958477444119

Range DataFrame Exact
Binary search time: 0.051115459865993924
Sort merge time: 0.010482655631171333

Range DataFrame Approximate
Binary search time: 0.04970757166544596
Sort merge time: 0.013570547103881836

Range DataFrame Approximate NaN
Binary search time: 0.05060754881964789
Sort merge time: 0.014084895451863607

Random DataFrame Approximate
Binary search time: 0.05478745036655002
Sort merge time: 0.012599335776435005



In [6]:
print("\nBENCHMARKING VLOOKUP EXACT", "-" * 13, "\n")

df = create_df(df_type="constant", start_val=50)
loops_time, hash_time = run_vlookup_exact_trials(df)
print_results("Constant DataFrame Exact", "Nested Loops", "Hash and Probe", loops_time, hash_time)

df = create_df(df_type="range", start_val=0)
loops_time, hash_time = run_vlookup_exact_trials(df)
print_results("Range DataFrame Exact", "Nested Loops", "Hash and Probe", loops_time, hash_time)

df = create_df(df_type="range", start_val=0.5)
loops_time, hash_time = run_vlookup_exact_trials(df)
print_results("Range DataFrame All NaN", "Nested Loops", "Hash and Probe", loops_time, hash_time)


BENCHMARKING VLOOKUP EXACT ------------- 

Constant DataFrame Exact
Nested Loops time: 0.09641014205084907
Hash and Probe time: 0.028596825069851346

Range DataFrame Exact
Nested Loops time: 0.11388158798217773
Hash and Probe time: 0.0296222103966607

Range DataFrame All NaN
Nested Loops time: 0.13159735997517905
Hash and Probe time: 0.016110738118489582

