In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np
import pandas as pd
import tqdm
import typing

import theoretical_models

## Constructing (standardised) data frames

Each task's experimentor will have some way that they've saved the data. The DataFrame class below is a base class that each task can inherit from, and standardise the way that the data is stored. It also marries up the experimentor's recall investigations with the speed measuring results, which are computed separately.

In [3]:
class DataFrame:

    @staticmethod
    def _load_dataframe(file: str) -> pd.DataFrame:
        """Helper method to load file into data frame"""
        if file.split(".")[-1] == "csv":
            return pd.read_csv(file)
        elif file.split(".")[-1] == "json":
            return pd.read_json(file)
        elif file.split(".")[-1] == "jsonl":
            return pd.read_json(file, lines=True)

    @staticmethod
    def _build_dataframe(
        cls: typing.Self, 
        file: str, 
        feature_prefix: str, 
        features: list[str],
    ) -> pd.DataFrame:
        """Helper method to build a preprocessed data frame"""
        df, data = pd.DataFrame(), cls._load_dataframe(file)
        for feature in tqdm.tqdm(features, desc=f"{feature_prefix} features"):
            function = getattr(cls, f"{feature_prefix}_{feature}")
            df.loc[:, feature] = data.apply(function, axis=1)
        return df

    def __new__(
        cls: typing.Self, 
        recall_file: str,
        speed_file: str = None,
        save_file: str = None,
    ) -> pd.DataFrame:
        """Instantiate a generic preprocessed data frame"""
        features = ["k", "b", "k_mult", "k_b", "interleaved", "metric"]
        recall_df = cls._build_dataframe(cls, recall_file, "recall", features)
        if speed_file:
            features = ["n", "batch_size", "k", "b", "k_mult", "k_b", "interleaved"]
            features += ["method", "duration_mean", "duration_stdv"]
            features += ["cost_basic", "cost_serial", "cost_parallel"]
            speed_df = cls._build_dataframe(cls, speed_file, "speed", features)
            merged_df = pd.merge(recall_df, speed_df)
            if save_file:
                with open(save_file, "w") as file:
                    file.write(merged_df.to_json(lines=True, orient="records"))
            return merged_df
        return recall_df
            

    def speed_n(row):
        return row["topk_size"]

    def speed_batch_size(row):
        return row["batch_size"]

    def speed_k(row):
        return row["k"]
    
    def speed_b(row):
        return (row["k"] // row["j"]) * row["k_mult"]

    def speed_k_mult(row):
        return row["k_mult"]
    
    def speed_k_b(row):
        return row["j"]

    def speed_interleaved(row):
        return row["args"].get("interleaved", True)
    
    def speed_method(row):
        return row["method"]

    def speed_duration_mean(row):
        return np.mean(row["duration"])

    def speed_duration_stdv(row):
        return np.std(row["duration"])

    def speed_cost_basic(row):
        return theoretical_models.cost_basic.approx_topk(
            k=row["k"], 
            n=row["topk_size"],
            m=row["batch_size"],
            b=(row["k"] // row["j"]) * row["k_mult"],
            k_b=row["j"],
        )

    def speed_cost_serial(row):
        return theoretical_models.cost_serial.approx_topk(
            k=row["k"], 
            n=row["topk_size"],
            m=row["batch_size"],
            b=(row["k"] // row["j"]) * row["k_mult"],
            k_b=row["j"],
        )

    def speed_cost_parallel(row):
        return theoretical_models.cost_parallel.approx_topk(
            k=row["k"], 
            n=row["topk_size"],
            m=row["batch_size"],
            b=(row["k"] // row["j"]) * row["k_mult"],
            k_b=row["j"],
        )

The task-specific data frame classes below describe the mapping from each experimentor's saved data to a standard approach. 

_Note: as benchmarking has all been run by Alberto, the speed data is the same across all tasks, and therefore is included in the parent DataFrame class._

In [4]:
class VocabDataFrame(DataFrame):

    def recall_k(row):
        return row["k"]
    
    def recall_b(row):
        return row["num_buckets"] * row["k_mult"]

    def recall_k_mult(row):
        return row["k_mult"]
    
    def recall_k_b(row):
        return row["k_per_bucket"]

    def recall_interleaved(row):
        return row["interleaved"]

    def recall_metric(row):
        k = row["k"]
        return row[f"recall_k{k}"]



class GraphDataFrame(DataFrame):

    def recall_k(row):
        return row["K"]
    
    def recall_b(row):
        return row["n_buckets"]

    def recall_k_mult(row):
        return row["k_mult"]
    
    def recall_k_b(row):
        return row["J"]

    def recall_interleaved(_):
        return True

    def recall_metric(row):
        return row[f"recall_interleaved"]


class SparQDataFrame(DataFrame):
    pass


class SynthDataFrame(DataFrame):
    pass

In [8]:
graph_df = GraphDataFrame(
    save_file="../data/graph-data-merged.csv",
    recall_file="../data/graph-recall-data.csv", 
    speed_file="../data/graph-speed-data.jsonl",
)
print(graph_df)

recall features: 100%|██████████| 6/6 [00:00<00:00, 1167.90it/s]
speed features: 100%|██████████| 13/13 [00:00<00:00, 198.12it/s]

         k         b   k_mult  k_b  interleaved    metric        n  \
0     10.0      10.0      1.0  1.0         True  0.653346  2653751   
1     10.0      10.0      1.0  1.0         True  0.653346  2653751   
2     10.0      10.0      1.0  1.0         True  0.653346  2653751   
3     10.0      10.0      1.0  1.0         True  0.653346  2653751   
4     10.0      10.0      1.0  1.0         True  0.653346  2653751   
..     ...       ...      ...  ...          ...       ...      ...   
475  100.0  409600.0  16384.0  4.0         True  1.000000  2653751   
476  100.0  409600.0  16384.0  4.0         True  1.000000  2653751   
477  100.0  409600.0  16384.0  4.0         True  1.000000  2653751   
478  100.0  409600.0  16384.0  4.0         True  1.000000  2653751   
479  100.0  409600.0  16384.0  4.0         True  1.000000  2653751   

     batch_size                                 method  duration_mean  \
0             1         approx_topk.torch_default.topk       0.001936   
1            


