In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import polars as pl
from polars import DataFrame
import numpy as np
from time import perf_counter

In [49]:
seed = 100
np.random.seed(seed)

In [74]:
def gen_rows(count: int):
    width = 3
    return [
        np.random.choice(['a','b','c','d', 'e', 'f', 'g'], count),
        np.random.randint(0, 33, (count, width), dtype=np.int32),
    ]


def load_large_df(data) -> pl.DataFrame:
    df = DataFrame(data, schema=["name", "indices"])
    df = df.with_columns([pl.col("indices").cast(pl.List(pl.Int32))])
    return df

def queries(db: DataFrame, queries: list[str]) -> list[set[int]]:
    nodes_so_far = db.filter(pl.col("name") == queries[0])["indices"].to_list()[0]
    for q in queries[1:]:
        found_nodes = db.filter(
            (pl.col("name") == q)
            & (pl.col("indices").list.set_intersection(list(nodes_so_far)).list.len() == 1)
        )
        nodes_so_far = set(found_nodes["indices"].to_list()[0]).union(nodes_so_far)
    return nodes_so_far



In [81]:
data = gen_rows(1_000_000)
start = perf_counter()
df = load_large_df(data)
print(f"Time that it took: {perf_counter() - start}")
print(df)

Time that it took: 0.07667216699337587
shape: (1_000_000, 2)
┌──────┬──────────────┐
│ name ┆ indices      │
│ ---  ┆ ---          │
│ str  ┆ list[i32]    │
╞══════╪══════════════╡
│ e    ┆ [27, 30, 31] │
│ d    ┆ [15, 9, 5]   │
│ a    ┆ [31, 11, 22] │
│ e    ┆ [19, 4, 8]   │
│ e    ┆ [20, 23, 6]  │
│ …    ┆ …            │
│ e    ┆ [15, 1, 22]  │
│ a    ┆ [5, 14, 22]  │
│ c    ┆ [27, 21, 9]  │
│ d    ┆ [21, 11, 1]  │
│ c    ┆ [5, 26, 24]  │
└──────┴──────────────┘


In [92]:
start = perf_counter()

queries(df, ["a", "b", "c", "a", "e", "f", "g", "g", "f","c","b","a","d","f","e","d","c","b","a","f","e","d","c","b","a"])
print(f"Time that it took: {perf_counter() - start}")


Time that it took: 0.9263365830411203
