In [2]:
import pandas as pd
import numpy as np
from scipy import spatial
import polars as pl
import random
import os
from sklearn.neighbors import KDTree

In [4]:
weapons = ["bomb", "rifle", "arrow"]
arr_random_input = np.random.default_rng().uniform(low=-100,high=100,size=[10_000_000,2])
arr_random_query = np.random.default_rng().uniform(low=-100,high=100,size=[1000,2])
df = pd.DataFrame(arr_random_input, columns=["x","y"])
query_df = pd.DataFrame(arr_random_query, columns=["x","y"])
df["weapon"] = "a"
df["weapon"]=df.weapon.apply(lambda x:random.choice(weapons))
df.to_parquet("notebook_results/input.parquet")
query_df.to_parquet("notebook_results/query.parquet")

### Original kdtree with polars (our current best) 
does around 5.2sec (@korvó) and around 11 sec on laptp for 10Mx1000

In [5]:
query_df_p = pl.read_parquet("notebook_results/query.parquet")
df_p = pl.read_parquet("notebook_results/input.parquet")

In [6]:
%%timeit
tree = spatial.KDTree(df_p[["x","y"]])
out = []
for row in query_df_p.iter_rows():
    out.append({"dist": tree.query(row)[0], 
                "weapon": df_p[int(tree.query(row)[1])]["weapon"].item()})

10.9 s ± 685 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


list comprehension is actually slower

In [10]:
%%timeit
tree = spatial.KDTree(df_p[["x","y"]])
out = [["dist","weapon"]]
out = [[dist,df_p[int(idx)]["weapon"]] for dist, idx in
        zip(tree.query(query_df_p)[0], tree.query(query_df_p)[1])]

In [41]:
tree = spatial.KDTree(df_p[["x","y"]])
out = [["dist","weapon"]]
out = [[dist,df_p[int(idx)]["weapon"]] for dist, idx in
        zip(tree.query(query_df_p)[0], tree.query(query_df_p)[1])]

# New tree with polars
tree building is around 30 sec but maybe can be exported?

In [44]:
tree = KDTree(df_p[["x","y"]],leaf_size=10)
out = []


querying looks reasonably fast

In [33]:
for row in query_df_p.iter_rows():
    out.append({"dist": tree.query([row])[0][0][0], 
                "weapon": df_p[int(tree.query([row])[1][0][0])]["weapon"].item()})

trying to export tree

In [37]:
from joblib import dump
from joblib import load

In [45]:
dump(tree,"notebook_results/sklearn_tree.joblib")

['notebook_results/sklearn_tree.joblib']

In [46]:
imported_tree = load("notebook_results/sklearn_tree.joblib")

In [47]:
for row in query_df_p.iter_rows():
    out.append({"dist": imported_tree.query([row])[0][0][0], 
                "weapon": df_p[int(imported_tree.query([row])[1][0][0])]["weapon"].item()})