In [51]:
import pandas as pd
import numpy as np
from scipy import spatial
import polars as pl
import random
import os
from sklearn.neighbors import KDTree
import pickle
from joblib import dump
from joblib import load
import csv

In [54]:
weapons = ["bomb", "rifle", "arrow"]
arr_random_input = np.random.default_rng().uniform(low=-100,high=100,size=[10_000_000,2])
arr_random_query = np.random.default_rng().uniform(low=-100,high=100,size=[1000,2])
df = pd.DataFrame(arr_random_input, columns=["x","y"])
query_df = pd.DataFrame(arr_random_query, columns=["x","y"])
df["weapon"] = "a"
df["weapon"]=df.weapon.apply(lambda x:random.choice(weapons))
df.to_parquet("input.parquet")
query_df.to_parquet("query.parquet")
query_df.to_csv("query.csv",index=False)

### Original kdtree with polars (our current best) 
does around 5.2sec (@korvó) and around 11 sec on laptp for 10Mx1000

In [5]:
query_df_p = pl.read_parquet("query.parquet")
df_p = pl.read_parquet("input.parquet")

In [6]:
%%timeit
tree = spatial.KDTree(df_p[["x","y"]])
out = []
for row in query_df_p.iter_rows():
    out.append({"dist": tree.query(row)[0], 
                "weapon": df_p[int(tree.query(row)[1])]["weapon"].item()})

10.9 s ± 685 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


list comprehension is actually slower

In [10]:
%%timeit
tree = spatial.KDTree(df_p[["x","y"]])
out = [["dist","weapon"]]
out = [[dist,df_p[int(idx)]["weapon"]] for dist, idx in
        zip(tree.query(query_df_p)[0], tree.query(query_df_p)[1])]

In [41]:
tree = spatial.KDTree(df_p[["x","y"]])
out = [["dist","weapon"]]
out = [[dist,df_p[int(idx)]["weapon"]] for dist, idx in
        zip(tree.query(query_df_p)[0], tree.query(query_df_p)[1])]

# New tree with polars
tree building is around 30 sec but maybe can be exported?

In [44]:
tree = KDTree(df_p[["x","y"]],leaf_size=10)
out = []


querying looks reasonably fast

In [33]:
for row in query_df_p.iter_rows():
    out.append({"dist": tree.query([row])[0][0][0], 
                "weapon": df_p[int(tree.query([row])[1][0][0])]["weapon"].item()})

trying to export tree

In [37]:
from joblib import dump
from joblib import load

In [45]:
dump(tree,"notebook_results/sklearn_tree.joblib")

['notebook_results/sklearn_tree.joblib']

In [46]:
imported_tree = load("notebook_results/sklearn_tree.joblib")

In [47]:
for row in query_df_p.iter_rows():
    out.append({"dist": imported_tree.query([row])[0][0][0], 
                "weapon": df_p[int(imported_tree.query([row])[1][0][0])]["weapon"].item()})

# Tree export-import checking

## Export-import speedtest

In [21]:
query_df_p = pl.read_parquet("query.parquet")
df_p = pl.read_parquet("input.parquet")

In [22]:
tree = spatial.KDTree(df_p[["x","y"]])

In [23]:
with open("tree.pkl","wb") as file:
    pickle.dump(tree, file)


In [24]:
%%timeit
with open("tree.pkl","rb") as file:
    tree = pickle.load(file)


187 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
dump(tree,"tree.joblib")

['tree.joblib']

In [25]:
%%timeit
tree = load("tree.joblib")

199 ms ± 8.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
tree = spatial.KDTree(df_p[["x","y"]])
with open("tree_scikit.pkl","wb") as file:
    pickle.dump(tree, file)

In [37]:
%%timeit
with open("tree_scikit.pkl","rb") as file:
    tree = pickle.load(file)


202 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


pickle seems to be faster and scipy.spatial is faster

## Query speedtest

In [38]:
tree = spatial.KDTree(df_p[["x","y"]])

In [39]:
%%timeit
tree.query(query_df_p)

1.09 ms ± 17.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [40]:
tree = KDTree(df_p[["x","y"]],leaf_size=10)

In [41]:
%%timeit
tree.query(query_df_p)

1.92 ms ± 16.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


query-ing is faster for spatial tree --> should use spatial

## Blocked lookup

### Baseline

In [57]:
df_p = pl.read_parquet("input.parquet")
tree = spatial.KDTree(df_p[["x","y"]])
with open("tree.pkl","wb") as file:
    pickle.dump(tree, file)



In [60]:
df_p = pl.read_parquet("input.parquet")
query_df_p = pl.read_csv("query.csv")
with open("tree.pkl","rb") as file:
    tree = pickle.load(file)
dist, ind = tree.query(query_df_p)    
pl.DataFrame({"dist":dist,
            "weapon":df_p[ind]["weapon"].to_list()}).write_csv("ad.csv")


In [62]:
%%timeit
df_p = pl.read_parquet("input.parquet")
query_df_p = pl.read_csv("query.csv")
with open("tree.pkl","rb") as file:
    tree = pickle.load(file)
dist, ind = tree.query(query_df_p)
pl.DataFrame({"dist":dist,
            "weapon":df_p[ind]["weapon"].to_list()}).write_csv("ad.csv")

336 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


with polars + pickle tree load + single querrying we get 330 ms for 10Mx1000 (at datalab)

### Numpy lookup

### Try blocked search

In [100]:
input_df = pl.read_parquet("input.parquet")
weapons = input_df["weapon"].to_list()

In [101]:
weapons

['rifle',
 'bomb',
 'rifle',
 'arrow',
 'rifle',
 'bomb',
 'arrow',
 'bomb',
 'arrow',
 'bomb',
 'arrow',
 'rifle',
 'arrow',
 'bomb',
 'arrow',
 'arrow',
 'bomb',
 'rifle',
 'bomb',
 'bomb',
 'bomb',
 'bomb',
 'rifle',
 'arrow',
 'rifle',
 'rifle',
 'arrow',
 'bomb',
 'arrow',
 'bomb',
 'rifle',
 'rifle',
 'rifle',
 'arrow',
 'rifle',
 'arrow',
 'bomb',
 'bomb',
 'arrow',
 'bomb',
 'rifle',
 'arrow',
 'rifle',
 'rifle',
 'arrow',
 'rifle',
 'rifle',
 'rifle',
 'bomb',
 'bomb',
 'rifle',
 'arrow',
 'arrow',
 'rifle',
 'rifle',
 'arrow',
 'bomb',
 'arrow',
 'arrow',
 'rifle',
 'arrow',
 'arrow',
 'rifle',
 'arrow',
 'arrow',
 'rifle',
 'bomb',
 'bomb',
 'arrow',
 'rifle',
 'rifle',
 'bomb',
 'bomb',
 'arrow',
 'bomb',
 'arrow',
 'rifle',
 'arrow',
 'bomb',
 'arrow',
 'bomb',
 'rifle',
 'arrow',
 'rifle',
 'arrow',
 'bomb',
 'bomb',
 'bomb',
 'rifle',
 'arrow',
 'arrow',
 'rifle',
 'rifle',
 'arrow',
 'rifle',
 'arrow',
 'arrow',
 'bomb',
 'bomb',
 'rifle',
 'arrow',
 'arrow',
 'bomb',
 

In [109]:
weapon_dict = {}
start_idx = 0
current_weapon = input_df["weapon"][0]

In [112]:
weapon_dict[0] = "adas"
weapon_dict[4] = "adfae"

In [113]:
weapon_dict

{0: 'adas', 4: 'adfae'}

In [103]:
current_weapon

'rifle'

In [104]:
for i, weapon in enumerate(df['weapon']):
    # If the weapon changes or it's the last row
    if weapon != current_weapon or i == len(df) - 1:
        end_idx = i - 1 if weapon != current_weapon else i
        weapon_dict[start_idx] = current_weapon
        weapon_dict[end_idx] = weapon
        start_idx = i
        current_weapon = weapon

In [105]:
len(weapon_dict)

8891405

In [106]:
weapon_dict

{0: 'bomb',
 1: 'rifle',
 2: 'arrow',
 3: 'rifle',
 4: 'bomb',
 5: 'arrow',
 6: 'bomb',
 7: 'arrow',
 8: 'bomb',
 9: 'arrow',
 10: 'rifle',
 11: 'arrow',
 12: 'bomb',
 13: 'arrow',
 14: 'arrow',
 15: 'bomb',
 16: 'rifle',
 17: 'bomb',
 18: 'bomb',
 21: 'rifle',
 22: 'arrow',
 23: 'rifle',
 24: 'rifle',
 25: 'arrow',
 26: 'bomb',
 27: 'arrow',
 28: 'bomb',
 29: 'rifle',
 30: 'rifle',
 32: 'arrow',
 33: 'rifle',
 34: 'arrow',
 35: 'bomb',
 36: 'bomb',
 37: 'arrow',
 38: 'bomb',
 39: 'rifle',
 40: 'arrow',
 41: 'rifle',
 42: 'rifle',
 43: 'arrow',
 44: 'rifle',
 45: 'rifle',
 47: 'bomb',
 48: 'bomb',
 49: 'rifle',
 50: 'arrow',
 51: 'arrow',
 52: 'rifle',
 53: 'rifle',
 54: 'arrow',
 55: 'bomb',
 56: 'arrow',
 57: 'arrow',
 58: 'rifle',
 59: 'arrow',
 60: 'arrow',
 61: 'rifle',
 62: 'arrow',
 63: 'arrow',
 64: 'rifle',
 65: 'bomb',
 66: 'bomb',
 67: 'arrow',
 68: 'rifle',
 69: 'rifle',
 70: 'bomb',
 71: 'bomb',
 72: 'arrow',
 73: 'bomb',
 74: 'arrow',
 75: 'rifle',
 76: 'arrow',
 77: 'bom

In [72]:
weapons

weapon
str
"""rifle"""
"""bomb"""
"""rifle"""
"""arrow"""
"""rifle"""
…
"""rifle"""
"""bomb"""
"""arrow"""
"""rifle"""
