-
Notifications
You must be signed in to change notification settings - Fork 144
Performance issues with group by #1058
Copy link
Copy link
Open
Description
@mhanberg Noted alerted us to some unexpectedly slow performance in Discord. I'm making this as a placeholder for myself so I don't forget about the investigation.
Benchmarking
Elixir code & results
defmodule MotchBenchmarkTest do
use ExUnit.Case, async: true
use ExUnitProperties
require Explorer.DataFrame, as: DataFrame
setup do
n_rows = 750_000
results = ["overbaked", "underbaked", "justright"]
data_frame =
DataFrame.new(
alice_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(5) end),
bob_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(10) end),
carol_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(15) end),
dave_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(20) end),
foo_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(25) end),
ingredients: Enum.map(1..n_rows, fn _ -> :rand.uniform(30) end),
sim_id: Enum.map(1..n_rows, fn _ -> :rand.uniform(35) end),
result: Enum.flat_map(1..n_rows, fn _ -> Enum.take_random(results, 1) end)
)
DataFrame.to_parquet(data_frame, "motch.parquet")
%{data_frame: data_frame}
end
test "motch", %{data_frame: data_frame} do
cookies = do_cookies(data_frame)
rest = do_rest(data_frame)
# Benchee.run(
# %{
# "1_cookies" => fn ->
# do_cookies(data_frame)
# end,
# "2_rest" => fn ->
# do_rest(data_frame)
# end,
# "3_join" => fn ->
# DataFrame.join(rest, cookies, on: [:sim_id, :foo_id], how: :left)
# end
# },
# time: 10,
# memory_time: 2
# )
# |> IO.inspect()
end
def do_cookies(data_frame) do
cols = [
:alice_id,
:bob_id,
:carol_id,
:dave_id
]
for col <- cols do
data_frame
|> DataFrame.group_by(["sim_id"])
|> DataFrame.frequencies([to_string(col)])
|> DataFrame.rename([{col, "foo_id"}])
end
|> DataFrame.concat_rows()
|> DataFrame.group_by(["sim_id", "foo_id"])
|> DataFrame.summarise(cookies: sum(counts))
end
def do_rest(data_frame) do
outcomes = ["overbaked", "underbaked"]
data_frame
|> DataFrame.group_by([:sim_id, :foo_id])
|> DataFrame.summarise(
cakes:
sum(
# 😧
cond do
result == "overbaked" or ingredients >= 2 ->
0
# like 5 more of these
end
),
biscuits: sum(if result in ^outcomes, do: 1, else: 0),
pies: sum(if result == "overbaked", do: 1, else: 0)
)
|> DataFrame.mutate(hot_treats: biscuits * 4 + pies * 3)
end
endName ips average deviation median 99th %
3_join 224.09 4.46 ms ±6.97% 4.44 ms 5.39 ms
2_rest 1.22 818.17 ms ±1.26% 811.82 ms 838.34 ms
1_cookies 1.13 882.87 ms ±7.32% 863.68 ms 991.60 ms
Comparison:
3_join 224.09
2_rest 1.22 - 183.35x slower +813.71 ms
1_cookies 1.13 - 197.85x slower +878.41 ms
Memory usage statistics:
Name Memory usage
3_join 8.64 KB
2_rest 31.05 KB - 3.59x memory usage +22.41 KB
1_cookies 83.63 KB - 9.68x memory usage +74.98 KB
Python code & results (only for a subset of the full Elixir example)
import polars as pl
df = pl.read_parquet("/Users/billy/projects/elixir-explorer/explorer/motch.parquet")
def do_rest():
return (
df.group_by([pl.col("sim_id"), pl.col("foo_id")])
.agg(
cakes=pl.when(pl.Expr.or_((pl.col("result") == "overbaked"), pl.col("ingredients") > 2)).then(0).sum(),
biscuits=pl.when(pl.col("result").is_in(["overbaked", "underbaked"])).then(1).otherwise(0).sum(),
pies=pl.when(pl.col("result") == "overbaked").then(1).otherwise(0).sum(),
)
)
timeit.timeit("do_rest()", globals=locals(), number=100) / 100
#=> 0.03980570465093478Summary
- Elixir: 800ms
- Python: 40ms
Not sure what's up yet. I'm guessing the query plans will reveal some bad defaults.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels