Skip to content

Commit

Permalink
Add filter and filter_with to Series (#728)
Browse files Browse the repository at this point in the history
* filter_with

* filter (macro)

* switch to underscore syntax

* no need for curly braces

Co-authored-by: José Valim <jose.valim@gmail.com>

* you no what? no braces at all

* reference mask/2

---------

Co-authored-by: José Valim <jose.valim@gmail.com>
  • Loading branch information
billylanchantin and josevalim committed Nov 7, 2023
1 parent 03a2160 commit f0d981d
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 0 deletions.
114 changes: 114 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,120 @@ defmodule Explorer.Series do
@spec at_every(series :: Series.t(), every_n :: integer()) :: Series.t()
def at_every(series, every_n), do: apply_series(series, :at_every, [every_n])

@doc """
Picks values based on an `Explorer.Query`.
The query is compiled and runs efficiently against the series.
The query must return a boolean expression or a list of boolean expressions.
When a list is returned, they are joined as `and` expressions.
> #### Notice {: .notice}
>
> This is a macro. You must `require Explorer.Series` before using it.
Besides element-wise series operations, you can also use window functions
and aggregations inside comparisons.
See `filter_with/2` for a callback version of this function without
`Explorer.Query`.
See `mask/2` if you want to filter values based on another series.
## Syntax
> #### Notice {: .notice}
>
> This macro uses the special `_` syntax.
DataFrames have named columns, so their queries use column names as variables:
iex> require Explorer.DataFrame
iex> df = Explorer.DataFrame.new(col_name: [1, 2, 3])
iex> Explorer.DataFrame.filter(df, col_name > 2)
#Explorer.DataFrame<
Polars[1 x 1]
col_name integer [3]
>
Series have no named columns.
(A series constitutes a single column, so no name is required.)
This means their queries can't use column names as variables.
Instead, series queries use the special `_` variable like so:
iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, _ > 2)
#Explorer.Series<
Polars[1]
integer [3]
>
## Examples
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.filter(s, _ == "b")
#Explorer.Series<
Polars[1]
string ["b"]
>
iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, remainder(_, 2) == 1)
#Explorer.Series<
Polars[2]
integer [1, 3]
>
Returning a non-boolean expression errors:
iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, cumulative_max(_))
** (ArgumentError) expecting the function to return a boolean LazySeries, but instead it returned a LazySeries of type :integer
Which can be addressed by converting it to boolean:
iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, cumulative_max(_) == 1)
#Explorer.Series<
Polars[1]
integer [1]
>
"""
@doc type: :element_wise
defmacro filter(series, query) do
quote do
require Explorer.Query

Explorer.DataFrame.new(_: unquote(series))
|> Explorer.DataFrame.filter_with(Explorer.Query.query(unquote(query)))
|> Explorer.DataFrame.pull(:_)
end
end

@doc """
Filters a series with a callback function.
See `mask/2` if you want to filter values based on another series.
## Examples
iex> series = Explorer.Series.from_list([1, 2, 3])
iex> is_odd = fn s -> s |> Explorer.Series.remainder(2) |> Explorer.Series.equal(1) end
iex> Explorer.Series.filter_with(series, is_odd)
#Explorer.Series<
Polars[2]
integer [1, 3]
>
"""
@doc type: :element_wise
@spec filter_with(
series :: Series.t(),
fun :: (Series.t() -> Series.lazy_t())
) :: Series.t()
def filter_with(%Series{} = series, fun) when is_function(fun, 1) do
Explorer.DataFrame.new(series: series)
|> Explorer.DataFrame.filter_with(&fun.(&1[:series]))
|> Explorer.DataFrame.pull(:series)
end

@doc """
Filters a series with a mask.
Expand Down
48 changes: 48 additions & 0 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2727,6 +2727,54 @@ defmodule Explorer.SeriesTest do
end
end

describe "filter/2" do
test "basic example" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter(s, _ > 2)
assert Series.to_list(filtered) == [3, 4]
end

test "aggregation" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter(s, _ == count(_))
assert Series.to_list(filtered) == [4]
end

test "mismatched columns" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
message = "could not find column name \"n\". The available entries are: [\"_\"]"

assert_raise ArgumentError, message, fn ->
Series.filter(s, n > 2)
end
end
end

describe "filter_with/2" do
test "basic example" do
s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter_with(s, &Series.greater(&1, 2))
assert Series.to_list(filtered) == [3, 4]
end

test "raise an error if the function is not returning a lazy series" do
s = Series.from_list([1, 2, 3, 4])

message =
"expecting the function to return a single or a list of boolean LazySeries, but instead it contains:\ntrue"

assert_raise ArgumentError, message, fn ->
Series.filter_with(s, &(&1 > 2))
end
end
end

describe "sample/2" do
test "sample taking 10 elements" do
s = 1..100 |> Enum.to_list() |> Series.from_list()
Expand Down

0 comments on commit f0d981d

Please sign in to comment.