From 3f64a557324030882a5cae27ed86b4490d7f165c Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 17 Apr 2024 14:49:32 -0300 Subject: [PATCH 1/7] Normalise dtype for lazy series The idea is to simplify and validate the dtypes for lazy series. --- lib/explorer/backend/lazy_series.ex | 2 ++ lib/explorer/backend/series.ex | 2 ++ test/explorer/backend/lazy_series_test.exs | 10 +++++----- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 68e903a06..6557e8732 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -198,6 +198,8 @@ defmodule Explorer.Backend.LazySeries do @doc false def new(op, args, dtype, aggregation \\ false) do + dtype = Explorer.Shared.normalise_dtype!(dtype) + %__MODULE__{op: op, args: args, dtype: dtype, aggregation: aggregation} end diff --git a/lib/explorer/backend/series.ex b/lib/explorer/backend/series.ex index 7e18d2675..b43c4ea5f 100644 --- a/lib/explorer/backend/series.ex +++ b/lib/explorer/backend/series.ex @@ -324,6 +324,8 @@ defmodule Explorer.Backend.Series do Create a new `Series`. """ def new(data, dtype) do + dtype = Explorer.Shared.normalise_dtype!(dtype) + %Explorer.Series{data: data, dtype: dtype} end diff --git a/test/explorer/backend/lazy_series_test.exs b/test/explorer/backend/lazy_series_test.exs index 7b4bddec6..cd80ebe71 100644 --- a/test/explorer/backend/lazy_series_test.exs +++ b/test/explorer/backend/lazy_series_test.exs @@ -5,8 +5,8 @@ defmodule Explorer.Backend.LazySeriesTest do alias Explorer.Backend.LazySeries test "inspect/2 gives a basic hint of lazy series" do - data = LazySeries.new(:column, ["col_a"], :unknown) - opaque_series = Backend.Series.new(data, {:s, 64}) + data = LazySeries.new(:column, ["col_a"], :s64) + opaque_series = Backend.Series.new(data, :s64) assert inspect(opaque_series) == """ @@ -18,7 +18,7 @@ defmodule Explorer.Backend.LazySeriesTest do end test "inspect/2 with nested operations" do - col = LazySeries.new(:column, ["col_a"], :unknown) + col = LazySeries.new(:column, ["col_a"], :s64) equal = LazySeries.new(:equal, [col, 5], :boolean) series = Backend.Series.new(equal, :boolean) @@ -33,7 +33,7 @@ defmodule Explorer.Backend.LazySeriesTest do end test "inspect/2 with single-element series" do - col = LazySeries.new(:column, ["col_a"], :unknown) + col = LazySeries.new(:column, ["col_a"], :u32) equal = LazySeries.new(:equal, [col, Explorer.Series.from_list([5]).data], :boolean) series = Backend.Series.new(equal, :boolean) @@ -48,7 +48,7 @@ defmodule Explorer.Backend.LazySeriesTest do end test "inspect/2 with nested series" do - col = LazySeries.new(:column, ["col_a"], :unknown) + col = LazySeries.new(:column, ["col_a"], :u32) equal = LazySeries.new(:equal, [col, Explorer.Series.from_list([1, 2, 3]).data], :boolean) series = Backend.Series.new(equal, :boolean) From 24805ac00830dec04a36dc47bd2624f7cb747b47 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 17 Apr 2024 18:14:57 -0300 Subject: [PATCH 2/7] Pass down the backend to the LazySeries The idea is to enable using it in case of any requirement. --- lib/explorer/backend/lazy_frame.ex | 16 ++++++++++++++-- lib/explorer/backend/lazy_series.ex | 21 ++++++++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/explorer/backend/lazy_frame.ex b/lib/explorer/backend/lazy_frame.ex index d59f0af02..087eef31a 100644 --- a/lib/explorer/backend/lazy_frame.ex +++ b/lib/explorer/backend/lazy_frame.ex @@ -10,9 +10,10 @@ defmodule Explorer.Backend.LazyFrame do alias Explorer.Backend alias Explorer.Backend.LazySeries - defstruct dtypes: %{}, names: [] + defstruct dtypes: %{}, names: [], original_data: nil @type t :: %__MODULE__{ + original_data: Backend.DataFrame.t(), dtypes: Backend.DataFrame.dtypes(), names: Backend.DataFrame.column_name() } @@ -21,7 +22,7 @@ defmodule Explorer.Backend.LazyFrame do @doc false def new(df) do Explorer.Backend.DataFrame.new( - %__MODULE__{names: df.names, dtypes: df.dtypes}, + %__MODULE__{names: df.names, dtypes: df.dtypes, original_data: df.data}, df.names, df.dtypes ) @@ -73,10 +74,21 @@ defmodule Explorer.Backend.LazyFrame do @impl true def pull(df, column) do dtype_for_column = df.dtypes[column] + series_backend = get_series_backend(df.data.original_data) + data = LazySeries.new(:column, [column], dtype_for_column) + data = %{data | backend: series_backend} + Backend.Series.new(data, dtype_for_column) end + defp get_series_backend(%module{}) do + module + |> Module.split() + |> List.replace_at(-1, "Series") + |> Module.concat() + end + funs = Backend.DataFrame.behaviour_info(:callbacks) -- (Backend.DataFrame.behaviour_info(:optional_callbacks) ++ diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 6557e8732..26bb9f7b8 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -9,9 +9,15 @@ defmodule Explorer.Backend.LazySeries do @behaviour Explorer.Backend.Series - defstruct op: nil, args: [], dtype: nil, aggregation: false + defstruct op: nil, args: [], dtype: nil, aggregation: false, backend: nil - @type t :: %__MODULE__{op: atom(), args: list(), dtype: any(), aggregation: boolean()} + @type t :: %__MODULE__{ + op: atom(), + args: list(), + dtype: any(), + aggregation: boolean(), + backend: nil | module() + } @operations [ # Element-wise @@ -199,8 +205,17 @@ defmodule Explorer.Backend.LazySeries do @doc false def new(op, args, dtype, aggregation \\ false) do dtype = Explorer.Shared.normalise_dtype!(dtype) + backend = backend_from_args(args) - %__MODULE__{op: op, args: args, dtype: dtype, aggregation: aggregation} + %__MODULE__{op: op, args: args, dtype: dtype, backend: backend, aggregation: aggregation} + end + + defp backend_from_args(args) do + Enum.find(args, fn + %__MODULE__{} = arg -> arg.backend + %module{} -> module + _other -> nil + end) end @doc false From e3b8550cc2225482534ceced73c8a746dccb3a87 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 17 Apr 2024 19:06:17 -0300 Subject: [PATCH 3/7] Use backend to get target dtype for lazy series --- lib/explorer/backend/lazy_series.ex | 17 +++++------ lib/explorer/backend/series.ex | 1 + lib/explorer/polars_backend/expression.ex | 2 +- lib/explorer/polars_backend/native.ex | 5 ++-- lib/explorer/polars_backend/series.ex | 12 ++++++-- native/explorer/src/expressions.rs | 10 +++++++ native/explorer/src/lib.rs | 6 ++-- native/explorer/src/series.rs | 17 +++++++++-- test/explorer/data_frame_test.exs | 36 +++++++++++++++++++---- 9 files changed, 82 insertions(+), 24 deletions(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 26bb9f7b8..cd8bb8638 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -1194,19 +1194,15 @@ defmodule Explorer.Backend.LazySeries do end @impl true - def re_named_captures(_series, _pattern) do - raise """ - #{unsupported(:re_named_captures, 2)} - - If you want to capture named groups from a column, you must do so outside of a query. - For example, instead of: + def re_named_captures(series, pattern) do + lazy_s = lazy_series!(series) + backend = lazy_s.backend - Explorer.DataFrame.mutate(df, new_column: re_named_captures(column, ~S/(a|b)/)) + target_dtype = backend.re_dtype(pattern) - You must write: + data = new(:re_named_captures, [lazy_s, pattern], target_dtype) - Explorer.DataFrame.put(df, :new_column, Explorer.Series.re_named_captures(column, ~S/(a|b)/)) - """ + Backend.Series.new(data, target_dtype) end @remaining_non_lazy_operations [ @@ -1218,6 +1214,7 @@ defmodule Explorer.Backend.LazySeries do cut: 5, qcut: 5, mask: 2, + re_dtype: 1, to_iovec: 1, to_list: 1 ] diff --git a/lib/explorer/backend/series.ex b/lib/explorer/backend/series.ex index b43c4ea5f..9394fbd59 100644 --- a/lib/explorer/backend/series.ex +++ b/lib/explorer/backend/series.ex @@ -298,6 +298,7 @@ defmodule Explorer.Backend.Series do @callback re_count_matches(s, String.t()) :: s @callback re_scan(s, String.t()) :: s @callback re_named_captures(s, String.t()) :: s + @callback re_dtype(String.t()) :: dtype() # Date / DateTime diff --git a/lib/explorer/polars_backend/expression.ex b/lib/explorer/polars_backend/expression.ex index 56cc70a39..8b43445d4 100644 --- a/lib/explorer/polars_backend/expression.ex +++ b/lib/explorer/polars_backend/expression.ex @@ -144,6 +144,7 @@ defmodule Explorer.PolarsBackend.Expression do count_matches: 2, re_count_matches: 2, re_scan: 2, + re_named_captures: 2, # Lists join: 2, @@ -171,7 +172,6 @@ defmodule Explorer.PolarsBackend.Expression do concat: 1, column: 1, correlation: 4, - re_named_captures: 2, covariance: 3 ] diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 6742544d5..dd9c46ce4 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -286,8 +286,9 @@ defmodule Explorer.PolarsBackend.Native do def s_concat(_series_list), do: err() def s_contains(_s, _pattern, _is_literal), do: err() def s_count_matches(_s, _pattern, _is_literal), do: err() - def s_extract_all(_s, _pattern), do: err() - def s_extract_groups(_s, _pattern), do: err() + def s_re_scan(_s, _pattern), do: err() + def s_re_named_captures(_s, _pattern), do: err() + def s_re_dtype(_pattern), do: err() def s_cumulative_max(_s, _reverse), do: err() def s_cumulative_min(_s, _reverse), do: err() def s_cumulative_sum(_s, _reverse), do: err() diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index d080d192a..aa6d752bc 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -790,12 +790,20 @@ defmodule Explorer.PolarsBackend.Series do @impl true def re_scan(series, pattern) do - Shared.apply_series(series, :s_extract_all, [pattern]) + Shared.apply_series(series, :s_re_scan, [pattern]) end @impl true def re_named_captures(series, pattern) do - Shared.apply_series(series, :s_extract_groups, [pattern]) + Shared.apply_series(series, :s_re_named_captures, [pattern]) + end + + @impl true + def re_dtype(regex_as_string) when is_binary(regex_as_string) do + case Explorer.PolarsBackend.Native.s_re_dtype(regex_as_string) do + {:ok, dtype} -> dtype + {:error, error} -> raise error + end end # Polars specific functions diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index 93994ddf2..47d43b9bb 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -1172,3 +1172,13 @@ pub fn expr_re_scan(expr: ExExpr, pattern: &str) -> ExExpr { let expr = expr.clone_inner(); ExExpr::new(expr.str().extract_all(pattern.lit())) } + +#[rustler::nif] +pub fn expr_re_named_captures(expr: ExExpr, pattern: &str) -> ExExpr { + let expr = expr.clone_inner(); + ExExpr::new( + expr.str() + .extract_groups(pattern) + .expect("should extract groups"), + ) +} diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs index 65613e2f4..338839455 100644 --- a/native/explorer/src/lib.rs +++ b/native/explorer/src/lib.rs @@ -267,6 +267,7 @@ rustler::init!( expr_count_matches, expr_re_count_matches, expr_re_scan, + expr_re_named_captures, // float round expressions expr_round, expr_floor, @@ -331,8 +332,9 @@ rustler::init!( s_concat, s_contains, s_count_matches, - s_extract_all, - s_extract_groups, + s_re_scan, + s_re_named_captures, + s_re_dtype, s_cos, s_upcase, s_day_of_week, diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index c38b1d5eb..5525951cc 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -1873,13 +1873,13 @@ pub fn s_count_matches( } #[rustler::nif(schedule = "DirtyCpu")] -pub fn s_extract_all(s1: ExSeries, pattern: &str) -> Result { +pub fn s_re_scan(s1: ExSeries, pattern: &str) -> Result { let chunked_array = s1.str()?.extract_all(pattern)?; Ok(ExSeries::new(chunked_array.into())) } #[rustler::nif(schedule = "DirtyCpu")] -pub fn s_extract_groups(s1: ExSeries, pattern: &str) -> Result { +pub fn s_re_named_captures(s1: ExSeries, pattern: &str) -> Result { let s2 = s1 .clone_inner() .into_frame() @@ -1895,3 +1895,16 @@ pub fn s_extract_groups(s1: ExSeries, pattern: &str) -> Result Result { + let s = Series::new("dummy", [""]) + .into_frame() + .lazy() + .with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy")) + .collect()? + .column("dummy")? + .clone(); + let ex_dtype = ExSeriesDtype::try_from(s.dtype())?; + Ok(ex_dtype) +} diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 2a683d2ae..74be2b6b0 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2056,12 +2056,38 @@ defmodule Explorer.DataFrameTest do } end - test "raise when try to extract groups" do - df = DF.new(a: ["2,000", "2,000,000", ",", nil]) + test "extract unnamed groups from regex in fields of a struct" do + df = DF.new(a: ["alice@example.com", "bob@example.com", nil]) - assert_raise RuntimeError, fn -> - DF.mutate(df, b: re_named_captures(a, ~S/(\d+)/)) - end + df1 = DF.mutate(df, b: re_named_captures(a, ~S/(.*[^@])@(.*)$/)) + + assert df1.dtypes["b"] == {:struct, [{"1", :string}, {"2", :string}]} + + assert DF.to_columns(df1, atom_keys: true) == %{ + a: ["alice@example.com", "bob@example.com", nil], + b: [ + %{"1" => "alice", "2" => "example.com"}, + %{"1" => "bob", "2" => "example.com"}, + %{"1" => nil, "2" => nil} + ] + } + end + + test "extract named groups from regex in fields of a struct" do + df = DF.new(a: ["alice@example.com", "bob@example.com", nil]) + + df1 = DF.mutate(df, b: re_named_captures(a, ~S/(?.*[^@])@(?.*)$/)) + + assert df1.dtypes["b"] == {:struct, [{"account", :string}, {"host", :string}]} + + assert DF.to_columns(df1, atom_keys: true) == %{ + a: ["alice@example.com", "bob@example.com", nil], + b: [ + %{"account" => "alice", "host" => "example.com"}, + %{"account" => "bob", "host" => "example.com"}, + %{"account" => nil, "host" => nil} + ] + } end end From 3a8d825716b5b0ee1646388d99ca0565939557e8 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Thu, 18 Apr 2024 14:12:12 -0300 Subject: [PATCH 4/7] Move responsibility for getting dtype of a regex to df --- lib/explorer/backend/data_frame.ex | 2 ++ lib/explorer/backend/lazy_frame.ex | 19 ++++++------------- lib/explorer/backend/lazy_series.ex | 5 ++--- lib/explorer/backend/series.ex | 1 - lib/explorer/polars_backend/data_frame.ex | 8 ++++++++ lib/explorer/polars_backend/lazy_frame.ex | 5 +++++ lib/explorer/polars_backend/native.ex | 2 +- lib/explorer/polars_backend/series.ex | 8 -------- native/explorer/src/dataframe.rs | 13 +++++++++++++ native/explorer/src/lib.rs | 2 +- native/explorer/src/series.rs | 13 ------------- 11 files changed, 38 insertions(+), 40 deletions(-) diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index 2df85c510..c06e291e5 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -162,6 +162,8 @@ defmodule Explorer.Backend.DataFrame do @callback n_rows(df) :: integer() @callback inspect(df, opts :: Inspect.Opts.t()) :: Inspect.Algebra.t() + @callback re_dtype(String.t()) :: dtype() + # Single table verbs @callback head(df, rows :: integer()) :: df diff --git a/lib/explorer/backend/lazy_frame.ex b/lib/explorer/backend/lazy_frame.ex index 087eef31a..d940fea22 100644 --- a/lib/explorer/backend/lazy_frame.ex +++ b/lib/explorer/backend/lazy_frame.ex @@ -10,10 +10,10 @@ defmodule Explorer.Backend.LazyFrame do alias Explorer.Backend alias Explorer.Backend.LazySeries - defstruct dtypes: %{}, names: [], original_data: nil + defstruct dtypes: %{}, names: [], backend: nil @type t :: %__MODULE__{ - original_data: Backend.DataFrame.t(), + backend: module(), dtypes: Backend.DataFrame.dtypes(), names: Backend.DataFrame.column_name() } @@ -21,8 +21,10 @@ defmodule Explorer.Backend.LazyFrame do @doc false def new(df) do + %module{} = df.data + Explorer.Backend.DataFrame.new( - %__MODULE__{names: df.names, dtypes: df.dtypes, original_data: df.data}, + %__MODULE__{names: df.names, dtypes: df.dtypes, backend: module}, df.names, df.dtypes ) @@ -74,21 +76,12 @@ defmodule Explorer.Backend.LazyFrame do @impl true def pull(df, column) do dtype_for_column = df.dtypes[column] - series_backend = get_series_backend(df.data.original_data) - data = LazySeries.new(:column, [column], dtype_for_column) - data = %{data | backend: series_backend} + data = LazySeries.new(:column, [column], dtype_for_column, false, df.data.backend) Backend.Series.new(data, dtype_for_column) end - defp get_series_backend(%module{}) do - module - |> Module.split() - |> List.replace_at(-1, "Series") - |> Module.concat() - end - funs = Backend.DataFrame.behaviour_info(:callbacks) -- (Backend.DataFrame.behaviour_info(:optional_callbacks) ++ diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index cd8bb8638..b3c91d80a 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -203,9 +203,9 @@ defmodule Explorer.Backend.LazySeries do @float_predicates [:is_finite, :is_infinite, :is_nan] @doc false - def new(op, args, dtype, aggregation \\ false) do + def new(op, args, dtype, aggregation \\ false, backend \\ nil) do dtype = Explorer.Shared.normalise_dtype!(dtype) - backend = backend_from_args(args) + backend = backend || backend_from_args(args) %__MODULE__{op: op, args: args, dtype: dtype, backend: backend, aggregation: aggregation} end @@ -1214,7 +1214,6 @@ defmodule Explorer.Backend.LazySeries do cut: 5, qcut: 5, mask: 2, - re_dtype: 1, to_iovec: 1, to_list: 1 ] diff --git a/lib/explorer/backend/series.ex b/lib/explorer/backend/series.ex index 9394fbd59..b43c4ea5f 100644 --- a/lib/explorer/backend/series.ex +++ b/lib/explorer/backend/series.ex @@ -298,7 +298,6 @@ defmodule Explorer.Backend.Series do @callback re_count_matches(s, String.t()) :: s @callback re_scan(s, String.t()) :: s @callback re_named_captures(s, String.t()) :: s - @callback re_dtype(String.t()) :: dtype() # Date / DateTime diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index 7867f7ede..c9091dcf3 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -890,6 +890,14 @@ defmodule Explorer.PolarsBackend.DataFrame do Explorer.Backend.DataFrame.inspect(df, "Polars", n_rows(df), opts) end + @impl true + def re_dtype(regex_as_string) when is_binary(regex_as_string) do + case Explorer.PolarsBackend.Native.df_re_dtype(regex_as_string) do + {:ok, dtype} -> dtype + {:error, error} -> raise error + end + end + # helpers defp pairwised(df, out_df, operation) do diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index b0a3a92fd..a14d69391 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -575,6 +575,11 @@ defmodule Explorer.PolarsBackend.LazyFrame do Shared.apply_dataframe(head, out_df, :lf_concat_columns, [Enum.map(tail, & &1.data)]) end + @impl true + def re_dtype(regex_as_string) when is_binary(regex_as_string) do + Eager.re_dtype(regex_as_string) + end + not_available_funs = [ correlation: 4, covariance: 3, diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index dd9c46ce4..aa5462147 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -168,6 +168,7 @@ defmodule Explorer.PolarsBackend.Native do def df_to_parquet_cloud(_df, _ex_entry, _compression), do: err() def df_width(_df), do: err() def df_nil_count(_df), do: err() + def df_re_dtype(_pattern), do: err() # Expressions (for lazy queries) @multi_arity_expressions [slice: 2, slice: 3, log: 1, log: 2] @@ -288,7 +289,6 @@ defmodule Explorer.PolarsBackend.Native do def s_count_matches(_s, _pattern, _is_literal), do: err() def s_re_scan(_s, _pattern), do: err() def s_re_named_captures(_s, _pattern), do: err() - def s_re_dtype(_pattern), do: err() def s_cumulative_max(_s, _reverse), do: err() def s_cumulative_min(_s, _reverse), do: err() def s_cumulative_sum(_s, _reverse), do: err() diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index aa6d752bc..a6e96fe9e 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -798,14 +798,6 @@ defmodule Explorer.PolarsBackend.Series do Shared.apply_series(series, :s_re_named_captures, [pattern]) end - @impl true - def re_dtype(regex_as_string) when is_binary(regex_as_string) do - case Explorer.PolarsBackend.Native.s_re_dtype(regex_as_string) do - {:ok, dtype} -> dtype - {:error, error} -> raise error - end - end - # Polars specific functions def name(series), do: Shared.apply_series(series, :s_name) diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs index dd264569c..4e258a290 100644 --- a/native/explorer/src/dataframe.rs +++ b/native/explorer/src/dataframe.rs @@ -474,3 +474,16 @@ pub fn df_lazy(df: ExDataFrame) -> Result { let new_lf = df.clone_inner().lazy(); Ok(ExLazyFrame::new(new_lf)) } + +#[rustler::nif(schedule = "DirtyCpu")] +pub fn df_re_dtype(pattern: &str) -> Result { + let s = Series::new("dummy", [""]) + .into_frame() + .lazy() + .with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy")) + .collect()? + .column("dummy")? + .clone(); + let ex_dtype = ExSeriesDtype::try_from(s.dtype())?; + Ok(ex_dtype) +} diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs index 338839455..9fbb227b2 100644 --- a/native/explorer/src/lib.rs +++ b/native/explorer/src/lib.rs @@ -126,6 +126,7 @@ rustler::init!( df_to_parquet, df_to_parquet_cloud, df_width, + df_re_dtype, // expressions expr_nil, expr_atom, @@ -334,7 +335,6 @@ rustler::init!( s_count_matches, s_re_scan, s_re_named_captures, - s_re_dtype, s_cos, s_upcase, s_day_of_week, diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index 5525951cc..302d494bd 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -1895,16 +1895,3 @@ pub fn s_re_named_captures(s1: ExSeries, pattern: &str) -> Result Result { - let s = Series::new("dummy", [""]) - .into_frame() - .lazy() - .with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy")) - .collect()? - .column("dummy")? - .clone(); - let ex_dtype = ExSeriesDtype::try_from(s.dtype())?; - Ok(ex_dtype) -} From c708f46312eaf0a6f931217d6e66f2d2dac92a8c Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Fri, 19 Apr 2024 17:32:12 -0300 Subject: [PATCH 5/7] Using default backend in case it's not available There is no way to test that scenario today. --- lib/explorer/backend/lazy_series.ex | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index b3c91d80a..4d9598191 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -6,6 +6,7 @@ defmodule Explorer.Backend.LazySeries do """ alias Explorer.Series alias Explorer.Backend + require Logger @behaviour Explorer.Backend.Series @@ -213,7 +214,6 @@ defmodule Explorer.Backend.LazySeries do defp backend_from_args(args) do Enum.find(args, fn %__MODULE__{} = arg -> arg.backend - %module{} -> module _other -> nil end) end @@ -1196,15 +1196,28 @@ defmodule Explorer.Backend.LazySeries do @impl true def re_named_captures(series, pattern) do lazy_s = lazy_series!(series) - backend = lazy_s.backend - target_dtype = backend.re_dtype(pattern) + target_dtype = get_backend(lazy_s).re_dtype(pattern) data = new(:re_named_captures, [lazy_s, pattern], target_dtype) Backend.Series.new(data, target_dtype) end + defp get_backend(%__MODULE__{} = lazy_series) do + lazy_series.backend || warning_with_default_backend() + end + + defp warning_with_default_backend do + backend = Explorer.Backend.get() + + Logger.warning( + "cannot get backend from LazySeries. Using the default one: #{inspect(backend)}" + ) + + :"#{backend}.DataFrame" + end + @remaining_non_lazy_operations [ at: 2, at_every: 2, From 1ae1ddd5ed7fc876c3b42b9cc9925cf2837fd1f8 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Sun, 21 Apr 2024 13:14:17 -0300 Subject: [PATCH 6/7] Raise instead of logging --- lib/explorer/backend/lazy_series.ex | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 4d9598191..9cd1c30d2 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -6,7 +6,6 @@ defmodule Explorer.Backend.LazySeries do """ alias Explorer.Series alias Explorer.Backend - require Logger @behaviour Explorer.Backend.Series @@ -1197,25 +1196,17 @@ defmodule Explorer.Backend.LazySeries do def re_named_captures(series, pattern) do lazy_s = lazy_series!(series) - target_dtype = get_backend(lazy_s).re_dtype(pattern) + backend = get_backend(lazy_s, "re_named_captures/2") + target_dtype = backend.re_dtype(pattern) data = new(:re_named_captures, [lazy_s, pattern], target_dtype) Backend.Series.new(data, target_dtype) end - defp get_backend(%__MODULE__{} = lazy_series) do - lazy_series.backend || warning_with_default_backend() - end - - defp warning_with_default_backend do - backend = Explorer.Backend.get() - - Logger.warning( - "cannot get backend from LazySeries. Using the default one: #{inspect(backend)}" - ) - - :"#{backend}.DataFrame" + defp get_backend(%__MODULE__{} = lazy_series, function) do + lazy_series.backend || + raise "cannot get backend from Explorer.Backend.LazySeries for `#{function}`" end @remaining_non_lazy_operations [ From e59d4343a6b64ad83590771106891dc58ef8f26e Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Sun, 21 Apr 2024 14:00:26 -0300 Subject: [PATCH 7/7] Update lib/explorer/backend/lazy_series.ex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim --- lib/explorer/backend/lazy_series.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 9cd1c30d2..1d43beced 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -212,7 +212,7 @@ defmodule Explorer.Backend.LazySeries do defp backend_from_args(args) do Enum.find(args, fn - %__MODULE__{} = arg -> arg.backend + %__MODULE__{backend: backend} -> backend _other -> nil end) end