Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements Series.split_into/3 #873

Merged
merged 8 commits into from
Mar 5, 2024
Merged
8 changes: 8 additions & 0 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ defmodule Explorer.Backend.LazySeries do
downcase: 1,
substring: 3,
split: 2,
split_into: 3,
json_decode: 2,
json_path_match: 2,
# Float round
Expand Down Expand Up @@ -1053,6 +1054,13 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, {:list, :string})
end

@impl true
def split_into(series, by, fields) do
data = new(:split_into, [lazy_series!(series), by, fields], :string)

Backend.Series.new(data, {:list, :struct})
ryancurtin marked this conversation as resolved.
Show resolved Hide resolved
end

@impl true
def round(series, decimals) when is_integer(decimals) and decimals >= 0 do
data = new(:round, [lazy_series!(series), decimals], {:f, 64})
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ defmodule Explorer.Backend.Series do
@callback rstrip(s, String.t() | nil) :: s
@callback substring(s, integer(), non_neg_integer() | nil) :: s
@callback split(s, String.t()) :: s
@callback split_into(s, String.t(), list(String.t() | atom())) :: s
@callback json_decode(s, dtype()) :: s
@callback json_path_match(s, String.t()) :: s

Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ defmodule Explorer.PolarsBackend.Expression do
upcase: 1,
substring: 3,
split: 2,
split_into: 3,
json_decode: 2,
json_path_match: 2,

Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_cut(_s, _bins, _labels, _break_point_label, _category_label), do: err()
def s_substring(_s, _offset, _length), do: err()
def s_split(_s, _by), do: err()
def s_split_into(_s, _by, _num_fields), do: err()

def s_qcut(_s, _quantiles, _labels, _break_point_label, _category_label),
do: err()
Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,10 @@ defmodule Explorer.PolarsBackend.Series do
def split(series, by),
do: Shared.apply_series(series, :s_split, [by])

@impl true
def split_into(series, by, fields),
do: Shared.apply_series(series, :s_split_into, [by, fields])

# Float round
@impl true
def round(series, decimals),
Expand Down
25 changes: 25 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5651,6 +5651,31 @@ defmodule Explorer.Series do
def split(%Series{dtype: dtype}, _by),
do: dtype_error("split/2", dtype, [:string])

@doc """
Split a string Series into a struct with fields determined by the list of
field names provided. The length of the field names list determines how
many fields the resulting struct will have. If the string cannot be split
into that many separate strings, null values will be provided for the
remaining fields.
ryancurtin marked this conversation as resolved.
Show resolved Hide resolved

## Examples

iex> s = Series.from_list(["Smith, John", "Jones, Jane"])
iex> Series.split_into(s, ", ", ["Last Name", "First Name"])
#Explorer.Series<
Polars[2]
struct[2] [%{"First Name" => "John", "Last Name" => "Smith"}, %{"First Name" => "Jane", "Last Name" => "Jones"}]
>

"""
@doc type: :string_wise
@spec split_into(Series.t(), String.t(), list(String.t() | atom())) :: Series.t()
def split_into(%Series{dtype: :string} = series, by, fields) when is_binary(by),
do: apply_series(series, :split_into, [by, fields])

def split_into(%Series{dtype: dtype}, _by, _fields),
do: dtype_error("split_into/3", dtype, [:string])
ryancurtin marked this conversation as resolved.
Show resolved Hide resolved

# Float

@doc """
Expand Down
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ rustler::init!(
s_strip,
s_substring,
s_split,
s_split_into,
s_subtract,
s_sum,
s_tail,
Expand Down
20 changes: 20 additions & 0 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,26 @@ pub fn s_split(s1: ExSeries, by: &str) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s2))
}

#[rustler::nif(schedule = "DirtyCpu")]
ryancurtin marked this conversation as resolved.
Show resolved Hide resolved
pub fn s_split_into(s1: ExSeries, by: &str, names: Vec<String>) -> Result<ExSeries, ExplorerError> {
let fields = s1
.str()?
.splitn(&ChunkedArray::new("a", &[by]), names.len())?
.fields()
.iter()
.zip(names.iter())
.map(|(s, name)| {
let mut s = s.clone();
s.rename(name);
s
})
.collect::<Vec<_>>();

let result = StructChunked::new(s1.name(), &fields).map(|ca| ca.into_series())?;

Ok(ExSeries::new(result))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_round(s: ExSeries, decimals: u32) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s.round(decimals)?.into_series()))
Expand Down
22 changes: 22 additions & 0 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5320,6 +5320,28 @@ defmodule Explorer.SeriesTest do
end
end

describe "split_into" do
test "split_into/3 produces the correct number of fields in a struct" do
series = Series.from_list(["Smith, John", "Jones, Jane"])
split_series = series |> Series.split_into(", ", ["Last Name", "First Name"])

assert Series.to_list(split_series) == [
%{"First Name" => "John", "Last Name" => "Smith"},
%{"First Name" => "Jane", "Last Name" => "Jones"}
]
end

test "split_into/3 produces a nil field when string cannot be split for every field" do
series = Series.from_list(["Smith-John", "Jones-Jane"])
split_series = series |> Series.split_into("-", ["Last Name", "First Name", "Middle Name"])

assert Series.to_list(split_series) == [
%{"First Name" => "John", "Last Name" => "Smith", "Middle Name" => nil},
%{"First Name" => "Jane", "Last Name" => "Jones", "Middle Name" => nil}
]
end
end

describe "strptime/2 and strftime/2" do
test "parse datetime from string" do
series = Series.from_list(["2023-01-05 12:34:56", "XYZ", nil])
Expand Down