Skip to content

Commit

Permalink
Add DataFrame.unnest/2 (#758)
Browse files Browse the repository at this point in the history
  • Loading branch information
costaraphael committed Dec 7, 2023
1 parent 132ed1f commit 48ca4e6
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 0 deletions.
1 change: 1 addition & 0 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ defmodule Explorer.Backend.DataFrame do
@callback describe(df, percentiles :: option(list(float()))) :: df()
@callback nil_count(df) :: df()
@callback explode(df, out_df :: df(), columns :: [column_name()]) :: df()
@callback unnest(df, out_df :: df(), columns :: [column_name()]) :: df()

# Two or more table verbs

Expand Down
68 changes: 68 additions & 0 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5456,6 +5456,74 @@ defmodule Explorer.DataFrame do
Shared.apply_impl(df, :explode, [out_df, columns])
end

@doc """
Unnests one or multiple struct columns into individual columns.
The field names in a unnested column must not exist in the dataframe
or in any other unnested columns.
## Examples
iex> df = Explorer.DataFrame.new(before: [1, 2], struct: [%{x: 1, y: 2}, %{x: 3, y: 4}], after: [3, 4])
iex> Explorer.DataFrame.unnest(df, :struct)
#Explorer.DataFrame<
Polars[2 x 4]
before integer [1, 2]
x integer [1, 3]
y integer [2, 4]
after integer [3, 4]
>
iex> df = Explorer.DataFrame.new(struct1: [%{x: 1, y: 2}, %{x: 3, y: 4}], struct2: [%{z: 5}, %{z: 6}])
iex> Explorer.DataFrame.unnest(df, [:struct1, :struct2])
#Explorer.DataFrame<
Polars[2 x 3]
x integer [1, 3]
y integer [2, 4]
z integer [5, 6]
>
"""
@doc type: :single
@spec unnest(df :: DataFrame.t(), column_or_columns :: column_name() | [column_name()]) ::
DataFrame.t()
def unnest(%DataFrame{} = df, column_or_columns) do
columns = to_existing_columns(df, List.wrap(column_or_columns))

dtypes = Enum.map(columns, &Map.fetch!(df.dtypes, &1))

unless Enum.all?(dtypes, &match?({:struct, _}, &1)) do
raise ArgumentError,
"unnest/2 expects struct columns, but the given columns have the types: #{inspect(dtypes)}"
end

{new_dtypes, new_names} =
columns
|> Enum.zip(dtypes)
|> Enum.reduce({%{}, %{}}, fn {column, {:struct, inner_dtypes}}, {new_dtypes, new_names} ->
new_dtypes = Map.merge(new_dtypes, inner_dtypes)
new_names = Map.put(new_names, column, Map.keys(inner_dtypes))

{new_dtypes, new_names}
end)

out_dtypes =
df.dtypes
|> Map.drop(columns)
|> Map.merge(new_dtypes)

out_names =
Enum.flat_map(df.names, fn name ->
case Map.fetch(new_names, name) do
{:ok, names} -> names
:error -> [name]
end
end)

out_df = %{df | dtypes: out_dtypes, names: out_names}

Shared.apply_impl(df, :unnest, [out_df, columns])
end

@doc """
Prints the DataFrame in a tabular fashion.
Expand Down
5 changes: 5 additions & 0 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
Shared.apply_dataframe(df, out_df, :df_explode, [columns])
end

@impl true
def unnest(df, out_df, columns) do
Shared.apply_dataframe(df, out_df, :df_unnest, [columns])
end

# Two or more table verbs

@impl true
Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,10 @@ defmodule Explorer.PolarsBackend.LazyFrame do
def explode(%DF{} = df, %DF{} = out_df, columns),
do: Shared.apply_dataframe(df, out_df, :lf_explode, [columns])

@impl true
def unnest(%DF{} = df, %DF{} = out_df, columns),
do: Shared.apply_dataframe(df, out_df, :lf_unnest, [columns])

# Groups

@impl true
Expand Down
2 changes: 2 additions & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ defmodule Explorer.PolarsBackend.Native do
def df_describe(_df, _percentiles), do: err()
def df_nil_count(_df), do: err()
def df_explode(_df, _columns), do: err()
def df_unnest(_df, _columns), do: err()

# Expressions (for lazy queries)
@multi_arity_expressions [slice: 2, slice: 3, log: 1, log: 2]
Expand Down Expand Up @@ -212,6 +213,7 @@ defmodule Explorer.PolarsBackend.Native do
def lf_tail(_df, _n_rows), do: err()
def lf_slice(_df, _offset, _length), do: err()
def lf_explode(_df, _columns), do: err()
def lf_unnest(_df, _columns), do: err()
def lf_from_ipc(_filename), do: err()
def lf_from_ndjson(_filename, _infer_schema_length, _batch_size), do: err()
def lf_from_parquet(_filename, _stop_after_n_rows, _maybe_columns), do: err()
Expand Down
6 changes: 6 additions & 0 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,12 @@ pub fn df_explode(df: ExDataFrame, columns: Vec<&str>) -> Result<ExDataFrame, Ex
Ok(ExDataFrame::new(new_df))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_unnest(df: ExDataFrame, columns: Vec<&str>) -> Result<ExDataFrame, ExplorerError> {
let new_df = df.clone_inner().unnest(columns)?;
Ok(ExDataFrame::new(new_df))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_lazy(df: ExDataFrame) -> Result<ExLazyFrame, ExplorerError> {
let new_lf = df.clone_inner().lazy();
Expand Down
6 changes: 6 additions & 0 deletions native/explorer/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ pub fn lf_explode(data: ExLazyFrame, columns: Vec<&str>) -> Result<ExLazyFrame,
Ok(ExLazyFrame::new(lf))
}

#[rustler::nif]
pub fn lf_unnest(data: ExLazyFrame, columns: Vec<&str>) -> Result<ExLazyFrame, ExplorerError> {
let lf = data.clone_inner().unnest(columns);
Ok(ExLazyFrame::new(lf))
}

#[rustler::nif]
pub fn lf_filter_with(data: ExLazyFrame, ex_expr: ExExpr) -> Result<ExLazyFrame, ExplorerError> {
let ldf = data.clone_inner();
Expand Down
2 changes: 2 additions & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ rustler::init!(
df_to_ndjson_cloud,
df_to_parquet,
df_to_parquet_cloud,
df_unnest,
df_width,
// expressions
expr_atom,
Expand Down Expand Up @@ -286,6 +287,7 @@ rustler::init!(
lf_tail,
lf_slice,
lf_explode,
lf_unnest,
lf_from_csv,
lf_from_ipc,
lf_from_parquet,
Expand Down
14 changes: 14 additions & 0 deletions test/explorer/data_frame/lazy_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1417,4 +1417,18 @@ defmodule Explorer.DataFrame.LazyTest do
}
end
end

describe "unnest/2" do
test "unnests a struct column" do
ldf = DF.new([data: [%{x: 1, y: 2}, %{x: 3, y: 4}]], lazy: true)

ldf1 = DF.unnest(ldf, :data)
df1 = DF.collect(ldf1)

assert DF.to_columns(df1, atom_keys: true) == %{
x: [1, 3],
y: [2, 4]
}
end
end
end
78 changes: 78 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3777,4 +3777,82 @@ defmodule Explorer.DataFrameTest do
fn -> DF.explode(df, [:a, :b]) end
end
end

describe "unnest/2" do
test "unnests a struct column" do
df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}])

df1 = DF.unnest(df, :a)

assert DF.names(df1) == ["x", "y"]
assert DF.dtypes(df1) == %{"x" => :integer, "y" => :integer}

assert DF.to_columns(df1, atom_keys: true) == %{
x: [1, 3],
y: [2, 4]
}
end

test "unnests multiple struct columns at once" do
df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}], b: [%{z: 5}, %{z: 6}])

df1 = DF.unnest(df, [:a, :b])

assert DF.names(df1) == ["x", "y", "z"]
assert DF.dtypes(df1) == %{"x" => :integer, "y" => :integer, "z" => :integer}

assert DF.to_columns(df1, atom_keys: true) == %{
x: [1, 3],
y: [2, 4],
z: [5, 6]
}
end

test "unnests the columns in the right order" do
df =
DF.new(
before: [1, 2],
a: [%{x: 1}, %{x: 2}],
between: [3, 4],
x: [%{y: 1}, %{y: 2}],
after: [5, 6]
)

df1 = DF.unnest(df, [:a, :x])

assert DF.names(df1) == ["before", "x", "between", "y", "after"]

assert DF.dtypes(df1) == %{
"before" => :integer,
"x" => :integer,
"between" => :integer,
"y" => :integer,
"after" => :integer
}
end

test "errors if the column is not of the struct type" do
df = DF.new(a: [1, 2, 3])

assert_raise ArgumentError,
"unnest/2 expects struct columns, but the given columns have the types: [:integer]",
fn -> DF.unnest(df, :a) end
end

test "errors when unnesting columns with clashing names" do
df = DF.new(a: [%{x: 1}, %{x: 2}], x: [3, 4])

assert_raise RuntimeError,
~r/column with name 'x' has more than one occurrences/,
fn -> DF.unnest(df, :a) end
end

test "errors when unnesting multiple columns with clashing names" do
df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}], b: [%{x: 5}, %{x: 6}])

assert_raise RuntimeError,
~r/column with name 'x' has more than one occurrences/,
fn -> DF.unnest(df, [:a, :b]) end
end
end
end

0 comments on commit 48ca4e6

Please sign in to comment.