Add DataFrame.unnest/2 (#758)

elixir-explorer · Dec 7, 2023 · 48ca4e6 · 48ca4e6
1 parent 132ed1f
commit 48ca4e6
Show file tree

Hide file tree

Showing 10 changed files with 186 additions and 0 deletions.
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -202,6 +202,7 @@ defmodule Explorer.Backend.DataFrame do
   @callback describe(df, percentiles :: option(list(float()))) :: df()
   @callback nil_count(df) :: df()
   @callback explode(df, out_df :: df(), columns :: [column_name()]) :: df()
+  @callback unnest(df, out_df :: df(), columns :: [column_name()]) :: df()
 
   # Two or more table verbs
 

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -5456,6 +5456,74 @@ defmodule Explorer.DataFrame do
     Shared.apply_impl(df, :explode, [out_df, columns])
   end
 
+  @doc """
+  Unnests one or multiple struct columns into individual columns.
+
+  The field names in a unnested column must not exist in the dataframe
+  or in any other unnested columns.
+
+  ## Examples
+
+      iex> df = Explorer.DataFrame.new(before: [1, 2], struct: [%{x: 1, y: 2}, %{x: 3, y: 4}], after: [3, 4])
+      iex> Explorer.DataFrame.unnest(df, :struct)
+      #Explorer.DataFrame<
+        Polars[2 x 4]
+        before integer [1, 2]
+        x integer [1, 3]
+        y integer [2, 4]
+        after integer [3, 4]
+      >
+
+      iex> df = Explorer.DataFrame.new(struct1: [%{x: 1, y: 2}, %{x: 3, y: 4}], struct2: [%{z: 5}, %{z: 6}])
+      iex> Explorer.DataFrame.unnest(df, [:struct1, :struct2])
+      #Explorer.DataFrame<
+        Polars[2 x 3]
+        x integer [1, 3]
+        y integer [2, 4]
+        z integer [5, 6]
+      >
+  """
+  @doc type: :single
+  @spec unnest(df :: DataFrame.t(), column_or_columns :: column_name() | [column_name()]) ::
+          DataFrame.t()
+  def unnest(%DataFrame{} = df, column_or_columns) do
+    columns = to_existing_columns(df, List.wrap(column_or_columns))
+
+    dtypes = Enum.map(columns, &Map.fetch!(df.dtypes, &1))
+
+    unless Enum.all?(dtypes, &match?({:struct, _}, &1)) do
+      raise ArgumentError,
+            "unnest/2 expects struct columns, but the given columns have the types: #{inspect(dtypes)}"
+    end
+
+    {new_dtypes, new_names} =
+      columns
+      |> Enum.zip(dtypes)
+      |> Enum.reduce({%{}, %{}}, fn {column, {:struct, inner_dtypes}}, {new_dtypes, new_names} ->
+        new_dtypes = Map.merge(new_dtypes, inner_dtypes)
+        new_names = Map.put(new_names, column, Map.keys(inner_dtypes))
+
+        {new_dtypes, new_names}
+      end)
+
+    out_dtypes =
+      df.dtypes
+      |> Map.drop(columns)
+      |> Map.merge(new_dtypes)
+
+    out_names =
+      Enum.flat_map(df.names, fn name ->
+        case Map.fetch(new_names, name) do
+          {:ok, names} -> names
+          :error -> [name]
+        end
+      end)
+
+    out_df = %{df | dtypes: out_dtypes, names: out_names}
+
+    Shared.apply_impl(df, :unnest, [out_df, columns])
+  end
+
   @doc """
   Prints the DataFrame in a tabular fashion.
 

diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -759,6 +759,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
     Shared.apply_dataframe(df, out_df, :df_explode, [columns])
   end
 
+  @impl true
+  def unnest(df, out_df, columns) do
+    Shared.apply_dataframe(df, out_df, :df_unnest, [columns])
+  end
+
   # Two or more table verbs
 
   @impl true

diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
@@ -422,6 +422,10 @@ defmodule Explorer.PolarsBackend.LazyFrame do
   def explode(%DF{} = df, %DF{} = out_df, columns),
     do: Shared.apply_dataframe(df, out_df, :lf_explode, [columns])
 
+  @impl true
+  def unnest(%DF{} = df, %DF{} = out_df, columns),
+    do: Shared.apply_dataframe(df, out_df, :lf_unnest, [columns])
+
   # Groups
 
   @impl true

diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
@@ -170,6 +170,7 @@ defmodule Explorer.PolarsBackend.Native do
   def df_describe(_df, _percentiles), do: err()
   def df_nil_count(_df), do: err()
   def df_explode(_df, _columns), do: err()
+  def df_unnest(_df, _columns), do: err()
 
   # Expressions (for lazy queries)
   @multi_arity_expressions [slice: 2, slice: 3, log: 1, log: 2]
@@ -212,6 +213,7 @@ defmodule Explorer.PolarsBackend.Native do
   def lf_tail(_df, _n_rows), do: err()
   def lf_slice(_df, _offset, _length), do: err()
   def lf_explode(_df, _columns), do: err()
+  def lf_unnest(_df, _columns), do: err()
   def lf_from_ipc(_filename), do: err()
   def lf_from_ndjson(_filename, _infer_schema_length, _batch_size), do: err()
   def lf_from_parquet(_filename, _stop_after_n_rows, _maybe_columns), do: err()

diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -669,6 +669,12 @@ pub fn df_explode(df: ExDataFrame, columns: Vec<&str>) -> Result<ExDataFrame, Ex
     Ok(ExDataFrame::new(new_df))
 }
 
+#[rustler::nif(schedule = "DirtyCpu")]
+pub fn df_unnest(df: ExDataFrame, columns: Vec<&str>) -> Result<ExDataFrame, ExplorerError> {
+    let new_df = df.clone_inner().unnest(columns)?;
+    Ok(ExDataFrame::new(new_df))
+}
+
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn df_lazy(df: ExDataFrame) -> Result<ExLazyFrame, ExplorerError> {
     let new_lf = df.clone_inner().lazy();

diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs
@@ -90,6 +90,12 @@ pub fn lf_explode(data: ExLazyFrame, columns: Vec<&str>) -> Result<ExLazyFrame,
     Ok(ExLazyFrame::new(lf))
 }
 
+#[rustler::nif]
+pub fn lf_unnest(data: ExLazyFrame, columns: Vec<&str>) -> Result<ExLazyFrame, ExplorerError> {
+    let lf = data.clone_inner().unnest(columns);
+    Ok(ExLazyFrame::new(lf))
+}
+
 #[rustler::nif]
 pub fn lf_filter_with(data: ExLazyFrame, ex_expr: ExExpr) -> Result<ExLazyFrame, ExplorerError> {
     let ldf = data.clone_inner();

diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs
@@ -138,6 +138,7 @@ rustler::init!(
         df_to_ndjson_cloud,
         df_to_parquet,
         df_to_parquet_cloud,
+        df_unnest,
         df_width,
         // expressions
         expr_atom,
@@ -286,6 +287,7 @@ rustler::init!(
         lf_tail,
         lf_slice,
         lf_explode,
+        lf_unnest,
         lf_from_csv,
         lf_from_ipc,
         lf_from_parquet,

diff --git a/test/explorer/data_frame/lazy_test.exs b/test/explorer/data_frame/lazy_test.exs
@@ -1417,4 +1417,18 @@ defmodule Explorer.DataFrame.LazyTest do
              }
     end
   end
+
+  describe "unnest/2" do
+    test "unnests a struct column" do
+      ldf = DF.new([data: [%{x: 1, y: 2}, %{x: 3, y: 4}]], lazy: true)
+
+      ldf1 = DF.unnest(ldf, :data)
+      df1 = DF.collect(ldf1)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               x: [1, 3],
+               y: [2, 4]
+             }
+    end
+  end
 end
diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
@@ -3777,4 +3777,82 @@ defmodule Explorer.DataFrameTest do
                    fn -> DF.explode(df, [:a, :b]) end
     end
   end
+
+  describe "unnest/2" do
+    test "unnests a struct column" do
+      df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}])
+
+      df1 = DF.unnest(df, :a)
+
+      assert DF.names(df1) == ["x", "y"]
+      assert DF.dtypes(df1) == %{"x" => :integer, "y" => :integer}
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               x: [1, 3],
+               y: [2, 4]
+             }
+    end
+
+    test "unnests multiple struct columns at once" do
+      df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}], b: [%{z: 5}, %{z: 6}])
+
+      df1 = DF.unnest(df, [:a, :b])
+
+      assert DF.names(df1) == ["x", "y", "z"]
+      assert DF.dtypes(df1) == %{"x" => :integer, "y" => :integer, "z" => :integer}
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               x: [1, 3],
+               y: [2, 4],
+               z: [5, 6]
+             }
+    end
+
+    test "unnests the columns in the right order" do
+      df =
+        DF.new(
+          before: [1, 2],
+          a: [%{x: 1}, %{x: 2}],
+          between: [3, 4],
+          x: [%{y: 1}, %{y: 2}],
+          after: [5, 6]
+        )
+
+      df1 = DF.unnest(df, [:a, :x])
+
+      assert DF.names(df1) == ["before", "x", "between", "y", "after"]
+
+      assert DF.dtypes(df1) == %{
+               "before" => :integer,
+               "x" => :integer,
+               "between" => :integer,
+               "y" => :integer,
+               "after" => :integer
+             }
+    end
+
+    test "errors if the column is not of the struct type" do
+      df = DF.new(a: [1, 2, 3])
+
+      assert_raise ArgumentError,
+                   "unnest/2 expects struct columns, but the given columns have the types: [:integer]",
+                   fn -> DF.unnest(df, :a) end
+    end
+
+    test "errors when unnesting columns with clashing names" do
+      df = DF.new(a: [%{x: 1}, %{x: 2}], x: [3, 4])
+
+      assert_raise RuntimeError,
+                   ~r/column with name 'x' has more than one occurrences/,
+                   fn -> DF.unnest(df, :a) end
+    end
+
+    test "errors when unnesting multiple columns with clashing names" do
+      df = DF.new(a: [%{x: 1, y: 2}, %{x: 3, y: 4}], b: [%{x: 5}, %{x: 6}])
+
+      assert_raise RuntimeError,
+                   ~r/column with name 'x' has more than one occurrences/,
+                   fn -> DF.unnest(df, [:a, :b]) end
+    end
+  end
 end