elixir-explorer · philss · Apr 12, 2023 · Apr 11, 2023 · josevalim · Apr 12, 2023
diff --git a/native/explorer/Cargo.lock b/native/explorer/Cargo.lock
diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml
@@ -18,14 +18,15 @@ rand = { version = "0.8.5", features = ["alloc"] }
 rand_pcg = "0.3.1"
 rustler = "0.27.0"
 thiserror = "1"
+smartstring = "1"
 
 # MiMalloc won´t compile on Windows with the GCC compiler.
 # On Linux with Musl it won´t load correctly.
 [target.'cfg(not(any(all(windows, target_env = "gnu"), all(target_os = "linux", target_env = "musl"))))'.dependencies]
 mimalloc = { version = "*", default-features = false }
 
 [dependencies.polars]
-version = "0.27.2"
+version = "0.28"
 default-features = false
 features = [
   "checked_arithmetic",
@@ -38,7 +39,6 @@ features = [
   "dtype-date",
   "dtype-time",
   "dtype-datetime",
-  "dtype-binary",
   "dtype-categorical",
   "ipc",
   "ipc_streaming",
@@ -66,4 +66,4 @@ features = [
 ]
 
 [dependencies.polars-ops]
-version = "0.27.2"
+version = "0.28"
diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -6,6 +6,7 @@ use std::result::Result;
 
 use crate::ex_expr_to_exprs;
 use crate::{ExDataFrame, ExExpr, ExLazyFrame, ExSeries, ExplorerError};
+use smartstring::alias::String as SmartString;
 
 // Loads the IO functions for read/writing CSV, NDJSON, Parquet, etc.
 pub mod io;
@@ -30,6 +31,14 @@ pub fn normalize_numeric_dtypes(df: &mut DataFrame) -> Result<DataFrame, crate::
     Ok(df.clone())
 }
 
+fn to_string_names(names: Vec<&str>) -> Vec<String> {
+    names.into_iter().map(|s| s.to_string()).collect()
+}
+
+pub fn to_smart_strings(slices: Vec<&str>) -> Vec<SmartString> {
+    slices.into_iter().map(|s| s.into()).collect()
+}
+
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn df_join(
     df: ExDataFrame,
@@ -57,7 +66,8 @@ pub fn df_join(
 
 #[rustler::nif]
 pub fn df_names(df: ExDataFrame) -> Result<Vec<String>, ExplorerError> {
-    Ok(df.get_column_names_owned())
+    let names = to_string_names(df.get_column_names());
+    Ok(names)
 }
 
 #[rustler::nif]
@@ -360,16 +370,17 @@ pub fn df_tail(
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn df_pivot_longer(
     df: ExDataFrame,
-    id_vars: Vec<String>,
-    value_vars: Vec<String>,
+    id_vars: Vec<&str>,
+    value_vars: Vec<&str>,
     names_to: String,
     values_to: String,
 ) -> Result<ExDataFrame, ExplorerError> {
     let melt_opts = MeltArgs {
-        id_vars,
-        value_vars,
-        variable_name: Some(names_to),
-        value_name: Some(values_to),
+        id_vars: to_smart_strings(id_vars),
+        value_vars: to_smart_strings(value_vars),
+        variable_name: Some(names_to.into()),
+        value_name: Some(values_to.into()),
+        streamable: true,
     };
     let new_df = df.melt2(melt_opts)?;
     Ok(ExDataFrame::new(new_df))
@@ -381,7 +392,7 @@ pub fn df_distinct(
     subset: Vec<String>,
     columns_to_keep: Option<Vec<&str>>,
 ) -> Result<ExDataFrame, ExplorerError> {
-    let new_df = df.unique_stable(Some(&subset), UniqueKeepStrategy::First)?;
+    let new_df = df.unique_stable(Some(&subset), UniqueKeepStrategy::First, None)?;
 
     match columns_to_keep {
         Some(columns) => Ok(ExDataFrame::new(new_df.select(columns)?)),
@@ -513,11 +524,12 @@ pub fn df_pivot_wider(
         values_column,
         id_columns.clone(),
         [pivot_column],
-        PivotAgg::First,
         false,
+        Some(PivotAgg::First),
         None,
     )?;
-    let mut new_names = new_df.get_column_names_owned();
+
+    let mut new_names = to_string_names(new_df.get_column_names());
 
     for name in new_names.iter_mut() {
         let original_name = name.clone();
@@ -546,7 +558,7 @@ pub fn df_pivot_wider(
                 }
             }
 
-            counter.insert(name.clone(), 1);
+            counter.insert(name.to_string(), 1);
         }
     }
 

diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs
@@ -15,6 +15,7 @@ use std::convert::TryFrom;
 use std::fs::File;
 use std::io::{BufReader, BufWriter, Cursor};
 use std::result::Result;
+use std::sync::Arc;
 
 use crate::dataframe::normalize_numeric_dtypes;
 use crate::datatypes::ExParquetCompression;
@@ -57,7 +58,7 @@ pub fn df_from_csv(
         _ => CsvEncoding::Utf8,
     };
 
-    let schema: Option<Schema> = match dtypes {
+    let schema = match dtypes {
         Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),
 
         None => None,
@@ -66,27 +67,27 @@ pub fn df_from_csv(
     let reader = CsvReader::from_path(filename)?
         .infer_schema(infer_schema_length)
         .has_header(has_header)
-        .with_parse_dates(parse_dates)
+        .with_try_parse_dates(parse_dates)
         .with_n_rows(stop_after_n_rows)
         .with_delimiter(delimiter_as_byte)
         .with_skip_rows(skip_rows)
         .with_projection(projection)
         .with_rechunk(do_rechunk)
         .with_encoding(encoding)
         .with_columns(column_names)
-        .with_dtypes(schema.as_ref())
+        .with_dtypes(schema)
         .with_null_values(Some(NullValues::AllColumns(vec![null_char])));
 
     finish_reader(reader)
 }
 
-pub fn schema_from_dtypes_pairs(dtypes: Vec<(&str, &str)>) -> Result<Schema, ExplorerError> {
+pub fn schema_from_dtypes_pairs(dtypes: Vec<(&str, &str)>) -> Result<Arc<Schema>, ExplorerError> {
     let mut schema = Schema::new();
     for (name, dtype_str) in dtypes {
         let dtype = dtype_from_str(dtype_str)?;
-        schema.with_column(name.to_string(), dtype);
+        schema.with_column(name.into(), dtype);
     }
-    Ok(schema)
+    Ok(Arc::new(schema))
 }
 
 fn dtype_from_str(dtype: &str) -> Result<DataType, ExplorerError> {
@@ -163,7 +164,7 @@ pub fn df_load_csv(
         _ => CsvEncoding::Utf8,
     };
 
-    let schema: Option<Schema> = match dtypes {
+    let schema = match dtypes {
         Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),
 
         None => None,
@@ -174,15 +175,15 @@ pub fn df_load_csv(
     let reader = CsvReader::new(cursor)
         .infer_schema(infer_schema_length)
         .has_header(has_header)
-        .with_parse_dates(parse_dates)
+        .with_try_parse_dates(parse_dates)
         .with_n_rows(stop_after_n_rows)
         .with_delimiter(delimiter_as_byte)
         .with_skip_rows(skip_rows)
         .with_projection(projection)
         .with_rechunk(do_rechunk)
         .with_encoding(encoding)
         .with_columns(column_names)
-        .with_dtypes(schema.as_ref())
+        .with_dtypes(schema)
         .with_null_values(Some(NullValues::AllColumns(vec![null_char])));
 
     finish_reader(reader)

diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs
@@ -1,4 +1,7 @@
-use crate::{expressions::ex_expr_to_exprs, ExDataFrame, ExExpr, ExLazyFrame, ExplorerError};
+use crate::{
+    dataframe::to_smart_strings, expressions::ex_expr_to_exprs, ExDataFrame, ExExpr, ExLazyFrame,
+    ExplorerError,
+};
 use polars::prelude::*;
 use std::result::Result;
 
@@ -42,7 +45,13 @@ pub fn lf_tail(data: ExLazyFrame, length: u32) -> Result<ExLazyFrame, ExplorerEr
 #[rustler::nif]
 pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
     let lf = data.clone_inner();
-    Ok(lf.schema()?.iter_names().cloned().collect())
+    let names = lf
+        .schema()?
+        .iter_names()
+        .map(|smart_string| smart_string.to_string())
+        .collect();
+
+    Ok(names)
 }
 
 #[rustler::nif]
@@ -158,17 +167,18 @@ pub fn lf_drop_nils(
 #[rustler::nif]
 pub fn lf_pivot_longer(
     data: ExLazyFrame,
-    id_vars: Vec<String>,
-    value_vars: Vec<String>,
+    id_vars: Vec<&str>,
+    value_vars: Vec<&str>,
     names_to: String,
     values_to: String,
 ) -> Result<ExLazyFrame, ExplorerError> {
     let ldf = data.clone_inner();
     let melt_opts = MeltArgs {
-        id_vars,
-        value_vars,
-        variable_name: Some(names_to),
-        value_name: Some(values_to),
+        id_vars: to_smart_strings(id_vars),
+        value_vars: to_smart_strings(value_vars),
+        variable_name: Some(names_to.into()),
+        value_name: Some(values_to.into()),
+        streamable: true,
     };
     let new_df = ldf.melt(melt_opts);
     Ok(ExLazyFrame::new(new_df))

diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs
@@ -70,7 +70,7 @@ pub fn lf_from_csv(
         _ => CsvEncoding::Utf8,
     };
 
-    let schema: Option<Schema> = match dtypes {
+    let schema = match dtypes {
         Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),
 
         None => None,
@@ -79,13 +79,13 @@ pub fn lf_from_csv(
     let df = LazyCsvReader::new(filename)
         .with_infer_schema_length(infer_schema_length)
         .has_header(has_header)
-        .with_parse_dates(parse_dates)
+        .with_try_parse_dates(parse_dates)
         .with_n_rows(stop_after_n_rows)
         .with_delimiter(delimiter_as_byte)
         .with_skip_rows(skip_rows)
         .with_rechunk(do_rechunk)
         .with_encoding(encoding)
-        .with_dtype_overwrite(schema.as_ref())
+        .with_dtype_overwrite(schema.as_deref())
         .with_null_values(Some(NullValues::AllColumns(vec![null_char])))
         .finish()?;
 

diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs
@@ -301,7 +301,7 @@ pub fn s_argsort(
         nulls_last,
         multithreaded: false,
     };
-    let indices = series.argsort(opts).cast(&DataType::Int64)?;
+    let indices = series.arg_sort(opts).cast(&DataType::Int64)?;
     Ok(ExSeries::new(indices))
 }
 
@@ -1118,7 +1118,8 @@ pub fn s_not(s1: ExSeries) -> Result<ExSeries, ExplorerError> {
 
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn s_contains(s1: ExSeries, pattern: &str) -> Result<ExSeries, ExplorerError> {
-    Ok(ExSeries::new(s1.utf8()?.contains(pattern)?.into()))
+    // TODO: maybe have "strict" as an option.
+    Ok(ExSeries::new(s1.utf8()?.contains(pattern, true)?.into()))
 }
 
 #[rustler::nif(schedule = "DirtyCpu")]

diff --git a/test/explorer/polars_backend/expression_test.exs b/test/explorer/polars_backend/expression_test.exs
@@ -16,10 +16,10 @@ defmodule Explorer.PolarsBackend.ExpressionTest do
 
       assert %Expression{} = expr = Expression.to_expr(lazy)
 
-      assert Expression.describe_filter_plan(df, expr) == """
-               FILTER [(col("col_a")) == (5i64)] FROM
-                 DF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
-             """
+      assert Expression.describe_filter_plan(df, expr) ==
+               String.trim("""
+               FILTER [(col("col_a")) == (5)] FROMDF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
+               """)
     end
 
     test "with basic float value" do
@@ -81,10 +81,10 @@ defmodule Explorer.PolarsBackend.ExpressionTest do
 
       assert %Expression{} = expr = Expression.to_expr(lazy)
 
-      assert Expression.describe_filter_plan(df, expr) == """
-               FILTER [(col("col_a")) == (col("col_b"))] FROM
-                 DF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
-             """
+      assert Expression.describe_filter_plan(df, expr) ==
+               String.trim("""
+               FILTER [(col("col_a")) == (col("col_b"))] FROMDF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
+               """)
     end
   end
 end