Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Polars to v0.28 #570

Merged
merged 1 commit into from Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
612 changes: 386 additions & 226 deletions native/explorer/Cargo.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions native/explorer/Cargo.toml
Expand Up @@ -18,14 +18,15 @@ rand = { version = "0.8.5", features = ["alloc"] }
rand_pcg = "0.3.1"
rustler = "0.27.0"
thiserror = "1"
smartstring = "1"

# MiMalloc won´t compile on Windows with the GCC compiler.
# On Linux with Musl it won´t load correctly.
[target.'cfg(not(any(all(windows, target_env = "gnu"), all(target_os = "linux", target_env = "musl"))))'.dependencies]
mimalloc = { version = "*", default-features = false }

[dependencies.polars]
version = "0.27.2"
version = "0.28"
default-features = false
features = [
"checked_arithmetic",
Expand All @@ -38,7 +39,6 @@ features = [
"dtype-date",
"dtype-time",
"dtype-datetime",
"dtype-binary",
"dtype-categorical",
"ipc",
"ipc_streaming",
Expand Down Expand Up @@ -66,4 +66,4 @@ features = [
]

[dependencies.polars-ops]
version = "0.27.2"
version = "0.28"
34 changes: 23 additions & 11 deletions native/explorer/src/dataframe.rs
Expand Up @@ -6,6 +6,7 @@ use std::result::Result;

use crate::ex_expr_to_exprs;
use crate::{ExDataFrame, ExExpr, ExLazyFrame, ExSeries, ExplorerError};
use smartstring::alias::String as SmartString;

// Loads the IO functions for read/writing CSV, NDJSON, Parquet, etc.
pub mod io;
Expand All @@ -30,6 +31,14 @@ pub fn normalize_numeric_dtypes(df: &mut DataFrame) -> Result<DataFrame, crate::
Ok(df.clone())
}

fn to_string_names(names: Vec<&str>) -> Vec<String> {
names.into_iter().map(|s| s.to_string()).collect()
}

pub fn to_smart_strings(slices: Vec<&str>) -> Vec<SmartString> {
slices.into_iter().map(|s| s.into()).collect()
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_join(
df: ExDataFrame,
Expand Down Expand Up @@ -57,7 +66,8 @@ pub fn df_join(

#[rustler::nif]
pub fn df_names(df: ExDataFrame) -> Result<Vec<String>, ExplorerError> {
Ok(df.get_column_names_owned())
let names = to_string_names(df.get_column_names());
Ok(names)
}

#[rustler::nif]
Expand Down Expand Up @@ -360,16 +370,17 @@ pub fn df_tail(
#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_pivot_longer(
df: ExDataFrame,
id_vars: Vec<String>,
value_vars: Vec<String>,
id_vars: Vec<&str>,
value_vars: Vec<&str>,
names_to: String,
values_to: String,
) -> Result<ExDataFrame, ExplorerError> {
let melt_opts = MeltArgs {
id_vars,
value_vars,
variable_name: Some(names_to),
value_name: Some(values_to),
id_vars: to_smart_strings(id_vars),
value_vars: to_smart_strings(value_vars),
variable_name: Some(names_to.into()),
value_name: Some(values_to.into()),
streamable: true,
};
let new_df = df.melt2(melt_opts)?;
Ok(ExDataFrame::new(new_df))
Expand All @@ -381,7 +392,7 @@ pub fn df_distinct(
subset: Vec<String>,
columns_to_keep: Option<Vec<&str>>,
) -> Result<ExDataFrame, ExplorerError> {
let new_df = df.unique_stable(Some(&subset), UniqueKeepStrategy::First)?;
let new_df = df.unique_stable(Some(&subset), UniqueKeepStrategy::First, None)?;

match columns_to_keep {
Some(columns) => Ok(ExDataFrame::new(new_df.select(columns)?)),
Expand Down Expand Up @@ -513,11 +524,12 @@ pub fn df_pivot_wider(
values_column,
id_columns.clone(),
[pivot_column],
PivotAgg::First,
false,
Some(PivotAgg::First),
None,
)?;
let mut new_names = new_df.get_column_names_owned();

let mut new_names = to_string_names(new_df.get_column_names());

for name in new_names.iter_mut() {
let original_name = name.clone();
Expand Down Expand Up @@ -546,7 +558,7 @@ pub fn df_pivot_wider(
}
}

counter.insert(name.clone(), 1);
counter.insert(name.to_string(), 1);
}
}

Expand Down
19 changes: 10 additions & 9 deletions native/explorer/src/dataframe/io.rs
Expand Up @@ -15,6 +15,7 @@ use std::convert::TryFrom;
use std::fs::File;
use std::io::{BufReader, BufWriter, Cursor};
use std::result::Result;
use std::sync::Arc;

use crate::dataframe::normalize_numeric_dtypes;
use crate::datatypes::ExParquetCompression;
Expand Down Expand Up @@ -57,7 +58,7 @@ pub fn df_from_csv(
_ => CsvEncoding::Utf8,
};

let schema: Option<Schema> = match dtypes {
let schema = match dtypes {
Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),

None => None,
Expand All @@ -66,27 +67,27 @@ pub fn df_from_csv(
let reader = CsvReader::from_path(filename)?
.infer_schema(infer_schema_length)
.has_header(has_header)
.with_parse_dates(parse_dates)
.with_try_parse_dates(parse_dates)
.with_n_rows(stop_after_n_rows)
.with_delimiter(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_projection(projection)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema.as_ref())
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])));

finish_reader(reader)
}

pub fn schema_from_dtypes_pairs(dtypes: Vec<(&str, &str)>) -> Result<Schema, ExplorerError> {
pub fn schema_from_dtypes_pairs(dtypes: Vec<(&str, &str)>) -> Result<Arc<Schema>, ExplorerError> {
let mut schema = Schema::new();
for (name, dtype_str) in dtypes {
let dtype = dtype_from_str(dtype_str)?;
schema.with_column(name.to_string(), dtype);
schema.with_column(name.into(), dtype);
}
Ok(schema)
Ok(Arc::new(schema))
}

fn dtype_from_str(dtype: &str) -> Result<DataType, ExplorerError> {
Expand Down Expand Up @@ -163,7 +164,7 @@ pub fn df_load_csv(
_ => CsvEncoding::Utf8,
};

let schema: Option<Schema> = match dtypes {
let schema = match dtypes {
Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),

None => None,
Expand All @@ -174,15 +175,15 @@ pub fn df_load_csv(
let reader = CsvReader::new(cursor)
.infer_schema(infer_schema_length)
.has_header(has_header)
.with_parse_dates(parse_dates)
.with_try_parse_dates(parse_dates)
.with_n_rows(stop_after_n_rows)
.with_delimiter(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_projection(projection)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema.as_ref())
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])));

finish_reader(reader)
Expand Down
26 changes: 18 additions & 8 deletions native/explorer/src/lazyframe.rs
@@ -1,4 +1,7 @@
use crate::{expressions::ex_expr_to_exprs, ExDataFrame, ExExpr, ExLazyFrame, ExplorerError};
use crate::{
dataframe::to_smart_strings, expressions::ex_expr_to_exprs, ExDataFrame, ExExpr, ExLazyFrame,
ExplorerError,
};
use polars::prelude::*;
use std::result::Result;

Expand Down Expand Up @@ -42,7 +45,13 @@ pub fn lf_tail(data: ExLazyFrame, length: u32) -> Result<ExLazyFrame, ExplorerEr
#[rustler::nif]
pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
let lf = data.clone_inner();
Ok(lf.schema()?.iter_names().cloned().collect())
let names = lf
.schema()?
.iter_names()
.map(|smart_string| smart_string.to_string())
.collect();

Ok(names)
}

#[rustler::nif]
Expand Down Expand Up @@ -158,17 +167,18 @@ pub fn lf_drop_nils(
#[rustler::nif]
pub fn lf_pivot_longer(
data: ExLazyFrame,
id_vars: Vec<String>,
value_vars: Vec<String>,
id_vars: Vec<&str>,
value_vars: Vec<&str>,
names_to: String,
values_to: String,
) -> Result<ExLazyFrame, ExplorerError> {
let ldf = data.clone_inner();
let melt_opts = MeltArgs {
id_vars,
value_vars,
variable_name: Some(names_to),
value_name: Some(values_to),
id_vars: to_smart_strings(id_vars),
value_vars: to_smart_strings(value_vars),
variable_name: Some(names_to.into()),
value_name: Some(values_to.into()),
streamable: true,
};
let new_df = ldf.melt(melt_opts);
Ok(ExLazyFrame::new(new_df))
Expand Down
6 changes: 3 additions & 3 deletions native/explorer/src/lazyframe/io.rs
Expand Up @@ -70,7 +70,7 @@ pub fn lf_from_csv(
_ => CsvEncoding::Utf8,
};

let schema: Option<Schema> = match dtypes {
let schema = match dtypes {
Some(dtypes) => Some(schema_from_dtypes_pairs(dtypes)?),

None => None,
Expand All @@ -79,13 +79,13 @@ pub fn lf_from_csv(
let df = LazyCsvReader::new(filename)
.with_infer_schema_length(infer_schema_length)
.has_header(has_header)
.with_parse_dates(parse_dates)
.with_try_parse_dates(parse_dates)
.with_n_rows(stop_after_n_rows)
.with_delimiter(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(schema.as_ref())
.with_dtype_overwrite(schema.as_deref())
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.finish()?;

Expand Down
5 changes: 3 additions & 2 deletions native/explorer/src/series.rs
Expand Up @@ -301,7 +301,7 @@ pub fn s_argsort(
nulls_last,
multithreaded: false,
};
let indices = series.argsort(opts).cast(&DataType::Int64)?;
let indices = series.arg_sort(opts).cast(&DataType::Int64)?;
Ok(ExSeries::new(indices))
}

Expand Down Expand Up @@ -1118,7 +1118,8 @@ pub fn s_not(s1: ExSeries) -> Result<ExSeries, ExplorerError> {

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_contains(s1: ExSeries, pattern: &str) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s1.utf8()?.contains(pattern)?.into()))
// TODO: maybe have "strict" as an option.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would the strict option do?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs don't say much :/

But for what I could understand, it is related to how the Regex is interpreted: https://pola-rs.github.io/polars/src/polars_ops/chunked_array/strings/namespace.rs.html#136

If it's "strict", then the regex must be valid. In case it's not strict and an invalid regex is used, it is going to search for "None" - nils.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@philss for us contain is never a regex, so we need to make sure our pattern is not being interpreted as a regex.

Ok(ExSeries::new(s1.utf8()?.contains(pattern, true)?.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand Down
16 changes: 8 additions & 8 deletions test/explorer/polars_backend/expression_test.exs
Expand Up @@ -16,10 +16,10 @@ defmodule Explorer.PolarsBackend.ExpressionTest do

assert %Expression{} = expr = Expression.to_expr(lazy)

assert Expression.describe_filter_plan(df, expr) == """
FILTER [(col("col_a")) == (5i64)] FROM
DF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
"""
assert Expression.describe_filter_plan(df, expr) ==
String.trim("""
FILTER [(col("col_a")) == (5)] FROMDF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
""")
end

test "with basic float value" do
Expand Down Expand Up @@ -81,10 +81,10 @@ defmodule Explorer.PolarsBackend.ExpressionTest do

assert %Expression{} = expr = Expression.to_expr(lazy)

assert Expression.describe_filter_plan(df, expr) == """
FILTER [(col("col_a")) == (col("col_b"))] FROM
DF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
"""
assert Expression.describe_filter_plan(df, expr) ==
String.trim("""
FILTER [(col("col_a")) == (col("col_b"))] FROMDF ["col_a", "col_b"]; PROJECT */2 COLUMNS; SELECTION: "None"
""")
end
end
end