diff --git a/CHANGELOG.md b/CHANGELOG.md index 25638fecdbf2..a41ded36f959 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -207,6 +207,8 @@ - [Added `Date_Period.Week` to `start_of` and `end_of` methods.][3733] - [Replaced `Table.where` with a new API relying on `Table.filter`.][3750] - [Added `Filter_Condition` to `Vector`, `Range` and `List`.][3770] +- [Extended `Filter_Condition` with `Is_Empty`, `Not_Empty`, `Like` and + `Not_Like`.][3775] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -332,6 +334,7 @@ [3749]: https://github.com/enso-org/enso/pull/3749 [3750]: https://github.com/enso-org/enso/pull/3750 [3770]: https://github.com/enso-org/enso/pull/3770 +[3775]: https://github.com/enso-org/enso/pull/3775 #### Enso Compiler diff --git a/build.sbt b/build.sbt index 742d952233b0..3f6f55386f41 100644 --- a/build.sbt +++ b/build.sbt @@ -2039,7 +2039,7 @@ buildEngineDistribution := { log.info(s"Engine package created at $root") } -val stdBitsProjects = List("Base", "Database", "Google_Api", "Image", "Table") +val stdBitsProjects = List("Base", "Database", "Google_Api", "Image", "Table", "All") val allStdBits: Parser[String] = stdBitsProjects.map(v => v: Parser[String]).reduce(_ | _) @@ -2057,7 +2057,7 @@ buildStdLib := Def.inputTaskDyn { }.evaluated lazy val pkgStdLibInternal = inputKey[Unit]("Use `buildStdLib`") -pkgStdLibInternal := Def.inputTaskDyn { +pkgStdLibInternal := Def.inputTask { val cmd = allStdBits.parsed val root = engineDistributionRoot.value val log: sbt.Logger = streams.value.log @@ -2073,15 +2073,27 @@ pkgStdLibInternal := Def.inputTaskDyn { (`std-image` / Compile / packageBin).value case "Table" => (`std-table` / Compile / packageBin).value + case "All" => + (`std-base` / Compile / packageBin).value + (`std-table` / Compile / packageBin).value + (`std-database` / Compile / packageBin).value + (`std-image` / Compile / packageBin).value + (`std-google-api` / Compile / packageBin).value case _ => } - StdBits.buildStdLibPackage( - cmd, - root, - cacheFactory, - log, - defaultDevEnsoVersion - ) + val libs = if (cmd != "All") Seq(cmd) else { + val prefix = "Standard." + Editions.standardLibraries.filter(_.startsWith(prefix)).map(_.stripPrefix(prefix)) + } + libs.foreach { lib => + StdBits.buildStdLibPackage( + lib, + root, + cacheFactory, + log, + defaultDevEnsoVersion + ) + } }.evaluated lazy val buildLauncherDistribution = diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 2254cc1d10ce..f22c29621dfc 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -2,6 +2,8 @@ from Standard.Base import all from Standard.Base.Data.Filter_Condition.Filter_Condition import all +polyglot java import org.enso.base.Regex_Utils + type Filter_Condition ## Is less than a value (or another column, in case of Table operations)? Less than:Any @@ -57,6 +59,52 @@ type Filter_Condition ## Is the value equal to False (Boolean only)? Is_False + ## Is equal to "" or Nothing (Text only)? + Is_Empty + + ## Is not equal to "" and Nothing (Text only)? + Not_Empty + + ## Does the value match the SQL pattern (Text only)? + + It accepts a Text value representing the matching pattern. In case of + Table operations, it can accept another column - then the corresponding + values from the source column and the provided column are checked. + + The pattern is interpreted according to the standard SQL convention: + - the `%` character matches any sequence of characters, + - the `_` character matches any single character, + - any other character is matched literally. + + ! Known Bugs + There is a known bug in Java Regex where escape characters are not + handled properly in Unicode-normalized matching mode. Due to this + limitation, Unicode normalization has been disabled for this function, + so beware that some equivalent graphemes like 'ś' and 's\u0301' will + not be matched. + See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + Like pattern:Text + + ## Does the value not match the SQL pattern (Text only)? + + It accepts a Text value representing the matching pattern. In case of + Table operations, it can accept another column - then the corresponding + values from the source column and the provided column are checked. + + The pattern is interpreted according to the standard SQL convention: + - the `%` character matches any sequence of characters, + - the `_` character matches any single character, + - any other character is matched literally. + + ! Known Bugs + There is a known bug in Java Regex where escape characters are not + handled properly in Unicode-normalized matching mode. Due to this + limitation, Unicode normalization has been disabled for this function, + so beware that some equivalent graphemes like 'ś' and 's\u0301' will + not be matched. + See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + Not_Like pattern:Text + ## Converts a `Filter_Condition` condition into a predicate taking an element and returning a value indicating whether the element should be accepted by the filter. @@ -80,3 +128,25 @@ type Filter_Condition _ -> True Is_True -> ==True Is_False -> ==False + Is_Empty -> elem -> case elem of + Nothing -> True + "" -> True + _ -> False + Not_Empty -> elem -> case elem of + Nothing -> False + "" -> False + _ -> True + Like sql_pattern -> + regex = sql_like_to_regex sql_pattern + regex.matches + Not_Like sql_pattern -> + regex = sql_like_to_regex sql_pattern + elem -> regex.matches elem . not + +## PRIVATE +sql_like_to_regex sql_pattern = + regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern + ## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. + https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + Once that bug is fixed, `match_ascii` may be set back to `False`. + Regex.compile regex_pattern dot_matches_newline=True match_ascii=True diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 35fd7200406f..6d7a467a3084 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -113,6 +113,34 @@ type Column to_sql : SQL_Statement to_sql self = self.to_table.to_sql + ## PRIVATE + Sets up an operation of arbitrary arity. + + Arguments: + - op_kind: The kind of the operation + - operands: A vector of additional operation arguments (the column itself + is always passed as the first argument). + - new_type: The type of the SQL column that results from applying the + operator. If not specified, the type of this column is used. + - operand_types: The SQL types of the additional arguments. They are used + if additional arguments are constants (and if not provided, the type of + this column is used). If the other argument is a column, its type is + used. + make_op self op_kind operands new_type=Nothing operand_types=Nothing = + prepare_operand operand operand_type = case operand of + other_column : Column -> + if Helpers.check_integrity self other_column then other_column.expression else + Error.throw <| Unsupported_Database_Operation_Error "Cannot use columns coming from different contexts in one expression without a join." + constant -> + actual_operand_type = operand_type.if_nothing self.sql_type + Expression.Constant actual_operand_type constant + actual_operand_types = operand_types.if_nothing (Vector.fill operands.length Nothing) + expressions = operands.zip actual_operand_types prepare_operand + + actual_new_type = new_type.if_nothing self.sql_type + new_expr = Expression.Operation op_kind ([self.expression] + expressions) + Column.Value self.name self.connection actual_new_type new_expr self.context + ## PRIVATE Creates a binary operation with given kind and operand. @@ -129,20 +157,7 @@ type Column defaults to the current type if not provided. make_binary_op : Text -> Text -> (Column | Any) -> (SQL_Type | Nothing) -> (SQL_Type | Nothing) -> Column make_binary_op self op_kind operand new_type=Nothing operand_type=Nothing = - actual_new_type = new_type.if_nothing self.sql_type - case operand of - Column.Value _ _ _ other_expr _ -> - case Helpers.check_integrity self operand of - False -> - Error.throw <| Unsupported_Database_Operation_Error "Cannot compare columns coming from different contexts. Only columns of a single table can be compared." - True -> - new_expr = Expression.Operation op_kind [self.expression, other_expr] - Column.Value self.name self.connection actual_new_type new_expr self.context - _ -> - actual_operand_type = operand_type.if_nothing self.sql_type - other = Expression.Constant actual_operand_type operand - new_expr = Expression.Operation op_kind [self.expression, other] - Column.Value self.name self.connection actual_new_type new_expr self.context + self.make_op op_kind [operand] new_type [operand_type] ## PRIVATE @@ -153,10 +168,7 @@ type Column - new_type: The type of the SQL column that results from applying the operator. make_unary_op : Text -> Text -> (SQL_Type | Nothing) -> Column - make_unary_op self op_kind new_type=Nothing = - actual_new_type = new_type.if_nothing self.sql_type - new_expr = Expression.Operation op_kind [self.expression] - Column.Value self.name self.connection actual_new_type new_expr self.context + make_unary_op self op_kind new_type=Nothing = self.make_op op_kind [] new_type ## UNSTABLE @@ -314,6 +326,22 @@ type Column < : Column | Any -> Column < self other = self.make_binary_op "<" other new_type=SQL_Type.boolean + ## Element-wise inclusive bounds check. + + Arguments: + - lower: The lower bound to compare elements of `self` against. If + `lower` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `lower`. + - upper: The upper bound to compare elements of `self` against. If + `upper` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `upper`. + + Returns a column with boolean values indicating whether values of this + column fit between the lower and upper bounds (both ends inclusive). + between : (Column | Any) -> (Column | Any) -> Column + between self lower upper = + self.make_op "BETWEEN" [lower, upper] new_type=SQL_Type.boolean + ## UNSTABLE Element-wise addition. @@ -407,6 +435,12 @@ type Column is_missing : Column is_missing self = self.make_unary_op "ISNULL" new_type=SQL_Type.boolean + ## PRIVATE + Returns a column of booleans, with `True` items at the positions where + this column contains an empty string or `Nothing`. + is_empty : Column + is_empty self = self.make_unary_op "ISEMPTY" new_type=SQL_Type.boolean + ## UNSTABLE Returns a new column where missing values have been replaced with the @@ -517,6 +551,11 @@ type Column contains : Column | Text -> Column contains self other = self.make_binary_op "contains" other new_type=SQL_Type.boolean + ## PRIVATE + Checks for each element of the column if it matches an SQL-like pattern. + like : Column | Text -> Column + like self other = self.make_binary_op "LIKE" other new_type=SQL_Type.boolean + ## PRIVATE as_internal : Internal_Column as_internal self = Internal_Column.Value self.name self.sql_type self.expression diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso index 2dd95b3ac5ce..8405041367ef 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso @@ -168,15 +168,39 @@ base_dialect = bin = name -> [name, make_binary_op name] unary = name -> [name, make_unary_op name] fun = name -> [name, make_function name] + arith = [bin "+", bin "-", bin "*", bin "/"] logic = [bin "AND", bin "OR", unary "NOT"] - compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">="] + compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">=", ["BETWEEN", make_between]] agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"] counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]] + text = [["ISEMPTY", make_is_empty], bin "LIKE"] nulls = [["ISNULL", make_right_unary_op "IS NULL"], ["FILLNULL", make_function "COALESCE"]] - base_map = Map.from_vector (arith + logic + compare + agg + nulls + counts) + base_map = Map.from_vector (arith + logic + compare + agg + counts + text + nulls) Internal_Dialect.Value base_map wrap_in_quotes +## PRIVATE +make_is_empty : Vector Builder -> Builder +make_is_empty arguments = case arguments.length of + 1 -> + arg = arguments.at 0 + is_null = (arg ++ " IS NULL").paren + is_empty = (arg ++ " = ''").paren + (is_null ++ " OR " ++ is_empty).paren + _ -> + Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation ISEMPTY") + +## PRIVATE +make_between : Vector Builder -> Builder +make_between arguments = case arguments.length of + 3 -> + expr = arguments.at 0 + lower = arguments.at 1 + upper = arguments.at 2 + (expr ++ " BETWEEN " ++ lower ++ " AND " ++ upper).paren + _ -> + Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation BETWEEN") + ## PRIVATE Builds code for an expression. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 12aadbdfc376..22a30d1a359d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -247,6 +247,21 @@ type Column < : Column | Any -> Column < self other = run_vectorized_binary_op self "<" (<) other + ## Element-wise inclusive bounds check. + + Arguments: + - lower: The lower bound to compare elements of `self` against. If + `lower` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `lower`. + - upper: The upper bound to compare elements of `self` against. If + `upper` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `upper`. + + Returns a column with boolean values indicating whether values of this + column fit between the lower and upper bounds (both ends inclusive). + between : (Column | Any) -> (Column | Any) -> Column + between self lower upper = (self >= lower) && (self <= upper) + ## ALIAS Add Columns Element-wise addition. @@ -444,6 +459,12 @@ type Column is_missing : Column is_missing self = run_vectorized_unary_op self "is_missing" (== Nothing) + ## PRIVATE + Returns a column of booleans, with `True` items at the positions where + this column contains an empty string or `Nothing`. + is_empty : Column + is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition.Is_Empty.to_predicate + ## Returns a column of booleans, with `True` items at the positions where this column does not contain a `Nothing`. @@ -564,6 +585,12 @@ type Column contains self other = run_vectorized_binary_op self "contains" (a -> b -> a.contains b) other + ## PRIVATE + Checks for each element of the column if it matches an SQL-like pattern. + like : Column | Text -> Column + like self other = + run_vectorized_binary_op self "like" (_ -> _ -> Error.throw (Illegal_State_Error "The `Like` operation should only be used on Text columns.")) other + ## ALIAS Transform Column Applies `function` to each item in this column and returns the column diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso index d28302c770a6..24deeea8c79f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso @@ -12,13 +12,24 @@ from Standard.Base.Data.Filter_Condition.Filter_Condition import all It also performs validation and will throw errors if unexpected column types are encountered. make_filter_column source_column filter_condition = case filter_condition of + # Equality + Equal value -> (source_column == value) + Not_Equal value -> (source_column != value) + # Nothing + Is_Nothing -> source_column.is_missing + Not_Nothing -> source_column.is_missing.not + # Boolean + Is_True -> + Value_Type.expect_boolean source_column.value_type <| source_column + Is_False -> + Value_Type.expect_boolean source_column.value_type <| source_column.not + # Comparisons Less value -> (source_column < value) Equal_Or_Less value -> (source_column <= value) - Equal value -> (source_column == value) Equal_Or_Greater value -> (source_column >= value) Greater value -> (source_column > value) - Not_Equal value -> (source_column != value) - Between lower upper -> ((source_column >= lower) && (source_column <= upper)) + Between lower upper -> source_column.between lower upper + # Text Starts_With prefix -> Value_Type.expect_text source_column.value_type <| expect_column_or_value_as_text "prefix" prefix <| @@ -31,12 +42,20 @@ make_filter_column source_column filter_condition = case filter_condition of Value_Type.expect_text source_column.value_type <| expect_column_or_value_as_text "substring" substring <| source_column.contains substring - Is_Nothing -> source_column.is_missing - Not_Nothing -> source_column.is_missing.not - Is_True -> - Value_Type.expect_boolean source_column.value_type <| source_column - Is_False -> - Value_Type.expect_boolean source_column.value_type <| source_column.not + Is_Empty -> + Value_Type.expect_text source_column.value_type <| + source_column.is_empty + Not_Empty -> + Value_Type.expect_text source_column.value_type <| + source_column.is_empty.not + Like pattern -> + Value_Type.expect_text source_column.value_type <| + expect_column_or_value_as_text "pattern" pattern <| + source_column.like pattern + Not_Like pattern -> + Value_Type.expect_text source_column.value_type <| + expect_column_or_value_as_text "pattern" pattern <| + source_column.like pattern . not ## PRIVATE expect_column_or_value_as_text field_name column_or_value ~action = case column_or_value of diff --git a/project/StdBits.scala b/project/StdBits.scala index 98bd16714225..292a94c9fc6b 100644 --- a/project/StdBits.scala +++ b/project/StdBits.scala @@ -132,7 +132,7 @@ object StdBits { cacheFactory: sbt.util.CacheStoreFactory, log: sbt.Logger, defaultDevEnsoVersion: String - ) = Def.task { + ) = { log.info(s"Building standard library package for '$name'") val prefix = "Standard" val targetPkgRoot = root / "lib" / prefix / name / defaultDevEnsoVersion diff --git a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java index ae24a5252640..8edde276714c 100644 --- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java @@ -70,4 +70,42 @@ public static String[] find_all_matches(String regex, String text) { } return allMatches.toArray(new String[0]); } + + /** + * Converts a SQL-like pattern into a Regex with the same semantics. + * + *
Special regex characters present in the input pattern are quoted to match them literally
+ * according to the SQL-like format.
+ */
+ public static String sql_like_pattern_to_regex(String sql_pattern) {
+ StringBuilder result = new StringBuilder();
+ // Accumulates the intermittent characters between wildcards. These will be quoted in bulk.
+ StringBuilder acc = new StringBuilder();
+ for (int i = 0; i < sql_pattern.length(); ++i) {
+ char c = sql_pattern.charAt(i);
+ if (c == '%' || c == '_') {
+ // Before inserting the converted wildcard, we append the accumulated characters, quoting
+ // them first.
+ if (acc.length() > 0) {
+ result.append(Pattern.quote(acc.toString()));
+ acc.setLength(0);
+ }
+
+ if (c == '%') {
+ result.append(".*");
+ } else {
+ result.append(".");
+ }
+ } else {
+ acc.append(c);
+ }
+ }
+
+ // If any trailing characters were left, we append them too.
+ if (acc.length() > 0) {
+ result.append(Pattern.quote(acc.toString()));
+ }
+
+ return result.toString();
+ }
}
diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java
new file mode 100644
index 000000000000..0963d6ab9997
--- /dev/null
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java
@@ -0,0 +1,60 @@
+package org.enso.table.data.column.operation.map.text;
+
+import java.util.BitSet;
+import java.util.regex.Pattern;
+
+import com.ibm.icu.impl.UnicodeRegex;
+import org.enso.base.Regex_Utils;
+import org.enso.table.data.column.storage.BoolStorage;
+import org.enso.table.data.column.storage.SpecializedStorage;
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.error.UnexpectedTypeException;
+
+public class LikeOp extends StringBooleanOp {
+ public LikeOp() {
+ super(Storage.Maps.LIKE);
+ }
+
+
+ /**
+ * There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting.
+ * Once that bug is fixed, we should add all relevant Unicode flags here too,
+ * consistently with the Default Enso regex engine.
+ */
+ private final static int REGEX_FLAGS = Pattern.DOTALL;
+
+ private Pattern createRegexPatternFromSql(String sqlPattern) {
+ String regex = Regex_Utils.sql_like_pattern_to_regex(sqlPattern);
+ String unicodeTransformed = UnicodeRegex.fix(regex);
+ return Pattern.compile(unicodeTransformed, REGEX_FLAGS);
+ }
+
+ @Override
+ protected boolean doString(String a, String b) {
+ return createRegexPatternFromSql(b).matcher(a).matches();
+ }
+
+ @Override
+ public Storage runMap(SpecializedStorage