From 9906bbf8ac645913c8ca5b896dcbf1807e93b9ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 12:49:24 +0200 Subject: [PATCH 01/20] Preliminary implementation for Base types --- .../0.0.0-dev/src/Data/Filter_Condition.enso | 50 +++++++++++++++++++ test/Tests/src/Data/List_Spec.enso | 7 +++ test/Tests/src/Data/Range_Spec.enso | 4 ++ test/Tests/src/Data/Vector_Spec.enso | 11 ++++ 4 files changed, 72 insertions(+) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 2254cc1d10ce..89cc920782f3 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -57,6 +57,36 @@ type Filter_Condition ## Is the value equal to False (Boolean only)? Is_False + ## Is equal to "" or Nothing (Text only)? + Is_Empty + + ## Is not equal to "" and Nothing (Text only)? + Not_Empty + + ## Does the value match the SQL pattern (Text only)? + + It accepts a Text value representing the matching pattern. In case of + Table operations, it can accept another column - then the corresponding + values from the source column and the provided column are checked. + + The pattern is interpreted according to the standard SQL convention: + - the `%` character matches any sequence of characters, + - the `_` character matches any single character, + - any other character is matched literally. + Like pattern:Text + + ## Does the value not match the SQL pattern (Text only)? + + It accepts a Text value representing the matching pattern. In case of + Table operations, it can accept another column - then the corresponding + values from the source column and the provided column are checked. + + The pattern is interpreted according to the standard SQL convention: + - the `%` character matches any sequence of characters, + - the `_` character matches any single character, + - any other character is matched literally. + Not_Like pattern:Text + ## Converts a `Filter_Condition` condition into a predicate taking an element and returning a value indicating whether the element should be accepted by the filter. @@ -80,3 +110,23 @@ type Filter_Condition _ -> True Is_True -> ==True Is_False -> ==False + Is_Empty -> elem -> case elem of + Nothing -> True + "" -> True + _ -> False + Not_Empty -> elem -> case elem of + Nothing -> False + "" -> False + _ -> True + Like sql_pattern -> + regex = sql_like_to_regex sql_pattern + regex.matches + Not_Like sql_pattern -> + regex = sql_like_to_regex sql_pattern + elem -> regex.matches elem . not + +## PRIVATE +sql_like_to_regex sql_pattern = + ## FIXME: IMO this will not work for `[ab]%` which will match `aaaa` and `bbbb` but it should not - it should however match `[ab]aaa`. `[]` are not special in SQL + regex_pattern = "^" + (sql_pattern.replace "\" "\\" . replace "." "\" . replace "_" "." . replace "%" ".*?") + "$" + Regex.compile regex_pattern diff --git a/test/Tests/src/Data/List_Spec.enso b/test/Tests/src/Data/List_Spec.enso index f8c0f34d636b..b756cb15d24e 100644 --- a/test/Tests/src/Data/List_Spec.enso +++ b/test/Tests/src/Data/List_Spec.enso @@ -69,6 +69,13 @@ spec = Test.group "List" <| txt.filter (Filter_Condition.Between "b" "c") . should_equal ["bbb", "baaa"].to_list Test.expect_panic_with (txt.filter (Filter_Condition.Starts_With 42)) Unsupported_Argument_Types_Data + ["", Nothing, " ", "a"].to_list.filter (Filter_Condition.Is_Empty) . should_equal ["", Nothing].to_list + ["", Nothing, " ", "a"].to_list.filter (Filter_Condition.Not_Empty) . should_equal [" ", "a"].to_list + ["abab", "aaabaaaa", "ba"].to_list.filter (Filter_Condition.Like "ba") . should_equal (Cons "ba" Nil) + ["abab", "aaabaaaa"].to_list.filter (Filter_Condition.Like "_ba_") . should_equal ["abab"].to_list + ["abab", "aaabaaaa"].to_list.filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"].to_list + ["abab", "aaabaaaa"].to_list.filter (Filter_Condition.Not_Like "%ba%") . should_equal Nil + mixed = [1, Nothing, "b"].to_list mixed.filter Filter_Condition.Is_Nothing . should_equal (Cons Nothing Nil) mixed.filter Filter_Condition.Not_Nothing . should_equal (Cons 1 (Cons "b" Nil)) diff --git a/test/Tests/src/Data/Range_Spec.enso b/test/Tests/src/Data/Range_Spec.enso index 9fdcf90b7598..a8c4cff7070f 100644 --- a/test/Tests/src/Data/Range_Spec.enso +++ b/test/Tests/src/Data/Range_Spec.enso @@ -78,10 +78,14 @@ spec = Test.group "Range" <| range.filter (Filter_Condition.Between 2.1 4.5) . should_equal [3, 4] Test.expect_panic_with (range.filter (Filter_Condition.Starts_With "a")) No_Such_Method_Error_Data + Test.expect_panic_with (range.filter (Filter_Condition.Like "a%")) Unsupported_Argument_Types_Data + Test.expect_panic_with (range.filter (Filter_Condition.Not_Like "a_")) Unsupported_Argument_Types_Data range.filter Filter_Condition.Is_True . should_equal [] range.filter Filter_Condition.Is_False . should_equal [] range.filter Filter_Condition.Is_Nothing . should_equal [] range.filter Filter_Condition.Not_Nothing . should_equal [1, 2, 3, 4, 5] + range.filter Filter_Condition.Is_Empty . should_equal [] + range.filter Filter_Condition.Not_Empty . should_equal [1, 2, 3, 4, 5] Test.specify "should allow iteration" <| vec_mut = Vector.new_builder diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index 82d898d3ee5a..8fefa0eaef09 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -165,9 +165,20 @@ spec = Test.group "Vectors" <| txtvec.filter (Filter_Condition.Between "b" "c") . should_equal ["bbb", "baaa"] Test.expect_panic_with (txtvec.filter (Filter_Condition.Starts_With 42)) Unsupported_Argument_Types_Data + ["", Nothing, " ", "a"].filter (Filter_Condition.Is_Empty) . should_equal ["", Nothing] + ["", Nothing, " ", "a"].filter (Filter_Condition.Not_Empty) . should_equal [" ", "a"] + ["abab", "aaabaaaa", "ba"].filter (Filter_Condition.Like "ba") . should_equal ["ba"] + ["abab", "aaabaaaa"].filter (Filter_Condition.Like "_ba_") . should_equal ["abab"] + ["abab", "aaabaaaa"].filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"] + ["aaaa", "bbbbb", "[ab]aaaa"].filter (Filter_Condition.Like "[ab]%") . should_equal ["[ab]aaaa"] + ["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"] + ["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"] + mixed = [1, Nothing, "b"] mixed.filter Filter_Condition.Is_Nothing . should_equal [Nothing] mixed.filter Filter_Condition.Not_Nothing . should_equal [1, "b"] + mixed.filter Filter_Condition.Is_Empty . should_equal [Nothing] + mixed.filter Filter_Condition.Not_Empty . should_equal [1, "b"] boolvec = [True, False, Nothing, True] boolvec.filter Filter_Condition.Is_True . should_equal [True, True] From b082949f17503d76b1597640747fcd3f3ffb6d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 13:14:08 +0200 Subject: [PATCH 02/20] Fix like, move to Java in anticipation of vectorized ops --- .../0.0.0-dev/src/Data/Filter_Condition.enso | 5 +-- .../main/java/org/enso/base/Regex_Utils.java | 35 +++++++++++++++++++ test/Tests/src/Data/Vector_Spec.enso | 1 + 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 89cc920782f3..326c0a8feb05 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -2,6 +2,8 @@ from Standard.Base import all from Standard.Base.Data.Filter_Condition.Filter_Condition import all +polyglot java import org.enso.base.Regex_Utils + type Filter_Condition ## Is less than a value (or another column, in case of Table operations)? Less than:Any @@ -127,6 +129,5 @@ type Filter_Condition ## PRIVATE sql_like_to_regex sql_pattern = - ## FIXME: IMO this will not work for `[ab]%` which will match `aaaa` and `bbbb` but it should not - it should however match `[ab]aaa`. `[]` are not special in SQL - regex_pattern = "^" + (sql_pattern.replace "\" "\\" . replace "." "\" . replace "_" "." . replace "%" ".*?") + "$" + regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern Regex.compile regex_pattern diff --git a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java index ae24a5252640..619c42057bd7 100644 --- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java @@ -70,4 +70,39 @@ public static String[] find_all_matches(String regex, String text) { } return allMatches.toArray(new String[0]); } + + /** Converts a SQL-like pattern into a Regex with the same semantics. + * + * Special regex characters present in the input pattern are quoted to match them literally according to the SQL-like format. + */ + public static String sql_like_pattern_to_regex(String sql_pattern) { + StringBuilder result = new StringBuilder(); + // Accumulates the intermittent characters between wildcards. These will be quoted in bulk. + StringBuilder acc = new StringBuilder(); + for (int i = 0; i < sql_pattern.length(); ++i) { + char c = sql_pattern.charAt(i); + if (c == '%' || c == '_') { + // Before inserting the converted wildcard, we append the accumulated characters, quoting them first. + if (acc.length() > 0) { + result.append(Pattern.quote(acc.toString())); + acc.setLength(0); + } + + if (c == '%') { + result.append(".*"); + } else { + result.append("."); + } + } else { + acc.append(c); + } + } + + // If any trailing characters were left, we append them too. + if (acc.length() > 0) { + result.append(Pattern.quote(acc.toString())); + } + + return result.toString(); + } } diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index 8fefa0eaef09..3093b241499d 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -171,6 +171,7 @@ spec = Test.group "Vectors" <| ["abab", "aaabaaaa"].filter (Filter_Condition.Like "_ba_") . should_equal ["abab"] ["abab", "aaabaaaa"].filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"] ["aaaa", "bbbbb", "[ab]aaaa"].filter (Filter_Condition.Like "[ab]%") . should_equal ["[ab]aaaa"] + ["a\Qa\Eabb", "aaabb"].filter (Filter_Condition.Like "_\Qa\Ea%") . should_equal ["a\Qa\Eabb"] ["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"] ["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"] From 6ea50616f83516725c050bb410c35cc7f0da1599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 15:41:19 +0200 Subject: [PATCH 03/20] tests, initial impl of like, empty, update between --- .../Database/0.0.0-dev/src/Data/Column.enso | 17 +++++++++ .../Table/0.0.0-dev/src/Data/Column.enso | 28 ++++++++++++++ .../Internal/Filter_Condition_Helpers.enso | 37 ++++++++++++++----- .../table/data/column/storage/Storage.java | 2 + .../data/column/storage/StringStorage.java | 25 +++++++++++++ test/Tests/src/Data/List_Spec.enso | 1 + test/Tests/src/Data/Range_Spec.enso | 1 + test/Tests/src/Data/Vector_Spec.enso | 3 ++ 8 files changed, 105 insertions(+), 9 deletions(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 35fd7200406f..27a259530595 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -314,6 +314,23 @@ type Column < : Column | Any -> Column < self other = self.make_binary_op "<" other new_type=SQL_Type.boolean + ## Element-wise inclusive bounds check. + + Arguments: + - lower: The lower bound to compare elements of `self` against. If + `lower` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `lower`. + - upper: The upper bound to compare elements of `self` against. If + `upper` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `upper`. + + Returns a column with boolean values indicating whether values of this + column fit between the lower and upper bounds (both ends inclusive). + between : (Column | Any) -> (Column | Any) -> Column + between self lower upper = + # TODO change this to use SQL BETWEEN! + (self >= lower) && (self <= upper) + ## UNSTABLE Element-wise addition. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 12aadbdfc376..407a99190cd2 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1,4 +1,5 @@ from Standard.Base import all +import Standard.Base.Data.Filter_Condition as Filter_Condition_Module import Standard.Base.Data.Ordering.Comparator import Standard.Base.Data.Index_Sub_Range @@ -247,6 +248,21 @@ type Column < : Column | Any -> Column < self other = run_vectorized_binary_op self "<" (<) other + ## Element-wise inclusive bounds check. + + Arguments: + - lower: The lower bound to compare elements of `self` against. If + `lower` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `lower`. + - upper: The upper bound to compare elements of `self` against. If + `upper` is a column, the comparison is performed pairwise between + corresponding elements of `self` and `upper`. + + Returns a column with boolean values indicating whether values of this + column fit between the lower and upper bounds (both ends inclusive). + between : (Column | Any) -> (Column | Any) -> Column + between self lower upper = (self >= lower) && (self <= upper) + ## ALIAS Add Columns Element-wise addition. @@ -444,6 +460,12 @@ type Column is_missing : Column is_missing self = run_vectorized_unary_op self "is_missing" (== Nothing) + ## PRIVATE + Returns a column of booleans, with `True` items at the positions where + this column contains an empty string or `Nothing`. + is_empty : Column + is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition_Module.is_empty + ## Returns a column of booleans, with `True` items at the positions where this column does not contain a `Nothing`. @@ -564,6 +586,12 @@ type Column contains self other = run_vectorized_binary_op self "contains" (a -> b -> a.contains b) other + ## PRIVATE + Checks for each element of the column if it matches an SQL-like pattern. + like : Column | Text -> Column + like self other = + run_vectorized_binary_op self "like" (_ -> _ -> Error.throw (Illegal_State_Error "The `Like` operation should only be used on Text columns.")) other + ## ALIAS Transform Column Applies `function` to each item in this column and returns the column diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso index d28302c770a6..24deeea8c79f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Filter_Condition_Helpers.enso @@ -12,13 +12,24 @@ from Standard.Base.Data.Filter_Condition.Filter_Condition import all It also performs validation and will throw errors if unexpected column types are encountered. make_filter_column source_column filter_condition = case filter_condition of + # Equality + Equal value -> (source_column == value) + Not_Equal value -> (source_column != value) + # Nothing + Is_Nothing -> source_column.is_missing + Not_Nothing -> source_column.is_missing.not + # Boolean + Is_True -> + Value_Type.expect_boolean source_column.value_type <| source_column + Is_False -> + Value_Type.expect_boolean source_column.value_type <| source_column.not + # Comparisons Less value -> (source_column < value) Equal_Or_Less value -> (source_column <= value) - Equal value -> (source_column == value) Equal_Or_Greater value -> (source_column >= value) Greater value -> (source_column > value) - Not_Equal value -> (source_column != value) - Between lower upper -> ((source_column >= lower) && (source_column <= upper)) + Between lower upper -> source_column.between lower upper + # Text Starts_With prefix -> Value_Type.expect_text source_column.value_type <| expect_column_or_value_as_text "prefix" prefix <| @@ -31,12 +42,20 @@ make_filter_column source_column filter_condition = case filter_condition of Value_Type.expect_text source_column.value_type <| expect_column_or_value_as_text "substring" substring <| source_column.contains substring - Is_Nothing -> source_column.is_missing - Not_Nothing -> source_column.is_missing.not - Is_True -> - Value_Type.expect_boolean source_column.value_type <| source_column - Is_False -> - Value_Type.expect_boolean source_column.value_type <| source_column.not + Is_Empty -> + Value_Type.expect_text source_column.value_type <| + source_column.is_empty + Not_Empty -> + Value_Type.expect_text source_column.value_type <| + source_column.is_empty.not + Like pattern -> + Value_Type.expect_text source_column.value_type <| + expect_column_or_value_as_text "pattern" pattern <| + source_column.like pattern + Not_Like pattern -> + Value_Type.expect_text source_column.value_type <| + expect_column_or_value_as_text "pattern" pattern <| + source_column.like pattern . not ## PRIVATE expect_column_or_value_as_text field_name column_or_value ~action = case column_or_value of diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java index 871c7bfd41fd..b1560d3d740e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -78,9 +78,11 @@ public static final class Maps { public static final String AND = "&&"; public static final String OR = "||"; public static final String IS_MISSING = "is_missing"; + public static final String IS_EMPTY = "is_empty"; public static final String STARTS_WITH = "starts_with"; public static final String ENDS_WITH = "ends_with"; public static final String CONTAINS = "contains"; + public static final String LIKE = "like"; } public static final class Aggregators { diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 807789382da9..e0846b7c69e4 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -1,10 +1,14 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.regex.Pattern; + +import org.enso.base.Regex_Utils; import org.enso.base.Text_Utils; import org.enso.table.data.column.builder.object.StringBuilder; import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.MapOperation; +import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.text.StringBooleanOp; import org.graalvm.polyglot.Value; @@ -93,6 +97,20 @@ public Storage runZip(SpecializedStorage storage, Storage arg) { return new BoolStorage(r, missing, storage.size(), false); } }); + t.add( + new UnaryMapOperation<>(Maps.IS_EMPTY) { + @Override + protected Storage run(SpecializedStorage storage) { + BitSet r = new BitSet(); + for (int i = 0; i < storage.size; i++) { + String s = storage.data[i]; + if (s == null || s.isEmpty()) { + r.set(i); + } + } + return new BoolStorage(r, new BitSet(), storage.size, false); + } + }); t.add( new StringBooleanOp(Maps.STARTS_WITH) { @Override @@ -114,6 +132,13 @@ protected boolean doString(String a, String b) { return Text_Utils.contains(a, b); } }); + t.add( + new StringBooleanOp(Maps.LIKE) { + @Override + protected boolean doString(String a, String b) { + return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(b)).matcher(a).matches(); + } + }); return t; } } diff --git a/test/Tests/src/Data/List_Spec.enso b/test/Tests/src/Data/List_Spec.enso index b756cb15d24e..987b117ebfee 100644 --- a/test/Tests/src/Data/List_Spec.enso +++ b/test/Tests/src/Data/List_Spec.enso @@ -49,6 +49,7 @@ spec = Test.group "List" <| list.filter (Filter_Condition.Greater than=3) . should_equal [4, 5].to_list list.filter (Filter_Condition.Less than=3.5) . should_equal [1, 2, 3].to_list list.filter (Filter_Condition.Equal to=3) . should_equal (Cons 3 Nil) + list.filter (Filter_Condition.Not_Equal to=3) . should_equal [1, 2, 4, 5].to_list list.filter (Filter_Condition.Equal_Or_Greater than=3) . should_equal [3, 4, 5].to_list list.filter (Filter_Condition.Equal_Or_Less than=(-1)) . should_equal Nil list.filter (Filter_Condition.Between 2 4) . should_equal [2, 3, 4].to_list diff --git a/test/Tests/src/Data/Range_Spec.enso b/test/Tests/src/Data/Range_Spec.enso index a8c4cff7070f..df48b7b9500b 100644 --- a/test/Tests/src/Data/Range_Spec.enso +++ b/test/Tests/src/Data/Range_Spec.enso @@ -72,6 +72,7 @@ spec = Test.group "Range" <| range.filter (Filter_Condition.Greater than=3) . should_equal [4, 5] range.filter (Filter_Condition.Less than=3.5) . should_equal [1, 2, 3] range.filter (Filter_Condition.Equal to=3) . should_equal [3] + range.filter (Filter_Condition.Not_Equal to=3) . should_equal [1, 2, 4, 5] range.filter (Filter_Condition.Equal_Or_Greater than=3) . should_equal [3, 4, 5] range.filter (Filter_Condition.Equal_Or_Less than=(-1)) . should_equal [] range.filter (Filter_Condition.Between 2 4) . should_equal [2, 3, 4] diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index 3093b241499d..bb4ba95854ed 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -144,6 +144,9 @@ spec = Test.group "Vectors" <| vec.filter (Filter_Condition.Greater than=3) . should_equal [4, 5] vec.filter (Filter_Condition.Less than=3.5) . should_equal [1, 2, 3] vec.filter (Filter_Condition.Equal to=3) . should_equal [3] + vec.filter (Filter_Condition.Equal to=3.0) . should_equal [3] + vec.filter (Filter_Condition.Equal to=3.1) . should_equal [] + vec.filter (Filter_Condition.Not_Equal to=3) . should_equal [1, 2, 4, 5] vec.filter (Filter_Condition.Equal_Or_Greater than=3) . should_equal [3, 4, 5] vec.filter (Filter_Condition.Equal_Or_Less than=(-1)) . should_equal [] vec.filter (Filter_Condition.Between 2 4) . should_equal [2, 3, 4] From 6ab3f089e9620fafc0a695cf7a506494103268f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 16:30:49 +0200 Subject: [PATCH 04/20] Add empty, like tests for Table --- test/Table_Tests/src/Common_Table_Spec.enso | 31 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index 2523abe6cacd..c3b748282131 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -1118,7 +1118,7 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Equal to=(t.at "Y")) . at "X" . to_vector . should_equal ["b", "c"] t.filter "X" (Filter_Condition.Between (t.at "Y") "bzzzz") . at "X" . to_vector . should_equal ["abb", "baca", "b"] - Test.specify "by text search (contains, starts_with, ends_with)" <| + Test.specify "by text search (contains, starts_with, ends_with, like)" <| t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "baca", "banana", Nothing, "nana"]], ["Y", ["a", "b", "b", "c", "a"]]] t.filter "X" (Filter_Condition.Starts_With "ba") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana"] @@ -1129,8 +1129,21 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Ends_With (t.at "Y")) on_problems=Report_Error . at "X" . to_vector . should_equal ["nana"] t.filter "X" (Filter_Condition.Contains (t.at "Y")) on_problems=Report_Error . at "X" . to_vector . should_equal ["abb", "baca", "banana", "nana"] + t.filter "X" (Filter_Condition.Like "%an%") on_problems=Report_Error . at "X" . to_vector . should_equal ["banana", "nana"] + t.filter "X" (Filter_Condition.Like "_a%") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] + t.filter "X" (Filter_Condition.Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["abb"] + t.filter "X" (Filter_Condition.Like "nana") on_problems=Report_Error . at "X" . to_vector . should_equal ["nana"] + t.filter "X" (Filter_Condition.Not_Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] + + Test.specify "by empty text search" <| + t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "", " ", Nothing, "nana"]]] + t.filter "X" Filter_Condition.Is_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["", Nothing] + t.filter "X" Filter_Condition.Not_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["abb", " ", "nana"] + + Test.specify "should check types for text operations" <| + t = table_builder [["ix", [1, 2, 3, 4]], ["X", [Nothing, "A", "", " "]]] check_column_type_error_handling action = - tester = check_empty ["ix", "X", "Y"] + tester = check_empty ["ix", "X"] check_problem problem = problem.should_be_a Invalid_Value_Type.Invalid_Value_Type_Data problem.expected . should_equal Value_Type.Char @@ -1142,17 +1155,27 @@ spec prefix table_builder test_selection pending=Nothing = check_column_type_error_handling (t.filter "X" (Filter_Condition.Starts_With (t.at "ix")) on_problems=_) check_column_type_error_handling (t.filter "X" (Filter_Condition.Ends_With (t.at "ix")) on_problems=_) check_column_type_error_handling (t.filter "X" (Filter_Condition.Contains (t.at "ix")) on_problems=_) + check_column_type_error_handling (t.filter "X" (Filter_Condition.Like (t.at "ix")) on_problems=_) + check_column_type_error_handling (t.filter "X" (Filter_Condition.Not_Like (t.at "ix")) on_problems=_) + check_column_type_error_handling (t.filter "ix" (Filter_Condition.Starts_With "A") on_problems=_) check_column_type_error_handling (t.filter "ix" (Filter_Condition.Ends_With "A") on_problems=_) check_column_type_error_handling (t.filter "ix" (Filter_Condition.Contains "A") on_problems=_) + check_column_type_error_handling (t.filter "ix" (Filter_Condition.Like "A") on_problems=_) + check_column_type_error_handling (t.filter "ix" (Filter_Condition.Not_Like "A") on_problems=_) + + check_column_type_error_handling (t.filter "ix" Filter_Condition.Is_Empty on_problems=_) + check_column_type_error_handling (t.filter "ix" Filter_Condition.Not_Empty on_problems=_) check_scalar_type_error_handling name action = - tester = check_empty ["ix", "X", "Y"] + tester = check_empty ["ix", "X"] problems = [Type_Error_Data Text Integer name] Problems.test_problem_handling action problems tester check_scalar_type_error_handling "prefix" (t.filter "X" (Filter_Condition.Starts_With 42) on_problems=_) check_scalar_type_error_handling "suffix" (t.filter "X" (Filter_Condition.Ends_With 42) on_problems=_) check_scalar_type_error_handling "substring" (t.filter "X" (Filter_Condition.Contains 42) on_problems=_) + check_scalar_type_error_handling "pattern" (t.filter "X" (Filter_Condition.Like 42) on_problems=_) + check_scalar_type_error_handling "pattern" (t.filter "X" (Filter_Condition.Not_Like 42) on_problems=_) Test.specify "by nulls" <| t = table_builder [["ix", [1, 2, 3, 4]], ["X", [Nothing, 1, Nothing, 4]]] @@ -1169,6 +1192,8 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "b" on_problems=Report_Error . at "ix" . to_vector . should_equal [1, 4, 5] t.filter "b" Filter_Condition.Is_False on_problems=Report_Error . at "ix" . to_vector . should_equal [2] + Test.specify "should check types of boolean operations" <| + t = table_builder [["ix", [1, 2, 3, 4, 5]], ["b", [True, False, Nothing, True, True]]] tester = check_empty ["ix", "b"] check_problem problem = problem.should_be_a Invalid_Value_Type.Invalid_Value_Type_Data From 9d5e2714a2242c0bcbe4be4a8f3ffeb13f5f0cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 18:18:55 +0200 Subject: [PATCH 05/20] shortcut to rebuild all stdlib quickly --- build.sbt | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/build.sbt b/build.sbt index 742d952233b0..720b3f91195b 100644 --- a/build.sbt +++ b/build.sbt @@ -2039,7 +2039,7 @@ buildEngineDistribution := { log.info(s"Engine package created at $root") } -val stdBitsProjects = List("Base", "Database", "Google_Api", "Image", "Table") +val stdBitsProjects = List("Base", "Database", "Google_Api", "Image", "Table", "All") val allStdBits: Parser[String] = stdBitsProjects.map(v => v: Parser[String]).reduce(_ | _) @@ -2057,7 +2057,7 @@ buildStdLib := Def.inputTaskDyn { }.evaluated lazy val pkgStdLibInternal = inputKey[Unit]("Use `buildStdLib`") -pkgStdLibInternal := Def.inputTaskDyn { +pkgStdLibInternal := Def.inputTask { val cmd = allStdBits.parsed val root = engineDistributionRoot.value val log: sbt.Logger = streams.value.log @@ -2073,15 +2073,24 @@ pkgStdLibInternal := Def.inputTaskDyn { (`std-image` / Compile / packageBin).value case "Table" => (`std-table` / Compile / packageBin).value + case "All" => + (`std-base` / Compile / packageBin).value + (`std-table` / Compile / packageBin).value + (`std-database` / Compile / packageBin).value + (`std-image` / Compile / packageBin).value + (`std-google-api` / Compile / packageBin).value case _ => } - StdBits.buildStdLibPackage( - cmd, - root, - cacheFactory, - log, - defaultDevEnsoVersion - ) + val libs = if (cmd == "All") Seq("Base", "Table", "Database", "Image", "Google_Api") else Seq(cmd) + libs.foreach { lib => + StdBits.buildStdLibPackage( + lib, + root, + cacheFactory, + log, + defaultDevEnsoVersion + ) + } }.evaluated lazy val buildLauncherDistribution = From eeb02d392cf8cecf759cbb6cb18ce252f6a59f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 18:33:33 +0200 Subject: [PATCH 06/20] update previously skipped packages too --- build.sbt | 5 ++++- project/StdBits.scala | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 720b3f91195b..3f6f55386f41 100644 --- a/build.sbt +++ b/build.sbt @@ -2081,7 +2081,10 @@ pkgStdLibInternal := Def.inputTask { (`std-google-api` / Compile / packageBin).value case _ => } - val libs = if (cmd == "All") Seq("Base", "Table", "Database", "Image", "Google_Api") else Seq(cmd) + val libs = if (cmd != "All") Seq(cmd) else { + val prefix = "Standard." + Editions.standardLibraries.filter(_.startsWith(prefix)).map(_.stripPrefix(prefix)) + } libs.foreach { lib => StdBits.buildStdLibPackage( lib, diff --git a/project/StdBits.scala b/project/StdBits.scala index 98bd16714225..292a94c9fc6b 100644 --- a/project/StdBits.scala +++ b/project/StdBits.scala @@ -132,7 +132,7 @@ object StdBits { cacheFactory: sbt.util.CacheStoreFactory, log: sbt.Logger, defaultDevEnsoVersion: String - ) = Def.task { + ) = { log.info(s"Building standard library package for '$name'") val prefix = "Standard" val targetPkgRoot = root / "lib" / prefix / name / defaultDevEnsoVersion From 28f05423f048083a46669b9284fbb53930960cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 18:37:11 +0200 Subject: [PATCH 07/20] Add is_empty and like to Database --- .../Database/0.0.0-dev/src/Data/Column.enso | 11 +++++++++++ .../0.0.0-dev/src/Internal/Base_Generator.enso | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 27a259530595..1683bc5659c8 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -424,6 +424,12 @@ type Column is_missing : Column is_missing self = self.make_unary_op "ISNULL" new_type=SQL_Type.boolean + ## PRIVATE + Returns a column of booleans, with `True` items at the positions where + this column contains an empty string or `Nothing`. + is_empty : Column + is_empty self = self.make_unary_op "ISEMPTY" new_type=SQL_Type.boolean + ## UNSTABLE Returns a new column where missing values have been replaced with the @@ -534,6 +540,11 @@ type Column contains : Column | Text -> Column contains self other = self.make_binary_op "contains" other new_type=SQL_Type.boolean + ## PRIVATE + Checks for each element of the column if it matches an SQL-like pattern. + like : Column | Text -> Column + like self other = self.make_binary_op "LIKE" other new_type=SQL_Type.boolean + ## PRIVATE as_internal : Internal_Column as_internal self = Internal_Column.Value self.name self.sql_type self.expression diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso index 2dd95b3ac5ce..3347f5dd2c3d 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso @@ -168,15 +168,29 @@ base_dialect = bin = name -> [name, make_binary_op name] unary = name -> [name, make_unary_op name] fun = name -> [name, make_function name] + arith = [bin "+", bin "-", bin "*", bin "/"] logic = [bin "AND", bin "OR", unary "NOT"] compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">="] agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"] counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]] + text = [["ISEMPTY", is_empty], bin "LIKE"] nulls = [["ISNULL", make_right_unary_op "IS NULL"], ["FILLNULL", make_function "COALESCE"]] - base_map = Map.from_vector (arith + logic + compare + agg + nulls + counts) + base_map = Map.from_vector (arith + logic + compare + agg + counts + text + nulls) Internal_Dialect.Value base_map wrap_in_quotes +## PRIVATE +is_empty : Vector Builder -> Builder +is_empty = arguments -> + case arguments.length == 1 of + True -> + arg = arguments.at 0 + is_null = (arg ++ " IS NULL").paren + is_empty = (arg ++ " == ''").paren + (is_null ++ " OR " ++ is_empty).paren + False -> + Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation ISEMPTY") + ## PRIVATE Builds code for an expression. From d3ec5145d4b089384631b99798f648b5b2e0a3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 18:48:58 +0200 Subject: [PATCH 08/20] fix for postgres --- .../Database/0.0.0-dev/src/Internal/Base_Generator.enso | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso index 3347f5dd2c3d..a4040dd1acbc 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso @@ -186,7 +186,7 @@ is_empty = arguments -> True -> arg = arguments.at 0 is_null = (arg ++ " IS NULL").paren - is_empty = (arg ++ " == ''").paren + is_empty = (arg ++ " = ''").paren (is_null ++ " OR " ++ is_empty).paren False -> Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation ISEMPTY") From a28b63e48251a3bcf464dfb0720db7f6ac16c947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 19:24:35 +0200 Subject: [PATCH 09/20] Ensure SQL BETWEEN is used for the Between filter in Database --- .../Database/0.0.0-dev/src/Data/Column.enso | 51 +++++++++++-------- .../src/Internal/Base_Generator.enso | 34 ++++++++----- .../src/Database/Codegen_Spec.enso | 10 ++++ 3 files changed, 63 insertions(+), 32 deletions(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 1683bc5659c8..90d61191550d 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -113,6 +113,34 @@ type Column to_sql : SQL_Statement to_sql self = self.to_table.to_sql + ## PRIVATE + Sets up an operation of arbitrary arity. + + Arguments: + - op_kind: The kind of the operation + - operands: A vector of additional operation arguments (the column itself + is always passed as the first argument). + - new_type: The type of the SQL column that results from applying the + operator. If not specified, the type of this column is used. + - operand_types: The SQL types of the additional arguments. They are used + if additional arguments are constants (and if not provided, the type of + this column is used). If the other argument is a column, its type is + used. + make_op self op_kind operands new_type=Nothing operand_types=Nothing = + prepare_operand operand operand_type = case operand of + other_column : Column -> + if Helpers.check_integrity self other_column then other_column.expression else + Error.throw <| Unsupported_Database_Operation_Error "Cannot use columns coming from different contexts in one expression without a join." + constant -> + actual_operand_type = operand_type.if_nothing self.sql_type + IR.make_constant actual_operand_type constant + actual_operand_types = operand_types.if_nothing (Vector.fill operands.length Nothing) + expressions = operands.zip actual_operand_types prepare_operand + + actual_new_type = new_type.if_nothing self.sql_type + new_expr = IR.Operation op_kind ([self.expression] + expressions) + Column.Value self.name self.connection actual_new_type new_expr self.context + ## PRIVATE Creates a binary operation with given kind and operand. @@ -129,20 +157,7 @@ type Column defaults to the current type if not provided. make_binary_op : Text -> Text -> (Column | Any) -> (SQL_Type | Nothing) -> (SQL_Type | Nothing) -> Column make_binary_op self op_kind operand new_type=Nothing operand_type=Nothing = - actual_new_type = new_type.if_nothing self.sql_type - case operand of - Column.Value _ _ _ other_expr _ -> - case Helpers.check_integrity self operand of - False -> - Error.throw <| Unsupported_Database_Operation_Error "Cannot compare columns coming from different contexts. Only columns of a single table can be compared." - True -> - new_expr = Expression.Operation op_kind [self.expression, other_expr] - Column.Value self.name self.connection actual_new_type new_expr self.context - _ -> - actual_operand_type = operand_type.if_nothing self.sql_type - other = Expression.Constant actual_operand_type operand - new_expr = Expression.Operation op_kind [self.expression, other] - Column.Value self.name self.connection actual_new_type new_expr self.context + self.make_op op_kind [operand] new_type [operand_type] ## PRIVATE @@ -153,10 +168,7 @@ type Column - new_type: The type of the SQL column that results from applying the operator. make_unary_op : Text -> Text -> (SQL_Type | Nothing) -> Column - make_unary_op self op_kind new_type=Nothing = - actual_new_type = new_type.if_nothing self.sql_type - new_expr = Expression.Operation op_kind [self.expression] - Column.Value self.name self.connection actual_new_type new_expr self.context + make_unary_op self op_kind new_type=Nothing = self.make_op op_kind [] new_type ## UNSTABLE @@ -328,8 +340,7 @@ type Column column fit between the lower and upper bounds (both ends inclusive). between : (Column | Any) -> (Column | Any) -> Column between self lower upper = - # TODO change this to use SQL BETWEEN! - (self >= lower) && (self <= upper) + self.make_op "BETWEEN" [lower, upper] new_type=SQL_Type.boolean ## UNSTABLE diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso index a4040dd1acbc..8405041367ef 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso @@ -171,25 +171,35 @@ base_dialect = arith = [bin "+", bin "-", bin "*", bin "/"] logic = [bin "AND", bin "OR", unary "NOT"] - compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">="] + compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">=", ["BETWEEN", make_between]] agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"] counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]] - text = [["ISEMPTY", is_empty], bin "LIKE"] + text = [["ISEMPTY", make_is_empty], bin "LIKE"] nulls = [["ISNULL", make_right_unary_op "IS NULL"], ["FILLNULL", make_function "COALESCE"]] base_map = Map.from_vector (arith + logic + compare + agg + counts + text + nulls) Internal_Dialect.Value base_map wrap_in_quotes ## PRIVATE -is_empty : Vector Builder -> Builder -is_empty = arguments -> - case arguments.length == 1 of - True -> - arg = arguments.at 0 - is_null = (arg ++ " IS NULL").paren - is_empty = (arg ++ " = ''").paren - (is_null ++ " OR " ++ is_empty).paren - False -> - Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation ISEMPTY") +make_is_empty : Vector Builder -> Builder +make_is_empty arguments = case arguments.length of + 1 -> + arg = arguments.at 0 + is_null = (arg ++ " IS NULL").paren + is_empty = (arg ++ " = ''").paren + (is_null ++ " OR " ++ is_empty).paren + _ -> + Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation ISEMPTY") + +## PRIVATE +make_between : Vector Builder -> Builder +make_between arguments = case arguments.length of + 3 -> + expr = arguments.at 0 + lower = arguments.at 1 + upper = arguments.at 2 + (expr ++ " BETWEEN " ++ lower ++ " AND " ++ upper).paren + _ -> + Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation BETWEEN") ## PRIVATE diff --git a/test/Table_Tests/src/Database/Codegen_Spec.enso b/test/Table_Tests/src/Database/Codegen_Spec.enso index 9d2bda76e2ad..f80810de9f4b 100644 --- a/test/Table_Tests/src/Database/Codegen_Spec.enso +++ b/test/Table_Tests/src/Database/Codegen_Spec.enso @@ -98,6 +98,16 @@ spec = c2 = t1.filter (t1.at "A" == t1.at "C") . at "B" c2.to_sql.prepare . should_equal ['SELECT "T1"."B" AS "B" FROM "T1" AS "T1" WHERE ("T1"."A" = "T1"."C")', []] + Test.specify "should generate a single BETWEEN expression" <| + t2 = t1.filter "A" (Filter_Condition.Between 10 20) + t2.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1" WHERE ("T1"."A" BETWEEN ? AND ?)', [[10, int], [20, int]]] + + t3 = t1.filter "A" (Filter_Condition.Between (t1.at "B") (t1.at "C")) + t3.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1" WHERE ("T1"."A" BETWEEN "T1"."B" AND "T1"."C")', []] + + t4 = t1.filter "A" (Filter_Condition.Between (t1.at "B") 33) + t4.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1" WHERE ("T1"."A" BETWEEN "T1"."B" AND ?)', [[33, int]]] + Test.group "[Codegen] Joining Tables" <| t2 = test_connection.query (SQL_Query.Table_Name "T2") t3 = test_connection.query (SQL_Query.Table_Name "T3") From 72458e0bf122fd7490e15cd234822419ba36c527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 6 Oct 2022 20:17:11 +0200 Subject: [PATCH 10/20] fix after rebase --- distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 407a99190cd2..22a30d1a359d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1,5 +1,4 @@ from Standard.Base import all -import Standard.Base.Data.Filter_Condition as Filter_Condition_Module import Standard.Base.Data.Ordering.Comparator import Standard.Base.Data.Index_Sub_Range @@ -464,7 +463,7 @@ type Column Returns a column of booleans, with `True` items at the positions where this column contains an empty string or `Nothing`. is_empty : Column - is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition_Module.is_empty + is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition.Is_Empty.to_predicate ## Returns a column of booleans, with `True` items at the positions where this column does not contain a `Nothing`. From 777fa28da4a56371dce0edb911f63f98001bddb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 7 Oct 2022 11:42:45 +0200 Subject: [PATCH 11/20] Precompile pattern only once --- .../column/operation/map/text/LikeOp.java | 51 +++++++++++++++++++ .../data/column/storage/StringStorage.java | 9 +--- 2 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java new file mode 100644 index 000000000000..dfc8f13b9af7 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java @@ -0,0 +1,51 @@ +package org.enso.table.data.column.operation.map.text; + +import org.enso.base.Regex_Utils; +import org.enso.table.data.column.operation.map.MapOperation; +import org.enso.table.data.column.storage.BoolStorage; +import org.enso.table.data.column.storage.SpecializedStorage; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.error.UnexpectedTypeException; + +import java.util.BitSet; +import java.util.regex.Pattern; + +public class LikeOp extends StringBooleanOp { + public LikeOp() { + super(Storage.Maps.LIKE); + } + + private Pattern createRegexPatternFromSql(String sqlPattern) { + return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(sqlPattern)); + } + + @Override + protected boolean doString(String a, String b) { + return createRegexPatternFromSql(b).matcher(a).matches(); + } + + @Override + public Storage runMap(SpecializedStorage storage, Object arg) { + if (arg == null) { + BitSet newVals = new BitSet(); + BitSet newMissing = new BitSet(); + newMissing.set(0, storage.size()); + return new BoolStorage(newVals, newMissing, storage.size(), false); + } else if (arg instanceof String argString) { + Pattern pattern = createRegexPatternFromSql(argString); + BitSet newVals = new BitSet(); + BitSet newMissing = new BitSet(); + for (int i = 0; i < storage.size(); i++) { + if (storage.isNa(i)) { + newMissing.set(i); + } else if (pattern.matcher(storage.getItem(i)).matches()) { + newVals.set(i); + } + } + return new BoolStorage(newVals, newMissing, storage.size(), false); + } else { + throw new UnexpectedTypeException("a Text"); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index e0846b7c69e4..fc1422b6b5c2 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -9,6 +9,7 @@ import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.MapOperation; import org.enso.table.data.column.operation.map.UnaryMapOperation; +import org.enso.table.data.column.operation.map.text.LikeOp; import org.enso.table.data.column.operation.map.text.StringBooleanOp; import org.graalvm.polyglot.Value; @@ -132,13 +133,7 @@ protected boolean doString(String a, String b) { return Text_Utils.contains(a, b); } }); - t.add( - new StringBooleanOp(Maps.LIKE) { - @Override - protected boolean doString(String a, String b) { - return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(b)).matcher(a).matches(); - } - }); + t.add(new LikeOp()); return t; } } From 50b782831e2e7597f4ff2e76ba6e5342f6eae035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 7 Oct 2022 11:47:09 +0200 Subject: [PATCH 12/20] changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1372ecb5c67e..2330bccdedd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -207,6 +207,8 @@ - [Added `Date_Period.Week` to `start_of` and `end_of` methods.][3733] - [Replaced `Table.where` with a new API relying on `Table.filter`.][3750] - [Added `Filter_Condition` to `Vector`, `Range` and `List`.][3770] +- [Extended `Filter_Condition` with `Is_Empty`, `Not_Empty`, `Like` and + `Not_Like`.][3775] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -332,6 +334,7 @@ [3749]: https://github.com/enso-org/enso/pull/3749 [3750]: https://github.com/enso-org/enso/pull/3750 [3770]: https://github.com/enso-org/enso/pull/3770 +[3775]: https://github.com/enso-org/enso/pull/3775 #### Enso Compiler From 1bbece0ba70d34ae055d06db64cd1d6b9eec3bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 7 Oct 2022 11:50:28 +0200 Subject: [PATCH 13/20] add a test --- test/Table_Tests/src/Common_Table_Spec.enso | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index c3b748282131..00bb2f1e4d75 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -1119,7 +1119,7 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Between (t.at "Y") "bzzzz") . at "X" . to_vector . should_equal ["abb", "baca", "b"] Test.specify "by text search (contains, starts_with, ends_with, like)" <| - t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "baca", "banana", Nothing, "nana"]], ["Y", ["a", "b", "b", "c", "a"]]] + t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "baca", "banana", Nothing, "nana"]], ["Y", ["a", "b", "b", "c", "a"]], ["Z", ["aaaaa", "bbbbb", "[ab]", "[ab]aaaa", "[ab]ccc"]]] t.filter "X" (Filter_Condition.Starts_With "ba") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana"] t.filter "X" (Filter_Condition.Ends_With "na") on_problems=Report_Error . at "X" . to_vector . should_equal ["banana", "nana"] @@ -1133,9 +1133,12 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Like "_a%") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] t.filter "X" (Filter_Condition.Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["abb"] t.filter "X" (Filter_Condition.Like "nana") on_problems=Report_Error . at "X" . to_vector . should_equal ["nana"] + t.filter "Z" (Filter_Condition.Like "[ab]_%") on_problems=Report_Error . at "Z" . to_vector . should_equal ["[ab]aaaa", "[ab]ccc"] + t.filter "X" (Filter_Condition.Not_Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] + t.filter "Z" (Filter_Condition.Not_Like "[ab]%") on_problems=Report_Error . at "Z" . to_vector . should_equal ["aaaaa", "bbbbb"] - Test.specify "by empty text search" <| + Test.specify "by empty text" <| t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "", " ", Nothing, "nana"]]] t.filter "X" Filter_Condition.Is_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["", Nothing] t.filter "X" Filter_Condition.Not_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["abb", " ", "nana"] From 4a184a5ec22995c5b3cd8fb479e8f0887560ea25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 7 Oct 2022 12:12:38 +0200 Subject: [PATCH 14/20] formatting --- .../base/src/main/java/org/enso/base/Regex_Utils.java | 9 ++++++--- .../table/data/column/operation/map/text/LikeOp.java | 7 ++----- .../enso/table/data/column/storage/StringStorage.java | 3 --- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java index 619c42057bd7..8edde276714c 100644 --- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java @@ -71,9 +71,11 @@ public static String[] find_all_matches(String regex, String text) { return allMatches.toArray(new String[0]); } - /** Converts a SQL-like pattern into a Regex with the same semantics. + /** + * Converts a SQL-like pattern into a Regex with the same semantics. * - * Special regex characters present in the input pattern are quoted to match them literally according to the SQL-like format. + *

Special regex characters present in the input pattern are quoted to match them literally + * according to the SQL-like format. */ public static String sql_like_pattern_to_regex(String sql_pattern) { StringBuilder result = new StringBuilder(); @@ -82,7 +84,8 @@ public static String sql_like_pattern_to_regex(String sql_pattern) { for (int i = 0; i < sql_pattern.length(); ++i) { char c = sql_pattern.charAt(i); if (c == '%' || c == '_') { - // Before inserting the converted wildcard, we append the accumulated characters, quoting them first. + // Before inserting the converted wildcard, we append the accumulated characters, quoting + // them first. if (acc.length() > 0) { result.append(Pattern.quote(acc.toString())); acc.setLength(0); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java index dfc8f13b9af7..15761bef4f5c 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java @@ -1,16 +1,13 @@ package org.enso.table.data.column.operation.map.text; +import java.util.BitSet; +import java.util.regex.Pattern; import org.enso.base.Regex_Utils; -import org.enso.table.data.column.operation.map.MapOperation; import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.column.storage.StringStorage; import org.enso.table.error.UnexpectedTypeException; -import java.util.BitSet; -import java.util.regex.Pattern; - public class LikeOp extends StringBooleanOp { public LikeOp() { super(Storage.Maps.LIKE); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index fc1422b6b5c2..f77b54b0bbc7 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -1,9 +1,6 @@ package org.enso.table.data.column.storage; import java.util.BitSet; -import java.util.regex.Pattern; - -import org.enso.base.Regex_Utils; import org.enso.base.Text_Utils; import org.enso.table.data.column.builder.object.StringBuilder; import org.enso.table.data.column.operation.map.MapOpStorage; From 593ef02be2b9d4d869c4e3d6e71b2a328f37fa52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 7 Oct 2022 13:48:22 +0200 Subject: [PATCH 15/20] Fix after rebase --- .../lib/Standard/Database/0.0.0-dev/src/Data/Column.enso | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 90d61191550d..6d7a467a3084 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -133,12 +133,12 @@ type Column Error.throw <| Unsupported_Database_Operation_Error "Cannot use columns coming from different contexts in one expression without a join." constant -> actual_operand_type = operand_type.if_nothing self.sql_type - IR.make_constant actual_operand_type constant + Expression.Constant actual_operand_type constant actual_operand_types = operand_types.if_nothing (Vector.fill operands.length Nothing) expressions = operands.zip actual_operand_types prepare_operand actual_new_type = new_type.if_nothing self.sql_type - new_expr = IR.Operation op_kind ([self.expression] + expressions) + new_expr = Expression.Operation op_kind ([self.expression] + expressions) Column.Value self.name self.connection actual_new_type new_expr self.context ## PRIVATE From 6711066441029282db79895922db8e6b1a13ae77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 10 Oct 2022 12:41:48 +0200 Subject: [PATCH 16/20] Add test for newlines in wildcards --- test/Table_Tests/src/Common_Table_Spec.enso | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index 00bb2f1e4d75..dd03ad2f0f74 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -1138,6 +1138,13 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Not_Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] t.filter "Z" (Filter_Condition.Not_Like "[ab]%") on_problems=Report_Error . at "Z" . to_vector . should_equal ["aaaaa", "bbbbb"] + Test.specify "like wildcards should also match newlines" <| + t = table_builder [["X", ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb']]] + t.filter "X" (Filter_Condition.Like 'a_') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n'] + t.filter "X" (Filter_Condition.Like 'a%') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] + t.filter "X" (Filter_Condition.Like 'a_b') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\nb'] + t.filter "X" (Filter_Condition.Like '%\nb') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\nb', 'a\nb'] + Test.specify "by empty text" <| t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "", " ", Nothing, "nana"]]] t.filter "X" Filter_Condition.Is_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["", Nothing] From 78906f052403fbbcbd44be43cbcd9405b30c1d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 10 Oct 2022 13:17:47 +0200 Subject: [PATCH 17/20] More tests, fix regex in Table --- .../enso/table/data/column/operation/map/text/LikeOp.java | 6 +++++- test/Table_Tests/src/Common_Table_Spec.enso | 7 ++++++- test/Tests/src/Data/Vector_Spec.enso | 6 ++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java index 15761bef4f5c..63bf38c8b6d4 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java @@ -13,8 +13,12 @@ public LikeOp() { super(Storage.Maps.LIKE); } + /** These flags should be consistent with default Enso regex settings. */ + private final static int UNICODE_REGEX = Pattern.CANON_EQ | Pattern.UNICODE_CHARACTER_CLASS | Pattern.UNICODE_CASE; + private final static int REGEX_FLAGS = UNICODE_REGEX | Pattern.DOTALL; + private Pattern createRegexPatternFromSql(String sqlPattern) { - return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(sqlPattern)); + return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(sqlPattern), REGEX_FLAGS); } @Override diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index dd03ad2f0f74..2138329a4c21 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -1138,13 +1138,18 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Not_Like "%b") on_problems=Report_Error . at "X" . to_vector . should_equal ["baca", "banana", "nana"] t.filter "Z" (Filter_Condition.Not_Like "[ab]%") on_problems=Report_Error . at "Z" . to_vector . should_equal ["aaaaa", "bbbbb"] - Test.specify "like wildcards should also match newlines" <| + Test.specify "text operations should also match newlines" <| t = table_builder [["X", ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb']]] t.filter "X" (Filter_Condition.Like 'a_') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n'] t.filter "X" (Filter_Condition.Like 'a%') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] t.filter "X" (Filter_Condition.Like 'a_b') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\nb'] t.filter "X" (Filter_Condition.Like '%\nb') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\nb', 'a\nb'] + t.filter "X" (Filter_Condition.Contains '\nb') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\nb', 'a\nb', 'caa\nbb'] + t.filter "X" (Filter_Condition.Ends_With '\nb') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\nb', 'a\nb'] + t.filter "X" (Filter_Condition.Ends_With '\n') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\n', 'a\n'] + t.filter "X" (Filter_Condition.Starts_With 'c') on_problems=Report_Error . at "X" . to_vector . should_equal ['caa\nbb'] + Test.specify "by empty text" <| t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "", " ", Nothing, "nana"]]] t.filter "X" Filter_Condition.Is_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["", Nothing] diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index bb4ba95854ed..0ed5141a6224 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -168,6 +168,12 @@ spec = Test.group "Vectors" <| txtvec.filter (Filter_Condition.Between "b" "c") . should_equal ["bbb", "baaa"] Test.expect_panic_with (txtvec.filter (Filter_Condition.Starts_With 42)) Unsupported_Argument_Types_Data + txt2 = ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb'] + txt2.filter "X" (Filter_Condition.Like 'a_') . should_equal ['a\n'] + txt2.filter "X" (Filter_Condition.Like 'a%') . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] + txt2.filter "X" (Filter_Condition.Like 'a_b') . should_equal ['a\nb'] + txt2.filter "X" (Filter_Condition.Like '%\nb') . should_equal ['a\n\n\nb', 'a\nb'] + ["", Nothing, " ", "a"].filter (Filter_Condition.Is_Empty) . should_equal ["", Nothing] ["", Nothing, " ", "a"].filter (Filter_Condition.Not_Empty) . should_equal [" ", "a"] ["abab", "aaabaaaa", "ba"].filter (Filter_Condition.Like "ba") . should_equal ["ba"] From 07be26a0566462ccf7639e677138b091066fef99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 10 Oct 2022 13:23:28 +0200 Subject: [PATCH 18/20] Fix Like in Base types --- .../Base/0.0.0-dev/src/Data/Filter_Condition.enso | 2 +- test/Tests/src/Data/Vector_Spec.enso | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 326c0a8feb05..78dbfcb34025 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -130,4 +130,4 @@ type Filter_Condition ## PRIVATE sql_like_to_regex sql_pattern = regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern - Regex.compile regex_pattern + Regex.compile regex_pattern dot_matches_newline=True diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index 0ed5141a6224..e6782d421c13 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -169,10 +169,10 @@ spec = Test.group "Vectors" <| Test.expect_panic_with (txtvec.filter (Filter_Condition.Starts_With 42)) Unsupported_Argument_Types_Data txt2 = ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb'] - txt2.filter "X" (Filter_Condition.Like 'a_') . should_equal ['a\n'] - txt2.filter "X" (Filter_Condition.Like 'a%') . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] - txt2.filter "X" (Filter_Condition.Like 'a_b') . should_equal ['a\nb'] - txt2.filter "X" (Filter_Condition.Like '%\nb') . should_equal ['a\n\n\nb', 'a\nb'] + txt2.filter (Filter_Condition.Like 'a_') . should_equal ['a\n'] + txt2.filter (Filter_Condition.Like 'a%') . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] + txt2.filter (Filter_Condition.Like 'a_b') . should_equal ['a\nb'] + txt2.filter (Filter_Condition.Like '%\nb') . should_equal ['a\n\n\nb', 'a\nb'] ["", Nothing, " ", "a"].filter (Filter_Condition.Is_Empty) . should_equal ["", Nothing] ["", Nothing, " ", "a"].filter (Filter_Condition.Not_Empty) . should_equal [" ", "a"] From 0cbf8d103ce1511fbe39297ea6ecc93264cf196e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 10 Oct 2022 16:37:29 +0200 Subject: [PATCH 19/20] Add some Unicode tests, some of them pending due to a bug --- .../column/operation/map/text/LikeOp.java | 6 +++++- test/Table_Tests/src/Common_Table_Spec.enso | 15 ++++++++++++- test/Table_Tests/src/Table_Spec.enso | 2 +- test/Tests/src/Data/Vector_Spec.enso | 21 +++++++++++++------ 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java index 63bf38c8b6d4..251cc3aa8d00 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java @@ -2,6 +2,8 @@ import java.util.BitSet; import java.util.regex.Pattern; + +import com.ibm.icu.impl.UnicodeRegex; import org.enso.base.Regex_Utils; import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.SpecializedStorage; @@ -18,7 +20,9 @@ public LikeOp() { private final static int REGEX_FLAGS = UNICODE_REGEX | Pattern.DOTALL; private Pattern createRegexPatternFromSql(String sqlPattern) { - return Pattern.compile(Regex_Utils.sql_like_pattern_to_regex(sqlPattern), REGEX_FLAGS); + String regex = Regex_Utils.sql_like_pattern_to_regex(sqlPattern); + String unicodeTransformed = UnicodeRegex.fix(regex); + return Pattern.compile(unicodeTransformed, REGEX_FLAGS); } @Override diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index 2138329a4c21..f501d6ff6c75 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -16,7 +16,7 @@ import Standard.Test.Problems from project.Util import all type Test_Selection - Config supports_case_sensitive_columns=True order_by=True natural_ordering=False case_insensitive_ordering=True order_by_unicode_normalization_by_default=False case_insensitive_ascii_only=False take_drop=True allows_mixed_type_comparisons=True + Config supports_case_sensitive_columns=True order_by=True natural_ordering=False case_insensitive_ordering=True order_by_unicode_normalization_by_default=False case_insensitive_ascii_only=False take_drop=True allows_mixed_type_comparisons=True supports_unicode_normalization=False ## A common test suite for shared operations on the Table API. @@ -1150,6 +1150,19 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Ends_With '\n') on_problems=Report_Error . at "X" . to_vector . should_equal ['a\n\n\n', 'a\n'] t.filter "X" (Filter_Condition.Starts_With 'c') on_problems=Report_Error . at "X" . to_vector . should_equal ['caa\nbb'] + if test_selection.supports_unicode_normalization then + Test.specify "text operations should support Unicode normalization" <| + t = table_builder [["X", ['śnieg', 's\u0301nieg', 'X', Nothing, 'połać', 'połac\u0301']]] + t.filter "X" (Filter_Condition.Starts_With 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] + t.filter "X" (Filter_Condition.Contains 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] + t.filter "X" (Filter_Condition.Ends_With 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal [] + t.filter "X" (Filter_Condition.Ends_With 'ć') on_problems=Report_Error . at "X" . to_vector . should_equal ['połać', 'połac\u0301'] + + # This test is split off just to mark is as pending, once resolved it can be merged with the one above. + Test.specify "text operations should support Unicode normalization (like)" pending='There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting.\nhttps://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926' <| + t = table_builder [["X", ['śnieg', 's\u0301nieg', 'X', Nothing, 'połać', 'połac\u0301']]] + t.filter "X" (Filter_Condition.Like 'ś%') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] + Test.specify "by empty text" <| t = table_builder [["ix", [1, 2, 3, 4, 5]], ["X", ["abb", "", " ", Nothing, "nana"]]] t.filter "X" Filter_Condition.Is_Empty on_problems=Report_Error . at "X" . to_vector . should_equal ["", Nothing] diff --git a/test/Table_Tests/src/Table_Spec.enso b/test/Table_Tests/src/Table_Spec.enso index 69d833c7ee0f..597ad17b3290 100644 --- a/test/Table_Tests/src/Table_Spec.enso +++ b/test/Table_Tests/src/Table_Spec.enso @@ -670,7 +670,7 @@ spec = t_3 = Table.new [c_3_1, c_3_2, c_3_3] t_3.default_visualization.should_equal Visualization.Id.table - selection = Common_Table_Spec.Test_Selection.Config supports_case_sensitive_columns=True order_by=True natural_ordering=True case_insensitive_ordering=True order_by_unicode_normalization_by_default=True + selection = Common_Table_Spec.Test_Selection.Config supports_case_sensitive_columns=True order_by=True natural_ordering=True case_insensitive_ordering=True order_by_unicode_normalization_by_default=True supports_unicode_normalization=True Common_Table_Spec.spec "[In-Memory] " table_builder=Table.new test_selection=selection Test.group "Use First Row As Names" <| diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index e6782d421c13..a64963e654fd 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -168,12 +168,6 @@ spec = Test.group "Vectors" <| txtvec.filter (Filter_Condition.Between "b" "c") . should_equal ["bbb", "baaa"] Test.expect_panic_with (txtvec.filter (Filter_Condition.Starts_With 42)) Unsupported_Argument_Types_Data - txt2 = ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb'] - txt2.filter (Filter_Condition.Like 'a_') . should_equal ['a\n'] - txt2.filter (Filter_Condition.Like 'a%') . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] - txt2.filter (Filter_Condition.Like 'a_b') . should_equal ['a\nb'] - txt2.filter (Filter_Condition.Like '%\nb') . should_equal ['a\n\n\nb', 'a\nb'] - ["", Nothing, " ", "a"].filter (Filter_Condition.Is_Empty) . should_equal ["", Nothing] ["", Nothing, " ", "a"].filter (Filter_Condition.Not_Empty) . should_equal [" ", "a"] ["abab", "aaabaaaa", "ba"].filter (Filter_Condition.Like "ba") . should_equal ["ba"] @@ -184,6 +178,21 @@ spec = Test.group "Vectors" <| ["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"] ["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"] + txt2 = ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb', 'caa\nbb'] + txt2.filter (Filter_Condition.Like 'a_') . should_equal ['a\n'] + txt2.filter (Filter_Condition.Like 'a%') . should_equal ['a\n\n\n', 'a\n', 'a\n\n\nb', 'a\nb'] + txt2.filter (Filter_Condition.Like 'a_b') . should_equal ['a\nb'] + txt2.filter (Filter_Condition.Like '%\nb') . should_equal ['a\n\n\nb', 'a\nb'] + + txt3 = ['śnieg', 's\u0301nieg', 'X', 'połać', 'połac\u0301'] + txt3.filter (Filter_Condition.Starts_With 'ś') . should_equal ['śnieg', 's\u0301nieg'] + txt3.filter (Filter_Condition.Contains 'ś') . should_equal ['śnieg', 's\u0301nieg'] + txt3.filter (Filter_Condition.Ends_With 'ś') . should_equal [] + txt3.filter (Filter_Condition.Ends_With 'ć') . should_equal ['połać', 'połac\u0301'] + ## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. + https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + # txt3.filter (Filter_Condition.Like 'ś%') . should_equal ['śnieg', 's\u0301nieg'] + mixed = [1, Nothing, "b"] mixed.filter Filter_Condition.Is_Nothing . should_equal [Nothing] mixed.filter Filter_Condition.Not_Nothing . should_equal [1, "b"] From f0a82e0eadd3f78910f1fefc6f0c3fda0a9a8af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 10 Oct 2022 20:01:08 +0200 Subject: [PATCH 20/20] Add comments on the known bug, make the fix better --- .../0.0.0-dev/src/Data/Filter_Condition.enso | 21 ++++++++++++++++++- .../column/operation/map/text/LikeOp.java | 10 ++++++--- test/Table_Tests/src/Common_Table_Spec.enso | 6 ++++-- test/Tests/src/Data/Vector_Spec.enso | 2 ++ 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 78dbfcb34025..f22c29621dfc 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -75,6 +75,14 @@ type Filter_Condition - the `%` character matches any sequence of characters, - the `_` character matches any single character, - any other character is matched literally. + + ! Known Bugs + There is a known bug in Java Regex where escape characters are not + handled properly in Unicode-normalized matching mode. Due to this + limitation, Unicode normalization has been disabled for this function, + so beware that some equivalent graphemes like 'ś' and 's\u0301' will + not be matched. + See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 Like pattern:Text ## Does the value not match the SQL pattern (Text only)? @@ -87,6 +95,14 @@ type Filter_Condition - the `%` character matches any sequence of characters, - the `_` character matches any single character, - any other character is matched literally. + + ! Known Bugs + There is a known bug in Java Regex where escape characters are not + handled properly in Unicode-normalized matching mode. Due to this + limitation, Unicode normalization has been disabled for this function, + so beware that some equivalent graphemes like 'ś' and 's\u0301' will + not be matched. + See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 Not_Like pattern:Text ## Converts a `Filter_Condition` condition into a predicate taking an @@ -130,4 +146,7 @@ type Filter_Condition ## PRIVATE sql_like_to_regex sql_pattern = regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern - Regex.compile regex_pattern dot_matches_newline=True + ## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. + https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + Once that bug is fixed, `match_ascii` may be set back to `False`. + Regex.compile regex_pattern dot_matches_newline=True match_ascii=True diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java index 251cc3aa8d00..0963d6ab9997 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/LikeOp.java @@ -15,9 +15,13 @@ public LikeOp() { super(Storage.Maps.LIKE); } - /** These flags should be consistent with default Enso regex settings. */ - private final static int UNICODE_REGEX = Pattern.CANON_EQ | Pattern.UNICODE_CHARACTER_CLASS | Pattern.UNICODE_CASE; - private final static int REGEX_FLAGS = UNICODE_REGEX | Pattern.DOTALL; + + /** + * There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. + * Once that bug is fixed, we should add all relevant Unicode flags here too, + * consistently with the Default Enso regex engine. + */ + private final static int REGEX_FLAGS = Pattern.DOTALL; private Pattern createRegexPatternFromSql(String sqlPattern) { String regex = Regex_Utils.sql_like_pattern_to_regex(sqlPattern); diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso index f501d6ff6c75..1603e5cabc80 100644 --- a/test/Table_Tests/src/Common_Table_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Spec.enso @@ -1151,16 +1151,18 @@ spec prefix table_builder test_selection pending=Nothing = t.filter "X" (Filter_Condition.Starts_With 'c') on_problems=Report_Error . at "X" . to_vector . should_equal ['caa\nbb'] if test_selection.supports_unicode_normalization then + t = table_builder [["X", ['śnieg', 's\u0301nieg', 'X', Nothing, 'połać', 'połac\u0301']]] Test.specify "text operations should support Unicode normalization" <| - t = table_builder [["X", ['śnieg', 's\u0301nieg', 'X', Nothing, 'połać', 'połac\u0301']]] t.filter "X" (Filter_Condition.Starts_With 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] t.filter "X" (Filter_Condition.Contains 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] t.filter "X" (Filter_Condition.Ends_With 'ś') on_problems=Report_Error . at "X" . to_vector . should_equal [] t.filter "X" (Filter_Condition.Ends_With 'ć') on_problems=Report_Error . at "X" . to_vector . should_equal ['połać', 'połac\u0301'] + # This should be replaced with the disabled test below, once the related bug is fixed. + t.filter "X" (Filter_Condition.Like 'ś%') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg'] + # This test is split off just to mark is as pending, once resolved it can be merged with the one above. Test.specify "text operations should support Unicode normalization (like)" pending='There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting.\nhttps://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926' <| - t = table_builder [["X", ['śnieg', 's\u0301nieg', 'X', Nothing, 'połać', 'połac\u0301']]] t.filter "X" (Filter_Condition.Like 'ś%') on_problems=Report_Error . at "X" . to_vector . should_equal ['śnieg', 's\u0301nieg'] Test.specify "by empty text" <| diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index a64963e654fd..8c770d72c499 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -192,6 +192,8 @@ spec = Test.group "Vectors" <| ## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 # txt3.filter (Filter_Condition.Like 'ś%') . should_equal ['śnieg', 's\u0301nieg'] + # This should be replaced with the disabled test above, once the related bug is fixed. + txt3.filter (Filter_Condition.Like 'ś%') . should_equal ['śnieg'] mixed = [1, Nothing, "b"] mixed.filter Filter_Condition.Is_Nothing . should_equal [Nothing]