enso-org · mergify · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -601,6 +601,7 @@
   and `Is_Finite`.][8539]
 - [Added text_length to Column][8606]
 - [Added none delimiter option for Data.Read][8627]
+- [Added text_left and text_right to Column][8691]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -862,6 +863,7 @@
 [8564]: https://github.com/enso-org/enso/pull/8564
 [8606]: https://github.com/enso-org/enso/pull/8606
 [8627]: https://github.com/enso-org/enso/pull/8627
+[8691]: https://github.com/enso-org/enso/pull/8691
 
 #### Enso Compiler
 

@@ -1195,6 +1195,48 @@ type Column
             new_name = self.naming_helper.function_name "text_length" [self]
             self.make_unary_op "LENGTH" new_name
 
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the left n characters for each element of the column.
+
+       In the Database backends, the default text left method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the left n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_left 5
+    text_left : Column|Integer -> Column
+    text_left self n =
+        Value_Type.expect_text self <| Value_Type.expect_integer n <|
+            n2 = n.max 0
+            new_name = self.naming_helper.function_name "text_left" [self, n]
+            self.make_binary_op "LEFT" n2 new_name
+
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the right n characters for each element of the column.
+
+       In the Database backends, the default text right method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the right n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_right 5
+    text_right : Column|Integer -> Column
+    text_right self n =
+        Value_Type.expect_text self <| Value_Type.expect_integer n <|
+            n2 = n.max 0
+            new_name = self.naming_helper.function_name "text_right" [self, n]
+            self.make_binary_op "RIGHT" n2 new_name           
+
     ## GROUP Standard.Base.Logical
        Checks for each element of the column if it contains `other`.
 

@@ -287,7 +287,7 @@ type Postgres_Dialect
 ## PRIVATE
 make_internal_generator_dialect =
     cases = [["LOWER", Base_Generator.make_function "LOWER"], ["UPPER", Base_Generator.make_function "UPPER"]]
-    text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace]]+concat_ops+cases+trim_ops
+    text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+cases+trim_ops
     counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
     arith_extensions = [is_nan, is_inf, floating_point_div, mod_op, decimal_div, decimal_mod, ["ROW_MIN", Base_Generator.make_function "LEAST"], ["ROW_MAX", Base_Generator.make_function "GREATEST"]]
     bool = [bool_or]
@@ -486,6 +486,14 @@ make_contains_expr expr substring =
 ## PRIVATE
 contains = Base_Generator.lift_binary_op "contains" make_contains_expr
 
+## PRIVATE
+left = Base_Generator.lift_binary_op "LEFT" str-> n->
+    Builder.code "left(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
+
+## PRIVATE
+right = Base_Generator.lift_binary_op "RIGHT" str-> n->
+    Builder.code "right(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
+
 ## PRIVATE
 make_order_descriptor internal_column sort_direction text_ordering =
     nulls = case sort_direction of

@@ -282,7 +282,7 @@ type SQLite_Dialect
 
 ## PRIVATE
 make_internal_generator_dialect =
-    text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace]]+concat_ops+trim_ops
+    text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+trim_ops
     counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
     stats = [agg_stddev_pop, agg_stddev_samp]
     arith_extensions = [is_inf, floating_point_div, mod_op]
@@ -409,6 +409,14 @@ make_contains_expr expr substring =
 ## PRIVATE
 contains = Base_Generator.lift_binary_op "contains" make_contains_expr
 
+## PRIVATE
+left = Base_Generator.lift_binary_op "LEFT" str-> n->
+    Builder.code "substr(" ++ str ++ ", 0, " ++ n ++ " + 1)"
+
+## PRIVATE
+right = Base_Generator.lift_binary_op "RIGHT" str-> n->
+    Builder.code "substr(" ++ str ++ ", -" ++ n ++ ", " ++ n ++ ")"
+
 ## PRIVATE
 bool_or = Base_Generator.lift_unary_op "BOOL_OR" arg->
     Builder.code "max(" ++ arg ++ ")"

@@ -184,7 +184,7 @@ operations_map =
 
     always_boolean_ops = ["==", "!=", "equals_ignore_case", ">=", "<=", "<", ">", "BETWEEN", "AND", "OR", "NOT", "IS_NULL", "IS_EMPTY", "LIKE", "IS_IN", "IS_IN_COLUMN", "starts_with", "ends_with", "contains", "BOOL_OR", "IS_INF"]
     always_floating_ops = ["/", "mod", "AVG", "STDDEV_POP", "STDDEV_SAMP", "ROUND"]
-    always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE"]
+    always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE", "LEFT", "RIGHT"]
     always_integer_ops = ["COUNT", "COUNT_IS_NULL", "COUNT_DISTINCT", "COUNT_DISTINCT_INCLUDE_NULL", "COUNT_EMPTY", "COUNT_NOT_EMPTY", "COUNT_ROWS", "ROW_NUMBER", "ROW_NUMBER_IN_GROUP", "LENGTH"]
     same_as_first = ["TRUNCATE", "CEIL", "FLOOR"]
     arithmetic_ops = ["ADD_NUMBER", "-", "*", "^", "%", "SUM"]

@@ -1256,6 +1256,48 @@ type Column
         Value_Type.expect_text self <|
             simple_unary_op self Java_Storage.Maps.TEXT_LENGTH
 
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the left n characters for each element of the column.
+
+       In the Database backends, the default text left method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the left n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_left 5
+    text_left : Column|Integer -> Column
+    text_left self n =
+        Value_Type.expect_text self <|
+            Value_Type.expect_integer n <|
+                new_name = naming_helper.function_name "text_left" [self, n]
+                run_vectorized_binary_op self Java_Storage.Maps.TEXT_LEFT n new_name
+
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the right n characters for each element of the column.
+
+       In the Database backends, the default text right method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the right n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_right 5
+    text_right : Column|Integer -> Column
+    text_right self n =
+        Value_Type.expect_text self <|
+            Value_Type.expect_integer n <|
+                new_name = naming_helper.function_name "text_right" [self, n]
+                run_vectorized_binary_op self Java_Storage.Maps.TEXT_RIGHT n new_name
+
     ## GROUP Standard.Base.Logical
        Checks for each element of the column if it contains `other`.
 

@@ -52,7 +52,9 @@ public static int compare_normalized(String a, String b) {
   public static String take_prefix(String str, long grapheme_length) {
     BreakIterator iter = BreakIterator.getCharacterInstance();
     iter.setText(str);
-    if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
+    if (grapheme_length <= 0) {
+      return "";
+    } else if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
       return str;
     } else {
       return str.substring(0, iter.current());

@@ -294,7 +294,9 @@ public static String take_suffix(String str, long grapheme_length) {
     BreakIterator iter = BreakIterator.getCharacterInstance();
     iter.setText(str);
     iter.last();
-    if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
+    if (grapheme_length <= 0) {
+      return "";
+    } else if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
       return str;
     } else {
       return str.substring(iter.current());

@@ -0,0 +1,75 @@
+package org.enso.table.data.column.operation.map.text;
+
+import org.enso.table.data.column.builder.StringBuilder;
+import org.enso.table.data.column.operation.map.BinaryMapOperation;
+import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
+import org.enso.table.data.column.storage.SpecializedStorage;
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.column.storage.StringStorage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+import org.enso.table.data.column.storage.type.TextType;
+import org.enso.table.error.UnexpectedTypeException;
+import org.graalvm.polyglot.Context;
+
+public abstract class StringLongToStringOp
+    extends BinaryMapOperation<String, SpecializedStorage<String>> {
+  public StringLongToStringOp(String name) {
+    super(name);
+  }
+
+  protected abstract String doOperation(String a, long b);
+
+  @Override
+  public Storage<?> runBinaryMap(
+      SpecializedStorage<String> storage,
+      Object arg,
+      MapOperationProblemAggregator problemAggregator) {
+    int size = storage.size();
+    if (arg == null) {
+      StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH);
+      builder.appendNulls(size);
+      return builder.seal();
+    } else if (arg instanceof Long argLong) {
+      String[] newVals = new String[size];
+      Context context = Context.getCurrent();
+      for (int i = 0; i < size; i++) {
+        if (storage.isNa(i)) {
+          newVals[i] = null;
+        } else {
+          newVals[i] = doOperation(storage.getItem(i), argLong);
+        }
+
+        context.safepoint();
+      }
+
+      return new StringStorage(newVals, size, (TextType) storage.getType());
+    } else {
+      throw new UnexpectedTypeException("a Text");
+    }
+  }
+
+  @Override
+  public Storage<?> runZip(
+      SpecializedStorage<String> storage,
+      Storage<?> arg,
+      MapOperationProblemAggregator problemAggregator) {
+    if (arg instanceof LongStorage v) {
+      int size = storage.size();
+      String[] newVals = new String[size];
+      Context context = Context.getCurrent();
+      for (int i = 0; i < size; i++) {
+        if (storage.isNa(i) || v.isNa(i)) {
+          newVals[i] = null;
+        } else {
+          newVals[i] = doOperation(storage.getItem(i), v.getItem(i));
+        }
+
+        context.safepoint();
+      }
+
+      return new StringStorage(newVals, size, (TextType) storage.getType());
+    } else {
+      throw new UnexpectedTypeException("a Text column");
+    }
+  }
+}
@@ -110,6 +110,8 @@ public static final class Maps {
     public static final String STARTS_WITH = "starts_with";
     public static final String ENDS_WITH = "ends_with";
     public static final String TEXT_LENGTH = "text_length";
+    public static final String TEXT_LEFT = "text_left";
+    public static final String TEXT_RIGHT = "text_right";
     public static final String CONTAINS = "contains";
     public static final String LIKE = "like";
     public static final String IS_IN = "is_in";

@@ -10,6 +10,7 @@
 import org.enso.table.data.column.operation.map.text.LikeOp;
 import org.enso.table.data.column.operation.map.text.StringBooleanOp;
 import org.enso.table.data.column.operation.map.text.StringIsInOp;
+import org.enso.table.data.column.operation.map.text.StringLongToStringOp;
 import org.enso.table.data.column.operation.map.text.StringStringOp;
 import org.enso.table.data.column.storage.type.StorageType;
 import org.enso.table.data.column.storage.type.TextType;
@@ -129,6 +130,20 @@ protected long doOperation(String a) {
             return Text_Utils.grapheme_length(a);
           }
         });
+    t.add(
+        new StringLongToStringOp(Maps.TEXT_LEFT) {
+          @Override
+          protected String doOperation(String a, long b) {
+            return Text_Utils.take_prefix(a, b);
+          }
+        });
+    t.add(
+        new StringLongToStringOp(Maps.TEXT_RIGHT) {
+          @Override
+          protected String doOperation(String a, long b) {
+            return Text_Utils.take_suffix(a, b);
+          }
+        });
     t.add(
         new StringBooleanOp(Maps.CONTAINS) {
           @Override

@@ -925,6 +925,71 @@ spec setup =
             t = table_builder [["numbers", [1, 2, 3]]]
             col = t.at "numbers"
             col.text_length . should_fail_with Invalid_Value_Type
+
+        Test.specify "should handle operation text_left and text_right with length 1" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings" . cast (Value_Type.Char size=286 variable_length=True)
+                resLeft = col.text_left 1
+                resLeft.name . should_equal "text_left([strings], 1)"
+                resRight = col.text_right 1
+                resRight.name . should_equal "text_right([strings], 1)"
+                resLeft . to_vector . should_equal ["a", "f", "", Nothing, "c", "I"] 
+                resRight . to_vector . should_equal ["a", "r", "", Nothing, "é", "."]
+                case setup.is_database of
+                    False -> resLeft . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
+                    True -> resLeft . value_type . should_equal (Value_Type.Char variable_length=True)
+                case setup.is_database of
+                    False -> resRight . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
+                    True -> resRight . value_type . should_equal (Value_Type.Char variable_length=True)
+
+        Test.specify "should handle operation text_left and text_right of grapheme and non-grapheme" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left 3
+                resLeft.name . should_equal "text_left([strings], 3)"
+                resRight = col.text_right 3
+                resRight.name . should_equal "text_right([strings], 3)"
+
+                case setup.is_database of
+                    False -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩‍🔬👩‍🔬V", "caf", "It "] # Grapheme Length
+                    True -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩‍🔬", "caf", "It "]  # Storage Length
+                case setup.is_database of
+                    False -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "V👩‍🔬👩‍🔬", "afé", "ir."] # Grapheme Length
+                    True -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "👩‍🔬", "afé", "ir."]  # Storage Length
+
+        Test.specify "text_left and text_right should error on non-string columns" <|
+            t = table_builder [["numbers", [1, 2, 3]]]
+            col = t.at "numbers"
+            col.text_left 6 . should_fail_with Invalid_Value_Type
+            col.text_right 6 . should_fail_with Invalid_Value_Type
+
+        Test.specify "text_left and text_right should error on non integer parameters" <|
+            t = table_builder [["numbers", [1, 2, 3]]]
+            col = t.at "numbers"
+            col.text_left 3.14 . should_fail_with Invalid_Value_Type
+            col.text_right 3.14 . should_fail_with Invalid_Value_Type
+            col.text_left "7" . should_fail_with Invalid_Value_Type
+            col.text_left "7" . should_fail_with Invalid_Value_Type
+
+        Test.specify "text_left and text_right should return empty on zero argument" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left 0
+                resLeft.name . should_equal "text_left([strings], 0)"
+                resRight = col.text_right 0
+                resRight.name . should_equal "text_right([strings], 0)"
+                resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+                resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+
+        Test.specify "text_left and text_right should return empty on negative arguments" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left -3
+                resLeft.name . should_equal "text_left([strings], -3)"
+                resRight = col.text_right -3
+                resRight.name . should_equal "text_right([strings], -3)"
+                resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+                resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
 
         Test.specify "should handle operations like is_empty, is_blank, fill_empty" <|
             with_mixed_columns_if_supported [["s", ["", " ", "  ", Nothing, "foo"]], ["letters", ["a", "b", "c", "d", "e"]]] t->

@@ -70,6 +70,8 @@ spec =
             Text_Utils.take_prefix txt 5 . should_equal txt
             Text_Utils.take_prefix txt 400 . should_equal txt
             Text_Utils.take_prefix txt 0 . should_equal ''
+            Text_Utils.take_prefix txt -1 . should_equal ''
+            Text_Utils.take_prefix txt -42 . should_equal ''
 
             Text_Utils.take_suffix txt 1 . should_equal 'c\u0301'
             Text_Utils.take_suffix txt 2 . should_equal 'śc\u0301'
@@ -78,6 +80,8 @@ spec =
             Text_Utils.take_suffix txt 5 . should_equal txt
             Text_Utils.take_suffix txt 400 . should_equal txt
             Text_Utils.take_suffix txt 0 . should_equal ''
+            Text_Utils.take_suffix txt -1 . should_equal ''
+            Text_Utils.take_suffix txt -42 . should_equal ''
 
             Text_Utils.take_prefix '🚀🚧' 1 . should_equal '🚀'
             Text_Utils.take_prefix '🚀🚧' 2 . should_equal '🚀🚧'