diff --git a/CHANGELOG.md b/CHANGELOG.md index 93e2ef0fc4c3..6daea61e8313 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ - [Fixed `Vector.sort` to handle tail-recursive comparators][3256] - [Implemented `Range.find`, `Table.rename_columns` and `Table.use_first_row_as_names` operations][3249] +- [Implemented `Text.at` and `Text.is_digit` methods][3269] [3153]: https://github.com/enso-org/enso/pull/3153 [3166]: https://github.com/enso-org/enso/pull/3166 @@ -50,6 +51,7 @@ [3250]: https://github.com/enso-org/enso/pull/3250 [3256]: https://github.com/enso-org/enso/pull/3256 [3249]: https://github.com/enso-org/enso/pull/3249 +[3269]: https://github.com/enso-org/enso/pull/3269 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 8469c6fb181d..92f527664648 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -19,6 +19,15 @@ polyglot java import com.ibm.icu.lang.UCharacter polyglot java import com.ibm.icu.text.BreakIterator polyglot java import org.enso.base.Text_Utils +## UNSTABLE + + An error for when an index is out of bounds in a text. + + Arguments: + - index: The requested index in the text. + - length: The length of the text. +type Index_Out_Of_Bounds_Error index length + ## ALIAS Length Computes the number of characters in the text. @@ -73,6 +82,48 @@ Text.each function = iterate fst nxt Nothing +## ALIAS Get Character + + Returns a character from the text at the specified index (0-based). + + Arguments: + - index: The location in the text to get the character from. The + index is also allowed be negative, then the characters are + counted from the end of the text, i.e. -1 will correspond to the + last character. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Get the individual characters in the text "건반(Korean)". + + "건반(Korean)".at 1 == "반" +Text.at : Integer -> Text ! Index_Out_Of_Bounds_Error +Text.at index = + case index < 0 of + True -> + length = this.length + new_index = index + length + if new_index < 0 then Error.throw (Index_Out_Of_Bounds_Error index length) else + this.at new_index + False -> + iterator = BreakIterator.getCharacterInstance + iterator.setText this + + loop prev next count = if count == index then (Text_Utils.substring this prev next) else + next_next = iterator.next + if next_next == -1 then count else + @Tail_Call loop next next_next (count + 1) + + first = iterator.next + result = if (first == -1) then 0 else (loop 0 first 0) + case result of + Integer -> Error.throw (Index_Out_Of_Bounds_Error index result) + _ -> result + ## ALIAS Get Characters Returns a vector containing all characters in the given text. @@ -547,6 +598,34 @@ Text.is_empty = this == "" Text.not_empty : Boolean Text.not_empty = this.is_empty.not +## Returns if a character from the text at the specified index (0-based) is a + digit (0-9). + + Arguments: + - index: The location in the text to get the character from. The + index is also allowed be negative, then the characters are + counted from the end of the text, i.e. -1 will correspond to the + last character. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Check if an individual character is a digit: + + "0".is_digit == True + "A0".is_digit == False + "A0".is_digit 1 == True + "건반(Korean)".is_digit 1 == False +Text.is_digit : Integer -> Text ! Index_Out_Of_Bounds_Error +Text.is_digit (index=0) = + grapheme = this.at index + if grapheme.is_error then grapheme else + char = (Text_Utils.get_chars grapheme).at 0 + char>=48 && char<=57 + ## Returns a vector containing bytes representing the UTF-8 encoding of the input text. diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 4cfd612ba7f9..c63df02a9e03 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -1,5 +1,6 @@ from Standard.Base import all +from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Locale import Standard.Base.Data.Text.Split_Kind @@ -41,6 +42,25 @@ spec = str = kshi + facepalm + accent_1 + accent_2 str.characters . should_equal [kshi, facepalm, accent_1, accent_2] + Test.specify "should allow access by index to a grapheme cluster" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at 0 . should_equal kshi + str.at 1 . should_equal facepalm + str.at 2 . should_equal accent_1 + str.at 3 . should_equal accent_2 + + Test.specify "should allow access by negative index to a grapheme cluster" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at -4 . should_equal kshi + str.at -3 . should_equal facepalm + str.at -2 . should_equal accent_1 + str.at -1 . should_equal accent_2 + + Test.specify "should return a dataflow error when accessing characters out of bounds" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at -5 . should_fail_with Index_Out_Of_Bounds_Error + str.at 4 . should_fail_with Index_Out_Of_Bounds_Error + Test.specify "should be able to split the text into words" <| sentences.words . should_equal sentence_words @@ -100,6 +120,27 @@ spec = kshi_chars = [2325, 2381, 2359, 2367] Text.from_utf_16 kshi_chars . should_equal kshi + Test.specify "should be able to check by index if is a digit" <| + str = kshi + "A12" + accent_2 + str.is_digit . should_be_false + str.is_digit 1 . should_be_false + str.is_digit 2 . should_be_true + str.is_digit 3 . should_be_true + str.is_digit 4 . should_be_false + + Test.specify "should be able to check by negative index if is a digit" <| + str = kshi + "A12" + accent_2 + str.is_digit -1 . should_be_false + str.is_digit -2 . should_be_true + str.is_digit -3 . should_be_true + str.is_digit -4 . should_be_false + str.is_digit -5 . should_be_false + + Test.specify "should return a dataflow error when checking is digit for out of bounds" <| + str = kshi + "A12" + accent_2 + str.at -6 . should_fail_with Index_Out_Of_Bounds_Error + str.at 5 . should_fail_with Index_Out_Of_Bounds_Error + Test.group "Regex matching" <| Test.specify "should be possible on text" <| match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First @@ -288,3 +329,4 @@ spec = result = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" comments=True result . should_equal "aaa" +main = Test.Suite.run_main here.spec