From ec7b82987a56e306fb694e65937d668c104afda5 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 11 Feb 2022 14:18:14 +0000 Subject: [PATCH 1/5] Add Text.at function --- .../0.0.0-dev/src/Data/Text/Extensions.enso | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 8469c6fb181d..734667241ffc 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -12,6 +12,8 @@ import Standard.Base.Meta from Standard.Builtins export Text +from Standard.Base.Data.Vector import Index_Out_Of_Bounds_Error + export Standard.Base.Data.Text.Split_Kind export Standard.Base.Data.Text.Line_Ending_Style @@ -73,6 +75,46 @@ Text.each function = iterate fst nxt Nothing +## ALIAS Get Character + + Returns a character from the text at the specified index (0-based). + + Arguments: + - index: The location in the text to get the character from. The + index is also allowed be negative, then the characters are + counted from the end of the text, i.e. -1 will correspond to the + last character. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Get the individual characters in the text "건반(Korean)". + + "건반(Korean)".at 1 == "반" +Text.at : Integer -> Text ! Index_Out_Of_Bounds_Error +Text.at index = + case index < 0 of + True -> + # Will realise the entire collection of graphemes + this.characters.at index + False -> + iterator = BreakIterator.getCharacterInstance + iterator.setText this + + loop prev next count = if count == index then (Text_Utils.substring this prev next) else + next_next = iterator.next + if next_next == -1 then count else + @Tail_Call loop next next_next (count + 1) + + first = iterator.next + result = if (first == -1) then 0 else (loop 0 first 0) + case result of + Integer -> Error.throw (Index_Out_Of_Bounds_Error index result) + _ -> result + ## ALIAS Get Characters Returns a vector containing all characters in the given text. From 2678498417dbdc5b739b972f40d66f7cd09decff Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 11 Feb 2022 15:05:39 +0000 Subject: [PATCH 2/5] Add tests for Text.at --- .../0.0.0-dev/src/Data/Text/Extensions.enso | 16 +++++++++++--- test/Tests/src/Data/Text_Spec.enso | 21 +++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 734667241ffc..55d43ec89870 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -12,8 +12,6 @@ import Standard.Base.Meta from Standard.Builtins export Text -from Standard.Base.Data.Vector import Index_Out_Of_Bounds_Error - export Standard.Base.Data.Text.Split_Kind export Standard.Base.Data.Text.Line_Ending_Style @@ -21,6 +19,15 @@ polyglot java import com.ibm.icu.lang.UCharacter polyglot java import com.ibm.icu.text.BreakIterator polyglot java import org.enso.base.Text_Utils +## UNSTABLE + + An error for when an index is out of bounds in a text. + + Arguments: + - index: The requested index in the text. + - length: The length of the text. +type Index_Out_Of_Bounds_Error index length + ## ALIAS Length Computes the number of characters in the text. @@ -99,7 +106,10 @@ Text.at index = case index < 0 of True -> # Will realise the entire collection of graphemes - this.characters.at index + graphemes = this.characters + new_index = index + graphemes.length + if new_index < 0 then Error.throw (Index_Out_Of_Bounds_Error index graphemes.length) else + graphemes.at new_index False -> iterator = BreakIterator.getCharacterInstance iterator.setText this diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 4cfd612ba7f9..8a4221f63bda 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -1,5 +1,6 @@ from Standard.Base import all +from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Locale import Standard.Base.Data.Text.Split_Kind @@ -41,6 +42,25 @@ spec = str = kshi + facepalm + accent_1 + accent_2 str.characters . should_equal [kshi, facepalm, accent_1, accent_2] + Test.specify "should allow access by index to a grapheme cluster" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at 0 . should_equal kshi + str.at 1 . should_equal facepalm + str.at 2 . should_equal accent_1 + str.at 3 . should_equal accent_2 + + Test.specify "should allow access by negative index to a grapheme cluster" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at -4 . should_equal kshi + str.at -3 . should_equal facepalm + str.at -2 . should_equal accent_1 + str.at -1 . should_equal accent_2 + + Test.specify "should return a dataflow error when accessing characters out of bounds" <| + str = kshi + facepalm + accent_1 + accent_2 + str.at -5 . should_fail_with Index_Out_Of_Bounds_Error + str.at 4 . should_fail_with Index_Out_Of_Bounds_Error + Test.specify "should be able to split the text into words" <| sentences.words . should_equal sentence_words @@ -288,3 +308,4 @@ spec = result = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" comments=True result . should_equal "aaa" +main = Test.Suite.run_main here.spec From 15dce46e2125c961a30e13178a20d666262df439 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 11 Feb 2022 15:28:35 +0000 Subject: [PATCH 3/5] Add tests for Text.is_digit --- .../0.0.0-dev/src/Data/Text/Extensions.enso | 28 +++++++++++++++++++ test/Tests/src/Data/Text_Spec.enso | 21 ++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 55d43ec89870..09d0d06f7006 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -599,6 +599,34 @@ Text.is_empty = this == "" Text.not_empty : Boolean Text.not_empty = this.is_empty.not +## Returns if a character from the text at the specified index (0-based) is a + digit (0-9). + + Arguments: + - index: The location in the text to get the character from. The + index is also allowed be negative, then the characters are + counted from the end of the text, i.e. -1 will correspond to the + last character. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Check if an individual character is a digit: + + "0".is_digit == True + "A0".is_digit == False + "A0".is_digit 1 == True + "건반(Korean)".is_digit 1 == False +Text.is_digit : Integer -> Text ! Index_Out_Of_Bounds_Error +Text.is_digit (index=0) = + grapheme = this.at index + if grapheme.is_error then grapheme else + char = (Text_Utils.get_chars grapheme).at 0 + char>=48 && char<=57 + ## Returns a vector containing bytes representing the UTF-8 encoding of the input text. diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 8a4221f63bda..c63df02a9e03 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -120,6 +120,27 @@ spec = kshi_chars = [2325, 2381, 2359, 2367] Text.from_utf_16 kshi_chars . should_equal kshi + Test.specify "should be able to check by index if is a digit" <| + str = kshi + "A12" + accent_2 + str.is_digit . should_be_false + str.is_digit 1 . should_be_false + str.is_digit 2 . should_be_true + str.is_digit 3 . should_be_true + str.is_digit 4 . should_be_false + + Test.specify "should be able to check by negative index if is a digit" <| + str = kshi + "A12" + accent_2 + str.is_digit -1 . should_be_false + str.is_digit -2 . should_be_true + str.is_digit -3 . should_be_true + str.is_digit -4 . should_be_false + str.is_digit -5 . should_be_false + + Test.specify "should return a dataflow error when checking is digit for out of bounds" <| + str = kshi + "A12" + accent_2 + str.at -6 . should_fail_with Index_Out_Of_Bounds_Error + str.at 5 . should_fail_with Index_Out_Of_Bounds_Error + Test.group "Regex matching" <| Test.specify "should be possible on text" <| match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First From d39eecf10856bad96784e07ccfcfee34a62f2699 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 11 Feb 2022 15:40:42 +0000 Subject: [PATCH 4/5] Change log --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b7e86f94cf2..1a1da492134f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ - [Fixed `Vector.sort` to handle tail-recursive comparators][3256] - [Implemented `Range.find`, `Table.rename_columns` and `Table.use_first_row_as_names` operations][3249] +- [Implemented `Text.at` and `Text.is_digit` methods][3269] [3153]: https://github.com/enso-org/enso/pull/3153 [3166]: https://github.com/enso-org/enso/pull/3166 @@ -50,6 +51,7 @@ [3250]: https://github.com/enso-org/enso/pull/3250 [3256]: https://github.com/enso-org/enso/pull/3256 [3249]: https://github.com/enso-org/enso/pull/3249 +[3269]: https://github.com/enso-org/enso/pull/3269 #### Enso Compiler From b47e9fbb5716345923bf064b6f04ba4fb924ef74 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 11 Feb 2022 16:04:03 +0000 Subject: [PATCH 5/5] Avoid memory allocation --- .../Base/0.0.0-dev/src/Data/Text/Extensions.enso | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 09d0d06f7006..92f527664648 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -105,11 +105,10 @@ Text.at : Integer -> Text ! Index_Out_Of_Bounds_Error Text.at index = case index < 0 of True -> - # Will realise the entire collection of graphemes - graphemes = this.characters - new_index = index + graphemes.length - if new_index < 0 then Error.throw (Index_Out_Of_Bounds_Error index graphemes.length) else - graphemes.at new_index + length = this.length + new_index = index + length + if new_index < 0 then Error.throw (Index_Out_Of_Bounds_Error index length) else + this.at new_index False -> iterator = BreakIterator.getCharacterInstance iterator.setText this