enso-org · mergify · Apr 19, 2022 · Apr 8, 2022 · Apr 8, 2022 · Apr 11, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -109,6 +109,8 @@
 - [Implemented support for most Table aggregations in the Database
   backend.][3383]
 - [Update `Text.replace` to new API.][3393]
+- [Add encoding support to `Text.bytes` and `Text.from_bytes`. Renamed and added
+  encoding to `File.read_text`. New `File.read` API.][3390]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -167,6 +169,7 @@
 [3385]: https://github.com/enso-org/enso/pull/3385
 [3392]: https://github.com/enso-org/enso/pull/3392
 [3393]: https://github.com/enso-org/enso/pull/3393
+[3390]: https://github.com/enso-org/enso/pull/3390
 
 #### Enso Compiler
 

@@ -13,7 +13,7 @@ component-groups:
     - Input:
         exports:
           - Standard.Base.System.File.new
-          - Standard.Base.System.File.read
+          - Standard.Base.System.File.read_text
     - Web:
         exports:
           - Standard.Base.Network.Http.new
@@ -53,4 +53,4 @@ component-groups:
           - Standard.Base.Data.Vector.Vector.distinct
     - Output:
         exports:
-          - Standard.Base.System.File.File.write
+          - Standard.Base.System.File.File.write_text
@@ -0,0 +1,102 @@
+from Standard.Base import all
+
+polyglot java import java.nio.charset.Charset
+polyglot java import java.nio.charset.UnsupportedCharsetException
+polyglot java import org.enso.base.Text_Utils
+
+## Get all available character sets from Java as Encodings.
+   Used to provide auto completion in the UI.
+all_character_sets : Vector.Vector Text
+all_character_sets =
+    java_array = Charset.availableCharsets.keySet.toArray
+    Vector.Vector java_array
+
+## Get all available Encodings.
+all_encodings : Vector Encoding
+all_encodings =
+    here.all_character_sets . map Encoding
+
+## Represents a character encoding.
+type Encoding
+    ## Create a new Encoding object.
+
+       Arguments:
+       - character_set: java.nio.charset name.
+    type Encoding (character_set:Text)
+
+    ## PRIVATE
+       Convert an Encoding to it's corresponding Java Charset
+    to_java_charset : Charset
+    to_java_charset =
+        Panic.catch UnsupportedCharsetException (Charset.forName this.character_set) _->
+            Error.throw (Illegal_Argument_Error ("Unknown Character Set: " + this.character_set))
+
+    ## Encoding for ASCII.
+    ascii : Encoding
+    ascii = Encoding "US-ASCII"
+
+    ## Encoding for Unicode UTF-8.
+    utf_8 : Encoding
+    utf_8 = Encoding "UTF-8"
+
+    ## Encoding for Unicode UTF-16 Little Endian.
+    utf_16_le : Encoding
+    utf_16_le = Encoding "UTF-16LE"
+
+    ## Encoding for Unicode UTF-16 Big Endian.
+    utf_16_be : Encoding
+    utf_16_be = Encoding "UTF-16BE"
+
+    ## Encoding for Unicode UTF-32 Little Endian.
+    utf_32_le : Encoding
+    utf_32_le = Encoding "UTF-32LE"
+
+    ## Encoding for Unicode UTF-32 Big Endian.
+    utf_32_be : Encoding
+    utf_32_be = Encoding "UTF-32BE"
+
+    ## Encoding for Central European (Windows).
+    windows_1250 : Encoding
+    windows_1250 = Encoding "windows-1250"
+
+    ## Encoding for Cyrillic (Windows).
+    windows_1251 : Encoding
+    windows_1251 = Encoding "windows-1251"
+
+    ## ALIAS ISO-8859-1
+
+       Encoding for Western European (Windows).
+    windows_1252 : Encoding
+    windows_1252 = Encoding "windows-1252"
+
+    ## Encoding for Greek (Windows).
+    windows_1253 : Encoding
+    windows_1253 = Encoding "windows-1253"
+
+    ## ALIAS ISO-8859-9
+
+       Encoding for Turkish (Windows).
+    windows_1254 : Encoding
+    windows_1254 = Encoding "windows-1254"
+
+    ## Encoding for Hebrew (Windows).
+    windows_1255 : Encoding
+    windows_1255 = Encoding "windows-1255"
+
+    ## Encoding for Arabic (Windows).
+    windows_1256 : Encoding
+    windows_1256 = Encoding "windows-1256"
+
+    ## Encoding for Baltic (Windows).
+    windows_1257 : Encoding
+    windows_1257 = Encoding "windows-1257"
+
+    ## Encoding for Vietnamese (Windows).
+    windows_1258 : Encoding
+    windows_1258 = Encoding "windows-1258"
+
+## One or more byte sequences were not decodable using the Encoding.
+type Encoding_Error (message:Text)
+
+Encoding_Error.to_display_text : Text
+Encoding_Error.to_display_text = "Encoding_Error: " + this.message
@@ -12,6 +12,8 @@ import Standard.Base.Data.Text.Line_Ending_Style
 from Standard.Base.Data.Text.Span as Span_Module import Span
 import Standard.Base.Data.Text.Split_Kind
 import Standard.Base.Data.Text.Text_Sub_Range
+from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
+from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
 import Standard.Base.Data.Locale
 import Standard.Base.Meta
 
@@ -187,10 +189,10 @@ Text.characters =
    - comments: Enables or disables the comments mode for the regular expression.
      In comments mode, the following changes apply:
      - Whitespace within the pattern is ignored, except when within a
-       character class or when preceeded by an unescaped backslash, or within
+       character class or when preceded by an unescaped backslash, or within
        grouping constructs (e.g. `(?...)`).
      - When a line contains a `#`, that is not in a character class and is not
-       preceeded by an unescaped backslash, all characters from the leftmost
+       preceded by an unescaped backslash, all characters from the leftmost
        such `#` to the end of the line are ignored. That is to say, they act
        as _comments_ in the regex.
    - extra_opts: Specifies additional options in a vector. This allows options
@@ -738,23 +740,82 @@ Text.is_whitespace : Boolean
 Text.is_whitespace =
     Text_Utils.is_all_whitespace this
 
+## Returns a vector containing bytes representing the specified encoding of the
+   input text.
+
+   This is useful for low-level operations, such as binary data encoding.
+
+   Arguments:
+   - encoding: The text encoding to encode this with. Defaults to UTF-8.
+   - on_problems: Specifies the behavior when a problem occurs during the
+     function.
+     By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
+
+   > Example
+     Get the ASCII bytes of the text "Hello".
+
+         "Hello".bytes (Encoding.ascii)
+Text.bytes : Encoding -> Problem_Behavior -> Vector.Vector Byte
+Text.bytes encoding on_problems=Report_Warning =
+    result = Text_Utils.get_bytes this (encoding . to_java_charset)
+    vector = Vector.Vector result.result
+    if result.warnings.is_nothing then vector else
+        on_problems.attach_problems_after vector [Encoding_Error result.warnings]
+
+## Takes a vector of bytes and returns Text resulting from decoding it using the
+   specified encoding.
+
+   Arguments:
+   - bytes: The vector of UTF-8 bytes.
+   - encoding: The text encoding to decode the bytes with. Defaults to UTF-8.
+   - on_problems: Specifies the behavior when a problem occurs during the
+     function.
+     By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
+
+   > Example
+     Get the ASCII bytes of the text "Hello".
+
+         "Hello".bytes (Encoding.ascii)
+Text.from_bytes : Vector.Vector Byte -> Encoding -> Text
+Text.from_bytes bytes encoding on_problems=Report_Warning =
+    result = Text_Utils.from_bytes bytes.to_array (encoding . to_java_charset)
+    if result.warnings.is_nothing then result.result else
+        on_problems.attach_problems_after result.result [Encoding_Error result.warnings]
+
 ## Returns a vector containing bytes representing the UTF-8 encoding of the
    input text.
 
    This is useful for low-level operations, such as binary data encoding and
    decoding.
 
+   Arguments:
+   - on_problems: Specifies the behavior when a problem occurs during the
+     function.
+     By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
+
    > Example
      Get the UTF-8 bytes of the text "Hello".
 
          "Hello".utf_8
-Text.utf_8 : Vector.Vector Byte
-Text.utf_8 = Vector.Vector (Text_Utils.get_bytes this)
+Text.utf_8 : Problem_Behavior -> Vector.Vector Byte
+Text.utf_8 on_problems=Report_Warning =
+    this.bytes Encoding.utf_8 on_problems
 
 ## Takes a vector of bytes and returns Text resulting from decoding it as UTF-8.
 
    Arguments:
    - bytes: The vector of UTF-8 bytes.
+   - on_problems: Specifies the behavior when a problem occurs during the
+     function.
+     By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
 
    This is useful for low-level operations, such as binary data encoding and
    decoding.
@@ -763,35 +824,31 @@ Text.utf_8 = Vector.Vector (Text_Utils.get_bytes this)
      Decoding the bytes to get a text.
 
          Text.from_utf_8 [-32, -92, -107, -32, -91, -115, -32, -92, -73, -32, -92, -65]
-Text.from_utf_8 : Vector.Vector Byte -> Text
-Text.from_utf_8 bytes = Text_Utils.from_utf_8 bytes.to_array
-
-## ADVANCED
+Text.from_utf_8 : Vector.Vector Byte -> Problem_Behavior -> Text
+Text.from_utf_8 bytes on_problems=Report_Warning =
+    Text.from_bytes bytes Encoding.utf_8 on_problems
 
-   Returns a vector containing the UTF-16 characters that encode the input text.
+## Returns a vector containing the UTF-16 characters that encode the input text.
 
    This is useful for low-level operations, such as binary data encoding and
    decoding.
 
    > Example
-     Get the UTF-16 bytes of the text "Hello".
+     Get the character vector of the text "Hello".
 
-         "Hello".utf_16
-Text.utf_16 : Vector.Vector Integer
-Text.utf_16 = Vector.Vector (Text_Utils.get_chars this)
-
-## ADVANCED
+         "Hello".char_vector
+Text.char_vector : Vector.Vector Integer
+Text.char_vector = Vector.Vector (Text_Utils.get_chars this)
 
-   Takes a vector of UTF-16 characters and returns the text that results from
-   decoding it as UTF-16.
+## Takes a vector of characters and returns the text that results from it.
 
    Arguments:
    - chars: The vector of UTF-16 characters.
 
    This is useful for low-level operations, such as binary data encoding and
    decoding.
-Text.from_utf_16 : Vector.Vector Integer -> Text
-Text.from_utf_16 chars = Text_Utils.from_chars chars.to_array
+Text.from_char_vector : Vector.Vector Integer -> Text
+Text.from_char_vector chars = Text_Utils.from_chars chars.to_array
 
 ## Returns a vector containing integers representing the Unicode codepoints of
    the input text.

@@ -708,8 +708,8 @@ type Match
        ! What is a Character?
          This regular expression engine defines a "character" to mean a UTF-16
          character. This means that these indices should only be used with the
-         result of calling `.utf_16` on the text. Using them with `.characters`
-         or `.codepoints` will produce incorrect results.
+         result of calling `.char_vector` on the text. Using them with
+         `.characters` or `.codepoints` will produce incorrect results.
 
        > Example
          Get the start index in the input where the full pattern matched for
@@ -736,8 +736,8 @@ type Match
        ! What is a Character?
          This regular expression engine defines a "character" to mean a UTF-16
          character. This means that these indices should only be used with the
-         result of calling `.utf_16` on the text. Using them with `.characters`
-         or `.codepoints` will produce incorrect results.
+         result of calling `.char_vector` on the text. Using them with
+         `.characters` or `.codepoints` will produce incorrect results.
 
        > Example
          Get the end index in the input where the full pattern matched for this
@@ -763,8 +763,8 @@ type Match
        ! What is a Character?
          This regular expression engine defines a "character" to mean a UTF-16
          character. This means that these indices should only be used with the
-         result of calling `.utf_16` on the text. Using them with `.characters`
-         or `.codepoints` will produce incorrect results.
+         result of calling `.char_vector` on the text. Using them with
+         `.characters` or `.codepoints` will produce incorrect results.
 
        > Example
          Get the span over the input that was matched by the full match.
@@ -784,8 +784,8 @@ type Match
        ! What is a Character?
          This regular expression engine defines a "character" to mean a UTF-16
          character. This means that these indices should only be used with the
-         result of calling `.utf_16` on the text. Using them with `.characters`
-         or `.codepoints` will produce incorrect results.
+         result of calling `.char_vector` on the text. Using them with
+         `.characters` or `.codepoints` will produce incorrect results.
 
        > Example
          Get the start position in the input to which this match was limited.
@@ -803,8 +803,8 @@ type Match
        ! What is a Character?
          This regular expression engine defines a "character" to mean a UTF-16
          character. This means that these indices should only be used with the
-         result of calling `.utf_16` on the text. Using them with `.characters`
-         or `.codepoints` will produce incorrect results.
+         result of calling `.char_vector` on the text. Using them with
+         `.characters` or `.codepoints` will produce incorrect results.
 
        > Example
          Get the end position in the input to which this match was limited.