src/string/grapheme/grapheme.cr

require "./properties"

class String
  # Iterates the grapheme boundaries in this string and yields for each grapheme
  # cluster the byte index of the first character and the byte index of the last
  # byte of the last character in the cluster. That is the byte span of the
  # respective grapheme cluster in the entire string.
  # The third input value is the last character itself, which can be used
  # to avoid multiple decoding in single-char graphemes.
  private def each_grapheme_boundary(& : Range(Int32, Int32), Char -> Nil) : Nil
    state = Grapheme::Property::Start

    reader = Char::Reader.new(self)
    last_char = reader.current_char
    # cache last_property to avoid re-calculation on the following iteration
    last_property = Grapheme::Property.from(last_char)
    last_boundary = 0

    while reader.has_next?
      char = reader.next_char
      property = Grapheme::Property.from(char)
      boundary, state = Grapheme.break?(last_property, property, state)

      if boundary
        index = reader.pos
        yield last_boundary...index, last_char

        last_boundary = index
      end

      last_char = char
      last_property = property
    end
  end

  # :nodoc:
  class GraphemeIterator
    include Iterator(Grapheme)

    @last_char : Char
    @last_property : Grapheme::Property

    def initialize(str : String)
      @reader = Char::Reader.new(str)
      @state = Grapheme::Property::Start
      @last_char = @reader.current_char
      # cache last_property to avoid re-calculation on the following iteration
      @last_property = Grapheme::Property.from(@last_char)
      @last_boundary = 0
    end

    def next
      return stop unless @reader.has_next?

      while char = @reader.next_char
        property = Grapheme::Property.from(char)
        boundary, @state = Grapheme.break?(@last_property, property, @state)

        last_char = @last_char
        @last_char = char
        @last_property = property

        if boundary
          index = @reader.pos
          grapheme = Grapheme.new(@reader.string, @last_boundary...index, last_char)
          @last_boundary = index

          return grapheme
        end
      end

      Grapheme.new(@reader.string, @last_boundary..@reader.string.bytesize, @last_char)
    end
  end

  # `Grapheme` represents a Unicode grapheme cluster, which describes the smallest
  # functional unit of a writing system. This is also called a *user-perceived character*.
  #
  # In the latin alphabet, most graphemes consist of a single Unicode codepoint
  # (equivalent to `Char`). But a grapheme can also consist of a sequence of codepoints,
  # which combine into a single unit.
  #
  # For example, the string `"e\u0301"` consists of two characters, the latin small letter `e`
  # and the combining acute accent `´`. Together, they form a single grapheme: `é`.
  # That same grapheme could alternatively be described in a single codepoint, `\u00E9` (latin small letter e with acute).
  # But the combinatory possibilities are far bigger than the amount of directly
  # available codepoints.
  #
  # ```
  # "e\u0301".size # => 2
  # "é".size       # => 1
  #
  # "e\u0301".grapheme_size # => 1
  # "é".grapheme_size       # => 1
  # ```
  #
  # This combination of codepoints is common in some non-latin scripts. It's also
  # often used with emojis to create customized combination. For example, the
  # thumbs up sign `👍` (`U+1F44D`) combined with an emoji modifier such as
  # `U+1F3FC` assign a colour to the emoji.
  #
  # Instances of this type can be acquired via `String#each_grapheme` or `String#graphemes`.
  #
  # The algorithm to determine boundaries between grapheme clusters is specified
  # in the [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
  @[Experimental("The grapheme API is still under development. Join the discussion at [#11610](https://github.com/crystal-lang/crystal/issues/11610).")]
  struct Grapheme
    # For efficiency reasons we avoid allocating a string for graphemes consisting
    # of only a single character.
    # As a trade-off, this leads to multi dispatch on this ivar. But that's
    # acceptable compared to the allocation overhead.
    #
    # Graphemes consisting of a single character are always represented as
    # `Char`, never as `String` to simplify comparability. This invariant is
    # protected by the constructor.
    @cluster : Char | String

    # :nodoc:
    def self.new(string : String, range : Range(Int32, Int32), char : Char) : self
      if char.bytesize == range.size
        new(char)
      else
        new(string.byte_slice(range.begin, range.end - range.begin))
      end
    end

    # :nodoc:
    def initialize(@cluster)
    end

    # Appends the characters in this grapheme cluster to *io*.
    def to_s(io : IO) : Nil
      io << @cluster
    end

    # Returns the characters in this grapheme cluster.
    def to_s : String
      case cluster = @cluster
      in Char
        cluster.to_s
      in String
        cluster
      end
    end

    # Appends a representation of this grapheme cluster to *io*.
    def inspect(io : IO) : Nil
      io << "String::Grapheme("
      @cluster.inspect(io)
      io << ")"
    end

    # Returns the number of characters in this grapheme cluster.
    def size : Int32
      case cluster = @cluster
      in Char
        1
      in String
        cluster.size
      end
    end

    # Returns the number of bytes in the UTF-8 representation of this grapheme cluster.
    def bytesize : Int32
      @cluster.bytesize
    end

    # Returns `true` if *other* is equivalent to `self`.
    #
    # Two graphemes are considered equivalent if they contain the same sequence
    # of codepoints.
    def ==(other : self) : Bool
      @cluster == other.@cluster
    end

    # :nodoc:
    def self.break?(c1 : Char, c2 : Char) : Bool
      break?(Property.from(c1), Property.from(c2))
    end

    # :nodoc:
    #
    # Returns whether there is a grapheme break between boundclasses lbc and tbc.
    #
    # Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
    # and GB 12/13 (regional indicator code points) require knowledge of previous characters
    # which is not handled by this overload. This may result in an incorrect break before
    # an E_Modifier class codepoint and an incorrectly missing break between two
    # REGIONAL_INDICATOR class code points if such support does not exist in the caller.
    #
    # The rules are graphically displayed in a table on https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
    #
    # The implementation is inspired by https://github.com/JuliaStrings/utf8proc/blob/462093b3924c7491defc67fda4bc7a27baf9b088/utf8proc.c#L261
    def self.break?(lbc : Property, tbc : Property) : Bool
      return true if lbc.start?                                                   # GB1
      return false if lbc.cr? && tbc.lf?                                          # GB3
      return true if lbc.cr? || lbc.lf? || lbc.control?                           # GB4
      return true if tbc.cr? || tbc.lf? || tbc.control?                           # GB5
      return false if lbc.l? && (tbc.l? || tbc.v? || tbc.lv? || tbc.lvt?)         # GB6
      return false if (lbc.lv? || lbc.v?) && (tbc.v? || tbc.t?)                   # GB7
      return false if (lbc.lvt? || lbc.t?) && tbc.t?                              # GB8
      return false if tbc.extend? || tbc.zwj?                                     # GB9
      return false if tbc.spacing_mark?                                           # GB9a
      return false if lbc.prepend?                                                # GB9b
      return false if lbc.extended_plus_zero_width? && tbc.extended_pictographic? # GB11 (requires additional handling)
      return false if lbc.regional_indicator? && tbc.regional_indicator?          # GB12/13 (requires additional handling)
      true                                                                        # GB999
    end

    # :nodoc:
    def self.break?(c1 : Char, c2 : Char, state : Property) : {Bool, Property}
      break?(Property.from(c1), Property.from(c2), state)
    end

    # :nodoc:
    #
    # Returns whether there is a grapheme break between boundclasses lbc and tbc.
    #
    # Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
    # and GB 12/13 (regional indicator code points) require knowledge of previous characters
    # which is accounted for in the state argument. The caller is expected to
    # store the returned state and pass it when calling this method the next time.
    # The initial value is `Property::Start`.
    #
    # The implementation is inspired by https://github.com/JuliaStrings/utf8proc/blob/462093b3924c7491defc67fda4bc7a27baf9b088/utf8proc.c#L291
    def self.break?(lbc : Property, tbc : Property, state : Property) : {Bool, Property}
      if state.start?
        state = lbc_override = lbc
      else
        lbc_override = state
      end

      break_permitted = break?(lbc_override, tbc)

      # Special support for GB 12/13 made possible by GB999. After two RI
      # class codepoints we want to force a break. Do this by resetting the
      # second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
      # after that character according to GB999 (unless of course such a break is
      # forbidden by a different rule such as GB9).
      if state == tbc && tbc.regional_indicator?
        state = Property::Any
        # Special support for GB11 (emoji extend* zwj / emoji)
      elsif state.extended_pictographic?
        if tbc.extend? # fold EXTEND codepoints into emoji
          state = Property::ExtendedPictographic
        elsif tbc.zwj?
          state = Property::ExtendedPlusZeroWidth # state to record emoji+zwg combo
        else
          state = tbc
        end
      else
        state = tbc
      end

      {break_permitted, state}
    end
  end
end