src/uri/encoding.cr

class URI
  # Encodes *string* so it can be safely placed as a potentially multi-segmented
  # URI path, replacing special characters with URI escape sequences as needed.
  #
  # Unreserved characters such as ASCII letters, digits, and the characters
  # `_.-~` are not encoded, as well as the character `/` which represent a
  # segment separator in hierarchical paths ([RFC 3986 §3.3](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)).
  #
  # ```
  # require "uri"
  #
  # URI.encode_path("foo/bar/baz")  # => "foo/bar/baz"
  # URI.encode_path("hello world!") # => "hello%20world%21"
  # URI.encode_path("put: it+й")    # => "put%3A%20it%2B%D0%B9"
  # ```
  #
  # * `.decode` is the reverse operation.
  # * `.encode_path_segment` encodes a single path segment, escaping `/`.
  def self.encode_path(string : String) : String
    String.build { |io| encode_path(io, string) }
  end

  # :ditto:
  def self.encode_path(io : IO, string : String) : Nil
    self.encode(string, io) { |byte| URI.unreserved?(byte) || '/' === byte }
  end

  # Encodes *string* so it can be safely placed inside a URI path segment,
  # replacing special characters (including `/`) with URI escape sequences as needed.
  #
  # Unreserved characters such as ASCII letters, digits, and the characters
  # `_.-~` are not encoded (see `.unreserved?`).
  #
  # ```
  # require "uri"
  #
  # URI.encode_path_segment("foo;bar;baz")  # => "foo%3Bbar%3Bbaz"
  # URI.encode_path_segment("foo/bar/baz")  # => "foo%2Fbar%2Fbaz"
  # URI.encode_path_segment("foo,bar,baz")  # => "foo%2Cbar%2Cbaz"
  # URI.encode_path_segment("hello world!") # => "hello%20world%21"
  # URI.encode_path_segment("put: it+й")    # => "put%3A%20it%2B%D0%B9"
  # ```
  #
  # * `.decode` is the reverse operation.
  # * `.encode_path` encodes a path consisting of multiple segments, not escaping `/`.
  # * `.encode_www_form` escapes space character as `+`.
  def self.encode_path_segment(string : String) : String
    String.build { |io| encode_path_segment(io, string) }
  end

  # :ditto:
  def self.encode_path_segment(io : IO, string : String) : Nil
    self.encode(string, io) { |byte| URI.unreserved?(byte) }
  end

  # URL-decodes *string*.
  #
  # ```
  # require "uri"
  #
  # URI.decode("hello%20world!")                                 # => "hello world!"
  # URI.decode("put:%20it+%D0%B9")                               # => "put: it+й"
  # URI.decode("http://example.com/Crystal%20is%20awesome%20=)") # => "http://example.com/Crystal is awesome =)"
  # ```
  #
  # By default, `+` is decoded literally. If *plus_to_space* is `true`, `+` is
  # decoded as space character (`0x20`). Percent-encoded values such as `%20`
  # and `%2B` are always decoded as characters with the respective codepoint.
  #
  # ```
  # require "uri"
  #
  # URI.decode("peter+%2B+paul")                      # => "peter+++paul"
  # URI.decode("peter+%2B+paul", plus_to_space: true) # => "peter + paul"
  # ```
  #
  # * `.encode` is the reverse operation.
  # * `.decode_www_form` encodes plus to space by default.
  def self.decode(string : String, *, plus_to_space : Bool = false) : String
    if needs_decode?(string, plus_to_space: plus_to_space)
      String.build { |io| decode!(string, io, plus_to_space: plus_to_space) { false } }
    else
      string
    end
  end

  # URL-decodes a string and writes the result to *io*.
  #
  # See `.decode(string : String, *, plus_to_space : Bool = false) : String` for details.
  def self.decode(string : String, io : IO, *, plus_to_space : Bool = false) : Nil
    self.decode(string, io, plus_to_space: plus_to_space) { false }
  end

  # URL-encodes *string*.
  #
  # Reserved and unreserved characters are not escaped, so this only modifies some
  # special characters as well as non-ASCII characters. `.reserved?` and `.unreserved?`
  # provide more details on these character classes.
  #
  # ```
  # require "uri"
  #
  # URI.encode("hello world!")                             # => "hello%20world!"
  # URI.encode("put: it+й")                                # => "put:%20it+%D0%B9"
  # URI.encode("http://example.com/Crystal is awesome =)") # => "http://example.com/Crystal%20is%20awesome%20=)"
  # ```
  #
  # By default, the space character (`0x20`) is encoded as `%20` and `+` is
  # encoded literally. If *space_to_plus* is `true`, space character is encoded
  # as `+` and `+` is encoded as `%2B`:
  #
  # ```
  # require "uri"
  #
  # URI.encode("peter + paul")                      # => "peter%20+%20paul"
  # URI.encode("peter + paul", space_to_plus: true) # => "peter+%2B+paul"
  # ```
  #
  # * `.decode` is the reverse operation.
  # * `.encode_www_form` also escapes reserved characters.
  @[Deprecated("Use `.encode_path` instead.")]
  def self.encode(string : String, *, space_to_plus : Bool = false) : String
    String.build { |io| encode(string, io, space_to_plus: space_to_plus) }
  end

  # URL-encodes *string* and writes the result to *io*.
  #
  # See `.encode(string : String, *, space_to_plus : Bool = false) : String` for details.
  @[Deprecated("Use `.encode_path` instead.")]
  def self.encode(string : String, io : IO, *, space_to_plus : Bool = false) : Nil
    self.encode(string, io, space_to_plus: space_to_plus) { |byte| URI.reserved?(byte) || URI.unreserved?(byte) }
  end

  # URL-decodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing).
  #
  # ```
  # require "uri"
  #
  # URI.decode_www_form("hello%20world!")                           # => "hello world!"
  # URI.decode_www_form("put:%20it+%D0%B9")                         # => "put: it й"
  # URI.decode_www_form("http://example.com/Crystal+is+awesome+=)") # => "http://example.com/Crystal is awesome =)"
  # ```
  #
  # By default, `+` is decoded as space character (`0x20`). If *plus_to_space*
  # is `false`, `+` is decoded literally as `+`. Percent-encoded values such as
  # `%20` and `%2B` are always decoded as characters with the respective codepoint.
  #
  # ```
  # require "uri"
  #
  # URI.decode_www_form("peter+%2B+paul")                       # => "peter + paul"
  # URI.decode_www_form("peter+%2B+paul", plus_to_space: false) # => "peter+++paul"
  # ```
  #
  # * `.encode_www_form` is the reverse operation.
  # * `.decode` encodes plus literally by default.
  def self.decode_www_form(string : String, *, plus_to_space : Bool = true) : String
    decode(string, plus_to_space: plus_to_space)
  end

  # URL-decodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing)
  # and writes the result to *io*.
  #
  # See `self.decode_www_form(string : String, *, plus_to_space : Bool = true) : String`
  # for details.
  def self.decode_www_form(string : String, io : IO, *, plus_to_space : Bool = true) : Nil
    decode(string, io, plus_to_space: plus_to_space)
  end

  # URL-encodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing).
  #
  # Reserved characters are escaped, unreserved characters are not.
  # `.reserved?` and `.unreserved?` provide more details on these character
  # classes.
  #
  # ```
  # require "uri"
  #
  # URI.encode_www_form("hello world!")                             # => "hello+world%21"
  # URI.encode_www_form("put: it+й")                                # => "put%3A+it%2B%D0%B9"
  # URI.encode_www_form("http://example.com/Crystal is awesome =)") # => "http%3A%2F%2Fexample.com%2FCrystal+is+awesome+%3D%29"
  # ```
  #
  # The encoded string returned from this method can be used as name or value
  # components for a `application/x-www-form-urlencoded` format serialization.
  # `URI::Params` provides a higher-level API for this use case.
  #
  # By default, the space character (`0x20`) is encoded as `+` and `+` is encoded
  # as `%2B`. If *space_to_plus* is `false`, space character is encoded as `%20`
  # and `'+'` is encoded literally.
  #
  # ```
  # require "uri"
  #
  # URI.encode_www_form("peter + paul")                       # => "peter+%2B+paul"
  # URI.encode_www_form("peter + paul", space_to_plus: false) # => "peter%20%2B%20paul"
  # ```
  #
  # * `.decode_www_form` is the reverse operation.
  # * `.encode_path` escapes space character as `%20`.
  def self.encode_www_form(string : String, *, space_to_plus : Bool = true) : String
    String.build do |io|
      encode_www_form(string, io, space_to_plus: space_to_plus)
    end
  end

  # URL-encodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing)
  # and writes the result to *io*.
  #
  # See `.encode_www_form(string : String, *, space_to_plus : Bool = true)` for
  # details.
  def self.encode_www_form(string : String, io : IO, *, space_to_plus : Bool = true) : Nil
    encode(string, io, space_to_plus: space_to_plus) do |byte|
      URI.unreserved?(byte)
    end
  end

  # Returns whether given byte is reserved character defined in
  # [RFC 3986 §2.2](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
  #
  # Reserved characters are ':', '/', '?', '#', '[', ']', '@', '!',
  # '$', '&', "'", '(', ')', '*', '+', ',', ';' and '='.
  def self.reserved?(byte) : Bool
    sub_delim?(byte) || gen_delim?(byte)
  end

  # :nodoc:
  # Returns `true` if the byte is URI gen-delims (https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
  def self.gen_delim?(byte)
    byte.unsafe_chr.in?('#', '/', ':', '?', '@', '[', ']')
  end

  # :nodoc:
  # Returns `true` if the byte is URI sub-delims (https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
  def self.sub_delim?(byte) : Bool
    char = byte.unsafe_chr
    '&' <= char <= ',' ||
      char.in?('!', '$', ';', '=')
  end

  # Returns whether given byte is unreserved character defined in
  # [RFC 3986 §2.3](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3).
  #
  # Unreserved characters are ASCII letters, ASCII digits, `_`, `.`, `-` and `~`.
  def self.unreserved?(byte) : Bool
    char = byte.unsafe_chr
    char.ascii_alphanumeric? ||
      char.in?('_', '.', '-', '~')
  end

  # URL-decodes *string* and writes the result to *io*.
  #
  # The block is called for each percent-encoded ASCII character and determines
  # whether the value is to be decoded. When the return value is falsey,
  # the character is decoded. Non-ASCII characters are always decoded.
  #
  # By default, `+` is decoded literally. If *plus_to_space* is `true`, `+` is
  # decoded as space character (`0x20`).
  #
  # This method enables some customization, but typical use cases can be implemented
  # by either `.decode(string : String, *, plus_to_space : Bool = false) : String` or
  # `.decode_www_form(string : String, *, plus_to_space : Bool = true) : String`.
  def self.decode(string : String, io : IO, *, plus_to_space : Bool = false, &block) : Nil
    unless needs_decode?(string, plus_to_space: plus_to_space)
      io.write_string(string.to_slice)
      return
    end

    decode!(string, io, plus_to_space: plus_to_space) do |byte|
      yield byte
    end
  end

  # Similar to the above, but it assumes there will be chars to decode (% or +).
  private def self.decode!(string : String, io : IO, *, plus_to_space : Bool, &block) : Nil
    i = 0
    bytesize = string.bytesize

    if io.encoding == "UTF-8"
      target_io = io
    else
      io_memory = IO::Memory.new(bytesize)
      target_io = io_memory
    end

    while i < bytesize
      byte = string.to_unsafe[i]
      char = byte.unsafe_chr
      i = decode_one(string, bytesize, i, byte, char, target_io, plus_to_space) { |byte| yield byte }
    end

    if io_memory
      io.write_string(io_memory.to_slice)
    end
  end

  # URL-encodes *string* and writes the result to an `IO`.
  #
  # The block is called for each ascii character (codepoint less than `0x80`) and
  # determines whether the value is to be encoded. When the return value is falsey,
  # the character is encoded. Non-ASCII characters are always encoded.
  #
  # By default, the space character (`0x20`) is encoded as `%20` and `+` is
  # encoded literally. If *space_to_plus* is `true`, space character is encoded
  # as `+` and `+` is encoded as `%2B`.
  #
  # This method enables some customization, but typical use cases can be implemented
  # by either `.encode_path(string : String) : String`, `.encode_path_segment(string : String) : String` or
  # `.encode_www_form(string : String, *, space_to_plus : Bool = true) : String`.
  def self.encode(string : String, io : IO, space_to_plus : Bool = false, &block) : Nil
    string.each_byte do |byte|
      char = byte.unsafe_chr
      if char == ' ' && space_to_plus
        io << '+'
      elsif char.ascii? && yield(byte) && (!space_to_plus || char != '+')
        io << char
      else
        io << '%'
        io << '0' if byte < 16
        byte.to_s(io, 16, upcase: true)
      end
    end
    io
  end

  # :nodoc:
  def self.decode_one(string, bytesize, i, byte, char, io, plus_to_space = false)
    self.decode_one(string, bytesize, i, byte, char, io, plus_to_space) { false }
  end

  # :nodoc:
  # Unencodes one character. Private API
  def self.decode_one(string, bytesize, i, byte, char, io, plus_to_space = false, &)
    if plus_to_space && char == '+'
      io.write_byte ' '.ord.to_u8
      i += 1
      return i
    end

    if char == '%' && i < bytesize - 2
      i += 1
      first = string.to_unsafe[i]
      first_num = first.unsafe_chr.to_i? 16
      unless first_num
        io.write_byte byte
        return i
      end

      i += 1
      second = string.to_unsafe[i]
      second_num = second.unsafe_chr.to_i? 16
      unless second_num
        io.write_byte byte
        io.write_byte first
        return i
      end

      encoded = (first_num * 16 + second_num).to_u8
      i += 1
      if encoded < 0x80 && yield encoded
        io.write_byte byte
        io.write_byte first
        io.write_byte second
        return i
      end
      io.write_byte encoded
      return i
    end

    io.write_byte byte
    i += 1
    i
  end

  # Checks if there's any chars that actually needs decoding.
  # If not, decoding a string can be done much faster, because
  # it's just the same string.
  private def self.needs_decode?(string : String, *, plus_to_space : Bool) : Bool
    string.includes?('%') || (plus_to_space && string.includes?('+'))
  end
end