Skip to content

Commit

Permalink
Merge pull request elixir-lang#566 from yrashk/unicode-sequences
Browse files Browse the repository at this point in the history
Add support for named sequences and make String module aware of it so it...
  • Loading branch information
José Valim committed Oct 24, 2012
2 parents dc169dc + 4717c9a commit 31e1965
Show file tree
Hide file tree
Showing 4 changed files with 541 additions and 42 deletions.
76 changes: 36 additions & 40 deletions lib/elixir/lib/string.ex
Expand Up @@ -2,7 +2,11 @@ defmodule String do
@moduledoc """
A string in Elixir is a utf-8 binary. This module
contains function to work with utf-8 data and its
codepoints.
codepoints and graphemes.
The difference between codepoints is that codepoints represent individual
characters, however, graphemes contain multiple characters that
are "perceived as a single character" by readers.
For working with raw binaries, use Erlang's :binary
module.
Expand Down Expand Up @@ -295,8 +299,11 @@ defmodule String do

defp do_codepoints(:no_codepoint), do: []

@doc false
defdelegate codepoint(string), to: String.Unicode

@doc """
Returns the first codepoint from an utf8 string.
Returns the first grapheme from an utf8 string.
## Examples
Expand All @@ -305,14 +312,14 @@ defmodule String do
"""
def first(string) do
case codepoint(string) do
case grapheme(string) do
{ char, _ } -> char
:no_codepoint -> ""
:no_sequence -> ""
end
end

@doc """
Returns the last codepoint from an utf8 string.
Returns the last grapheme from an utf8 string.
## Examples
Expand All @@ -321,17 +328,17 @@ defmodule String do
"""
def last(string) do
do_last(codepoint(string), "")
do_last(grapheme(string), "")
end

defp do_last({char, rest}, _) do
do_last(codepoint(rest), char)
do_last(grapheme(rest), char)
end

defp do_last(:no_codepoint, last_char), do: last_char
defp do_last(:no_sequence, last_char), do: last_char

@doc """
Returns the number of codepoint in an utf8 string.
Returns the number of unicode graphemes in an utf8 string.
## Examples
Expand All @@ -340,17 +347,28 @@ defmodule String do
"""
def length(string) do
do_length(codepoint(string))
do_length(grapheme(string))
end

defp do_length({_, rest}) do
1 + do_length(codepoint(rest))
1 + do_length(grapheme(rest))
end
defp do_length(:no_sequence), do: 0

@doc """
Returns unicode graphemes in the string
## Examples
String.graphemes("Ā̀stute") # => ["Ā̀","s","t","u","t","e"]
"""
defdelegate graphemes(string), to: String.Unicode, as: :sequences

defp do_length(:no_codepoint), do: 0
@doc false
defdelegate grapheme(string), to: String.Unicode, as: :sequence

@doc """
Returns the codepoint in the `position` of the given utf8 `string`.
Returns the grapheme in the `position` of the given utf8 `string`.
If `position` is greater than `string` length, than it returns `nil`.
## Examples
Expand All @@ -363,47 +381,25 @@ defmodule String do
"""
def at(string, position) when position >= 0 do
do_at(codepoint(string), position, 0)
do_at(grapheme(string), position, 0)
end

def at(string, position) when position < 0 do
real_pos = do_length(codepoint(string)) - abs(position)
real_pos = do_length(grapheme(string)) - abs(position)
case real_pos >= 0 do
true -> do_at(codepoint(string), real_pos, 0)
true -> do_at(grapheme(string), real_pos, 0)
false -> ""
end
end

defp do_at({_ , rest}, desired_pos, current_pos) when desired_pos > current_pos do
do_at(codepoint(rest), desired_pos, current_pos + 1)
do_at(grapheme(rest), desired_pos, current_pos + 1)
end

defp do_at({char, _}, desired_pos, current_pos) when desired_pos == current_pos do
char
end

defp do_at(:no_codepoint, _, _), do: ""

# Private implementation which returns the first codepoint
# of any given utf8 string and the rest of it
# If an empty string is given, :no_codepoint is returned.
defp codepoint(<<194, char, rest :: binary>>)
when char in 161..191,
do: { <<194, char>>, rest }

defp codepoint(<<first, char, rest :: binary>>)
when first in 195..223 and char in 128..191,
do: { <<first, char>>, rest }

defp codepoint(<<first, second, char, rest :: binary>>)
when first == 224 and second in 160..191 and char in 128..191,
do: { <<first, second, char>>, rest }

defp codepoint(<<first, second, char, rest :: binary>>)
when first in 225..239 and second in 128..191 and char in 128..191,
do: { <<first, second, char>>, rest }

defp codepoint(<<other, rest :: binary>>), do: { <<other>>, rest }
defp do_at(:no_sequence, _, _), do: ""

defp codepoint(<<>>), do: :no_codepoint
end

0 comments on commit 31e1965

Please sign in to comment.