Skip to content

Commit

Permalink
Add Char#titlecase for correct mixed-case transformations (#13539)
Browse files Browse the repository at this point in the history
  • Loading branch information
HertzDevil committed Jun 24, 2023
1 parent eec0594 commit 5b8cee0
Show file tree
Hide file tree
Showing 9 changed files with 340 additions and 109 deletions.
28 changes: 23 additions & 5 deletions scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ end

entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_titlecase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
special_cases_casefold = [] of SpecialCase
casefold_mapping = Hash(Int32, Int32).new
Expand Down Expand Up @@ -200,6 +201,7 @@ body.each_line do |line|
end
upcase = pieces[12].to_i?(16)
downcase = pieces[13].to_i?(16)
titlecase = pieces[14].to_i?(16)
casefold = casefold_mapping[codepoint]?
entries << Entry.new(
codepoint: codepoint,
Expand All @@ -211,6 +213,9 @@ body.each_line do |line|
downcase: downcase,
casefold: casefold,
)
if titlecase && titlecase != upcase
special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0])
end
end

url = "#{UCD_ROOT}SpecialCasing.txt"
Expand All @@ -223,22 +228,30 @@ body.each_line do |line|

pieces = line.split(';')
codepoint = pieces[0].to_i(16)

downcase = pieces[1].split.map(&.to_i(16))
upcase = pieces[3].split.map(&.to_i(16))
downcase = nil if downcase.size == 1
upcase = nil if upcase.size == 1
if downcase
if downcase.size > 1
while downcase.size < 3
downcase << 0
end
special_cases_downcase << SpecialCase.new(codepoint, downcase)
end
if upcase

upcase = pieces[3].split.map(&.to_i(16))
if upcase.size > 1
while upcase.size < 3
upcase << 0
end
special_cases_upcase << SpecialCase.new(codepoint, upcase)
end

titlecase = pieces[2].split.map(&.to_i(16))
if titlecase.size > 1
while titlecase.size < 3
titlecase << 0
end
special_cases_titlecase << SpecialCase.new(codepoint, titlecase)
end
end

url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
Expand Down Expand Up @@ -282,6 +295,11 @@ upcase_ranges.select! { |r| r.delta != -1 }

alternate_ranges = alternate_ranges(downcase_one_ranges)

special_cases_downcase.sort_by! &.codepoint
special_cases_upcase.sort_by! &.codepoint
special_cases_titlecase.reject! &.in?(special_cases_upcase)
special_cases_titlecase.sort_by! &.codepoint

casefold_ranges = case_ranges entries, &.casefold

all_strides = {} of String => Array(Stride)
Expand Down
11 changes: 11 additions & 0 deletions scripts/unicode_data.ecr
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ module Unicode
data
end

# Titlecase transformation that differs from the uppercase transformation.
# The maximum transformation is always 3 codepoints, so we store them all as 3
# codepoints and 0 means end.
private class_getter special_cases_titlecase : Hash(Int32, {Int32, Int32, Int32}) do
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_titlecase.size %>)
<%- special_cases_titlecase.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end %>
data
end

# Fold case transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
Expand Down
26 changes: 19 additions & 7 deletions spec/std/char_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,29 @@ require "spec/helpers/iterate"
require "../support/string"

describe "Char" do
describe "upcase" do
describe "#upcase" do
it { 'a'.upcase.should eq('A') }
it { '1'.upcase.should eq('1') }
it { assert_iterates_yielding ['F', 'F', 'L'], 'ffl'.upcase }
end

describe "downcase" do
describe "#downcase" do
it { 'A'.downcase.should eq('a') }
it { '1'.downcase.should eq('1') }
it do
actual = [] of Char
'ß'.downcase(Unicode::CaseOptions::Fold) { |c| actual << c }
actual.should eq(['s', 's'])
end
it { assert_iterates_yielding ['i', '\u{0307}'], 'İ'.downcase }
it { assert_iterates_yielding ['s', 's'], 'ß'.downcase(Unicode::CaseOptions::Fold) }
it { 'Ń'.downcase(Unicode::CaseOptions::Fold).should eq('ń') }
it { 'ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
it { 'Ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
end

describe "#titlecase" do
it { 'a'.titlecase.should eq('A') }
it { '1'.titlecase.should eq('1') }
it { '\u{10D0}'.titlecase.should eq('\u{10D0}') } # GEORGIAN LETTER AN
it { assert_iterates_yielding ['F', 'f', 'l'], 'ffl'.titlecase }
end

it "#succ" do
'a'.succ.should eq('b')
'あ'.succ.should eq('ぃ')
Expand Down Expand Up @@ -89,6 +94,13 @@ describe "Char" do
it { ' '.lowercase?.should be_false }
end

describe "#titlecase?" do
it { 'Dz'.titlecase?.should be_true }
it { 'ᾈ'.titlecase?.should be_true }
it { 'A'.titlecase?.should be_false }
it { 'a'.titlecase?.should be_false }
end

describe "ascii_letter?" do
it { 'a'.ascii_letter?.should be_true }
it { 'A'.ascii_letter?.should be_true }
Expand Down
81 changes: 32 additions & 49 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -682,47 +682,36 @@ describe "String" do
end

describe "#capitalize" do
it { "HELLO!".capitalize.should eq("Hello!") }
it { "HELLO MAN!".capitalize.should eq("Hello man!") }
it { "".capitalize.should eq("") }
it { "fflİ".capitalize.should eq("FFLi̇") }
it { "iO".capitalize(Unicode::CaseOptions::Turkic).should eq("İo") }
it { assert_prints "HELLO!".capitalize, "Hello!" }
it { assert_prints "HELLO MAN!".capitalize, "Hello man!" }
it { assert_prints "".capitalize, "" }
it { assert_prints "iO".capitalize(Unicode::CaseOptions::Turkic), "İo" }

it "does not touch invalid code units in an otherwise ascii string" do
"\xB5!\xE0\xC1\xB5?".capitalize.should eq("\xB5!\xE0\xC1\xB5?")
it "handles multi-character mappings correctly (#13533)" do
assert_prints "fflİ".capitalize, "Ffli̇"
end

describe "with IO" do
it { String.build { |io| "HELLO!".capitalize io }.should eq "Hello!" }
it { String.build { |io| "HELLO MAN!".capitalize io }.should eq "Hello man!" }
it { String.build { |io| "".capitalize io }.should be_empty }
it { String.build { |io| "fflİ".capitalize io }.should eq "FFLi̇" }
it { String.build { |io| "iO".capitalize io, Unicode::CaseOptions::Turkic }.should eq "İo" }
it "does not touch invalid code units in an otherwise ascii string" do
assert_prints "\xB5!\xE0\xC1\xB5?".capitalize, "\xB5!\xE0\xC1\xB5?"
end
end

describe "#titleize" do
it { "hEllO tAb\tworld".titleize.should eq("Hello Tab\tWorld") }
it { " spaces before".titleize.should eq(" Spaces Before") }
it { "testa-se muito".titleize.should eq("Testa-se Muito") }
it { "hÉllÕ tAb\tworld".titleize.should eq("Héllõ Tab\tWorld") }
it { " spáçes before".titleize.should eq(" Spáçes Before") }
it { "testá-se múitô".titleize.should eq("Testá-se Múitô") }
it { "iO iO".titleize(Unicode::CaseOptions::Turkic).should eq("İo İo") }
it { assert_prints "hEllO tAb\tworld".titleize, "Hello Tab\tWorld" }
it { assert_prints " spaces before".titleize, " Spaces Before" }
it { assert_prints "testa-se muito".titleize, "Testa-se Muito" }
it { assert_prints "hÉllÕ tAb\tworld".titleize, "Héllõ Tab\tWorld" }
it { assert_prints " spáçes before".titleize, " Spáçes Before" }
it { assert_prints "testá-se múitô".titleize, "Testá-se Múitô" }
it { assert_prints "iO iO".titleize(Unicode::CaseOptions::Turkic), "İo İo" }

it "does not touch invalid code units in an otherwise ascii string" do
"\xB5!\xE0\xC1\xB5?".titleize.should eq("\xB5!\xE0\xC1\xB5?")
"a\xA0b".titleize.should eq("A\xA0b")
it "handles multi-character mappings correctly (#13533)" do
assert_prints "fflİ İffl dz DZ".titleize, "Ffli̇ İffl Dz Dz"
end

describe "with IO" do
it { String.build { |io| "hEllO tAb\tworld".titleize io }.should eq "Hello Tab\tWorld" }
it { String.build { |io| " spaces before".titleize io }.should eq " Spaces Before" }
it { String.build { |io| "testa-se muito".titleize io }.should eq "Testa-se Muito" }
it { String.build { |io| "hÉllÕ tAb\tworld".titleize io }.should eq "Héllõ Tab\tWorld" }
it { String.build { |io| " spáçes before".titleize io }.should eq " Spáçes Before" }
it { String.build { |io| "testá-se múitô".titleize io }.should eq "Testá-se Múitô" }
it { String.build { |io| "iO iO".titleize io, Unicode::CaseOptions::Turkic }.should eq "İo İo" }
it "does not touch invalid code units in an otherwise ascii string" do
assert_prints "\xB5!\xE0\xC1\xB5?".titleize, "\xB5!\xE0\xC1\xB5?"
assert_prints "a\xA0b".titleize, "A\xA0b"
end
end

Expand Down Expand Up @@ -2194,24 +2183,18 @@ describe "String" do
end

describe "#camelcase" do
it { "foo".camelcase.should eq "Foo" }
it { "foo_bar".camelcase.should eq "FooBar" }
it { "foo".camelcase(lower: true).should eq "foo" }
it { "foo_bar".camelcase(lower: true).should eq "fooBar" }
it { "Foo".camelcase.should eq "Foo" }
it { "Foo_bar".camelcase.should eq "FooBar" }
it { "Foo".camelcase(lower: true).should eq "foo" }
it { "Foo_bar".camelcase(lower: true).should eq "fooBar" }

describe "with IO" do
it { String.build { |io| "foo".camelcase io }.should eq "Foo" }
it { String.build { |io| "foo_bar".camelcase io }.should eq "FooBar" }
it { String.build { |io| "foo".camelcase io, lower: true }.should eq "foo" }
it { String.build { |io| "foo_bar".camelcase io, lower: true }.should eq "fooBar" }
it { String.build { |io| "Foo".camelcase io }.should eq "Foo" }
it { String.build { |io| "Foo_bar".camelcase io }.should eq "FooBar" }
it { String.build { |io| "Foo".camelcase io, lower: true }.should eq "foo" }
it { String.build { |io| "Foo_bar".camelcase io, lower: true }.should eq "fooBar" }
it { assert_prints "foo".camelcase, "Foo" }
it { assert_prints "foo_bar".camelcase, "FooBar" }
it { assert_prints "foo".camelcase(lower: true), "foo" }
it { assert_prints "foo_bar".camelcase(lower: true), "fooBar" }
it { assert_prints "Foo".camelcase, "Foo" }
it { assert_prints "Foo_bar".camelcase, "FooBar" }
it { assert_prints "Foo".camelcase(lower: true), "foo" }
it { assert_prints "Foo_bar".camelcase(lower: true), "fooBar" }

it "handles multi-character mappings correctly (#13533)" do
assert_prints "ffl_xffl".camelcase, "FflXffl"
assert_prints "İ_xffl".camelcase(lower: true), "i̇Xffl"
end
end

Expand Down
16 changes: 9 additions & 7 deletions spec/support/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ end
# Given a call of the form `foo.bar(*args, **opts)`, tests the following cases:
#
# * This call itself should return a `String` equal to *str*.
# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to *str*.
# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should be equal
# to *str*; that is, the `IO` overload should not fail when the `IO` argument
# uses a non-default encoding. This case is skipped if the `without_iconv`
# flag is set.
# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to
# `str.scrub`; writing to a `String::Builder` must not produce any invalid
# UTF-8 byte sequences.
# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should also be
# equal to `str.scrub`; that is, the `IO` overload should not fail when the
# `IO` argument uses a non-default encoding. This case is skipped if the
# `without_iconv` flag is set.
macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
%str = ({{ str }}).as(String)
%file = {{ file }}
Expand All @@ -45,7 +47,7 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
{% for arg in call.args %} {{ arg }}, {% end %}
{% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %}
) {{ call.block }}
end.should eq(%str), file: %file, line: %line
end.should eq(%str.scrub), file: %file, line: %line

{% unless flag?(:without_iconv) %}
string_build_via_utf16 do |io|
Expand All @@ -54,6 +56,6 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
{% for arg in call.args %} {{ arg }}, {% end %}
{% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %}
) {{ call.block }}
end.should eq(%str), file: %file, line: %line
end.should eq(%str.scrub), file: %file, line: %line
{% end %}
end
61 changes: 59 additions & 2 deletions src/char.cr
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ struct Char
# 'ç'.lowercase? # => true
# 'G'.lowercase? # => false
# '.'.lowercase? # => false
# 'Dz'.lowercase? # => false
# ```
def lowercase? : Bool
ascii? ? ascii_lowercase? : Unicode.lowercase?(self)
Expand All @@ -221,11 +222,24 @@ struct Char
# 'Á'.uppercase? # => true
# 'c'.uppercase? # => false
# '.'.uppercase? # => false
# 'Dz'.uppercase? # => false
# ```
def uppercase? : Bool
ascii? ? ascii_uppercase? : Unicode.uppercase?(self)
end

# Returns `true` if this char is a titlecase character, i.e. a ligature
# consisting of an uppercase letter followed by lowercase characters.
#
# ```
# 'Dz'.titlecase? # => true
# 'H'.titlecase? # => false
# 'c'.titlecase? # => false
# ```
def titlecase? : Bool
!ascii? && Unicode.titlecase?(self)
end

# Returns `true` if this char is an ASCII letter ('a' to 'z', 'A' to 'Z').
#
# ```
Expand Down Expand Up @@ -393,7 +407,7 @@ struct Char
# characters, like 'İ', than when downcased result in multiple
# characters (in this case: 'I' and the dot mark).
#
# For a more correct method see the method that receives a block.
# For more correct behavior see the overload that receives a block.
#
# ```
# 'Z'.downcase # => 'z'
Expand Down Expand Up @@ -449,7 +463,7 @@ struct Char
# characters, like 'ffl', than when upcased result in multiple
# characters (in this case: 'F', 'F', 'L').
#
# For a more correct method see the method that receives a block.
# For more correct behavior see the overload that receives a block.
#
# ```
# 'z'.upcase # => 'Z'
Expand All @@ -474,6 +488,49 @@ struct Char
Unicode.upcase(self, options) { |char| yield char }
end

# Returns the titlecase equivalent of this char.
#
# Usually this is equivalent to `#upcase`, but a few precomposed characters
# consisting of multiple letters may return a different character where only
# the first letter is uppercase and the rest lowercase.
#
# Note that this only works for characters whose titlecase
# equivalent yields a single codepoint. There are a few
# characters, like 'ffl', than when titlecased result in multiple
# characters (in this case: 'F', 'f', 'l').
#
# For more correct behavior see the overload that receives a block.
#
# ```
# 'z'.titlecase # => 'Z'
# 'X'.titlecase # => 'X'
# '.'.titlecase # => '.'
# 'DZ'.titlecase # => 'Dz'
# 'dz'.titlecase # => 'Dz'
# ```
def titlecase(options : Unicode::CaseOptions = :none) : Char
Unicode.titlecase(self, options)
end

# Yields each char for the titlecase equivalent of this char.
#
# Usually this is equivalent to `#upcase`, but a few precomposed characters
# consisting of multiple letters may yield a different character sequence
# where only the first letter is uppercase and the rest lowercase.
#
# This method takes into account the possibility that a titlecase
# version of a char might result in multiple chars, like for
# 'ffl', which results in 'F', 'f' and 'l'.
#
# ```
# 'z'.titlecase { |v| puts v } # prints 'Z'
# 'DZ'.titlecase { |v| puts v } # prints 'Dz'
# 'ffl'.titlecase { |v| puts v } # prints 'F', 'f', 'l'
# ```
def titlecase(options : Unicode::CaseOptions = :none, &)
Unicode.titlecase(self, options) { |char| yield char }
end

# See `Object#hash(hasher)`
def hash(hasher)
hasher.char(self)
Expand Down
Loading

0 comments on commit 5b8cee0

Please sign in to comment.