Add Char#titlecase for correct mixed-case transformations (#13539)

crystal-lang · Jun 24, 2023 · 5b8cee0 · 5b8cee0
1 parent eec0594
commit 5b8cee0
Show file tree

Hide file tree

Showing 9 changed files with 340 additions and 109 deletions.
diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr
@@ -147,6 +147,7 @@ end
 
 entries = [] of Entry
 special_cases_downcase = [] of SpecialCase
+special_cases_titlecase = [] of SpecialCase
 special_cases_upcase = [] of SpecialCase
 special_cases_casefold = [] of SpecialCase
 casefold_mapping = Hash(Int32, Int32).new
@@ -200,6 +201,7 @@ body.each_line do |line|
   end
   upcase = pieces[12].to_i?(16)
   downcase = pieces[13].to_i?(16)
+  titlecase = pieces[14].to_i?(16)
   casefold = casefold_mapping[codepoint]?
   entries << Entry.new(
     codepoint: codepoint,
@@ -211,6 +213,9 @@ body.each_line do |line|
     downcase: downcase,
     casefold: casefold,
   )
+  if titlecase && titlecase != upcase
+    special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0])
+  end
 end
 
 url = "#{UCD_ROOT}SpecialCasing.txt"
@@ -223,22 +228,30 @@ body.each_line do |line|
 
   pieces = line.split(';')
   codepoint = pieces[0].to_i(16)
+
   downcase = pieces[1].split.map(&.to_i(16))
-  upcase = pieces[3].split.map(&.to_i(16))
-  downcase = nil if downcase.size == 1
-  upcase = nil if upcase.size == 1
-  if downcase
+  if downcase.size > 1
     while downcase.size < 3
       downcase << 0
     end
     special_cases_downcase << SpecialCase.new(codepoint, downcase)
   end
-  if upcase
+
+  upcase = pieces[3].split.map(&.to_i(16))
+  if upcase.size > 1
     while upcase.size < 3
       upcase << 0
     end
     special_cases_upcase << SpecialCase.new(codepoint, upcase)
   end
+
+  titlecase = pieces[2].split.map(&.to_i(16))
+  if titlecase.size > 1
+    while titlecase.size < 3
+      titlecase << 0
+    end
+    special_cases_titlecase << SpecialCase.new(codepoint, titlecase)
+  end
 end
 
 url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
@@ -282,6 +295,11 @@ upcase_ranges.select! { |r| r.delta != -1 }
 
 alternate_ranges = alternate_ranges(downcase_one_ranges)
 
+special_cases_downcase.sort_by! &.codepoint
+special_cases_upcase.sort_by! &.codepoint
+special_cases_titlecase.reject! &.in?(special_cases_upcase)
+special_cases_titlecase.sort_by! &.codepoint
+
 casefold_ranges = case_ranges entries, &.casefold
 
 all_strides = {} of String => Array(Stride)

diff --git a/scripts/unicode_data.ecr b/scripts/unicode_data.ecr
@@ -87,6 +87,17 @@ module Unicode
     data
   end
 
+  # Titlecase transformation that differs from the uppercase transformation.
+  # The maximum transformation is always 3 codepoints, so we store them all as 3
+  # codepoints and 0 means end.
+  private class_getter special_cases_titlecase : Hash(Int32, {Int32, Int32, Int32}) do
+    data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_titlecase.size %>)
+    <%- special_cases_titlecase.each do |a_case| -%>
+      put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
+    <%- end %>
+    data
+  end
+
   # Fold case transformation that involve mapping a codepoint
   # to multiple codepoints. The maximum transformation is always 3
   # codepoints, so we store them all as 3 codepoints and 0 means end.

diff --git a/spec/std/char_spec.cr b/spec/std/char_spec.cr
@@ -4,24 +4,29 @@ require "spec/helpers/iterate"
 require "../support/string"
 
 describe "Char" do
-  describe "upcase" do
+  describe "#upcase" do
     it { 'a'.upcase.should eq('A') }
     it { '1'.upcase.should eq('1') }
+    it { assert_iterates_yielding ['F', 'F', 'L'], 'ﬄ'.upcase }
   end
 
-  describe "downcase" do
+  describe "#downcase" do
     it { 'A'.downcase.should eq('a') }
     it { '1'.downcase.should eq('1') }
-    it do
-      actual = [] of Char
-      'ß'.downcase(Unicode::CaseOptions::Fold) { |c| actual << c }
-      actual.should eq(['s', 's'])
-    end
+    it { assert_iterates_yielding ['i', '\u{0307}'], 'İ'.downcase }
+    it { assert_iterates_yielding ['s', 's'], 'ß'.downcase(Unicode::CaseOptions::Fold) }
     it { 'Ń'.downcase(Unicode::CaseOptions::Fold).should eq('ń') }
     it { 'ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
     it { 'Ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
   end
 
+  describe "#titlecase" do
+    it { 'a'.titlecase.should eq('A') }
+    it { '1'.titlecase.should eq('1') }
+    it { '\u{10D0}'.titlecase.should eq('\u{10D0}') } # GEORGIAN LETTER AN
+    it { assert_iterates_yielding ['F', 'f', 'l'], 'ﬄ'.titlecase }
+  end
+
   it "#succ" do
     'a'.succ.should eq('b')
     'あ'.succ.should eq('ぃ')
@@ -89,6 +94,13 @@ describe "Char" do
     it { ' '.lowercase?.should be_false }
   end
 
+  describe "#titlecase?" do
+    it { 'ǲ'.titlecase?.should be_true }
+    it { 'ᾈ'.titlecase?.should be_true }
+    it { 'A'.titlecase?.should be_false }
+    it { 'a'.titlecase?.should be_false }
+  end
+
   describe "ascii_letter?" do
     it { 'a'.ascii_letter?.should be_true }
     it { 'A'.ascii_letter?.should be_true }

diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr
@@ -682,47 +682,36 @@ describe "String" do
   end
 
   describe "#capitalize" do
-    it { "HELLO!".capitalize.should eq("Hello!") }
-    it { "HELLO MAN!".capitalize.should eq("Hello man!") }
-    it { "".capitalize.should eq("") }
-    it { "ﬄİ".capitalize.should eq("FFLi̇") }
-    it { "iO".capitalize(Unicode::CaseOptions::Turkic).should eq("İo") }
+    it { assert_prints "HELLO!".capitalize, "Hello!" }
+    it { assert_prints "HELLO MAN!".capitalize, "Hello man!" }
+    it { assert_prints "".capitalize, "" }
+    it { assert_prints "iO".capitalize(Unicode::CaseOptions::Turkic), "İo" }
 
-    it "does not touch invalid code units in an otherwise ascii string" do
-      "\xB5!\xE0\xC1\xB5?".capitalize.should eq("\xB5!\xE0\xC1\xB5?")
+    it "handles multi-character mappings correctly (#13533)" do
+      assert_prints "ﬄİ".capitalize, "Ffli̇"
     end
 
-    describe "with IO" do
-      it { String.build { |io| "HELLO!".capitalize io }.should eq "Hello!" }
-      it { String.build { |io| "HELLO MAN!".capitalize io }.should eq "Hello man!" }
-      it { String.build { |io| "".capitalize io }.should be_empty }
-      it { String.build { |io| "ﬄİ".capitalize io }.should eq "FFLi̇" }
-      it { String.build { |io| "iO".capitalize io, Unicode::CaseOptions::Turkic }.should eq "İo" }
+    it "does not touch invalid code units in an otherwise ascii string" do
+      assert_prints "\xB5!\xE0\xC1\xB5?".capitalize, "\xB5!\xE0\xC1\xB5?"
     end
   end
 
   describe "#titleize" do
-    it { "hEllO tAb\tworld".titleize.should eq("Hello Tab\tWorld") }
-    it { "  spaces before".titleize.should eq("  Spaces Before") }
-    it { "testa-se muito".titleize.should eq("Testa-se Muito") }
-    it { "hÉllÕ tAb\tworld".titleize.should eq("Héllõ Tab\tWorld") }
-    it { "  spáçes before".titleize.should eq("  Spáçes Before") }
-    it { "testá-se múitô".titleize.should eq("Testá-se Múitô") }
-    it { "iO iO".titleize(Unicode::CaseOptions::Turkic).should eq("İo İo") }
+    it { assert_prints "hEllO tAb\tworld".titleize, "Hello Tab\tWorld" }
+    it { assert_prints "  spaces before".titleize, "  Spaces Before" }
+    it { assert_prints "testa-se muito".titleize, "Testa-se Muito" }
+    it { assert_prints "hÉllÕ tAb\tworld".titleize, "Héllõ Tab\tWorld" }
+    it { assert_prints "  spáçes before".titleize, "  Spáçes Before" }
+    it { assert_prints "testá-se múitô".titleize, "Testá-se Múitô" }
+    it { assert_prints "iO iO".titleize(Unicode::CaseOptions::Turkic), "İo İo" }
 
-    it "does not touch invalid code units in an otherwise ascii string" do
-      "\xB5!\xE0\xC1\xB5?".titleize.should eq("\xB5!\xE0\xC1\xB5?")
-      "a\xA0b".titleize.should eq("A\xA0b")
+    it "handles multi-character mappings correctly (#13533)" do
+      assert_prints "ﬄİ İﬄ ǳ Ǳ".titleize, "Ffli̇ İﬄ ǲ ǲ"
     end
 
-    describe "with IO" do
-      it { String.build { |io| "hEllO tAb\tworld".titleize io }.should eq "Hello Tab\tWorld" }
-      it { String.build { |io| "  spaces before".titleize io }.should eq "  Spaces Before" }
-      it { String.build { |io| "testa-se muito".titleize io }.should eq "Testa-se Muito" }
-      it { String.build { |io| "hÉllÕ tAb\tworld".titleize io }.should eq "Héllõ Tab\tWorld" }
-      it { String.build { |io| "  spáçes before".titleize io }.should eq "  Spáçes Before" }
-      it { String.build { |io| "testá-se múitô".titleize io }.should eq "Testá-se Múitô" }
-      it { String.build { |io| "iO iO".titleize io, Unicode::CaseOptions::Turkic }.should eq "İo İo" }
+    it "does not touch invalid code units in an otherwise ascii string" do
+      assert_prints "\xB5!\xE0\xC1\xB5?".titleize, "\xB5!\xE0\xC1\xB5?"
+      assert_prints "a\xA0b".titleize, "A\xA0b"
     end
   end
 
@@ -2194,24 +2183,18 @@ describe "String" do
   end
 
   describe "#camelcase" do
-    it { "foo".camelcase.should eq "Foo" }
-    it { "foo_bar".camelcase.should eq "FooBar" }
-    it { "foo".camelcase(lower: true).should eq "foo" }
-    it { "foo_bar".camelcase(lower: true).should eq "fooBar" }
-    it { "Foo".camelcase.should eq "Foo" }
-    it { "Foo_bar".camelcase.should eq "FooBar" }
-    it { "Foo".camelcase(lower: true).should eq "foo" }
-    it { "Foo_bar".camelcase(lower: true).should eq "fooBar" }
-
-    describe "with IO" do
-      it { String.build { |io| "foo".camelcase io }.should eq "Foo" }
-      it { String.build { |io| "foo_bar".camelcase io }.should eq "FooBar" }
-      it { String.build { |io| "foo".camelcase io, lower: true }.should eq "foo" }
-      it { String.build { |io| "foo_bar".camelcase io, lower: true }.should eq "fooBar" }
-      it { String.build { |io| "Foo".camelcase io }.should eq "Foo" }
-      it { String.build { |io| "Foo_bar".camelcase io }.should eq "FooBar" }
-      it { String.build { |io| "Foo".camelcase io, lower: true }.should eq "foo" }
-      it { String.build { |io| "Foo_bar".camelcase io, lower: true }.should eq "fooBar" }
+    it { assert_prints "foo".camelcase, "Foo" }
+    it { assert_prints "foo_bar".camelcase, "FooBar" }
+    it { assert_prints "foo".camelcase(lower: true), "foo" }
+    it { assert_prints "foo_bar".camelcase(lower: true), "fooBar" }
+    it { assert_prints "Foo".camelcase, "Foo" }
+    it { assert_prints "Foo_bar".camelcase, "FooBar" }
+    it { assert_prints "Foo".camelcase(lower: true), "foo" }
+    it { assert_prints "Foo_bar".camelcase(lower: true), "fooBar" }
+
+    it "handles multi-character mappings correctly (#13533)" do
+      assert_prints "ﬄ_xﬄ".camelcase, "FflXﬄ"
+      assert_prints "İ_xﬄ".camelcase(lower: true), "i̇Xﬄ"
     end
   end
 

diff --git a/spec/support/string.cr b/spec/support/string.cr
@@ -25,11 +25,13 @@ end
 # Given a call of the form `foo.bar(*args, **opts)`, tests the following cases:
 #
 # * This call itself should return a `String` equal to *str*.
-# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to *str*.
-# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should be equal
-#   to *str*; that is, the `IO` overload should not fail when the `IO` argument
-#   uses a non-default encoding. This case is skipped if the `without_iconv`
-#   flag is set.
+# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to
+#   `str.scrub`; writing to a `String::Builder` must not produce any invalid
+#   UTF-8 byte sequences.
+# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should also be
+#   equal to `str.scrub`; that is, the `IO` overload should not fail when the
+#   `IO` argument uses a non-default encoding. This case is skipped if the
+#   `without_iconv` flag is set.
 macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
   %str = ({{ str }}).as(String)
   %file = {{ file }}
@@ -45,7 +47,7 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
       {% for arg in call.args %} {{ arg }}, {% end %}
       {% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %}
     ) {{ call.block }}
-  end.should eq(%str), file: %file, line: %line
+  end.should eq(%str.scrub), file: %file, line: %line
 
   {% unless flag?(:without_iconv) %}
     string_build_via_utf16 do |io|
@@ -54,6 +56,6 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__)
         {% for arg in call.args %} {{ arg }}, {% end %}
         {% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %}
       ) {{ call.block }}
-    end.should eq(%str), file: %file, line: %line
+    end.should eq(%str.scrub), file: %file, line: %line
   {% end %}
 end
diff --git a/src/char.cr b/src/char.cr
@@ -197,6 +197,7 @@ struct Char
   # 'ç'.lowercase? # => true
   # 'G'.lowercase? # => false
   # '.'.lowercase? # => false
+  # 'ǲ'.lowercase? # => false
   # ```
   def lowercase? : Bool
     ascii? ? ascii_lowercase? : Unicode.lowercase?(self)
@@ -221,11 +222,24 @@ struct Char
   # 'Á'.uppercase? # => true
   # 'c'.uppercase? # => false
   # '.'.uppercase? # => false
+  # 'ǲ'.uppercase? # => false
   # ```
   def uppercase? : Bool
     ascii? ? ascii_uppercase? : Unicode.uppercase?(self)
   end
 
+  # Returns `true` if this char is a titlecase character, i.e. a ligature
+  # consisting of an uppercase letter followed by lowercase characters.
+  #
+  # ```
+  # 'ǲ'.titlecase? # => true
+  # 'H'.titlecase? # => false
+  # 'c'.titlecase? # => false
+  # ```
+  def titlecase? : Bool
+    !ascii? && Unicode.titlecase?(self)
+  end
+
   # Returns `true` if this char is an ASCII letter ('a' to 'z', 'A' to 'Z').
   #
   # ```
@@ -393,7 +407,7 @@ struct Char
   # characters, like 'İ', than when downcased result in multiple
   # characters (in this case: 'I' and the dot mark).
   #
-  # For a more correct method see the method that receives a block.
+  # For more correct behavior see the overload that receives a block.
   #
   # ```
   # 'Z'.downcase # => 'z'
@@ -449,7 +463,7 @@ struct Char
   # characters, like 'ﬄ', than when upcased result in multiple
   # characters (in this case: 'F', 'F', 'L').
   #
-  # For a more correct method see the method that receives a block.
+  # For more correct behavior see the overload that receives a block.
   #
   # ```
   # 'z'.upcase # => 'Z'
@@ -474,6 +488,49 @@ struct Char
     Unicode.upcase(self, options) { |char| yield char }
   end
 
+  # Returns the titlecase equivalent of this char.
+  #
+  # Usually this is equivalent to `#upcase`, but a few precomposed characters
+  # consisting of multiple letters may return a different character where only
+  # the first letter is uppercase and the rest lowercase.
+  #
+  # Note that this only works for characters whose titlecase
+  # equivalent yields a single codepoint. There are a few
+  # characters, like 'ﬄ', than when titlecased result in multiple
+  # characters (in this case: 'F', 'f', 'l').
+  #
+  # For more correct behavior see the overload that receives a block.
+  #
+  # ```
+  # 'z'.titlecase # => 'Z'
+  # 'X'.titlecase # => 'X'
+  # '.'.titlecase # => '.'
+  # 'Ǳ'.titlecase # => 'ǲ'
+  # 'ǳ'.titlecase # => 'ǲ'
+  # ```
+  def titlecase(options : Unicode::CaseOptions = :none) : Char
+    Unicode.titlecase(self, options)
+  end
+
+  # Yields each char for the titlecase equivalent of this char.
+  #
+  # Usually this is equivalent to `#upcase`, but a few precomposed characters
+  # consisting of multiple letters may yield a different character sequence
+  # where only the first letter is uppercase and the rest lowercase.
+  #
+  # This method takes into account the possibility that a titlecase
+  # version of a char might result in multiple chars, like for
+  # 'ﬄ', which results in 'F', 'f' and 'l'.
+  #
+  # ```
+  # 'z'.titlecase { |v| puts v } # prints 'Z'
+  # 'Ǳ'.titlecase { |v| puts v } # prints 'ǲ'
+  # 'ﬄ'.titlecase { |v| puts v } # prints 'F', 'f', 'l'
+  # ```
+  def titlecase(options : Unicode::CaseOptions = :none, &)
+    Unicode.titlecase(self, options) { |char| yield char }
+  end
+
   # See `Object#hash(hasher)`
   def hash(hasher)
     hasher.char(self)