diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr index f360f27b76e6..610eae8cc2dd 100644 --- a/scripts/generate_unicode_data.cr +++ b/scripts/generate_unicode_data.cr @@ -147,6 +147,7 @@ end entries = [] of Entry special_cases_downcase = [] of SpecialCase +special_cases_titlecase = [] of SpecialCase special_cases_upcase = [] of SpecialCase special_cases_casefold = [] of SpecialCase casefold_mapping = Hash(Int32, Int32).new @@ -200,6 +201,7 @@ body.each_line do |line| end upcase = pieces[12].to_i?(16) downcase = pieces[13].to_i?(16) + titlecase = pieces[14].to_i?(16) casefold = casefold_mapping[codepoint]? entries << Entry.new( codepoint: codepoint, @@ -211,6 +213,9 @@ body.each_line do |line| downcase: downcase, casefold: casefold, ) + if titlecase && titlecase != upcase + special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0]) + end end url = "#{UCD_ROOT}SpecialCasing.txt" @@ -223,22 +228,30 @@ body.each_line do |line| pieces = line.split(';') codepoint = pieces[0].to_i(16) + downcase = pieces[1].split.map(&.to_i(16)) - upcase = pieces[3].split.map(&.to_i(16)) - downcase = nil if downcase.size == 1 - upcase = nil if upcase.size == 1 - if downcase + if downcase.size > 1 while downcase.size < 3 downcase << 0 end special_cases_downcase << SpecialCase.new(codepoint, downcase) end - if upcase + + upcase = pieces[3].split.map(&.to_i(16)) + if upcase.size > 1 while upcase.size < 3 upcase << 0 end special_cases_upcase << SpecialCase.new(codepoint, upcase) end + + titlecase = pieces[2].split.map(&.to_i(16)) + if titlecase.size > 1 + while titlecase.size < 3 + titlecase << 0 + end + special_cases_titlecase << SpecialCase.new(codepoint, titlecase) + end end url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt" @@ -282,6 +295,11 @@ upcase_ranges.select! { |r| r.delta != -1 } alternate_ranges = alternate_ranges(downcase_one_ranges) +special_cases_downcase.sort_by! &.codepoint +special_cases_upcase.sort_by! &.codepoint +special_cases_titlecase.reject! &.in?(special_cases_upcase) +special_cases_titlecase.sort_by! &.codepoint + casefold_ranges = case_ranges entries, &.casefold all_strides = {} of String => Array(Stride) diff --git a/scripts/unicode_data.ecr b/scripts/unicode_data.ecr index 499a2fb42836..c3144c8281c8 100644 --- a/scripts/unicode_data.ecr +++ b/scripts/unicode_data.ecr @@ -87,6 +87,17 @@ module Unicode data end + # Titlecase transformation that differs from the uppercase transformation. + # The maximum transformation is always 3 codepoints, so we store them all as 3 + # codepoints and 0 means end. + private class_getter special_cases_titlecase : Hash(Int32, {Int32, Int32, Int32}) do + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_titlecase.size %>) + <%- special_cases_titlecase.each do |a_case| -%> + put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>) + <%- end %> + data + end + # Fold case transformation that involve mapping a codepoint # to multiple codepoints. The maximum transformation is always 3 # codepoints, so we store them all as 3 codepoints and 0 means end. diff --git a/spec/std/char_spec.cr b/spec/std/char_spec.cr index d9e9ef8b4671..2c5ea0345252 100644 --- a/spec/std/char_spec.cr +++ b/spec/std/char_spec.cr @@ -4,24 +4,29 @@ require "spec/helpers/iterate" require "../support/string" describe "Char" do - describe "upcase" do + describe "#upcase" do it { 'a'.upcase.should eq('A') } it { '1'.upcase.should eq('1') } + it { assert_iterates_yielding ['F', 'F', 'L'], 'ffl'.upcase } end - describe "downcase" do + describe "#downcase" do it { 'A'.downcase.should eq('a') } it { '1'.downcase.should eq('1') } - it do - actual = [] of Char - 'ß'.downcase(Unicode::CaseOptions::Fold) { |c| actual << c } - actual.should eq(['s', 's']) - end + it { assert_iterates_yielding ['i', '\u{0307}'], 'İ'.downcase } + it { assert_iterates_yielding ['s', 's'], 'ß'.downcase(Unicode::CaseOptions::Fold) } it { 'Ń'.downcase(Unicode::CaseOptions::Fold).should eq('ń') } it { 'ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') } it { 'Ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') } end + describe "#titlecase" do + it { 'a'.titlecase.should eq('A') } + it { '1'.titlecase.should eq('1') } + it { '\u{10D0}'.titlecase.should eq('\u{10D0}') } # GEORGIAN LETTER AN + it { assert_iterates_yielding ['F', 'f', 'l'], 'ffl'.titlecase } + end + it "#succ" do 'a'.succ.should eq('b') 'あ'.succ.should eq('ぃ') @@ -89,6 +94,13 @@ describe "Char" do it { ' '.lowercase?.should be_false } end + describe "#titlecase?" do + it { 'Dz'.titlecase?.should be_true } + it { 'ᾈ'.titlecase?.should be_true } + it { 'A'.titlecase?.should be_false } + it { 'a'.titlecase?.should be_false } + end + describe "ascii_letter?" do it { 'a'.ascii_letter?.should be_true } it { 'A'.ascii_letter?.should be_true } diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index f5ac06188dc6..7a9f587126f3 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -682,47 +682,36 @@ describe "String" do end describe "#capitalize" do - it { "HELLO!".capitalize.should eq("Hello!") } - it { "HELLO MAN!".capitalize.should eq("Hello man!") } - it { "".capitalize.should eq("") } - it { "fflİ".capitalize.should eq("FFLi̇") } - it { "iO".capitalize(Unicode::CaseOptions::Turkic).should eq("İo") } + it { assert_prints "HELLO!".capitalize, "Hello!" } + it { assert_prints "HELLO MAN!".capitalize, "Hello man!" } + it { assert_prints "".capitalize, "" } + it { assert_prints "iO".capitalize(Unicode::CaseOptions::Turkic), "İo" } - it "does not touch invalid code units in an otherwise ascii string" do - "\xB5!\xE0\xC1\xB5?".capitalize.should eq("\xB5!\xE0\xC1\xB5?") + it "handles multi-character mappings correctly (#13533)" do + assert_prints "fflİ".capitalize, "Ffli̇" end - describe "with IO" do - it { String.build { |io| "HELLO!".capitalize io }.should eq "Hello!" } - it { String.build { |io| "HELLO MAN!".capitalize io }.should eq "Hello man!" } - it { String.build { |io| "".capitalize io }.should be_empty } - it { String.build { |io| "fflİ".capitalize io }.should eq "FFLi̇" } - it { String.build { |io| "iO".capitalize io, Unicode::CaseOptions::Turkic }.should eq "İo" } + it "does not touch invalid code units in an otherwise ascii string" do + assert_prints "\xB5!\xE0\xC1\xB5?".capitalize, "\xB5!\xE0\xC1\xB5?" end end describe "#titleize" do - it { "hEllO tAb\tworld".titleize.should eq("Hello Tab\tWorld") } - it { " spaces before".titleize.should eq(" Spaces Before") } - it { "testa-se muito".titleize.should eq("Testa-se Muito") } - it { "hÉllÕ tAb\tworld".titleize.should eq("Héllõ Tab\tWorld") } - it { " spáçes before".titleize.should eq(" Spáçes Before") } - it { "testá-se múitô".titleize.should eq("Testá-se Múitô") } - it { "iO iO".titleize(Unicode::CaseOptions::Turkic).should eq("İo İo") } + it { assert_prints "hEllO tAb\tworld".titleize, "Hello Tab\tWorld" } + it { assert_prints " spaces before".titleize, " Spaces Before" } + it { assert_prints "testa-se muito".titleize, "Testa-se Muito" } + it { assert_prints "hÉllÕ tAb\tworld".titleize, "Héllõ Tab\tWorld" } + it { assert_prints " spáçes before".titleize, " Spáçes Before" } + it { assert_prints "testá-se múitô".titleize, "Testá-se Múitô" } + it { assert_prints "iO iO".titleize(Unicode::CaseOptions::Turkic), "İo İo" } - it "does not touch invalid code units in an otherwise ascii string" do - "\xB5!\xE0\xC1\xB5?".titleize.should eq("\xB5!\xE0\xC1\xB5?") - "a\xA0b".titleize.should eq("A\xA0b") + it "handles multi-character mappings correctly (#13533)" do + assert_prints "fflİ İffl dz DZ".titleize, "Ffli̇ İffl Dz Dz" end - describe "with IO" do - it { String.build { |io| "hEllO tAb\tworld".titleize io }.should eq "Hello Tab\tWorld" } - it { String.build { |io| " spaces before".titleize io }.should eq " Spaces Before" } - it { String.build { |io| "testa-se muito".titleize io }.should eq "Testa-se Muito" } - it { String.build { |io| "hÉllÕ tAb\tworld".titleize io }.should eq "Héllõ Tab\tWorld" } - it { String.build { |io| " spáçes before".titleize io }.should eq " Spáçes Before" } - it { String.build { |io| "testá-se múitô".titleize io }.should eq "Testá-se Múitô" } - it { String.build { |io| "iO iO".titleize io, Unicode::CaseOptions::Turkic }.should eq "İo İo" } + it "does not touch invalid code units in an otherwise ascii string" do + assert_prints "\xB5!\xE0\xC1\xB5?".titleize, "\xB5!\xE0\xC1\xB5?" + assert_prints "a\xA0b".titleize, "A\xA0b" end end @@ -2194,24 +2183,18 @@ describe "String" do end describe "#camelcase" do - it { "foo".camelcase.should eq "Foo" } - it { "foo_bar".camelcase.should eq "FooBar" } - it { "foo".camelcase(lower: true).should eq "foo" } - it { "foo_bar".camelcase(lower: true).should eq "fooBar" } - it { "Foo".camelcase.should eq "Foo" } - it { "Foo_bar".camelcase.should eq "FooBar" } - it { "Foo".camelcase(lower: true).should eq "foo" } - it { "Foo_bar".camelcase(lower: true).should eq "fooBar" } - - describe "with IO" do - it { String.build { |io| "foo".camelcase io }.should eq "Foo" } - it { String.build { |io| "foo_bar".camelcase io }.should eq "FooBar" } - it { String.build { |io| "foo".camelcase io, lower: true }.should eq "foo" } - it { String.build { |io| "foo_bar".camelcase io, lower: true }.should eq "fooBar" } - it { String.build { |io| "Foo".camelcase io }.should eq "Foo" } - it { String.build { |io| "Foo_bar".camelcase io }.should eq "FooBar" } - it { String.build { |io| "Foo".camelcase io, lower: true }.should eq "foo" } - it { String.build { |io| "Foo_bar".camelcase io, lower: true }.should eq "fooBar" } + it { assert_prints "foo".camelcase, "Foo" } + it { assert_prints "foo_bar".camelcase, "FooBar" } + it { assert_prints "foo".camelcase(lower: true), "foo" } + it { assert_prints "foo_bar".camelcase(lower: true), "fooBar" } + it { assert_prints "Foo".camelcase, "Foo" } + it { assert_prints "Foo_bar".camelcase, "FooBar" } + it { assert_prints "Foo".camelcase(lower: true), "foo" } + it { assert_prints "Foo_bar".camelcase(lower: true), "fooBar" } + + it "handles multi-character mappings correctly (#13533)" do + assert_prints "ffl_xffl".camelcase, "FflXffl" + assert_prints "İ_xffl".camelcase(lower: true), "i̇Xffl" end end diff --git a/spec/support/string.cr b/spec/support/string.cr index 45a8dc476e9c..be2306bdd509 100644 --- a/spec/support/string.cr +++ b/spec/support/string.cr @@ -25,11 +25,13 @@ end # Given a call of the form `foo.bar(*args, **opts)`, tests the following cases: # # * This call itself should return a `String` equal to *str*. -# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to *str*. -# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should be equal -# to *str*; that is, the `IO` overload should not fail when the `IO` argument -# uses a non-default encoding. This case is skipped if the `without_iconv` -# flag is set. +# * `String.build { |io| foo.bar(io, *args, **opts) }` should be equal to +# `str.scrub`; writing to a `String::Builder` must not produce any invalid +# UTF-8 byte sequences. +# * `string_build_via_utf16 { |io| foo.bar(io, *args, **opts) }` should also be +# equal to `str.scrub`; that is, the `IO` overload should not fail when the +# `IO` argument uses a non-default encoding. This case is skipped if the +# `without_iconv` flag is set. macro assert_prints(call, str, *, file = __FILE__, line = __LINE__) %str = ({{ str }}).as(String) %file = {{ file }} @@ -45,7 +47,7 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__) {% for arg in call.args %} {{ arg }}, {% end %} {% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %} ) {{ call.block }} - end.should eq(%str), file: %file, line: %line + end.should eq(%str.scrub), file: %file, line: %line {% unless flag?(:without_iconv) %} string_build_via_utf16 do |io| @@ -54,6 +56,6 @@ macro assert_prints(call, str, *, file = __FILE__, line = __LINE__) {% for arg in call.args %} {{ arg }}, {% end %} {% if call.named_args %} {% for narg in call.named_args %} {{ narg.name }}: {{ narg.value }}, {% end %} {% end %} ) {{ call.block }} - end.should eq(%str), file: %file, line: %line + end.should eq(%str.scrub), file: %file, line: %line {% end %} end diff --git a/src/char.cr b/src/char.cr index 2a6b912db118..761d4187c243 100644 --- a/src/char.cr +++ b/src/char.cr @@ -197,6 +197,7 @@ struct Char # 'ç'.lowercase? # => true # 'G'.lowercase? # => false # '.'.lowercase? # => false + # 'Dz'.lowercase? # => false # ``` def lowercase? : Bool ascii? ? ascii_lowercase? : Unicode.lowercase?(self) @@ -221,11 +222,24 @@ struct Char # 'Á'.uppercase? # => true # 'c'.uppercase? # => false # '.'.uppercase? # => false + # 'Dz'.uppercase? # => false # ``` def uppercase? : Bool ascii? ? ascii_uppercase? : Unicode.uppercase?(self) end + # Returns `true` if this char is a titlecase character, i.e. a ligature + # consisting of an uppercase letter followed by lowercase characters. + # + # ``` + # 'Dz'.titlecase? # => true + # 'H'.titlecase? # => false + # 'c'.titlecase? # => false + # ``` + def titlecase? : Bool + !ascii? && Unicode.titlecase?(self) + end + # Returns `true` if this char is an ASCII letter ('a' to 'z', 'A' to 'Z'). # # ``` @@ -393,7 +407,7 @@ struct Char # characters, like 'İ', than when downcased result in multiple # characters (in this case: 'I' and the dot mark). # - # For a more correct method see the method that receives a block. + # For more correct behavior see the overload that receives a block. # # ``` # 'Z'.downcase # => 'z' @@ -449,7 +463,7 @@ struct Char # characters, like 'ffl', than when upcased result in multiple # characters (in this case: 'F', 'F', 'L'). # - # For a more correct method see the method that receives a block. + # For more correct behavior see the overload that receives a block. # # ``` # 'z'.upcase # => 'Z' @@ -474,6 +488,49 @@ struct Char Unicode.upcase(self, options) { |char| yield char } end + # Returns the titlecase equivalent of this char. + # + # Usually this is equivalent to `#upcase`, but a few precomposed characters + # consisting of multiple letters may return a different character where only + # the first letter is uppercase and the rest lowercase. + # + # Note that this only works for characters whose titlecase + # equivalent yields a single codepoint. There are a few + # characters, like 'ffl', than when titlecased result in multiple + # characters (in this case: 'F', 'f', 'l'). + # + # For more correct behavior see the overload that receives a block. + # + # ``` + # 'z'.titlecase # => 'Z' + # 'X'.titlecase # => 'X' + # '.'.titlecase # => '.' + # 'DZ'.titlecase # => 'Dz' + # 'dz'.titlecase # => 'Dz' + # ``` + def titlecase(options : Unicode::CaseOptions = :none) : Char + Unicode.titlecase(self, options) + end + + # Yields each char for the titlecase equivalent of this char. + # + # Usually this is equivalent to `#upcase`, but a few precomposed characters + # consisting of multiple letters may yield a different character sequence + # where only the first letter is uppercase and the rest lowercase. + # + # This method takes into account the possibility that a titlecase + # version of a char might result in multiple chars, like for + # 'ffl', which results in 'F', 'f' and 'l'. + # + # ``` + # 'z'.titlecase { |v| puts v } # prints 'Z' + # 'DZ'.titlecase { |v| puts v } # prints 'Dz' + # 'ffl'.titlecase { |v| puts v } # prints 'F', 'f', 'l' + # ``` + def titlecase(options : Unicode::CaseOptions = :none, &) + Unicode.titlecase(self, options) { |char| yield char } + end + # See `Object#hash(hasher)` def hash(hasher) hasher.char(self) diff --git a/src/string.cr b/src/string.cr index 9e67c9debb20..7c9eed3dd186 100644 --- a/src/string.cr +++ b/src/string.cr @@ -1499,7 +1499,7 @@ class String def capitalize(io : IO, options : Unicode::CaseOptions = :none) : Nil each_char_with_index do |char, i| if i.zero? - char.upcase(options) { |c| io << c } + char.titlecase(options) { |c| io << c } else char.downcase(options) { |c| io << c } end @@ -1551,8 +1551,11 @@ class String upcase_next = true each_char_with_index do |char, i| - replaced_char = upcase_next ? char.upcase(options) : char.downcase(options) - io << replaced_char + if upcase_next + char.titlecase(options) { |c| io << c } + else + char.downcase(options) { |c| io << c } + end upcase_next = char.whitespace? end end @@ -4356,11 +4359,15 @@ class String each_char do |char| if first - io << (lower ? char.downcase(options) : char.upcase(options)) + if lower + char.downcase(options) { |c| io << c } + else + char.titlecase(options) { |c| io << c } + end elsif char == '_' last_is_underscore = true elsif last_is_underscore - io << char.upcase(options) + char.titlecase(options) { |c| io << c } last_is_underscore = false else io << char diff --git a/src/unicode/data.cr b/src/unicode/data.cr index 5fe4b621c8a5..050c6c8836eb 100644 --- a/src/unicode/data.cr +++ b/src/unicode/data.cr @@ -2651,23 +2651,11 @@ module Unicode private class_getter special_cases_upcase : Hash(Int32, {Int32, Int32, Int32}) do data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 102) put(data, 223, 83, 83, 0) - put(data, 64256, 70, 70, 0) - put(data, 64257, 70, 73, 0) - put(data, 64258, 70, 76, 0) - put(data, 64259, 70, 70, 73) - put(data, 64260, 70, 70, 76) - put(data, 64261, 83, 84, 0) - put(data, 64262, 83, 84, 0) - put(data, 1415, 1333, 1362, 0) - put(data, 64275, 1348, 1350, 0) - put(data, 64276, 1348, 1333, 0) - put(data, 64277, 1348, 1339, 0) - put(data, 64278, 1358, 1350, 0) - put(data, 64279, 1348, 1341, 0) put(data, 329, 700, 78, 0) + put(data, 496, 74, 780, 0) put(data, 912, 921, 776, 769) put(data, 944, 933, 776, 769) - put(data, 496, 74, 780, 0) + put(data, 1415, 1333, 1362, 0) put(data, 7830, 72, 817, 0) put(data, 7831, 84, 776, 0) put(data, 7832, 87, 778, 0) @@ -2677,18 +2665,6 @@ module Unicode put(data, 8018, 933, 787, 768) put(data, 8020, 933, 787, 769) put(data, 8022, 933, 787, 834) - put(data, 8118, 913, 834, 0) - put(data, 8134, 919, 834, 0) - put(data, 8146, 921, 776, 768) - put(data, 8147, 921, 776, 769) - put(data, 8150, 921, 834, 0) - put(data, 8151, 921, 776, 834) - put(data, 8162, 933, 776, 768) - put(data, 8163, 933, 776, 769) - put(data, 8164, 929, 787, 0) - put(data, 8166, 933, 834, 0) - put(data, 8167, 933, 776, 834) - put(data, 8182, 937, 834, 0) put(data, 8064, 7944, 921, 0) put(data, 8065, 7945, 921, 0) put(data, 8066, 7946, 921, 0) @@ -2737,21 +2713,135 @@ module Unicode put(data, 8109, 8045, 921, 0) put(data, 8110, 8046, 921, 0) put(data, 8111, 8047, 921, 0) - put(data, 8115, 913, 921, 0) - put(data, 8124, 913, 921, 0) - put(data, 8131, 919, 921, 0) - put(data, 8140, 919, 921, 0) - put(data, 8179, 937, 921, 0) - put(data, 8188, 937, 921, 0) put(data, 8114, 8122, 921, 0) + put(data, 8115, 913, 921, 0) put(data, 8116, 902, 921, 0) + put(data, 8118, 913, 834, 0) + put(data, 8119, 913, 834, 921) + put(data, 8124, 913, 921, 0) put(data, 8130, 8138, 921, 0) + put(data, 8131, 919, 921, 0) put(data, 8132, 905, 921, 0) + put(data, 8134, 919, 834, 0) + put(data, 8135, 919, 834, 921) + put(data, 8140, 919, 921, 0) + put(data, 8146, 921, 776, 768) + put(data, 8147, 921, 776, 769) + put(data, 8150, 921, 834, 0) + put(data, 8151, 921, 776, 834) + put(data, 8162, 933, 776, 768) + put(data, 8163, 933, 776, 769) + put(data, 8164, 929, 787, 0) + put(data, 8166, 933, 834, 0) + put(data, 8167, 933, 776, 834) put(data, 8178, 8186, 921, 0) + put(data, 8179, 937, 921, 0) put(data, 8180, 911, 921, 0) - put(data, 8119, 913, 834, 921) - put(data, 8135, 919, 834, 921) + put(data, 8182, 937, 834, 0) put(data, 8183, 937, 834, 921) + put(data, 8188, 937, 921, 0) + put(data, 64256, 70, 70, 0) + put(data, 64257, 70, 73, 0) + put(data, 64258, 70, 76, 0) + put(data, 64259, 70, 70, 73) + put(data, 64260, 70, 70, 76) + put(data, 64261, 83, 84, 0) + put(data, 64262, 83, 84, 0) + put(data, 64275, 1348, 1350, 0) + put(data, 64276, 1348, 1333, 0) + put(data, 64277, 1348, 1339, 0) + put(data, 64278, 1358, 1350, 0) + put(data, 64279, 1348, 1341, 0) + + data + end + + # Titlecase transformation that differs from the uppercase transformation. + # The maximum transformation is always 3 codepoints, so we store them all as 3 + # codepoints and 0 means end. + private class_getter special_cases_titlecase : Hash(Int32, {Int32, Int32, Int32}) do + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 81) + put(data, 223, 83, 115, 0) + put(data, 452, 453, 0, 0) + put(data, 453, 453, 0, 0) + put(data, 454, 453, 0, 0) + put(data, 455, 456, 0, 0) + put(data, 456, 456, 0, 0) + put(data, 457, 456, 0, 0) + put(data, 458, 459, 0, 0) + put(data, 459, 459, 0, 0) + put(data, 460, 459, 0, 0) + put(data, 497, 498, 0, 0) + put(data, 498, 498, 0, 0) + put(data, 499, 498, 0, 0) + put(data, 1415, 1333, 1410, 0) + put(data, 4304, 4304, 0, 0) + put(data, 4305, 4305, 0, 0) + put(data, 4306, 4306, 0, 0) + put(data, 4307, 4307, 0, 0) + put(data, 4308, 4308, 0, 0) + put(data, 4309, 4309, 0, 0) + put(data, 4310, 4310, 0, 0) + put(data, 4311, 4311, 0, 0) + put(data, 4312, 4312, 0, 0) + put(data, 4313, 4313, 0, 0) + put(data, 4314, 4314, 0, 0) + put(data, 4315, 4315, 0, 0) + put(data, 4316, 4316, 0, 0) + put(data, 4317, 4317, 0, 0) + put(data, 4318, 4318, 0, 0) + put(data, 4319, 4319, 0, 0) + put(data, 4320, 4320, 0, 0) + put(data, 4321, 4321, 0, 0) + put(data, 4322, 4322, 0, 0) + put(data, 4323, 4323, 0, 0) + put(data, 4324, 4324, 0, 0) + put(data, 4325, 4325, 0, 0) + put(data, 4326, 4326, 0, 0) + put(data, 4327, 4327, 0, 0) + put(data, 4328, 4328, 0, 0) + put(data, 4329, 4329, 0, 0) + put(data, 4330, 4330, 0, 0) + put(data, 4331, 4331, 0, 0) + put(data, 4332, 4332, 0, 0) + put(data, 4333, 4333, 0, 0) + put(data, 4334, 4334, 0, 0) + put(data, 4335, 4335, 0, 0) + put(data, 4336, 4336, 0, 0) + put(data, 4337, 4337, 0, 0) + put(data, 4338, 4338, 0, 0) + put(data, 4339, 4339, 0, 0) + put(data, 4340, 4340, 0, 0) + put(data, 4341, 4341, 0, 0) + put(data, 4342, 4342, 0, 0) + put(data, 4343, 4343, 0, 0) + put(data, 4344, 4344, 0, 0) + put(data, 4345, 4345, 0, 0) + put(data, 4346, 4346, 0, 0) + put(data, 4349, 4349, 0, 0) + put(data, 4350, 4350, 0, 0) + put(data, 4351, 4351, 0, 0) + put(data, 8114, 8122, 837, 0) + put(data, 8116, 902, 837, 0) + put(data, 8119, 913, 834, 837) + put(data, 8130, 8138, 837, 0) + put(data, 8132, 905, 837, 0) + put(data, 8135, 919, 834, 837) + put(data, 8178, 8186, 837, 0) + put(data, 8180, 911, 837, 0) + put(data, 8183, 937, 834, 837) + put(data, 64256, 70, 102, 0) + put(data, 64257, 70, 105, 0) + put(data, 64258, 70, 108, 0) + put(data, 64259, 70, 102, 105) + put(data, 64260, 70, 102, 108) + put(data, 64261, 83, 116, 0) + put(data, 64262, 83, 116, 0) + put(data, 64275, 1348, 1398, 0) + put(data, 64276, 1348, 1381, 0) + put(data, 64277, 1348, 1387, 0) + put(data, 64278, 1358, 1398, 0) + put(data, 64279, 1348, 1389, 0) data end diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr index 491a702c68ed..224d5c59a042 100644 --- a/src/unicode/unicode.cr +++ b/src/unicode/unicode.cr @@ -298,6 +298,52 @@ module Unicode end # :nodoc: + def self.titlecase(char : Char, options : CaseOptions) : Char + result = check_upcase_ascii(char, options) + return result if result + + result = check_upcase_turkic(char, options) + return result if result + + # there are no ASCII or Turkic special cases for titlecasing; this is the + # only part that differs from `.upcase` + result = special_cases_titlecase[char.ord]? + return result.first.unsafe_chr if result && result[1] == 0 && result[2] == 0 + + check_upcase_ranges(char) + end + + # :nodoc: + def self.titlecase(char : Char, options : CaseOptions, &) + result = check_upcase_ascii(char, options) + if result + yield result + return + end + + result = check_upcase_turkic(char, options) + if result + yield result + return + end + + # there are no ASCII or Turkic special cases for titlecasing; this is the + # only part that differs from `.upcase` + result = special_cases_titlecase[char.ord]? + if result + result.each { |c| yield c.unsafe_chr if c != 0 } + return + end + + result = special_cases_upcase[char.ord]? + if result + result.each { |c| yield c.unsafe_chr if c != 0 } + return + end + + yield check_upcase_ranges(char) + end + def self.foldcase(char : Char, options : CaseOptions) : Char results = check_foldcase(char, options) return results[0].unsafe_chr if results && results.size == 1 @@ -336,6 +382,11 @@ module Unicode in_category?(char.ord, category_Lu) end + # :nodoc: + def self.titlecase?(char : Char) : Bool + in_category?(char.ord, category_Lt) + end + # :nodoc: def self.letter?(char : Char) : Bool in_any_category?(char.ord, category_Lu, category_Ll, category_Lt, category_Lm, category_Lo)