Skip to content

Commit ce1b6d6

Browse files
makenowjustbcardiff
authored andcommitted
Fix String#scan behavior same as Ruby (#3877)
* Fix String#scan behavior same as Ruby For example, Ruby's String#scan is: "hello world".scan(/\w+|(?= )/) # => ["hello", "", "world"] But Crystal's is: "hello world".scan(/\w+|(?= )/).map &.[0] # => ["hello", ""] This commit fixes it by continuing to scan when match is empty. * Fix String#split behavior same as Ruby
1 parent ff314f9 commit ce1b6d6

File tree

2 files changed

+32
-37
lines changed

2 files changed

+32
-37
lines changed

spec/std/string_spec.cr

+3
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,9 @@ describe "String" do
810810
assert { "foo,bar,baz,qux".split(/,/, 30).should eq(["foo", "bar", "baz", "qux"]) }
811811
assert { "a b c".split(Regex.new(" "), 2).should eq(["a", "b c"]) }
812812
assert { "日本ん語日本ん語".split(/ん/).should eq(["日本", "語日本", ""]) }
813+
assert { "九十九十九".split(/(?=十)/).should eq(["", "十九", "十九"]) }
813814
assert { "hello world".split(/\b/).should eq(["hello", " ", "world", ""]) }
815+
assert { "hello world".split(/\w+|(?= )/).should eq(["", " ", ""]) }
814816
assert { "abc".split(//).should eq(["a", "b", "c"]) }
815817
assert { "hello".split(/\w+/).should eq(["", ""]) }
816818
assert { "foo".split(/o/).should eq(["f", "", ""]) }
@@ -1674,6 +1676,7 @@ describe "String" do
16741676
it "works when match is empty" do
16751677
r = %r([\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*))
16761678
"hello".scan(r).map(&.[0]).should eq(["hello", ""])
1679+
"hello world".scan(/\w+|(?= )/).map(&.[0]).should eq(["hello", "", "world"])
16771680
end
16781681

16791682
it "works with strings with block" do

src/string.cr

+29-37
Original file line numberDiff line numberDiff line change
@@ -2852,41 +2852,32 @@ class String
28522852
end
28532853

28542854
count = 0
2855-
match_offset = 0
2856-
slice_offset = 0
2857-
last_slice_offset = 0
2855+
match_offset = slice_offset = 0
28582856

28592857
while match = separator.match_at_byte_index(self, match_offset)
28602858
index = match.byte_begin(0)
2861-
slice_size = index - slice_offset
28622859
match_bytesize = match[0].bytesize
2860+
next_offset = index + match_bytesize
28632861

2864-
if slice_offset == 0 && slice_size == 0 && match_bytesize == 0
2865-
# Skip
2866-
elsif slice_offset == bytesize && slice_size == 0
2867-
yield byte_slice(last_slice_offset)
2862+
if next_offset == slice_offset
2863+
match_offset = next_offset + char_bytesize_at(next_offset)
28682864
else
2865+
slice_size = index - slice_offset
2866+
28692867
yield byte_slice(slice_offset, slice_size)
2870-
end
2871-
count += 1
2868+
count += 1
28722869

2873-
1.upto(match.size) do |i|
2874-
if group = match[i]?
2875-
yield group
2870+
1.upto(match.size) do |i|
2871+
if group = match[i]?
2872+
yield group
2873+
end
28762874
end
2877-
end
2878-
2879-
last_slice_offset = slice_offset
28802875

2881-
if match_bytesize == 0
2882-
match_offset = index + 1
2883-
slice_offset = index
2884-
else
2885-
match_offset = index + match_bytesize
2886-
slice_offset = match_offset
2876+
slice_offset = match_offset = next_offset
28872877
end
2878+
28882879
break if limit && count + 1 == limit
2889-
break if slice_offset > bytesize
2880+
break if match_offset >= bytesize
28902881
end
28912882

28922883
yield byte_slice(slice_offset)
@@ -3208,7 +3199,7 @@ class String
32083199
$~ = match
32093200
yield match
32103201
match_bytesize = match[0].bytesize
3211-
break if match_bytesize == 0
3202+
match_bytesize += 1 if match_bytesize == 0
32123203
byte_offset = index + match_bytesize
32133204
end
32143205

@@ -3574,6 +3565,19 @@ class String
35743565
@bytesize == size
35753566
end
35763567

3568+
protected def char_bytesize_at(byte_index)
3569+
case unsafe_byte_at(byte_index)
3570+
when .< 0x80
3571+
1
3572+
when .< 0xe0
3573+
2
3574+
when .< 0xf0
3575+
3
3576+
else
3577+
4
3578+
end
3579+
end
3580+
35773581
protected def size_known?
35783582
@bytesize == 0 || @length > 0
35793583
end
@@ -3584,19 +3588,7 @@ class String
35843588

35853589
while byte_index < bytesize
35863590
yield byte_index, char_index
3587-
3588-
c = to_unsafe[byte_index]
3589-
3590-
if c < 0x80
3591-
byte_index += 1
3592-
elsif c < 0xe0
3593-
byte_index += 2
3594-
elsif c < 0xf0
3595-
byte_index += 3
3596-
else
3597-
byte_index += 4
3598-
end
3599-
3591+
byte_index += char_bytesize_at(byte_index)
36003592
char_index += 1
36013593
end
36023594

0 commit comments

Comments
 (0)