-
-
Notifications
You must be signed in to change notification settings - Fork 6.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
lib: better optimized casecompare() and ncasecompare() #16311
Conversation
Less 'jne` or `je` CPU instructions.
Compare Clang 19.1.0 with -O3 casecompare_old: casecompare_new:
movzx edx, byte ptr [rdi] movzx ecx, byte ptr [rdi]
test dl, dl test cl, cl
je .LBB1_1 je .LBB2_5
inc rdi inc rdi
lea rcx, [rip + touppermap] lea rax, [rip + touppermap]
.LBB1_3: .LBB2_2:
movzx r8d, byte ptr [rsi] movzx ecx, cl
test r8, r8 movzx ecx, byte ptr [rcx + rax]
setne al movzx edx, byte ptr [rsi]
je .LBB1_7 cmp cl, byte ptr [rdx + rax]
movzx edx, dl jne .LBB2_3
movzx edx, byte ptr [rdx + rcx] inc rsi
cmp dl, byte ptr [r8 + rcx] movzx ecx, byte ptr [rdi]
jne .LBB1_5 inc rdi
inc rsi test cl, cl
movzx edx, byte ptr [rdi] jne .LBB2_2
inc rdi .LBB2_5:
test dl, dl xor eax, eax
jne .LBB1_3 cmp byte ptr [rsi], 0
jmp .LBB1_7 sete al
.LBB1_1: ret
mov al, 1 .LBB2_3:
.LBB1_7: xor eax, eax
cmp byte ptr [rsi], 0 ret
setne cl
xor cl, al
movzx eax, cl
ret
.LBB1_5:
xor eax, eax
ret ncasecompare_old: ncasecompare_new:
movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi]
test al, al test cl, cl
je .LBB3_1 sete al
inc rdi test rdx, rdx
lea rcx, [rip + touppermap] sete r8b
.LBB3_3: or r8b, al
movzx r8d, byte ptr [rsi] jne .LBB4_6
test r8, r8 lea r8, [rdx - 1]
je .LBB3_9 xor eax, eax
test rdx, rdx lea r9, [rip + touppermap]
je .LBB3_9 xor r10d, r10d
movzx eax, al .LBB4_2:
movzx eax, byte ptr [rax + rcx] movzx ecx, cl
cmp al, byte ptr [r8 + rcx] movzx ecx, byte ptr [rcx + r9]
jne .LBB3_6 movzx r11d, byte ptr [rsi + r10]
dec rdx cmp cl, byte ptr [r11 + r9]
inc rsi jne .LBB4_9
movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + r10 + 1]
inc rdi lea r11, [r10 + 1]
test al, al test cl, cl
jne .LBB3_3 je .LBB4_5
xor eax, eax cmp r8, r10
.LBB3_9: mov r10, r11
movzx eax, al jne .LBB4_2
test rdx, rdx .LBB4_5:
je .LBB3_11 sub rdx, r11
.LBB3_12: add rsi, r11
lea rcx, [rip + touppermap] .LBB4_6:
movzx edx, byte ptr [rax + rcx] test rdx, rdx
movzx esi, byte ptr [rsi] je .LBB4_7
xor eax, eax movzx eax, cl
cmp dl, byte ptr [rsi + rcx] lea rcx, [rip + touppermap]
sete al movzx edx, byte ptr [rax + rcx]
ret movzx esi, byte ptr [rsi]
.LBB3_1: xor eax, eax
xor eax, eax cmp dl, byte ptr [rsi + rcx]
test rdx, rdx sete al
jne .LBB3_12 .LBB4_9:
.LBB3_11: ret
mov eax, 1 .LBB4_7:
ret mov eax, 1
.LBB3_6: ret
xor eax, eax
ret |
My gut reaction was that removing the check of |
Analysis of PR #16311 at 4fe08e05: Test 2048 failed, which has NOT been flaky recently, so there could be a real issue in this PR. Test 363 failed, which has NOT been flaky recently, so there could be a real issue in this PR. Generated by Testclutch |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
lgtm.🎉 |
Thanks! |
Less
jne
orje
CPU instructions.Compare CGG 14.2 with -O3