Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib: better optimized casecompare() and ncasecompare() #16311

Closed
wants to merge 1 commit into from

Conversation

sergio-nsk
Copy link
Contributor

@sergio-nsk sergio-nsk commented Feb 13, 2025

Less jne or je CPU instructions.
Compare CGG 14.2 with -O3

casecompare_old:                                     casecompare_new:
        movzx   edx, BYTE PTR [rdi]                          jmp     .L22
        test    dl, dl                               .L16:
        jne     .L4                                          movzx   edx, BYTE PTR [rsi]
        jmp     .L11                                         movzx   eax, BYTE PTR touppermap[rax]
.L7:                                                         cmp     BYTE PTR touppermap[rdx], al
        movzx   ecx, BYTE PTR touppermap[rdx]                jne     .L17
        cmp     BYTE PTR touppermap[rax], cl                 add     rdi, 1
        jne     .L8                                          add     rsi, 1
        movzx   edx, BYTE PTR [rdi+1]                .L22:
        add     rdi, 1                                       movzx   eax, BYTE PTR [rdi]
        lea     rax, [rsi+1]                                 test    al, al
        test    dl, dl                                       jne     .L16
        je      .L12                                         xor     eax, eax
        mov     rsi, rax                                     cmp     BYTE PTR [rsi], 0
.L4:                                                         sete    al
        movzx   eax, BYTE PTR [rsi]                          ret
        test    al, al                               .L17:
        jne     .L7                                          xor     eax, eax
        mov     edx, 1                                       ret
.L5:
        test    al, al
        sete    al
        xor     eax, edx
        movzx   eax, al
        ret
.L8:
        xor     eax, eax
        ret
.L12:
        movzx   eax, BYTE PTR [rsi+1]
        test    al, al
        sete    al
        xor     eax, edx
        movzx   eax, al
        ret
.L11:
        movzx   eax, BYTE PTR [rsi]
        jmp     .L5
ncasecompare_old:                                     ncasecompare_new:
        jmp     .L37                                          jmp     .L50
.L38:                                                 .L51:
        movzx   ecx, BYTE PTR [rsi]                           test    rdx, rdx
        test    cl, cl                                        je      .L45
        je      .L25                                          movzx   ecx, BYTE PTR [rsi]
        test    rdx, rdx                                      movzx   eax, BYTE PTR touppermap[rax]
        je      .L29                                          cmp     BYTE PTR touppermap[rcx], al
        movzx   eax, BYTE PTR touppermap[rax]                 jne     .L44
        cmp     BYTE PTR touppermap[rcx], al                  sub     rdx, 1
        jne     .L28                                          add     rdi, 1
        sub     rdx, 1                                        add     rsi, 1
        add     rdi, 1                                .L50:
        add     rsi, 1                                        movzx   eax, BYTE PTR [rdi]
.L37:                                                         test    al, al
        movzx   eax, BYTE PTR [rdi]                           jne     .L51
        test    al, al                                        test    rdx, rdx
        jne     .L38                                          je      .L45
.L25:                                                         movzx   eax, BYTE PTR [rsi]
        test    rdx, rdx                                      cmp     BYTE PTR touppermap[rax], 0
        je      .L29                                          sete    al
        movzx   edx, BYTE PTR [rsi]                           movzx   eax, al
        movzx   eax, BYTE PTR touppermap[rax]                 ret
        cmp     BYTE PTR touppermap[rdx], al          .L45:
        sete    al                                            mov     eax, 1
        movzx   eax, al                                       ret
        ret                                           .L44:
.L29:                                                         xor     eax, eax
        mov     eax, 1                                        ret
        ret
.L28:
        xor     eax, eax
        ret

Less 'jne` or `je` CPU instructions.
@sergio-nsk
Copy link
Contributor Author

Compare Clang 19.1.0 with -O3

casecompare_old:                                   casecompare_new:
        movzx   edx, byte ptr [rdi]                        movzx   ecx, byte ptr [rdi]
        test    dl, dl                                     test    cl, cl
        je      .LBB1_1                                    je      .LBB2_5
        inc     rdi                                        inc     rdi
        lea     rcx, [rip + touppermap]                    lea     rax, [rip + touppermap]
.LBB1_3:                                           .LBB2_2:
        movzx   r8d, byte ptr [rsi]                        movzx   ecx, cl
        test    r8, r8                                     movzx   ecx, byte ptr [rcx + rax]
        setne   al                                         movzx   edx, byte ptr [rsi]
        je      .LBB1_7                                    cmp     cl, byte ptr [rdx + rax]
        movzx   edx, dl                                    jne     .LBB2_3
        movzx   edx, byte ptr [rdx + rcx]                  inc     rsi
        cmp     dl, byte ptr [r8 + rcx]                    movzx   ecx, byte ptr [rdi]
        jne     .LBB1_5                                    inc     rdi
        inc     rsi                                        test    cl, cl
        movzx   edx, byte ptr [rdi]                        jne     .LBB2_2
        inc     rdi                                .LBB2_5:
        test    dl, dl                                     xor     eax, eax
        jne     .LBB1_3                                    cmp     byte ptr [rsi], 0
        jmp     .LBB1_7                                    sete    al
.LBB1_1:                                                   ret
        mov     al, 1                              .LBB2_3:
.LBB1_7:                                                   xor     eax, eax
        cmp     byte ptr [rsi], 0                          ret
        setne   cl
        xor     cl, al
        movzx   eax, cl
        ret
.LBB1_5:
        xor     eax, eax
        ret
ncasecompare_old:                                  ncasecompare_new:
        movzx   eax, byte ptr [rdi]                        movzx   ecx, byte ptr [rdi]
        test    al, al                                     test    cl, cl
        je      .LBB3_1                                    sete    al
        inc     rdi                                        test    rdx, rdx
        lea     rcx, [rip + touppermap]                    sete    r8b
.LBB3_3:                                                   or      r8b, al
        movzx   r8d, byte ptr [rsi]                        jne     .LBB4_6
        test    r8, r8                                     lea     r8, [rdx - 1]
        je      .LBB3_9                                    xor     eax, eax
        test    rdx, rdx                                   lea     r9, [rip + touppermap]
        je      .LBB3_9                                    xor     r10d, r10d
        movzx   eax, al                            .LBB4_2:
        movzx   eax, byte ptr [rax + rcx]                  movzx   ecx, cl
        cmp     al, byte ptr [r8 + rcx]                    movzx   ecx, byte ptr [rcx + r9]
        jne     .LBB3_6                                    movzx   r11d, byte ptr [rsi + r10]
        dec     rdx                                        cmp     cl, byte ptr [r11 + r9]
        inc     rsi                                        jne     .LBB4_9
        movzx   eax, byte ptr [rdi]                        movzx   ecx, byte ptr [rdi + r10 + 1]
        inc     rdi                                        lea     r11, [r10 + 1]
        test    al, al                                     test    cl, cl
        jne     .LBB3_3                                    je      .LBB4_5
        xor     eax, eax                                   cmp     r8, r10
.LBB3_9:                                                   mov     r10, r11
        movzx   eax, al                                    jne     .LBB4_2
        test    rdx, rdx                           .LBB4_5:
        je      .LBB3_11                                   sub     rdx, r11
.LBB3_12:                                                  add     rsi, r11
        lea     rcx, [rip + touppermap]            .LBB4_6:
        movzx   edx, byte ptr [rax + rcx]                  test    rdx, rdx
        movzx   esi, byte ptr [rsi]                        je      .LBB4_7
        xor     eax, eax                                   movzx   eax, cl
        cmp     dl, byte ptr [rsi + rcx]                   lea     rcx, [rip + touppermap]
        sete    al                                         movzx   edx, byte ptr [rax + rcx]
        ret                                                movzx   esi, byte ptr [rsi]
.LBB3_1:                                                   xor     eax, eax
        xor     eax, eax                                   cmp     dl, byte ptr [rsi + rcx]
        test    rdx, rdx                                   sete    al
        jne     .LBB3_12                           .LBB4_9:
.LBB3_11:                                                  ret
        mov     eax, 1                             .LBB4_7:
        ret                                                mov     eax, 1
.LBB3_6:                                                   ret
        xor     eax, eax
        ret

@dfandrich
Copy link
Contributor

My gut reaction was that removing the check of *second could lead to a buffer read overflow, but the failed comparison between *first and *second will exit the loop before anything bad happens. I can't think of a case where this optimization would fail.

@testclutch
Copy link

Analysis of PR #16311 at 4fe08e05:

Test 2048 failed, which has NOT been flaky recently, so there could be a real issue in this PR.

Test 363 failed, which has NOT been flaky recently, so there could be a real issue in this PR.

Generated by Testclutch

Copy link
Member

@bagder bagder left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

@icing
Copy link
Contributor

icing commented Feb 13, 2025

lgtm.🎉

@bagder bagder closed this in c134181 Feb 13, 2025
@bagder
Copy link
Member

bagder commented Feb 13, 2025

Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging this pull request may close these issues.

5 participants