Skip to content

Commit

Permalink
Rename isIntelBridgeFamily to isIntelERMSGoodCPU.
Browse files Browse the repository at this point in the history
Make sure ERMS repmovsb only works on aligned non-overlapped data copy.
Benchstat result:
---
goos: linux
goarch: amd64
pkg: runtime
cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz
               │  ./old.txt  │              ./new.txt              │
               │   sec/op    │   sec/op     vs base                │
Memmove/2048-2   25.24n ± 0%   24.27n ± 0%   -3.84% (p=0.000 n=10)
Memmove/4096-2   44.87n ± 0%   33.16n ± 1%  -26.11% (p=0.000 n=10)
geomean          33.65n        28.37n       -15.71%

               │  ./old.txt   │               ./new.txt               │
               │     B/s      │      B/s       vs base                │
Memmove/2048-2   75.56Gi ± 0%    78.59Gi ± 0%   +4.02% (p=0.000 n=10)
Memmove/4096-2   85.01Gi ± 0%   115.05Gi ± 1%  +35.34% (p=0.000 n=10)
geomean          80.14Gi         95.09Gi       +18.65%

Signed-off-by: TangYang <yang.tang@intel.com>
  • Loading branch information
cocotyty committed May 13, 2024
1 parent f3f5b0f commit 7f7e861
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 24 deletions.
11 changes: 7 additions & 4 deletions src/runtime/cpuflags_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,23 @@ import (
"internal/cpu"
)

var useAVXmemmove bool
var (
useAVXmemmove bool
useERMS bool
)

func init() {
// Let's remove stepping and reserved fields
processor := processorVersionInfo & 0x0FFF3FF0

isIntelBridgeFamily := isIntel &&
isIntelERMSGoodCPU := isIntel &&
processor == 0x206A0 || // Sandy Bridge (Client)
processor == 0x206D0 || // Sandy Bridge (Server)
processor == 0x306A0 || // Ivy Bridge (Client)
processor == 0x306E0 || // Ivy Bridge (Server)
processor == 0x606A0 || // Ice Lake (Server) SP
processor == 0x606C0 || // Ice Lake (Server) DE
processor == 0x806F0 // Sapphire Rapids

useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
useERMS = isIntelERMSGoodCPU && cpu.X86.HasERMS
useAVXmemmove = cpu.X86.HasAVX
}
41 changes: 21 additions & 20 deletions src/runtime/memmove_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -72,45 +72,43 @@ tail:
CMPQ BX, $256
JBE move_129through256

TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned

/*
* check and set for backwards
*/
CMPQ SI, DI
JLS back

/*
* forward copy loop
*/
* forward copy loop
*/
forward:
CMPQ BX, $2048
JLS move_256through2048
// ERMS is slow if destination address is unaligned.
TESTQ $15, DI
JZ check_avx

TESTB $1, runtime·useERMS(SB)
JNZ erms

// If REP MOVSB isn't fast, don't use it
CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
JNE fwdBy8
check_avx:
TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned

// Check alignment
MOVL SI, AX
ORL DI, AX
TESTL $7, AX
JEQ fwdBy8

// Do 1 byte at a time
MOVQ BX, CX
REP; MOVSB
RET
CMPQ BX, $2048
JLS move_256through2048

fwdBy8:
// Do 8 bytes at a time
MOVQ BX, CX
SHRQ $3, CX
ANDQ $7, BX
REP; MOVSQ
JMP tail

erms:
MOVQ BX, CX
REP; MOVSB
RET

back:
/*
* check overlap
Expand All @@ -119,6 +117,9 @@ back:
ADDQ BX, CX
CMPQ CX, DI
JLS forward

TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned
/*
* whole thing backwards has
* adjusted addresses
Expand Down

0 comments on commit 7f7e861

Please sign in to comment.