The ARMv8.1 spec adds support for new atomic instructions that are faster and produce more fair results with locks than the current Go compiler implementation on ARM64. Go should make use of these instructions when they are available. Below is a test case in which many threads attempt to acquire the same lock many times. After execution is complete, the program calculates the number of times each thread acquired the lock and the standard deviation of those values. On x86_64 and ARMv8 systems this standard deviation is low, indicating a more-or-less fair implementation of the mutex. On ARMv8.1 systems (I tested this on the AWS m6g instance type), the standard deviation is much higher, indicating that some threads acquire the lock much more often, resulting in starvation of some of the threads.
I have already posted a change request which fixes this problem: https://go-review.googlesource.com/c/go/+/234217
What version of Go are you using (go version)?
This is the base version of my change—the commit just before the change that I made.
$ go version
go version devel +2b70ffe930 Fri May 15 16:15:25 2020 +0000 linux/arm64
Does this issue reproduce with the latest release?
Yes.
What operating system and processor architecture are you using (go env)?
arch: arm64
os: Ubuntu 20.04 and Amazon Linux 2
go env
$ go env
GO111MODULE=""
GOARCH="arm64"
GOBIN=""
GOCACHE="/home/ubuntu/.cache/go-build"
GOENV="/home/ubuntu/.config/go/env"
GOEXE=""
GOFLAGS=""
GOHOSTARCH="arm64"
GOHOSTOS="linux"
GOINSECURE=""
GOMODCACHE="/home/ubuntu/go/pkg/mod"
GONOPROXY=""
GONOSUMDB=""
GOOS="linux"
GOPATH="/home/ubuntu/go"
GOPRIVATE=""
GOPROXY="https://proxy.golang.org,direct"
GOROOT="/home/ubuntu/go-compiler-test/go"
GOSUMDB="sum.golang.org"
GOTMPDIR=""
GOTOOLDIR="/home/ubuntu/go-compiler-test/go/pkg/tool/linux_arm64"
GCCGO="gccgo"
AR="ar"
CC="gcc"
CXX="g++"
CGO_ENABLED="1"
GOMOD=""
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -pthread -fmessage-length=0 -fdebug-prefix-map=/tmp/go-build223010461=/tmp/go-build -gno-record-gcc-switches"
What did you do?
Below is a test case in which many threads attempt to acquire the same lock many times. After execution is complete, the program calculates the number of times each thread acquired the lock and the standard deviation of those values.
https://play.golang.org/p/-rXAb4EP2Fj (The code here is modified not to require command line arguments.)
$ cat mutex-starving.go
package main
import (
"fmt"
"sync"
"time"
"flag"
"math"
)
type globals_s struct {
lock sync.Mutex
running bool
status int
stats []Stats
threads_waiter sync.WaitGroup
}
var globals globals_s
type Stats struct {
min_lock_time int64
max_lock_time int64
total_time int64
n_locks int64
n_flips int64
}
func update_stats(stats *Stats, elapsed_time int64) {
stats.n_locks += 1
if (elapsed_time < stats.min_lock_time) {
stats.min_lock_time = elapsed_time
}
if (elapsed_time > stats.max_lock_time) {
stats.max_lock_time = elapsed_time
}
stats.total_time += elapsed_time
}
func fun(check int, set int, stats *Stats) {
loop := true
for loop {
start := time.Now()
globals.lock.Lock()
if (globals.status == check) {
loop = false
stats.n_flips += 1
globals.status = set
}
globals.lock.Unlock()
stop := time.Now()
update_stats(stats, stop.Sub(start).Nanoseconds())
}
}
func thread_entry_point(stats *Stats) {
// run until canceled
for globals.running {
fun(1, 0, stats)
}
}
func print_stats() {
n_threads := len(globals.stats)
for i := 0; i < n_threads; i++ {
var average float64
stats := globals.stats[i]
average = float64(stats.total_time) / float64(stats.n_locks)
if i == n_threads-1 {
fmt.Printf("server: ")
} else {
fmt.Printf("thread %d: ", i)
}
fmt.Printf("min=%d, max=%d, average=%f, mutexes_locked=%d, flips=%d\n",
stats.min_lock_time, stats.max_lock_time, average, stats.n_locks, stats.n_flips)
}
}
func avg(n []int64) float64 {
sum := float64(0)
for i := 0; i < len(n); i++ {
sum += float64(n[i])
}
return sum / float64(len(n))
}
func stdev(n []int64) float64 {
average := avg(n)
sd := float64(0)
for i := 0; i < len(n); i++ {
sd += math.Pow(float64(n[i]) - average, 2)
}
return math.Sqrt(sd/float64(len(n)))
}
func max(n []int64) int64 {
m := n[0]
for i := 1; i < len(n); i ++ {
if m < n[i] {
m = n[i]
}
}
return m
}
func min(n []int64) int64 {
m := n[0]
for i := 1; i < len(n); i ++ {
if m > n[i] {
m = n[i]
}
}
return m
}
func calculate_stats(n []int64) string {
max_val := max(n)
min_val := min(n)
range_val := max_val - min_val
stdev_val := stdev(n)
return fmt.Sprintf("range: %7d, stdev: %11.3f, max: %7d, min: %7d",
range_val, stdev_val, max_val, min_val)
}
func print_stats_summary() {
n_threads := len(globals.stats)
flips := make([]int64, 0, n_threads)
mutexes_locked := make([]int64, 0, n_threads)
for i := 0; i < n_threads; i++ {
stats := globals.stats[i]
flips = append(flips, stats.n_flips)
mutexes_locked = append(mutexes_locked, stats.n_locks)
}
fmt.Printf(" flips: %s\n", calculate_stats(flips))
fmt.Printf("mutexes_locked: %s\n", calculate_stats(mutexes_locked))
}
func main() {
n_threads_arg := flag.Int("threads", 16, "number of worker threads")
n_iterations_arg := flag.Int("iterations", 10000, "number of iterations of the server thread")
summary := flag.Bool("summary", false, "print a summary of stats instead of raw data")
all_stats := flag.Bool("all-stats", false, "print all raw stats")
flag.Parse()
// add one to n_threads because we always have one extra for the "server" thread
n_threads := *n_threads_arg + 1
n_iterations := *n_iterations_arg
// initialize the stats slice
globals.stats = make([]Stats, n_threads, n_threads)
for i := 0; i < n_threads; i++ {
globals.stats[i].min_lock_time = 1000000
}
globals.running = true
// start threads
for i := 0; i < n_threads-1; i++ {
go thread_entry_point(&globals.stats[i])
}
globals.threads_waiter.Add(0)
// start the main thread (but re-use the "main" thread)
for i := 0; i < n_iterations; i++ {
fun(0, 1, &globals.stats[n_threads-1])
}
// stop
globals.running = false
globals.threads_waiter.Wait()
if *summary {
print_stats_summary()
if *all_stats {
print_stats()
}
} else {
print_stats()
}
}
What did you expect to see?
I expected to see similar results to x86_64 counterparts. Here is an execution on a c5.24xlarge instance (96 vCPUs). Pay attention to the standard deviation value for mutexes_locked.
$ ./mutex-starving-2b70ffe930 -summary -iterations 100000 -threads 96
flips: range: 99190, stdev: 9996.221, max: 100000, min: 810
mutexes_locked: range: 22617, stdev: 3989.320, max: 157237, min: 134620
And here is an a1.4xlarge (16 vCPUs, Graviton 1/ARMv8).
$ ./mutex-starving-2b70ffe930 -summary -iterations 100000 -threads 16
flips: range: 94551, stdev: 22063.755, max: 100000, min: 5449
mutexes_locked: range: 20451, stdev: 4360.286, max: 121619, min: 101168
What did you see instead?
I saw a very high standard deviation in the number of times each thread acquired the lock on m6g (Graviton 2, with ARMv8.2 atomic instruction support). Here is an execution from a recent development build of the compiler (2b70ffe) on a m6g.16xlarge (64 vCPUs):
$ time ./mutex-starving-2b70ffe930 -summary -iterations 100000 -threads 64
flips: range: 99852, stdev: 12227.687, max: 100000, min: 148
mutexes_locked: range: 669341, stdev: 125266.892, max: 1270028, min: 600687
real 0m29.545s
user 6m27.095s
sys 0m0.253s
Finally, with the change request:
$ time ./mutex-starving-patched -summary -iterations 100000 -threads 64
flips: range: 98708, stdev: 12116.108, max: 100000, min: 1292
mutexes_locked: range: 17521, stdev: 3209.880, max: 102460, min: 84939
real 0m0.950s
user 0m6.678s
sys 0m0.989s
The ARMv8.1 spec adds support for new atomic instructions that are faster and produce more fair results with locks than the current Go compiler implementation on ARM64. Go should make use of these instructions when they are available. Below is a test case in which many threads attempt to acquire the same lock many times. After execution is complete, the program calculates the number of times each thread acquired the lock and the standard deviation of those values. On x86_64 and ARMv8 systems this standard deviation is low, indicating a more-or-less fair implementation of the mutex. On ARMv8.1 systems (I tested this on the AWS m6g instance type), the standard deviation is much higher, indicating that some threads acquire the lock much more often, resulting in starvation of some of the threads.
I have already posted a change request which fixes this problem: https://go-review.googlesource.com/c/go/+/234217
What version of Go are you using (
go version)?This is the base version of my change—the commit just before the change that I made.
Does this issue reproduce with the latest release?
Yes.
What operating system and processor architecture are you using (
go env)?arch: arm64
os: Ubuntu 20.04 and Amazon Linux 2
go envWhat did you do?
Below is a test case in which many threads attempt to acquire the same lock many times. After execution is complete, the program calculates the number of times each thread acquired the lock and the standard deviation of those values.
https://play.golang.org/p/-rXAb4EP2Fj (The code here is modified not to require command line arguments.)
$ cat mutex-starving.goWhat did you expect to see?
I expected to see similar results to x86_64 counterparts. Here is an execution on a
c5.24xlargeinstance (96 vCPUs). Pay attention to the standard deviation value formutexes_locked.And here is an
a1.4xlarge(16 vCPUs, Graviton 1/ARMv8).What did you see instead?
I saw a very high standard deviation in the number of times each thread acquired the lock on
m6g(Graviton 2, with ARMv8.2 atomic instruction support). Here is an execution from a recent development build of the compiler (2b70ffe) on am6g.16xlarge(64 vCPUs):Finally, with the change request: