Skip to content

Commit 997c00f

Browse files
dvyukovrsc
authored andcommitted
runtime: replace Semacquire/Semrelease implementation
1. The implementation uses distributed hash table of waitlists instead of a centralized one. It significantly improves scalability for uncontended semaphores. 2. The implementation provides wait-free fast-path for signalers. 3. The implementation uses less locks (1 lock/unlock instead of 5 for Semacquire). 4. runtime·ready() call is moved out of critical section. 5. Semacquire() does not call semwake(). Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz) are as follows: benchmark old ns/op new ns/op delta runtime_test.BenchmarkSemaUncontended 58.20 36.30 -37.63% runtime_test.BenchmarkSemaUncontended-2 199.00 18.30 -90.80% runtime_test.BenchmarkSemaUncontended-4 327.00 9.20 -97.19% runtime_test.BenchmarkSemaUncontended-8 491.00 5.32 -98.92% runtime_test.BenchmarkSemaUncontended-16 946.00 4.18 -99.56% runtime_test.BenchmarkSemaSyntNonblock 59.00 36.80 -37.63% runtime_test.BenchmarkSemaSyntNonblock-2 167.00 138.00 -17.37% runtime_test.BenchmarkSemaSyntNonblock-4 333.00 129.00 -61.26% runtime_test.BenchmarkSemaSyntNonblock-8 464.00 130.00 -71.98% runtime_test.BenchmarkSemaSyntNonblock-16 1015.00 136.00 -86.60% runtime_test.BenchmarkSemaSyntBlock 58.80 36.70 -37.59% runtime_test.BenchmarkSemaSyntBlock-2 294.00 149.00 -49.32% runtime_test.BenchmarkSemaSyntBlock-4 333.00 177.00 -46.85% runtime_test.BenchmarkSemaSyntBlock-8 471.00 221.00 -53.08% runtime_test.BenchmarkSemaSyntBlock-16 990.00 227.00 -77.07% runtime_test.BenchmarkSemaWorkNonblock 829.00 832.00 +0.36% runtime_test.BenchmarkSemaWorkNonblock-2 425.00 419.00 -1.41% runtime_test.BenchmarkSemaWorkNonblock-4 308.00 220.00 -28.57% runtime_test.BenchmarkSemaWorkNonblock-8 394.00 147.00 -62.69% runtime_test.BenchmarkSemaWorkNonblock-16 1510.00 149.00 -90.13% runtime_test.BenchmarkSemaWorkBlock 828.00 813.00 -1.81% runtime_test.BenchmarkSemaWorkBlock-2 428.00 436.00 +1.87% runtime_test.BenchmarkSemaWorkBlock-4 232.00 219.00 -5.60% runtime_test.BenchmarkSemaWorkBlock-8 392.00 251.00 -35.97% runtime_test.BenchmarkSemaWorkBlock-16 1524.00 298.00 -80.45% sync_test.BenchmarkMutexUncontended 24.10 24.00 -0.41% sync_test.BenchmarkMutexUncontended-2 12.00 12.00 +0.00% sync_test.BenchmarkMutexUncontended-4 6.25 6.17 -1.28% sync_test.BenchmarkMutexUncontended-8 3.43 3.34 -2.62% sync_test.BenchmarkMutexUncontended-16 2.34 2.32 -0.85% sync_test.BenchmarkMutex 24.70 24.70 +0.00% sync_test.BenchmarkMutex-2 208.00 99.50 -52.16% sync_test.BenchmarkMutex-4 2744.00 256.00 -90.67% sync_test.BenchmarkMutex-8 5137.00 556.00 -89.18% sync_test.BenchmarkMutex-16 5368.00 1284.00 -76.08% sync_test.BenchmarkMutexSlack 24.70 25.00 +1.21% sync_test.BenchmarkMutexSlack-2 1094.00 186.00 -83.00% sync_test.BenchmarkMutexSlack-4 3430.00 402.00 -88.28% sync_test.BenchmarkMutexSlack-8 5051.00 1066.00 -78.90% sync_test.BenchmarkMutexSlack-16 6806.00 1363.00 -79.97% sync_test.BenchmarkMutexWork 793.00 792.00 -0.13% sync_test.BenchmarkMutexWork-2 398.00 398.00 +0.00% sync_test.BenchmarkMutexWork-4 1441.00 308.00 -78.63% sync_test.BenchmarkMutexWork-8 8532.00 847.00 -90.07% sync_test.BenchmarkMutexWork-16 8225.00 2760.00 -66.44% sync_test.BenchmarkMutexWorkSlack 793.00 793.00 +0.00% sync_test.BenchmarkMutexWorkSlack-2 418.00 414.00 -0.96% sync_test.BenchmarkMutexWorkSlack-4 4481.00 480.00 -89.29% sync_test.BenchmarkMutexWorkSlack-8 6317.00 1598.00 -74.70% sync_test.BenchmarkMutexWorkSlack-16 9111.00 3038.00 -66.66% R=rsc CC=golang-dev https://golang.org/cl/4631059
1 parent 39acba5 commit 997c00f

File tree

6 files changed

+131
-96
lines changed

6 files changed

+131
-96
lines changed

src/pkg/runtime/386/atomic.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright 2009 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "runtime.h"
6+
7+
#pragma textflag 7
8+
uint32
9+
runtime·atomicload(uint32 volatile* addr)
10+
{
11+
return *addr;
12+
}

src/pkg/runtime/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ OFILES_arm=\
4747

4848
OFILES=\
4949
asm.$O\
50+
atomic.$O\
5051
cgocall.$O\
5152
chan.$O\
5253
closure.$O\

src/pkg/runtime/amd64/atomic.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright 2009 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "runtime.h"
6+
7+
#pragma textflag 7
8+
uint32
9+
runtime·atomicload(uint32 volatile* addr)
10+
{
11+
return *addr;
12+
}

src/pkg/runtime/arm/atomic.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright 2009 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "runtime.h"
6+
7+
#pragma textflag 7
8+
uint32
9+
runtime·atomicload(uint32 volatile* addr)
10+
{
11+
return runtime·xadd(addr, 0);
12+
}

src/pkg/runtime/runtime.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,10 @@ int32 runtime·write(int32, void*, int32);
416416
int32 runtime·mincore(void*, uintptr, byte*);
417417
bool runtime·cas(uint32*, uint32, uint32);
418418
bool runtime·casp(void**, void*, void*);
419+
// Don't confuse with XADD x86 instruction,
420+
// this one is actually 'addx', that is, add-and-fetch.
419421
uint32 runtime·xadd(uint32 volatile*, int32);
422+
uint32 runtime·atomicload(uint32 volatile*);
420423
void runtime·jmpdefer(byte*, void*);
421424
void runtime·exit1(int32);
422425
void runtime·ready(G*);

src/pkg/runtime/sema.goc

Lines changed: 91 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -23,157 +23,152 @@ package runtime
2323
typedef struct Sema Sema;
2424
struct Sema
2525
{
26-
uint32 *addr;
26+
uint32 volatile *addr;
2727
G *g;
2828
Sema *prev;
2929
Sema *next;
3030
};
3131

32-
// TODO: For now, a linked list; maybe a hash table of linked lists later.
33-
static Sema *semfirst, *semlast;
34-
static Lock semlock;
32+
typedef struct SemaRoot SemaRoot;
33+
struct SemaRoot
34+
{
35+
Lock;
36+
Sema *head;
37+
Sema *tail;
38+
// Number of waiters. Read w/o the lock.
39+
uint32 volatile nwait;
40+
};
41+
42+
// Prime to not correlate with any user patterns.
43+
#define SEMTABLESZ 251
44+
45+
static union
46+
{
47+
SemaRoot;
48+
// Modern processors tend to have 64-byte cache lines,
49+
// potentially with 128-byte effective cache line size for reading.
50+
// While there are hypothetical architectures
51+
// with 16-4096 byte cache lines, 128 looks like a good compromise.
52+
uint8 pad[128];
53+
} semtable[SEMTABLESZ];
54+
55+
static SemaRoot*
56+
semroot(uint32 *addr)
57+
{
58+
return &semtable[((uintptr)addr >> 3) % SEMTABLESZ];
59+
}
3560

3661
static void
37-
semqueue(uint32 *addr, Sema *s)
62+
semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s)
3863
{
64+
s->g = g;
3965
s->addr = addr;
40-
s->g = nil;
41-
42-
runtime·lock(&semlock);
43-
s->prev = semlast;
4466
s->next = nil;
45-
if(semlast)
46-
semlast->next = s;
67+
s->prev = root->tail;
68+
if(root->tail)
69+
root->tail->next = s;
4770
else
48-
semfirst = s;
49-
semlast = s;
50-
runtime·unlock(&semlock);
71+
root->head = s;
72+
root->tail = s;
5173
}
5274

5375
static void
54-
semdequeue(Sema *s)
76+
semdequeue(SemaRoot *root, Sema *s)
5577
{
56-
runtime·lock(&semlock);
5778
if(s->next)
5879
s->next->prev = s->prev;
5980
else
60-
semlast = s->prev;
81+
root->tail = s->prev;
6182
if(s->prev)
6283
s->prev->next = s->next;
6384
else
64-
semfirst = s->next;
85+
root->head = s->next;
6586
s->prev = nil;
6687
s->next = nil;
67-
runtime·unlock(&semlock);
68-
}
69-
70-
static void
71-
semwakeup(uint32 *addr)
72-
{
73-
Sema *s;
74-
75-
runtime·lock(&semlock);
76-
for(s=semfirst; s; s=s->next) {
77-
if(s->addr == addr && s->g) {
78-
runtime·ready(s->g);
79-
s->g = nil;
80-
break;
81-
}
82-
}
83-
runtime·unlock(&semlock);
84-
}
85-
86-
// Step 1 of sleep: make ourselves available for wakeup.
87-
// TODO(rsc): Maybe we can write a version without
88-
// locks by using cas on s->g. Maybe not: I need to
89-
// think more about whether it would be correct.
90-
static void
91-
semsleep1(Sema *s)
92-
{
93-
runtime·lock(&semlock);
94-
s->g = g;
95-
runtime·unlock(&semlock);
96-
}
97-
98-
// Decided not to go through with it: undo step 1.
99-
static void
100-
semsleepundo1(Sema *s)
101-
{
102-
runtime·lock(&semlock);
103-
if(s->g != nil) {
104-
s->g = nil; // back ourselves out
105-
} else {
106-
// If s->g == nil already, semwakeup
107-
// already readied us. Since we never stopped
108-
// running, readying us just set g->readyonstop.
109-
// Clear it.
110-
if(g->readyonstop == 0)
111-
*(int32*)0x555 = 555;
112-
g->readyonstop = 0;
113-
}
114-
runtime·unlock(&semlock);
115-
}
116-
117-
// Step 2: wait for the wakeup.
118-
static void
119-
semsleep2(Sema *s)
120-
{
121-
USED(s);
122-
g->status = Gwaiting;
123-
runtime·gosched();
12488
}
12589

12690
static int32
12791
cansemacquire(uint32 *addr)
12892
{
12993
uint32 v;
13094

131-
while((v = *addr) > 0)
95+
while((v = runtime·atomicload(addr)) > 0)
13296
if(runtime·cas(addr, v, v-1))
13397
return 1;
13498
return 0;
13599
}
136100

137-
// For now has no return value.
138-
// Might return an ok (not interrupted) bool in the future?
139101
void
140-
runtime·semacquire(uint32 *addr)
102+
runtime·semacquire(uint32 volatile *addr)
141103
{
142104
Sema s;
105+
SemaRoot *root;
143106

144107
// Easy case.
145108
if(cansemacquire(addr))
146109
return;
147110

148111
// Harder case:
149-
// queue
150-
// try semacquire one more time, sleep if failed
151-
// dequeue
152-
// wake up one more guy to avoid races (TODO(rsc): maybe unnecessary?)
153-
semqueue(addr, &s);
112+
// increment waiter count
113+
// try cansemacquire one more time, return if succeeded
114+
// enqueue itself as a waiter
115+
// sleep
116+
// (waiter descriptor is dequeued by signaler)
117+
root = semroot(addr);
154118
for(;;) {
155-
semsleep1(&s);
119+
runtime·lock(root);
120+
// Add ourselves to nwait to disable "easy case" in semrelease.
121+
runtime·xadd(&root->nwait, 1);
122+
// Check cansemacquire to avoid missed wakeup.
156123
if(cansemacquire(addr)) {
157-
semsleepundo1(&s);
158-
break;
124+
runtime·xadd(&root->nwait, -1);
125+
runtime·unlock(root);
126+
return;
159127
}
160-
semsleep2(&s);
128+
// Any semrelease after the cansemacquire knows we're waiting
129+
// (we set nwait above), so go to sleep.
130+
semqueue(root, addr, &s);
131+
g->status = Gwaiting;
132+
runtime·unlock(root);
133+
runtime·gosched();
134+
if(cansemacquire(addr))
135+
return;
161136
}
162-
semdequeue(&s);
163-
semwakeup(addr);
164137
}
165138

166139
void
167-
runtime·semrelease(uint32 *addr)
140+
runtime·semrelease(uint32 volatile *addr)
168141
{
169-
uint32 v;
142+
Sema *s;
143+
SemaRoot *root;
170144

171-
for(;;) {
172-
v = *addr;
173-
if(runtime·cas(addr, v, v+1))
145+
root = semroot(addr);
146+
runtime·xadd(addr, 1);
147+
148+
// Easy case: no waiters?
149+
// This check must happen after the xadd, to avoid a missed wakeup
150+
// (see loop in semacquire).
151+
if(runtime·atomicload(&root->nwait) == 0)
152+
return;
153+
154+
// Harder case: search for a waiter and wake it.
155+
runtime·lock(root);
156+
if(runtime·atomicload(&root->nwait) == 0) {
157+
// The count is already consumed by another goroutine,
158+
// so no need to wake up another goroutine.
159+
runtime·unlock(root);
160+
return;
161+
}
162+
for(s = root->head; s; s = s->next) {
163+
if(s->addr == addr) {
164+
runtime·xadd(&root->nwait, -1);
165+
semdequeue(root, s);
174166
break;
167+
}
175168
}
176-
semwakeup(addr);
169+
runtime·unlock(root);
170+
if(s)
171+
runtime·ready(s->g);
177172
}
178173

179174
func Semacquire(addr *uint32) {

0 commit comments

Comments
 (0)