Skip to content

Commit aba7f3c

Browse files
authored
Merge pull request #12183 from ronawho/fence-unordered-ops-task-end
Automatically fence unordered operations at task termination [reviewed by @gbtitus and @mppf] Unordered operations must be fenced before operations are guaranteed to be completed. Previously, the only way to do this was with explicit fences. This adds implicit/automatic fences to task termination to simplify common cases. For bale-histogram, previously we did something like: ```chpl forall r in rindex do A[r].addBuff(1); flushAtomicBuff(); ``` and now we can simply write: ```chpl forall r in rindex do A[r].addBuff(1); ``` To support that, this PR adds a new comm layer routine `chpl_comm_task_end`, which is a hook that's called on task completion. For ugni we use this to flush any operations that were buffered as part of an unordered op, and it's currently a no-op for other comm layers. A more elegant solution might have been to have some sort of callback interface where module code could register things to be done on task completion (similar to what we have in the runtime) but I wasn't quite sure how to do that cheaply and cleanly. Since we're now calling these fencing operations on every task join, we need them to be as fast as possible, especially in the no-op case. To support that, this adds very fast per-task fences. For tasking layers where tasks can't migrate between threads, this is a non-atomic conditional in the no-op case so there shouldn't be any noticeable overhead. If there are pending operations, we only have to lock the current thread so there shouldn't be any contention. Closes #12065 Closes #12066
2 parents 15ab995 + c4f26fc commit aba7f3c

File tree

18 files changed

+99
-80
lines changed

18 files changed

+99
-80
lines changed

modules/internal/ChapelBase.chpl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,8 @@ module ChapelBase {
10071007
pragma "down end count fn"
10081008
proc _downEndCount(e: _EndCount, err: unmanaged Error) {
10091009
chpl_save_task_error(e, err);
1010+
extern proc chpl_comm_task_end(): void;
1011+
chpl_comm_task_end();
10101012
// inform anybody waiting that we're done
10111013
e.i.sub(1, memory_order_release);
10121014
}

runtime/include/chpl-cache.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ extern const int CHPL_CACHE_REMOTE;
3434
static inline
3535
int chpl_cache_enabled(void)
3636
{
37-
return CHPL_CACHE_REMOTE && chpl_task_supportsRemoteCache();
37+
// The remote cache uses thread local storage, so if tasks can migrate
38+
// between threads we lose out ability to correctly fence.
39+
return CHPL_CACHE_REMOTE && !chpl_task_canMigrateThreads();
3840
}
3941

4042

runtime/include/chpl-comm-native-atomics.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,5 +163,6 @@ DECL_CHPL_COMM_ATOMIC_BINARY(sub, real32)
163163
DECL_CHPL_COMM_ATOMIC_BINARY(sub, real64)
164164

165165
void chpl_comm_atomic_unordered_fence(void);
166+
void chpl_comm_atomic_unordered_task_fence(void);
166167

167168
#endif // _chpl_comm_native_atomics_h_

runtime/include/chpl-comm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,10 @@ int chpl_comm_numPollingTasks(void);
492492
// an atomic variable for other tasks to finish).
493493
void chpl_comm_make_progress(void);
494494

495+
// This is a hook that's called when a task is ending. It allows for things
496+
// like say flushing task private buffers.
497+
void chpl_comm_task_end(void);
498+
495499
//
496500
// Comm diagnostics stuff
497501
//

runtime/include/chpl-tasks.h

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -283,20 +283,6 @@ chpl_task_ChapelData_t* chpl_task_getChapelData(void)
283283
}
284284

285285

286-
//
287-
// Can this tasking layer support remote caching?
288-
//
289-
// (In practice this answers: "Are tasks bound to specific pthreads
290-
// or, if not, does the tasking layer make memory consistency calls
291-
// whenever it might move a task from one pthread to another?" Remote
292-
// caching uses pthread-specific data (TLS) extensively, so it turns
293-
// itself off when it's used with a tasking layer that can't support
294-
// that.)
295-
//
296-
#ifndef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
297-
int chpl_task_supportsRemoteCache(void);
298-
#endif
299-
300286
//
301287
// Returns the maximum width of parallelism the tasking layer expects
302288
// to be able to provide on the calling (sub)locale. With some
@@ -366,6 +352,22 @@ uint32_t chpl_task_isFixedThread(void) {
366352
return CHPL_TASK_IMPL_IS_FIXED_THREAD();
367353
}
368354

355+
//
356+
// If the tasking layer will always execute tasks on the same thread
357+
// they started on, this returns true. Otherwise, it returns false.
358+
// For example CHPL_TASKS=fifo a task is a thread, so tasks can't
359+
// migrate. For CHPL_TASKS=qthreads some schedulers support
360+
// work-stealing where tasks can be stolen and moved to a diferent
361+
// then they started on.
362+
//
363+
#ifndef CHPL_TASK_IMPL_CAN_MIGRATE_THREADS
364+
#define CHPL_TASK_IMPL_CAN_MIGRATE_THREADS() 1
365+
#endif
366+
static inline
367+
uint32_t chpl_task_canMigrateThreads(void) {
368+
return CHPL_TASK_IMPL_CAN_MIGRATE_THREADS();
369+
}
370+
369371
//
370372
// returns the total number of threads that currently exist, whether running,
371373
// blocked, or idle

runtime/include/comm/ugni/chpl-comm-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ void chpl_comm_get_unordered(void *addr, c_nodeid_t node, void* raddr,
6666
int ln, int32_t fn);
6767

6868
void chpl_comm_get_unordered_fence(void);
69+
void chpl_comm_get_unordered_task_fence(void);
6970

7071
//
7172
// Internal statistics gathering and reporting.

runtime/include/tasks/fifo/chpl-tasks-impl.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,7 @@ c_sublocid_t chpl_task_getRequestedSubloc(void) {
126126
return c_sublocid_any;
127127
}
128128

129-
130-
#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
131-
#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!"
132-
#else
133-
#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1
134-
#endif
135-
static inline
136-
int chpl_task_supportsRemoteCache(void) {
137-
return 1;
138-
}
129+
#define CHPL_TASK_IMPL_CAN_MIGRATE_THREADS() 0
139130

140131
#ifdef __cplusplus
141132
} // end extern "C"

runtime/include/tasks/qthreads/chpl-tasks-impl.h

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -236,22 +236,9 @@ void chpl_task_setSubloc(c_sublocid_t full_subloc)
236236
chpl_task_impl_getFixedNumThreads()
237237
uint32_t chpl_task_impl_getFixedNumThreads(void);
238238

239-
240239
#define CHPL_TASK_IMPL_IS_FIXED_THREAD() (qthread_shep() != NO_SHEPHERD)
241240

242-
243-
//
244-
// Can we support remote caching?
245-
//
246-
#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
247-
#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!"
248-
#else
249-
#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1
250-
#endif
251-
static inline
252-
int chpl_task_supportsRemoteCache(void) {
253-
return CHPL_QTHREAD_SUPPORTS_REMOTE_CACHE;
254-
}
241+
#define CHPL_TASK_IMPL_CAN_MIGRATE_THREADS() CHPL_QTHREAD_TASKS_CAN_MIGRATE_THREADS
255242

256243
#ifdef __cplusplus
257244
} // end extern "C"

runtime/src/comm/gasnet/comm-gasnet.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,7 @@ void chpl_comm_make_progress(void)
15861586
gasnet_AMPoll();
15871587
}
15881588

1589+
void chpl_comm_task_end(void) { }
15891590

15901591
void chpl_startVerboseComm() {
15911592
chpl_verbose_comm = 1;

runtime/src/comm/none/comm-none.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,9 @@ void chpl_comm_execute_on_fast(c_nodeid_t node, c_sublocid_t subloc,
219219

220220
int chpl_comm_numPollingTasks(void) { return 0; }
221221

222-
void chpl_comm_make_progress(void)
223-
{
224-
}
222+
void chpl_comm_make_progress(void) { }
223+
224+
void chpl_comm_task_end(void) { }
225225

226226
void chpl_startVerboseComm() { }
227227
void chpl_stopVerboseComm() { }

0 commit comments

Comments
 (0)