-
Notifications
You must be signed in to change notification settings - Fork 300
/
pair.cc
1245 lines (1078 loc) · 34.5 KB
/
pair.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* Copyright (c) 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "gloo/transport/tcp/pair.h"
#include <array>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <fcntl.h>
#include <netdb.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "gloo/common/error.h"
#include "gloo/common/logging.h"
#include "gloo/transport/tcp/buffer.h"
#include "gloo/transport/tcp/context.h"
#include "gloo/transport/tcp/unbound_buffer.h"
#define FD_INVALID (-1)
namespace gloo {
namespace transport {
namespace tcp {
namespace {
// This reflects an approximation of /proc/sys/net/core/{r,w}mem_max.
// It is hard coded because making buffers larger than this would not
// have much impact. Also see socket(7).
constexpr size_t kMaxSendBufferSize = 32 * 1024 * 1024;
constexpr size_t kMaxRecvBufferSize = 32 * 1024 * 1024;
} // namespace
Pair::Pair(
Context* context,
Device* device,
int rank,
std::chrono::milliseconds timeout)
: context_(context),
device_(device),
rank_(rank),
state_(INITIALIZING),
sync_(false),
timeout_(timeout),
busyPoll_(false),
fd_(FD_INVALID),
sendBufferSize_(0),
is_client_(false),
ex_(nullptr) {
listen();
}
// Destructor performs a "soft" close.
Pair::~Pair() {
// Needs lock so that this doesn't race with read/write of the
// underlying file descriptor on the device thread.
std::lock_guard<std::mutex> lock(m_);
if (state_ != CLOSED) {
Pair::changeState(CLOSED);
}
}
// The close function performs a "hard" close.
// It sets SO_LINGER to reset the connection on close,
// in order to avoid sockets hanging around in TIME_WAIT.
void Pair::close() {
// Needs lock so that this doesn't race with read/write of the
// underlying file descriptor on the device thread.
std::lock_guard<std::mutex> lock(m_);
if (state_ != CLOSED) {
if (fd_ != FD_INVALID) {
struct linger sl;
sl.l_onoff = 1;
sl.l_linger = 0;
setsockopt(fd_, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl));
}
changeState(CLOSED);
}
}
const Address& Pair::address() const {
return self_;
}
void Pair::connect(const std::vector<char>& bytes) {
auto peer = Address(bytes);
connect(peer);
}
static void setSocketBlocking(int fd, bool enable) {
auto rv = fcntl(fd, F_GETFL);
GLOO_ENFORCE_NE(rv, -1);
if (enable) {
rv &= ~O_NONBLOCK;
} else {
rv |= O_NONBLOCK;
}
rv = fcntl(fd, F_SETFL, rv);
GLOO_ENFORCE_NE(rv, -1);
}
void Pair::setSync(bool sync, bool busyPoll) {
std::unique_lock<std::mutex> lock(m_);
if (!sync) {
GLOO_THROW_INVALID_OPERATION_EXCEPTION("Can only switch to sync mode");
}
// Wait for pair to be connected. No need to wait for timeout here. If
// necessary, the connect path will timeout and signal this thread.
waitUntilConnected(lock, false);
if (state_ == CLOSED) {
signalAndThrowException(
GLOO_ERROR_MSG("Socket unexpectedly closed ", peer_.str()));
}
if (!sync_) {
// If async, unregister from loop and switch socket to blocking mode
device_->unregisterDescriptor(fd_, this);
setSocketBlocking(fd_, true);
// If the pair was still flushing writes, finish them.
for (auto& op : tx_) {
auto rv = write(op);
if (!rv) {
GLOO_ENFORCE(
ex_ != nullptr,
"write() returned false in sync mode; ex_ must be set");
std::rethrow_exception(ex_);
}
}
tx_.clear();
}
sync_ = true;
busyPoll_ = busyPoll;
}
void Pair::listen() {
std::lock_guard<std::mutex> lock(m_);
int rv;
const auto& attr = device_->attr_;
auto fd = socket(attr.ai_family, attr.ai_socktype, attr.ai_protocol);
if (fd == -1) {
signalAndThrowException(GLOO_ERROR_MSG("socket: ", strerror(errno)));
}
// Set SO_REUSEADDR to signal that reuse of the listening port is OK.
int on = 1;
rv = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
if (rv == -1) {
::close(fd);
signalAndThrowException(GLOO_ERROR_MSG("setsockopt: ", strerror(errno)));
}
rv = bind(fd, (const sockaddr*)&attr.ai_addr, attr.ai_addrlen);
if (rv == -1) {
::close(fd);
signalAndThrowException(GLOO_ERROR_MSG("bind: ", strerror(errno)));
}
// listen(2) on socket
fd_ = fd;
rv = ::listen(fd_, 1);
if (rv == -1) {
::close(fd_);
fd_ = FD_INVALID;
signalAndThrowException(GLOO_ERROR_MSG("listen: ", strerror(errno)));
}
// Keep copy of address
self_ = Address::fromSockName(fd);
// Register with device so we're called when peer connects
changeState(LISTENING);
device_->registerDescriptor(fd_, EPOLLIN, this);
return;
}
void Pair::connect(const Address& peer) {
std::unique_lock<std::mutex> lock(m_);
int rv;
socklen_t addrlen;
throwIfException();
peer_ = peer;
const auto& selfAddr = self_.getSockaddr();
const auto& peerAddr = peer_.getSockaddr();
// Addresses have to have same family
if (selfAddr.ss_family != peerAddr.ss_family) {
GLOO_THROW_INVALID_OPERATION_EXCEPTION("address family mismatch");
}
if (selfAddr.ss_family == AF_INET) {
struct sockaddr_in* sa = (struct sockaddr_in*)&selfAddr;
struct sockaddr_in* sb = (struct sockaddr_in*)&peerAddr;
addrlen = sizeof(struct sockaddr_in);
rv = memcmp(&sa->sin_addr, &sb->sin_addr, sizeof(struct in_addr));
if (rv == 0) {
rv = sa->sin_port - sb->sin_port;
}
} else if (peerAddr.ss_family == AF_INET6) {
struct sockaddr_in6* sa = (struct sockaddr_in6*)&selfAddr;
struct sockaddr_in6* sb = (struct sockaddr_in6*)&peerAddr;
addrlen = sizeof(struct sockaddr_in6);
rv = memcmp(&sa->sin6_addr, &sb->sin6_addr, sizeof(struct in6_addr));
if (rv == 0) {
rv = sa->sin6_port - sb->sin6_port;
}
} else {
GLOO_THROW_INVALID_OPERATION_EXCEPTION("unknown sa_family");
}
if (rv == 0) {
GLOO_THROW_INVALID_OPERATION_EXCEPTION("cannot connect to self");
}
is_client_ = rv > 0;
// self_ < peer_; we are listening side.
if (!is_client_) {
waitUntilConnected(lock, true);
return;
}
// self_ > peer_; we are connecting side.
// First destroy listening socket.
device_->unregisterDescriptor(fd_, this);
::close(fd_);
// Create new socket to connect to peer.
fd_ = socket(peerAddr.ss_family, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (fd_ == -1) {
signalAndThrowException(GLOO_ERROR_MSG("socket: ", strerror(errno)));
}
// Set SO_REUSEADDR to signal that reuse of the source port is OK.
int on = 1;
rv = setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
if (rv == -1) {
::close(fd_);
fd_ = FD_INVALID;
signalAndThrowException(GLOO_ERROR_MSG("setsockopt: ", strerror(errno)));
}
// Connect to peer
rv = ::connect(fd_, (struct sockaddr*)&peerAddr, addrlen);
if (rv == -1 && errno != EINPROGRESS) {
::close(fd_);
fd_ = FD_INVALID;
signalAndThrowException(GLOO_ERROR_MSG("connect: ", strerror(errno)));
}
// Register with device so we're called when connection completes.
changeState(CONNECTING);
device_->registerDescriptor(fd_, EPOLLIN | EPOLLOUT, this);
// Wait for connection to complete
waitUntilConnected(lock, true);
}
ssize_t Pair::prepareWrite(
Op& op,
const NonOwningPtr<UnboundBuffer>& buf,
struct iovec* iov,
int& ioc) {
ssize_t len = 0;
ioc = 0;
// Include preamble if necessary
if (op.nwritten < sizeof(op.preamble)) {
iov[ioc].iov_base = ((char*)&op.preamble) + op.nwritten;
iov[ioc].iov_len = sizeof(op.preamble) - op.nwritten;
len += iov[ioc].iov_len;
ioc++;
}
auto opcode = op.getOpcode();
// Send data to a remote buffer
if (opcode == Op::SEND_BUFFER) {
char* ptr = (char*)op.buf->ptr_;
size_t offset = op.preamble.offset;
size_t nbytes = op.preamble.length;
if (op.nwritten > sizeof(op.preamble)) {
offset += op.nwritten - sizeof(op.preamble);
nbytes -= op.nwritten - sizeof(op.preamble);
}
iov[ioc].iov_base = ptr + offset;
iov[ioc].iov_len = nbytes;
len += iov[ioc].iov_len;
ioc++;
return len;
}
// Send data to a remote unbound buffer
if (opcode == Op::SEND_UNBOUND_BUFFER) {
char* ptr = (char*)buf->ptr;
size_t offset = op.offset;
size_t nbytes = op.nbytes;
if (op.nwritten > sizeof(op.preamble)) {
offset += op.nwritten - sizeof(op.preamble);
nbytes -= op.nwritten - sizeof(op.preamble);
}
iov[ioc].iov_base = ptr + offset;
iov[ioc].iov_len = nbytes;
len += iov[ioc].iov_len;
ioc++;
return len;
}
return len;
}
// write is called from:
// 1) the device thread (the handleEvents function)
// 2) a user thread (the send function)
//
// In either case, the lock is held and the write function
// below inherits it.
//
bool Pair::write(Op& op) {
if (state_ == CLOSED) {
return false;
}
NonOwningPtr<UnboundBuffer> buf;
std::array<struct iovec, 2> iov;
int ioc;
ssize_t rv;
const auto opcode = op.getOpcode();
// Acquire pointer to unbound buffer if applicable.
if (opcode == Op::SEND_UNBOUND_BUFFER) {
buf = NonOwningPtr<UnboundBuffer>(op.ubuf);
if (!buf) {
return false;
}
}
for (;;) {
const auto nbytes = prepareWrite(op, buf, iov.data(), ioc);
// Write
rv = writev(fd_, iov.data(), ioc);
if (rv == -1) {
if (errno == EAGAIN) {
if (sync_) {
// Sync mode: blocking call returning with EAGAIN indicates timeout.
signalException(GLOO_ERROR_MSG("Write timeout ", peer_.str()));
} else {
// Async mode: can't write more than this.
}
return false;
}
if (errno == ECONNRESET) {
if (!sync_) {
return false;
}
}
if (errno == EPIPE) {
if (!sync_) {
return false;
}
}
// Retry on EINTR
if (errno == EINTR) {
continue;
}
// Unexpected error
signalException(
GLOO_ERROR_MSG("writev ", peer_.str(), ": ", strerror(errno)));
return false;
}
// From write(2) man page (NOTES section):
//
// If a write() is interrupted by a signal handler before any
// bytes are written, then the call fails with the error EINTR;
// if it is interrupted after at least one byte has been written,
// the call succeeds, and returns the number of bytes written.
//
// If rv < nbytes we ALWAYS retry, regardless of sync/async mode,
// since an EINTR may or may not have happened. If this was not
// the case, and the kernel buffer is full, the next call to
// write(2) will return EAGAIN, which is handled appropriately.
op.nwritten += rv;
if (rv < nbytes) {
continue;
}
GLOO_ENFORCE_EQ(rv, nbytes);
GLOO_ENFORCE_EQ(op.nwritten, op.preamble.nbytes);
break;
}
writeComplete(op, buf, opcode);
return true;
}
void Pair::writeComplete(const Op &op, NonOwningPtr<UnboundBuffer> &buf,
const Op::Opcode &opcode) const {
switch (opcode) {
case Op::SEND_BUFFER:
op.buf->handleSendCompletion();
break;
case Op::SEND_UNBOUND_BUFFER:
buf->handleSendCompletion(this->rank_);
break;
case Op::NOTIFY_SEND_READY:
break;
case Op::NOTIFY_RECV_READY:
break;
}
}
// Populates the iovec struct. May populate the 'buf' or 'ubuf' member field
// in the op if the preamble indicates the operation is one of type SEND_BUFFER
// or SEND_UNBOUND_BUFFER.
//
// Returns a boolean indicating whether or not the caller (the read function)
// should continue trying to read from the socket. This is not the case if the
// buffer this message is intended for has not yet been registered (this can
// only be the case for unbound buffers).
//
ssize_t Pair::prepareRead(
Op& op,
NonOwningPtr<UnboundBuffer>& buf,
struct iovec& iov) {
iov.iov_base = nullptr;
iov.iov_len = 0;
// Read preamble
if (op.nread < sizeof(op.preamble)) {
iov.iov_base = ((char*)&op.preamble) + op.nread;
iov.iov_len = sizeof(op.preamble) - op.nread;
return iov.iov_len;
}
auto opcode = op.getOpcode();
auto offset = op.nread - sizeof(op.preamble);
// Remote side is sending data to a buffer; read payload
if (opcode == Op::SEND_BUFFER) {
if (op.buf == nullptr) {
op.buf = getBuffer(op.preamble.slot);
// Buffer not (yet) registered, leave it for next loop iteration
if (op.buf == nullptr) {
return -1;
}
}
iov.iov_base = ((char*)op.buf->ptr_) + offset + op.preamble.roffset;
iov.iov_len = op.preamble.length - offset;
// Bytes read must be in bounds for target buffer
GLOO_ENFORCE_LE(op.preamble.roffset + op.preamble.length, op.buf->size_);
return iov.iov_len;
}
// Remote side is sending data to an unbound buffer; read payload
if (opcode == Op::SEND_UNBOUND_BUFFER) {
if (!op.ubuf) {
auto it = localPendingRecv_.find(op.preamble.slot);
GLOO_ENFORCE(it != localPendingRecv_.end());
std::deque<UnboundBufferOp>& queue = it->second;
GLOO_ENFORCE(!queue.empty());
std::tie(op.ubuf, op.offset, op.nbytes) = queue.front();
queue.pop_front();
if (queue.empty()) {
localPendingRecv_.erase(it);
}
}
// Acquire short lived pointer to unbound buffer.
// This is a stack allocated variable in the read function
// which is destructed upon that function returning.
buf = NonOwningPtr<UnboundBuffer>(op.ubuf);
if (!buf) {
return -1;
}
iov.iov_base = ((char*)buf->ptr) + op.offset + offset;
iov.iov_len = op.preamble.length - offset;
// Bytes read must be in bounds for target buffer
GLOO_ENFORCE_LE(op.preamble.length, op.nbytes);
return iov.iov_len;
}
return 0;
}
// read is called from:
// 1) the device thread (the handleEvents function).
// 2) a user thread (the recv function) IFF the pair is in sync mode.
//
// In either case, the lock is held and the read function
// below inherits it.
//
bool Pair::read() {
if (state_ == CLOSED) {
return false;
}
NonOwningPtr<UnboundBuffer> buf;
auto start = std::chrono::steady_clock::now();
for (;;) {
struct iovec iov = {
.iov_base = nullptr,
.iov_len = 0,
};
const auto nbytes = prepareRead(rx_, buf, iov);
if (nbytes < 0) {
return false;
}
// Break from loop if the op is complete.
// Note that this means that the buffer pointer has been
// set, per the call to prepareRead.
if (nbytes == 0) {
break;
}
// If busy-poll has been requested AND sync mode has been enabled for pair
// we'll keep spinning calling recv() on socket by supplying MSG_DONTWAIT
// flag. This is more efficient in terms of latency than allowing the kernel
// to de-schedule this thread waiting for IO event to happen. The tradeoff
// is stealing the CPU core just for busy polling.
ssize_t rv = 0;
for (;;) {
// Alas, readv does not support flags, so we need to use recv
rv = ::recv(fd_, iov.iov_base, iov.iov_len, busyPoll_ ? MSG_DONTWAIT : 0);
if (rv == -1) {
// EAGAIN happens when (1) non-blocking and there are no more bytes left
// to read or (2) blocking and timeout occurs.
if (errno == EAGAIN) {
if (sync_) {
// Sync mode: EAGAIN indicates nothing to read right now.
auto hasTimedOut = [&] {
return (timeout_ != kNoTimeout) &&
((std::chrono::steady_clock::now() - start) >= timeout_);
};
if (busyPoll_ && !hasTimedOut()) {
// Keep looping on EAGAIN if busy-poll flag has been set and the
// timeout (if set) hasn't been reached
continue;
} else {
// Either timeout on poll or blocking call returning with EAGAIN
// indicates timeout
signalException(GLOO_ERROR_MSG("Read timeout ", peer_.str()));
}
} else {
// Async mode: can't read more than this.
}
return false;
}
// Retry on EINTR
if (errno == EINTR) {
continue;
}
// Unexpected error
signalException(
GLOO_ERROR_MSG("Read error ", peer_.str(), ": ", strerror(errno)));
return false;
}
break;
}
// Transition to CLOSED on EOF
if (rv == 0) {
signalException(
GLOO_ERROR_MSG("Connection closed by peer ", peer_.str()));
return false;
}
rx_.nread += rv;
}
readComplete(buf);
return true;
}
void Pair::readComplete(NonOwningPtr<UnboundBuffer> &buf) {
const auto opcode = this->rx_.getOpcode();
switch (opcode) {
case Op::SEND_BUFFER:
// Done sending data to pinned buffer; trigger completion.
this->rx_.buf->handleRecvCompletion();
break;
case Op::SEND_UNBOUND_BUFFER:
// Remote side is sending data to unbound buffer; trigger completion
buf->handleRecvCompletion(this->rank_);
break;
case Op::NOTIFY_SEND_READY:
// Remote side has pending send operation
this->handleRemotePendingSend(this->rx_);
break;
case Op::NOTIFY_RECV_READY:
// Remote side has pending recv operation
this->handleRemotePendingRecv(this->rx_);
break;
}
// Reset read operation state.
this->rx_ = Op();
}
// This function is called upon receiving a message from the peer
// indicating it has a pending send operation.
void Pair::handleRemotePendingSend(const Op& op) {
const auto& slot = op.preamble.slot;
// Acquire context lock through mutator.
Context::Mutator mutator(*context_, slot, rank_);
// If a receive operation was posted without there already being a
// corresponding send notification, we'll find a pending send
// notification and don't need to handle this send notification.
if (mutator.shiftExpectedSendNotification()) {
return;
}
{
// If we're ready to add it to the context wide pending operation
// tally, first check if there are any recv-from-any operations
// that this send operation can fulfill.
WeakNonOwningPtr<UnboundBuffer> buf;
size_t offset;
size_t nbytes;
if (context_->findRecvFromAny(slot, rank_, &buf, &offset, &nbytes)) {
localPendingRecv_[slot].push_back(std::make_tuple(buf, offset, nbytes));
sendNotifyRecvReady(slot, nbytes);
return;
}
}
// Increase balance of remote pending sends.
mutator.pushRemotePendingSend();
}
// This function is called upon receiving a message from the peer
// indicating it has a pending receive operation.
void Pair::handleRemotePendingRecv(const Op& op) {
const auto& slot = op.preamble.slot;
// Find local pending send and execute it.
// Nothing to do if there are none.
auto it = localPendingSend_.find(slot);
if (it != localPendingSend_.end()) {
std::deque<UnboundBufferOp>& queue = it->second;
GLOO_ENFORCE(!queue.empty());
WeakNonOwningPtr<UnboundBuffer> buf;
size_t offset;
size_t nbytes;
std::tie(buf, offset, nbytes) = queue.front();
queue.pop_front();
if (queue.empty()) {
localPendingSend_.erase(it);
}
sendUnboundBuffer(std::move(buf), slot, offset, nbytes);
return;
}
// Increase balance of remote pending recv.
// Note that the current value CANNOT be negative, as sends
// cannot execute until the remote side is ready to receive.
Context::Mutator mutator(*context_, slot, rank_);
mutator.pushRemotePendingRecv();
}
void Pair::handleEvents(int events) {
// Try to acquire the pair's lock so the device thread (the thread
// that ends up calling handleEvents) can mutate the tx and rx op
// fields of this instance. If the lock cannot be acquired that
// means some other thread is trying to mutate this pair's state,
// which in turn might require calling into (and locking) the
// underlying device (for example, when the pair transitions to the
// CLOSED state). To avoid deadlocks, attempt to lock the pair and
// skip handling the events until the next tick if the lock cannot
// be acquired.
std::unique_lock<std::mutex> lock(m_, std::try_to_lock);
if (!lock) {
return;
}
// State must be <= CONNECTED.
// If state is CLOSED; this function will NOT be called. Refer to
// Pair::changeState and Device::unregisterDescriptor for more info.
GLOO_ENFORCE_LE(state_, CONNECTED);
// Exception must not be set.
// If exception is set, state must advance to CLOSED state.
GLOO_ENFORCE(ex_ == nullptr);
if (state_ == CONNECTED) {
handleReadWrite(events);
return;
}
if (state_ == LISTENING) {
handleListening();
return;
}
if (state_ == CONNECTING) {
handleConnecting();
return;
}
GLOO_ENFORCE(false, "Unexpected state: ", state_);
}
void Pair::handleReadWrite(int events) {
if (events & EPOLLOUT) {
GLOO_ENFORCE(
!tx_.empty(), "tx_ cannot be empty because EPOLLOUT happened");
while (!tx_.empty()) {
auto& op = tx_.front();
if (!write(op)) {
// Write did not complete; wait for epoll.
break;
}
// Write completed; remove from queue.
tx_.pop_front();
}
// If there is nothing to transmit; remove EPOLLOUT.
if (tx_.empty()) {
device_->registerDescriptor(fd_, EPOLLIN, this);
}
}
if (events & EPOLLIN) {
while (read()) {
// Keep going
}
}
}
void Pair::handleListening() {
struct sockaddr_storage addr;
socklen_t addrlen = sizeof(addr);
int rv;
rv = accept(fd_, (struct sockaddr*)&addr, &addrlen);
// Close the listening file descriptor whether we've successfully connected
// or run into an error and will throw an exception.
device_->unregisterDescriptor(fd_, this);
::close(fd_);
fd_ = FD_INVALID;
if (rv == -1) {
signalException(GLOO_ERROR_MSG("accept: ", strerror(errno)));
return;
}
// Connected, replace file descriptor
fd_ = rv;
// Common connection-made code
handleConnected();
}
void Pair::handleConnecting() {
int optval;
socklen_t optlen = sizeof(optval);
int rv;
// Verify that connecting was successful
rv = getsockopt(fd_, SOL_SOCKET, SO_ERROR, &optval, &optlen);
GLOO_ENFORCE_NE(rv, -1);
if (optval != 0) {
signalException(
GLOO_ERROR_MSG("connect ", peer_.str(), ": ", strerror(optval)));
return;
}
// Common connection-made code
handleConnected();
}
void Pair::handleConnected() {
int rv;
// Reset addresses
self_ = Address::fromSockName(fd_);
peer_ = Address::fromPeerName(fd_);
// Make sure socket is non-blocking
setSocketBlocking(fd_, false);
int flag = 1;
socklen_t optlen = sizeof(flag);
rv = setsockopt(fd_, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, optlen);
GLOO_ENFORCE_NE(rv, -1);
// Set timeout
struct timeval tv = {};
tv.tv_sec = timeout_.count() / 1000;
tv.tv_usec = (timeout_.count() % 1000) * 1000;
rv = setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
GLOO_ENFORCE_NE(rv, -1);
rv = setsockopt(fd_, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
GLOO_ENFORCE_NE(rv, -1);
device_->registerDescriptor(fd_, EPOLLIN, this);
changeState(CONNECTED);
}
// getBuffer must only be called when holding lock.
Buffer* Pair::getBuffer(int slot) {
for (;;) {
auto it = buffers_.find(slot);
if (it == buffers_.end()) {
// The remote peer already sent some bytes destined for the
// buffer at this slot, but this side of the pair hasn't
// registed it yet.
//
// The current strategy is to return and let the the device loop
// repeatedly call us again until the buffer has been
// registered. This essentially means busy waiting while
// yielding to other pairs. This is not a problem as this only
// happens at initialization time.
//
return nullptr;
}
return it->second;
}
}
void Pair::registerBuffer(Buffer* buf) {
std::lock_guard<std::mutex> lock(m_);
GLOO_ENFORCE(
buffers_.find(buf->slot_) == buffers_.end(),
"duplicate buffer for slot ",
buf->slot_);
buffers_[buf->slot_] = buf;
cv_.notify_all();
}
void Pair::unregisterBuffer(Buffer* buf) {
std::lock_guard<std::mutex> lock(m_);
buffers_.erase(buf->slot_);
}
// changeState must only be called when holding lock.
void Pair::changeState(state nextState) noexcept {
if (nextState == CLOSED) {
switch (state_) {
case INITIALIZING:
// This state persists from construction up to the point where
// Pair::listen sets fd_ and calls listen(2). If this fails,
// it takes care of cleaning up the socket itself.
// There is no additional cleanup needed here.
break;
case LISTENING:
// The pair may be in the LISTENING state when it is destructed.
if (fd_ != FD_INVALID) {
device_->unregisterDescriptor(fd_, this);
::close(fd_);
fd_ = FD_INVALID;
}
break;
case CONNECTING:
// The pair may be in the CONNECTING state when it is destructed.
if (fd_ != FD_INVALID) {
device_->unregisterDescriptor(fd_, this);
::close(fd_);
fd_ = FD_INVALID;
}
break;
case CONNECTED:
if (!sync_) {
device_->unregisterDescriptor(fd_, this);
}
::close(fd_);
fd_ = FD_INVALID;
break;
case CLOSED:
// This can't happen, because we ignore no-op state changes above.
// We handle it regardless to have a case for every enum value.
break;
}
}
state_ = nextState;
cv_.notify_all();
}
void Pair::waitUntilConnected(
std::unique_lock<std::mutex>& lock,
bool useTimeout) {
auto pred = [&] {
throwIfException();
return state_ >= CONNECTED;
};
waitUntil(pred, lock, useTimeout);
}
void Pair::verifyConnected() {
// This code path should only be called after reaching the connected state
GLOO_ENFORCE_GE(
state_,
CONNECTED,
"Pair is not connected (",
self_.str(),
" <--> ",
peer_.str(),
")");
// Check if the socket has been closed. We were unable to tell if this was an
// error or normal tear down, but now throw since we are trying to do IO.
if (state_ == CLOSED) {
signalAndThrowException(GLOO_ERROR_MSG("Socket closed ", peer_.str()));
}
}
// Sends contents of operation to the remote side of the pair.
// The pair's mutex is held when this function is called.
// Only applicable to synchronous mode. May block.
void Pair::sendSyncMode(Op& op) {
GLOO_ENFORCE(sync_);
auto rv = write(op);
if (!rv) {
GLOO_ENFORCE(ex_ != nullptr);
std::rethrow_exception(ex_);
}
}
// Sends contents of operation to the remote side of the pair.
// The pair's mutex is held when this function is called.
// Only applicable to asynchronous mode. Never blocks.
void Pair::sendAsyncMode(Op& op) {
GLOO_ENFORCE(!sync_);
// If an earlier operation hasn't finished transmitting,
// add this operation to the transmit queue.
if (!tx_.empty()) {
tx_.push_back(std::move(op));
return;
}
// Write in place without checking socket for writeability.
// This is the fast path.
if (write(op)) {
return;
}
// Write may have resulted in an error.
throwIfException();
// Write didn't complete; pass to event loop
tx_.push_back(std::move(op));
device_->registerDescriptor(fd_, EPOLLIN | EPOLLOUT, this);
}
void Pair::send(Op& op) {
std::unique_lock<std::mutex> lock(m_);
throwIfException();
verifyConnected();
// Try to size the send buffer such that the write below completes
// synchronously and we don't need to finish the write later.
size_t size = std::min(op.preamble.nbytes, kMaxSendBufferSize);
if (sendBufferSize_ < size) {
int rv;
size_t optval = size;
socklen_t optlen = sizeof(optval);
rv = setsockopt(fd_, SOL_SOCKET, SO_SNDBUF, &optval, optlen);
GLOO_ENFORCE_NE(rv, -1);
rv = getsockopt(fd_, SOL_SOCKET, SO_SNDBUF, &optval, &optlen);
GLOO_ENFORCE_NE(rv, -1);
sendBufferSize_ = optval;
}