From 4c576a44013782f5743c65a63e91204ab843ca8d Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 6 Oct 2021 13:59:38 +0000 Subject: [PATCH 01/22] Skeleton for point to point messaging setup --- .../faabric/transport/MessageEndpointClient.h | 1 - .../faabric/transport/PointToPointClient.h | 21 +++++++ .../faabric/transport/PointToPointRegistry.h | 19 +++++++ .../faabric/transport/PointToPointServer.h | 31 ++++++++++ include/faabric/transport/common.h | 4 ++ src/proto/faabric.proto | 20 +++++++ src/transport/PointToPointClient.cpp | 43 ++++++++++++++ src/transport/PointToPointServer.cpp | 57 +++++++++++++++++++ 8 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 include/faabric/transport/PointToPointClient.h create mode 100644 include/faabric/transport/PointToPointRegistry.h create mode 100644 include/faabric/transport/PointToPointServer.h create mode 100644 src/transport/PointToPointClient.cpp create mode 100644 src/transport/PointToPointServer.cpp diff --git a/include/faabric/transport/MessageEndpointClient.h b/include/faabric/transport/MessageEndpointClient.h index 02e945925..7875dc37e 100644 --- a/include/faabric/transport/MessageEndpointClient.h +++ b/include/faabric/transport/MessageEndpointClient.h @@ -30,7 +30,6 @@ class MessageEndpointClient protected: const std::string host; - private: const int asyncPort; const int syncPort; diff --git a/include/faabric/transport/PointToPointClient.h b/include/faabric/transport/PointToPointClient.h new file mode 100644 index 000000000..238a407a9 --- /dev/null +++ b/include/faabric/transport/PointToPointClient.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace faabric::transport { + +class PointToPointClient : public faabric::transport::MessageEndpointClient +{ + public: + PointToPointClient(const std::string& hostIn); + + void broadcastPointToPointMappings(int appId, + std::map idxToHosts); + + void send(int appId, + int sendIdx, + int recvIdx, + const uint8_t* data, + size_t dataSize); +}; +} diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h new file mode 100644 index 000000000..602853047 --- /dev/null +++ b/include/faabric/transport/PointToPointRegistry.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include + +namespace faabric::transport { +class PointToPointRegistry +{ + public: + PointToPointRegistry(); + + std::string getHostForReceiver(int appId, int recvIdx); + + void setHostForReceiver(int appId, int recvIdx, const std::string& host); + + private: + std::mutex registryMutex; +}; +} diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h new file mode 100644 index 000000000..4196ecd8a --- /dev/null +++ b/include/faabric/transport/PointToPointServer.h @@ -0,0 +1,31 @@ +#pragma once + +#include + +namespace faabric::transport { +class PointToPointServer final : public MessageEndpointServer +{ + public: + PointToPointServer(); + + std::vector recvMessage(int appId, int sendIdx, int recvIdx); + + private: + void doAsyncRecv(int header, + const uint8_t* buffer, + size_t bufferSize) override; + + std::unique_ptr + doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; + + std::string getInprocLabel(int appId, int sendIdx, int recvIdx); + + std::shared_ptr getSendEndpoint(int appId, + int sendIdx, + int recvIdx); + + std::shared_ptr getRecvEndpoint(int appId, + int sendIdx, + int recvIdx); +}; +} diff --git a/include/faabric/transport/common.h b/include/faabric/transport/common.h index fc6e48547..6722d3b28 100644 --- a/include/faabric/transport/common.h +++ b/include/faabric/transport/common.h @@ -10,3 +10,7 @@ #define FUNCTION_CALL_SYNC_PORT 8006 #define SNAPSHOT_ASYNC_PORT 8007 #define SNAPSHOT_SYNC_PORT 8008 + + +#define POINT_TO_POINT_ASYNC_PORT 8009 +#define POINT_TO_POINT_SYNC_PORT 8010 diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index 07c7ef476..8d1f650c6 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -200,3 +200,23 @@ message StateAppendedResponse { string key = 2; repeated AppendedValue values = 3; } + +// --------------------------------------------- +// POINT-TO-POINT +// --------------------------------------------- + +message PointToPointMessage { + int32 appId = 1; + int32 sendIdx = 2; + int32 recvIdx = 3; + + bytes data = 4; +} + +message PointToPointMappings { + message PointToPointMapping { + int32 appId = 1; + int32 recvIdx = 2; + } + repeated PointToPointMapping mappings = 1; +} diff --git a/src/transport/PointToPointClient.cpp b/src/transport/PointToPointClient.cpp new file mode 100644 index 000000000..b2089ee54 --- /dev/null +++ b/src/transport/PointToPointClient.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +namespace faabric::transport { + +PointToPointClient::PointToPointClient(const std::string& hostIn) + : faabric::transport::MessageEndpointClient(hostIn, + POINT_TO_POINT_ASYNC_PORT, + POINT_TO_POINT_SYNC_PORT) +{} + +void PointToPointClient::broadcastPointToPointMappings( + int appId, + std::map idxToHosts) +{ + message::PointToPointMappings mappings; + for (auto m : idxToHosts) { + auto appendedValue = mappings->add_mappings(); + appendedValue->set_appid(m.first); + appendedValue->set_recvidx(m.second); + } + + // TODO get hosts + // TODO send mappings to all of them +} + +void PointToPointClient::send(int appId, + int sendIdx, + int recvIdx, + const uint8_t* data, + size_t dataSize) +{ + faabric::PointToPointMessage msg; + msg.set_appid(appId); + msg.set_sendidx(sendIdx); + msg.set_recvidx(recvIdx); + msg.set_data(data, dataSize); + + asyncSend(1, &msg); +} +} diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp new file mode 100644 index 000000000..4057cfeb3 --- /dev/null +++ b/src/transport/PointToPointServer.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include + +namespace faabric::transport { +PointToPointServer::PointToPointServer() + : faabric::transport::MessageEndpointServer(POINT_TO_POINT_ASYNC_PORT, + POINT_TO_POINT_SYNC_PORT) +{} + +void PointToPointServer::doAsyncRecv(int header, + const uint8_t* buffer, + size_t bufferSize) +{ + PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) + + std::shared_ptr endpoint = + getSendEndpoint(msg.appid(), msg.sendidx(), msg.recvidx()); + + // TODO - is this copying the data? Would be nice to avoid if poss + endpoint->send(BYTES_CONST(msg.data().c_str()), msg.data().size()); +} + +std::unique_ptr PointToPointServer::doSyncRecv( + int header, + const uint8_t* buffer, + size_t bufferSize) +{ + std::string errMsg = "Point-to-point sync messaging not supported"; + SPDLOG_ERROR(errMsg); + throw std::runtime_error(errMsg); +} + +std::vector PointToPointServer::recvMessage(int appId, + int sendIdx, + int recvIdx) +{ + std::shared_ptr endpoint = + getRecvEndpoint(appId, sendIdx, recvIdx); + + std::optional messageDataMaybe = endpoint->recv().value(); + Message& messageData = messageDataMaybe.value(); + + // TODO - possible to avoid this copy too? + return messageData.dataCopy(); +} + +std::string PointToPointServer::getInprocLabel(int appId, + int sendIdx, + int recvIdx) +{ + return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); +} +} From ec1db9703456070665c98b9123a20db2721970e5 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 6 Oct 2021 15:28:22 +0000 Subject: [PATCH 02/22] More groundwork for point to point --- include/faabric/runner/FaabricMain.h | 4 ++ include/faabric/transport/MessageEndpoint.h | 22 +++++- include/faabric/transport/PointToPointCall.h | 10 +++ .../faabric/transport/PointToPointClient.h | 10 +-- .../faabric/transport/PointToPointRegistry.h | 14 +++- .../faabric/transport/PointToPointServer.h | 10 ++- include/faabric/util/config.h | 1 + src/proto/faabric.proto | 1 + src/runner/FaabricMain.cpp | 9 +++ src/transport/CMakeLists.txt | 6 ++ src/transport/MessageEndpoint.cpp | 38 +++++++++- src/transport/PointToPointClient.cpp | 31 ++------ src/transport/PointToPointRegistry.cpp | 70 +++++++++++++++++++ src/transport/PointToPointServer.cpp | 39 +++++++++-- src/util/config.cpp | 2 + tests/test/util/test_config.cpp | 4 ++ 16 files changed, 221 insertions(+), 50 deletions(-) create mode 100644 include/faabric/transport/PointToPointCall.h create mode 100644 src/transport/PointToPointRegistry.cpp diff --git a/include/faabric/runner/FaabricMain.h b/include/faabric/runner/FaabricMain.h index 7f49f7727..6a5fa6f72 100644 --- a/include/faabric/runner/FaabricMain.h +++ b/include/faabric/runner/FaabricMain.h @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace faabric::runner { @@ -23,11 +24,14 @@ class FaabricMain void startSnapshotServer(); + void startPointToPointServer(); + void shutdown(); private: faabric::state::StateServer stateServer; faabric::scheduler::FunctionCallServer functionServer; faabric::snapshot::SnapshotServer snapshotServer; + faabric::transport::PointToPointServer pointToPointServer; }; } diff --git a/include/faabric/transport/MessageEndpoint.h b/include/faabric/transport/MessageEndpoint.h index 2e955df5a..b8ca3c212 100644 --- a/include/faabric/transport/MessageEndpoint.h +++ b/include/faabric/transport/MessageEndpoint.h @@ -80,7 +80,18 @@ class AsyncSendMessageEndpoint final : public MessageEndpoint void send(const uint8_t* data, size_t dataSize, bool more = false); - zmq::socket_t pushSocket; + zmq::socket_t socket; +}; + +class AsyncInternalSendMessageEndpoint final : public MessageEndpoint +{ + public: + AsyncInternalSendMessageEndpoint(const std::string& inProcLabel, + int timeoutMs = DEFAULT_RECV_TIMEOUT_MS); + + void send(const uint8_t* data, size_t dataSize, bool more = false); + + zmq::socket_t socket; }; class SyncSendMessageEndpoint final : public MessageEndpoint @@ -183,6 +194,15 @@ class AsyncRecvMessageEndpoint final : public RecvMessageEndpoint std::optional recv(int size = 0) override; }; +class AsyncInternalRecvMessageEndpoint final : public RecvMessageEndpoint +{ + public: + AsyncInternalRecvMessageEndpoint(const std::string& inprocLabel, + int timeoutMs = DEFAULT_RECV_TIMEOUT_MS); + + std::optional recv(int size = 0) override; +}; + class SyncRecvMessageEndpoint final : public RecvMessageEndpoint { public: diff --git a/include/faabric/transport/PointToPointCall.h b/include/faabric/transport/PointToPointCall.h new file mode 100644 index 000000000..a636b0b39 --- /dev/null +++ b/include/faabric/transport/PointToPointCall.h @@ -0,0 +1,10 @@ +#pragma once + +namespace faabric::transport { + +enum PointToPointCall +{ + MAPPING = 0, + MESSAGE = 1 +}; +} diff --git a/include/faabric/transport/PointToPointClient.h b/include/faabric/transport/PointToPointClient.h index 238a407a9..6c3b53046 100644 --- a/include/faabric/transport/PointToPointClient.h +++ b/include/faabric/transport/PointToPointClient.h @@ -1,5 +1,6 @@ #pragma once +#include "faabric/proto/faabric.pb.h" #include namespace faabric::transport { @@ -9,13 +10,8 @@ class PointToPointClient : public faabric::transport::MessageEndpointClient public: PointToPointClient(const std::string& hostIn); - void broadcastPointToPointMappings(int appId, - std::map idxToHosts); + void sendMappings(faabric::PointToPointMappings &mappings); - void send(int appId, - int sendIdx, - int recvIdx, - const uint8_t* data, - size_t dataSize); + void sendMessage(faabric::PointToPointMessage &msg); }; } diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h index 602853047..c64ad0558 100644 --- a/include/faabric/transport/PointToPointRegistry.h +++ b/include/faabric/transport/PointToPointRegistry.h @@ -1,7 +1,9 @@ #pragma once -#include +#include #include +#include +#include namespace faabric::transport { class PointToPointRegistry @@ -13,7 +15,15 @@ class PointToPointRegistry void setHostForReceiver(int appId, int recvIdx, const std::string& host); + void broadcastMappings(int appId, std::vector indexes); + private: - std::mutex registryMutex; + std::shared_mutex registryMutex; + + std::unordered_map mappings; + + std::string getKey(int appId, int recvIdx); }; + +PointToPointRegistry& getPointToPointRegistry(); } diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 4196ecd8a..376a7fe81 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -20,12 +20,10 @@ class PointToPointServer final : public MessageEndpointServer std::string getInprocLabel(int appId, int sendIdx, int recvIdx); - std::shared_ptr getSendEndpoint(int appId, - int sendIdx, - int recvIdx); + std::unique_ptr + getSendEndpoint(int appId, int sendIdx, int recvIdx); - std::shared_ptr getRecvEndpoint(int appId, - int sendIdx, - int recvIdx); + std::unique_ptr + getRecvEndpoint(int appId, int sendIdx, int recvIdx); }; } diff --git a/include/faabric/util/config.h b/include/faabric/util/config.h index f18bd3f1f..b424ebcf4 100644 --- a/include/faabric/util/config.h +++ b/include/faabric/util/config.h @@ -47,6 +47,7 @@ class SystemConfig int functionServerThreads; int stateServerThreads; int snapshotServerThreads; + int pointToPointServerThreads; SystemConfig(); diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index 8d1f650c6..999570e61 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -217,6 +217,7 @@ message PointToPointMappings { message PointToPointMapping { int32 appId = 1; int32 recvIdx = 2; + string host = 3; } repeated PointToPointMapping mappings = 1; } diff --git a/src/runner/FaabricMain.cpp b/src/runner/FaabricMain.cpp index 0b61ddf81..286f4c863 100644 --- a/src/runner/FaabricMain.cpp +++ b/src/runner/FaabricMain.cpp @@ -40,6 +40,9 @@ void FaabricMain::startBackground() // Snapshots startSnapshotServer(); + // Point-to-point messaging + startPointToPointServer(); + // Work sharing startFunctionCallServer(); } @@ -71,6 +74,12 @@ void FaabricMain::startSnapshotServer() snapshotServer.start(); } +void FaabricMain::startPointToPointServer() +{ + SPDLOG_INFO("Starting point-to-point server"); + pointToPointServer.start(); +} + void FaabricMain::startStateServer() { // Skip state server if not in inmemory mode diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt index b49d705f9..5929b30d1 100644 --- a/src/transport/CMakeLists.txt +++ b/src/transport/CMakeLists.txt @@ -11,6 +11,9 @@ set(HEADERS "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointClient.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointServer.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MpiMessageEndpoint.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointClient.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointServer.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointRegistry.h" ) set(LIB_FILES @@ -20,6 +23,9 @@ set(LIB_FILES MessageEndpointClient.cpp MessageEndpointServer.cpp MpiMessageEndpoint.cpp + PointToPointClient.cpp + PointToPointServer.cpp + PointToPointRegistry.cpp ${HEADERS} ) diff --git a/src/transport/MessageEndpoint.cpp b/src/transport/MessageEndpoint.cpp index 8fb0b1c22..6eec1c457 100644 --- a/src/transport/MessageEndpoint.cpp +++ b/src/transport/MessageEndpoint.cpp @@ -313,14 +313,14 @@ AsyncSendMessageEndpoint::AsyncSendMessageEndpoint(const std::string& hostIn, int timeoutMs) : MessageEndpoint(hostIn, portIn, timeoutMs) { - pushSocket = + socket = setUpSocket(zmq::socket_type::push, MessageEndpointConnectType::CONNECT); } void AsyncSendMessageEndpoint::sendHeader(int header) { uint8_t headerBytes = static_cast(header); - doSend(pushSocket, &headerBytes, sizeof(headerBytes), true); + doSend(socket, &headerBytes, sizeof(headerBytes), true); } void AsyncSendMessageEndpoint::send(const uint8_t* data, @@ -328,7 +328,24 @@ void AsyncSendMessageEndpoint::send(const uint8_t* data, bool more) { SPDLOG_TRACE("PUSH {} ({} bytes, more {})", address, dataSize, more); - doSend(pushSocket, data, dataSize, more); + doSend(socket, data, dataSize, more); +} + +AsyncInternalSendMessageEndpoint::AsyncInternalSendMessageEndpoint( + const std::string& inprocLabel, + int timeoutMs) + : MessageEndpoint("inproc://" + inprocLabel, timeoutMs) +{ + socket = + setUpSocket(zmq::socket_type::push, MessageEndpointConnectType::CONNECT); +} + +void AsyncInternalSendMessageEndpoint::send(const uint8_t* data, + size_t dataSize, + bool more) +{ + SPDLOG_TRACE("PUSH {} ({} bytes, more {})", address, dataSize, more); + doSend(socket, data, dataSize, more); } // ---------------------------------------------- @@ -495,6 +512,21 @@ std::optional AsyncRecvMessageEndpoint::recv(int size) return RecvMessageEndpoint::recv(size); } +AsyncInternalRecvMessageEndpoint::AsyncInternalRecvMessageEndpoint( + const std::string& inprocLabel, + int timeoutMs) + : RecvMessageEndpoint(inprocLabel, + timeoutMs, + zmq::socket_type::pull, + MessageEndpointConnectType::BIND) +{} + +std::optional AsyncInternalRecvMessageEndpoint::recv(int size) +{ + SPDLOG_TRACE("PULL {} ({} bytes)", address, size); + return RecvMessageEndpoint::recv(size); +} + // ---------------------------------------------- // SYNC RECV ENDPOINT // ---------------------------------------------- diff --git a/src/transport/PointToPointClient.cpp b/src/transport/PointToPointClient.cpp index b2089ee54..fec354e51 100644 --- a/src/transport/PointToPointClient.cpp +++ b/src/transport/PointToPointClient.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -11,33 +13,14 @@ PointToPointClient::PointToPointClient(const std::string& hostIn) POINT_TO_POINT_SYNC_PORT) {} -void PointToPointClient::broadcastPointToPointMappings( - int appId, - std::map idxToHosts) +void PointToPointClient::sendMappings(faabric::PointToPointMappings& mappings) { - message::PointToPointMappings mappings; - for (auto m : idxToHosts) { - auto appendedValue = mappings->add_mappings(); - appendedValue->set_appid(m.first); - appendedValue->set_recvidx(m.second); - } - - // TODO get hosts - // TODO send mappings to all of them + faabric::EmptyResponse resp; + syncSend(PointToPointCall::MAPPING, &mappings, &resp); } -void PointToPointClient::send(int appId, - int sendIdx, - int recvIdx, - const uint8_t* data, - size_t dataSize) +void PointToPointClient::sendMessage(faabric::PointToPointMessage& msg) { - faabric::PointToPointMessage msg; - msg.set_appid(appId); - msg.set_sendidx(sendIdx); - msg.set_recvidx(recvIdx); - msg.set_data(data, dataSize); - - asyncSend(1, &msg); + asyncSend(PointToPointCall::MESSAGE, &msg); } } diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp new file mode 100644 index 000000000..4d744b733 --- /dev/null +++ b/src/transport/PointToPointRegistry.cpp @@ -0,0 +1,70 @@ +#include "faabric/transport/PointToPointClient.h" +#include +#include +#include +#include +#include + +namespace faabric::transport { + +PointToPointRegistry::PointToPointRegistry() {} + +std::string PointToPointRegistry::getHostForReceiver(int appId, int recvIdx) +{ + faabric::util::SharedLock lock(registryMutex); + + std::string key = getKey(appId, recvIdx); + + if (mappings.find(key) == mappings.end()) { + SPDLOG_ERROR("No point-to-point mapping for {}/{}", appId, recvIdx); + throw std::runtime_error("No point-to-point mapping found"); + } + + return key; +} + +void PointToPointRegistry::setHostForReceiver(int appId, + int recvIdx, + const std::string& host) +{ + faabric::util::FullLock lock(registryMutex); + + std::string key = getKey(appId, recvIdx); + + mappings[key] = host; +} + +void PointToPointRegistry::broadcastMappings(int appId, + std::vector indexes) +{ + faabric::util::SharedLock lock(registryMutex); + + faabric::PointToPointMappings msg; + + for (auto i : indexes) { + std::string key = getKey(appId, i); + std::string host = mappings[key]; + + auto* mapping = msg.add_mappings(); + mapping->set_appid(appId); + mapping->set_recvidx(i); + mapping->set_host(host); + } + + auto& sch = faabric::scheduler::getScheduler(); + + // TODO - can we just do the set of registered hosts here somehow? + std::set hosts = sch.getAvailableHosts(); + + for (const auto& host : hosts) { + PointToPointClient cli(host); + cli.sendMappings(msg); + } +} + +PointToPointRegistry& getPointToPointRegistry() +{ + static PointToPointRegistry reg; + return reg; +} +} diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 4057cfeb3..3075f8e61 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,14 +1,19 @@ +#include "faabric/transport/PointToPointRegistry.h" #include #include #include #include +#include #include #include namespace faabric::transport { PointToPointServer::PointToPointServer() - : faabric::transport::MessageEndpointServer(POINT_TO_POINT_ASYNC_PORT, - POINT_TO_POINT_SYNC_PORT) + : faabric::transport::MessageEndpointServer( + POINT_TO_POINT_ASYNC_PORT, + POINT_TO_POINT_SYNC_PORT, + POINT_TO_POINT_INPROC_LABEL, + faabric::util::getSystemConfig().pointToPointServerThreads) {} void PointToPointServer::doAsyncRecv(int header, @@ -17,7 +22,7 @@ void PointToPointServer::doAsyncRecv(int header, { PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) - std::shared_ptr endpoint = + std::unique_ptr endpoint = getSendEndpoint(msg.appid(), msg.sendidx(), msg.recvidx()); // TODO - is this copying the data? Would be nice to avoid if poss @@ -29,16 +34,22 @@ std::unique_ptr PointToPointServer::doSyncRecv( const uint8_t* buffer, size_t bufferSize) { - std::string errMsg = "Point-to-point sync messaging not supported"; - SPDLOG_ERROR(errMsg); - throw std::runtime_error(errMsg); + PARSE_MSG(faabric::PointToPointMappings, buffer, bufferSize) + + PointToPointRegistry& reg = getPointToPointRegistry(); + + for (const auto& m : msg.mappings()) { + reg.setHostForReceiver(m.appid(), m.recvidx(), m.host()); + } + + return std::make_unique(); } std::vector PointToPointServer::recvMessage(int appId, int sendIdx, int recvIdx) { - std::shared_ptr endpoint = + std::unique_ptr endpoint = getRecvEndpoint(appId, sendIdx, recvIdx); std::optional messageDataMaybe = endpoint->recv().value(); @@ -54,4 +65,18 @@ std::string PointToPointServer::getInprocLabel(int appId, { return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); } + +std::unique_ptr +PointToPointServer::getSendEndpoint(int appId, int sendIdx, int recvIdx) +{ + std::string label = getInprocLabel(appId, sendIdx, recvIdx); + return std::make_unique(label); +} + +std::unique_ptr +PointToPointServer::getRecvEndpoint(int appId, int sendIdx, int recvIdx) +{ + std::string label = getInprocLabel(appId, sendIdx, recvIdx); + return std::make_unique(label); +} } diff --git a/src/util/config.cpp b/src/util/config.cpp index 76a4fb273..633291de3 100644 --- a/src/util/config.cpp +++ b/src/util/config.cpp @@ -65,6 +65,8 @@ void SystemConfig::initialise() this->getSystemConfIntParam("STATE_SERVER_THREADS", "2"); snapshotServerThreads = this->getSystemConfIntParam("SNAPSHOT_SERVER_THREADS", "2"); + pointToPointServerThreads = + this->getSystemConfIntParam("POINT_TO_POINT_SERVER_THREADS", "2"); } int SystemConfig::getSystemConfIntParam(const char* name, diff --git a/tests/test/util/test_config.cpp b/tests/test/util/test_config.cpp index ff7e726db..383253057 100644 --- a/tests/test/util/test_config.cpp +++ b/tests/test/util/test_config.cpp @@ -51,6 +51,8 @@ TEST_CASE("Test overriding system config initialisation", "[util]") std::string functionThreads = setEnvVar("FUNCTION_SERVER_THREADS", "111"); std::string stateThreads = setEnvVar("STATE_SERVER_THREADS", "222"); std::string snapshotThreads = setEnvVar("SNAPSHOT_SERVER_THREADS", "333"); + std::string pointToPointThreads = + setEnvVar("POINT_TO_POINT_SERVER_THREADS", "444"); std::string mpiSize = setEnvVar("DEFAULT_MPI_WORLD_SIZE", "2468"); std::string mpiPort = setEnvVar("MPI_BASE_PORT", "9999"); @@ -75,6 +77,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") REQUIRE(conf.functionServerThreads == 111); REQUIRE(conf.stateServerThreads == 222); REQUIRE(conf.snapshotServerThreads == 333); + REQUIRE(conf.pointToPointServerThreads == 444); REQUIRE(conf.defaultMpiWorldSize == 2468); REQUIRE(conf.mpiBasePort == 9999); @@ -99,6 +102,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") setEnvVar("FUNCTION_SERVER_THREADS", functionThreads); setEnvVar("STATE_SERVER_THREADS", stateThreads); setEnvVar("SNAPSHOT_SERVER_THREADS", snapshotThreads); + setEnvVar("POINT_TO_POINT_SERVER_THREADS", pointToPointThreads); setEnvVar("DEFAULT_MPI_WORLD_SIZE", mpiSize); setEnvVar("MPI_BASE_PORT", mpiPort); From 901667aa3591f302cd8a94dea9512b699956e62d Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 6 Oct 2021 15:46:01 +0000 Subject: [PATCH 03/22] Tidy up --- include/faabric/transport/PointToPointClient.h | 2 +- src/transport/PointToPointServer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/faabric/transport/PointToPointClient.h b/include/faabric/transport/PointToPointClient.h index 6c3b53046..4fb2c9d08 100644 --- a/include/faabric/transport/PointToPointClient.h +++ b/include/faabric/transport/PointToPointClient.h @@ -1,6 +1,6 @@ #pragma once -#include "faabric/proto/faabric.pb.h" +#include #include namespace faabric::transport { diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 3075f8e61..344028bf8 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,4 +1,4 @@ -#include "faabric/transport/PointToPointRegistry.h" +#include #include #include #include From 7b61ec68dba17ccdabadf0087c3521624c0a305c Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 6 Oct 2021 15:46:18 +0000 Subject: [PATCH 04/22] formatting --- include/faabric/transport/PointToPointClient.h | 4 ++-- src/transport/PointToPointServer.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/faabric/transport/PointToPointClient.h b/include/faabric/transport/PointToPointClient.h index 4fb2c9d08..d1a312d3a 100644 --- a/include/faabric/transport/PointToPointClient.h +++ b/include/faabric/transport/PointToPointClient.h @@ -10,8 +10,8 @@ class PointToPointClient : public faabric::transport::MessageEndpointClient public: PointToPointClient(const std::string& hostIn); - void sendMappings(faabric::PointToPointMappings &mappings); + void sendMappings(faabric::PointToPointMappings& mappings); - void sendMessage(faabric::PointToPointMessage &msg); + void sendMessage(faabric::PointToPointMessage& msg); }; } diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 344028bf8..309669480 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include #include #include From afbf58886eb4c1bb4458ec127fc32e9f4014ae29 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 09:37:33 +0000 Subject: [PATCH 05/22] Test for registry --- .../faabric/transport/PointToPointRegistry.h | 7 +- src/transport/PointToPointRegistry.cpp | 35 +++++++-- .../test_point_to_point_registry.cpp | 76 +++++++++++++++++++ 3 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tests/test/transport/test_point_to_point_registry.cpp diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h index c64ad0558..ac8c7c912 100644 --- a/include/faabric/transport/PointToPointRegistry.h +++ b/include/faabric/transport/PointToPointRegistry.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace faabric::transport { class PointToPointRegistry @@ -15,11 +16,15 @@ class PointToPointRegistry void setHostForReceiver(int appId, int recvIdx, const std::string& host); - void broadcastMappings(int appId, std::vector indexes); + void broadcastMappings(int appId); + std::set getIdxsRegisteredForApp(int appId); + + void clear(); private: std::shared_mutex registryMutex; + std::unordered_map> appIdxs; std::unordered_map mappings; std::string getKey(int appId, int recvIdx); diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp index 4d744b733..c4ef62e0f 100644 --- a/src/transport/PointToPointRegistry.cpp +++ b/src/transport/PointToPointRegistry.cpp @@ -9,6 +9,11 @@ namespace faabric::transport { PointToPointRegistry::PointToPointRegistry() {} +std::string PointToPointRegistry::getKey(int appId, int recvIdx) +{ + return fmt::format("{}-{}", appId, recvIdx); +} + std::string PointToPointRegistry::getHostForReceiver(int appId, int recvIdx) { faabric::util::SharedLock lock(registryMutex); @@ -16,11 +21,11 @@ std::string PointToPointRegistry::getHostForReceiver(int appId, int recvIdx) std::string key = getKey(appId, recvIdx); if (mappings.find(key) == mappings.end()) { - SPDLOG_ERROR("No point-to-point mapping for {}/{}", appId, recvIdx); + SPDLOG_ERROR("No point-to-point mapping for app {} idx {}", appId, recvIdx); throw std::runtime_error("No point-to-point mapping found"); } - return key; + return mappings[key]; } void PointToPointRegistry::setHostForReceiver(int appId, @@ -29,18 +34,22 @@ void PointToPointRegistry::setHostForReceiver(int appId, { faabric::util::FullLock lock(registryMutex); - std::string key = getKey(appId, recvIdx); + // Record this index for this app + appIdxs[appId].insert(recvIdx); + // Add host mapping + std::string key = getKey(appId, recvIdx); mappings[key] = host; } -void PointToPointRegistry::broadcastMappings(int appId, - std::vector indexes) +void PointToPointRegistry::broadcastMappings(int appId) { faabric::util::SharedLock lock(registryMutex); faabric::PointToPointMappings msg; + std::set& indexes = appIdxs[appId]; + for (auto i : indexes) { std::string key = getKey(appId, i); std::string host = mappings[key]; @@ -53,7 +62,8 @@ void PointToPointRegistry::broadcastMappings(int appId, auto& sch = faabric::scheduler::getScheduler(); - // TODO - can we just do the set of registered hosts here somehow? + // TODO seems excessive to broadcast to all hosts, could we perhaps use the + // set of registered hosts? std::set hosts = sch.getAvailableHosts(); for (const auto& host : hosts) { @@ -62,6 +72,19 @@ void PointToPointRegistry::broadcastMappings(int appId, } } +std::set PointToPointRegistry::getIdxsRegisteredForApp(int appId) +{ + faabric::util::SharedLock lock(registryMutex); + return appIdxs[appId]; +} + +void PointToPointRegistry::clear() +{ + faabric::util::SharedLock lock(registryMutex); + + mappings.clear(); +} + PointToPointRegistry& getPointToPointRegistry() { static PointToPointRegistry reg; diff --git a/tests/test/transport/test_point_to_point_registry.cpp b/tests/test/transport/test_point_to_point_registry.cpp new file mode 100644 index 000000000..6cc670e9a --- /dev/null +++ b/tests/test/transport/test_point_to_point_registry.cpp @@ -0,0 +1,76 @@ +#include "faabric_utils.h" +#include + +#include + +#include + +using namespace faabric::transport; +using namespace faabric::util; + +namespace tests { + +class PointToPointFixture +{ + public: + PointToPointFixture() + : reg(faabric::transport::getPointToPointRegistry()) + { + reg.clear(); + } + + ~PointToPointFixture() { reg.clear(); } + + protected: + faabric::transport::PointToPointRegistry& reg; +}; + +TEST_CASE_METHOD(PointToPointFixture, + "Test set and get point-to-point hosts", + "[transport]") +{ + // Note - deliberately overlap app indexes to make sure app id counts + int appIdA = 123; + int appIdB = 345; + int idxA1 = 0; + int idxB1 = 2; + int idxA2 = 10; + int idxB2 = 10; + + std::string hostA = "host-a"; + std::string hostB = "host-b"; + std::string hostC = "host-c"; + + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA1)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB1)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + + reg.setHostForReceiver(appIdA, idxA1, hostA); + reg.setHostForReceiver(appIdB, idxB1, hostB); + + std::set expectedA = { idxA1 }; + std::set expectedB = { idxB1 }; + REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + + REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); + REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + + reg.setHostForReceiver(appIdA, idxA2, hostB); + reg.setHostForReceiver(appIdB, idxB2, hostC); + + expectedA = { idxA1, idxA2 }; + expectedB = { idxB1, idxB2 }; + + REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + + REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); + REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE(reg.getHostForReceiver(appIdB, idxB2) == hostC); +} +} From 12df2737054bd41772d8afa1fbd2506f04b66328 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 10:26:00 +0000 Subject: [PATCH 06/22] Mocked test for sending mappings --- .../faabric/transport/PointToPointClient.h | 8 +++ .../faabric/transport/PointToPointRegistry.h | 6 ++ .../faabric/transport/PointToPointServer.h | 1 + src/transport/PointToPointClient.cpp | 39 ++++++++++- src/transport/PointToPointRegistry.cpp | 35 ++++++---- src/transport/PointToPointServer.cpp | 6 ++ .../test_point_to_point_registry.cpp | 15 ----- .../transport/test_point_to_point_server.cpp | 66 +++++++++++++++++++ tests/utils/fixtures.h | 27 ++++++++ 9 files changed, 173 insertions(+), 30 deletions(-) create mode 100644 tests/test/transport/test_point_to_point_server.cpp diff --git a/include/faabric/transport/PointToPointClient.h b/include/faabric/transport/PointToPointClient.h index d1a312d3a..806592426 100644 --- a/include/faabric/transport/PointToPointClient.h +++ b/include/faabric/transport/PointToPointClient.h @@ -5,6 +5,14 @@ namespace faabric::transport { +std::vector> +getSentMappings(); + +std::vector> +getSentPointToPointMessages(); + +void clearSentMessages(); + class PointToPointClient : public faabric::transport::MessageEndpointClient { public: diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h index ac8c7c912..15dc837e8 100644 --- a/include/faabric/transport/PointToPointRegistry.h +++ b/include/faabric/transport/PointToPointRegistry.h @@ -1,5 +1,7 @@ #pragma once +#include + #include #include #include @@ -18,6 +20,8 @@ class PointToPointRegistry void broadcastMappings(int appId); + void sendMappings(int appId, const std::string &host); + std::set getIdxsRegisteredForApp(int appId); void clear(); @@ -27,6 +31,8 @@ class PointToPointRegistry std::unordered_map> appIdxs; std::unordered_map mappings; + faabric::scheduler::Scheduler &sch; + std::string getKey(int appId, int recvIdx); }; diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 376a7fe81..f41a18d6e 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -8,6 +8,7 @@ class PointToPointServer final : public MessageEndpointServer public: PointToPointServer(); + protected: std::vector recvMessage(int appId, int sendIdx, int recvIdx); private: diff --git a/src/transport/PointToPointClient.cpp b/src/transport/PointToPointClient.cpp index fec354e51..872b82d3d 100644 --- a/src/transport/PointToPointClient.cpp +++ b/src/transport/PointToPointClient.cpp @@ -4,9 +4,34 @@ #include #include #include +#include namespace faabric::transport { +static std::vector> + sentMappings; + +static std::vector> + sentMessages; + +std::vector> +getSentMappings() +{ + return sentMappings; +} + +std::vector> +getSentPointToPointMessages() +{ + return sentMessages; +} + +void clearSentMessages() +{ + sentMappings.clear(); + sentMessages.clear(); +} + PointToPointClient::PointToPointClient(const std::string& hostIn) : faabric::transport::MessageEndpointClient(hostIn, POINT_TO_POINT_ASYNC_PORT, @@ -15,12 +40,20 @@ PointToPointClient::PointToPointClient(const std::string& hostIn) void PointToPointClient::sendMappings(faabric::PointToPointMappings& mappings) { - faabric::EmptyResponse resp; - syncSend(PointToPointCall::MAPPING, &mappings, &resp); + if (faabric::util::isMockMode()) { + sentMappings.emplace_back(host, mappings); + } else { + faabric::EmptyResponse resp; + syncSend(PointToPointCall::MAPPING, &mappings, &resp); + } } void PointToPointClient::sendMessage(faabric::PointToPointMessage& msg) { - asyncSend(PointToPointCall::MESSAGE, &msg); + if (faabric::util::isMockMode()) { + sentMessages.emplace_back(host, msg); + } else { + asyncSend(PointToPointCall::MESSAGE, &msg); + } } } diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp index c4ef62e0f..74dc916b4 100644 --- a/src/transport/PointToPointRegistry.cpp +++ b/src/transport/PointToPointRegistry.cpp @@ -7,7 +7,9 @@ namespace faabric::transport { -PointToPointRegistry::PointToPointRegistry() {} +PointToPointRegistry::PointToPointRegistry() + : sch(faabric::scheduler::getScheduler()) +{} std::string PointToPointRegistry::getKey(int appId, int recvIdx) { @@ -21,7 +23,8 @@ std::string PointToPointRegistry::getHostForReceiver(int appId, int recvIdx) std::string key = getKey(appId, recvIdx); if (mappings.find(key) == mappings.end()) { - SPDLOG_ERROR("No point-to-point mapping for app {} idx {}", appId, recvIdx); + SPDLOG_ERROR( + "No point-to-point mapping for app {} idx {}", appId, recvIdx); throw std::runtime_error("No point-to-point mapping found"); } @@ -34,6 +37,9 @@ void PointToPointRegistry::setHostForReceiver(int appId, { faabric::util::FullLock lock(registryMutex); + SPDLOG_TRACE( + "Setting point-to-point mapping {}:{} to {}", appId, recvIdx, host); + // Record this index for this app appIdxs[appId].insert(recvIdx); @@ -43,6 +49,19 @@ void PointToPointRegistry::setHostForReceiver(int appId, } void PointToPointRegistry::broadcastMappings(int appId) +{ + auto& sch = faabric::scheduler::getScheduler(); + + // TODO seems excessive to broadcast to all hosts, could we perhaps use the + // set of registered hosts? + std::set hosts = sch.getAvailableHosts(); + + for (const auto& host : hosts) { + sendMappings(appId, host); + } +} + +void PointToPointRegistry::sendMappings(int appId, const std::string& host) { faabric::util::SharedLock lock(registryMutex); @@ -60,16 +79,8 @@ void PointToPointRegistry::broadcastMappings(int appId) mapping->set_host(host); } - auto& sch = faabric::scheduler::getScheduler(); - - // TODO seems excessive to broadcast to all hosts, could we perhaps use the - // set of registered hosts? - std::set hosts = sch.getAvailableHosts(); - - for (const auto& host : hosts) { - PointToPointClient cli(host); - cli.sendMappings(msg); - } + PointToPointClient cli(host); + cli.sendMappings(msg); } std::set PointToPointRegistry::getIdxsRegisteredForApp(int appId) diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 309669480..85f412b04 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -25,6 +25,12 @@ void PointToPointServer::doAsyncRecv(int header, std::unique_ptr endpoint = getSendEndpoint(msg.appid(), msg.sendidx(), msg.recvidx()); + SPDLOG_TRACE("Forwarding point-to-point message {}:{}:{} to {}", + msg.appid(), + msg.sendidx(), + msg.recvidx(), + endpoint->getAddress()); + // TODO - is this copying the data? Would be nice to avoid if poss endpoint->send(BYTES_CONST(msg.data().c_str()), msg.data().size()); } diff --git a/tests/test/transport/test_point_to_point_registry.cpp b/tests/test/transport/test_point_to_point_registry.cpp index 6cc670e9a..b1454374d 100644 --- a/tests/test/transport/test_point_to_point_registry.cpp +++ b/tests/test/transport/test_point_to_point_registry.cpp @@ -10,21 +10,6 @@ using namespace faabric::util; namespace tests { -class PointToPointFixture -{ - public: - PointToPointFixture() - : reg(faabric::transport::getPointToPointRegistry()) - { - reg.clear(); - } - - ~PointToPointFixture() { reg.clear(); } - - protected: - faabric::transport::PointToPointRegistry& reg; -}; - TEST_CASE_METHOD(PointToPointFixture, "Test set and get point-to-point hosts", "[transport]") diff --git a/tests/test/transport/test_point_to_point_server.cpp b/tests/test/transport/test_point_to_point_server.cpp new file mode 100644 index 000000000..5493c31a1 --- /dev/null +++ b/tests/test/transport/test_point_to_point_server.cpp @@ -0,0 +1,66 @@ +#include "faabric/util/testing.h" +#include + +#include + +#include +#include +#include +#include +#include + +using namespace faabric::transport; + +namespace tests { + +TEST_CASE_METHOD(PointToPointFixture, "Test sending mappings", "[transport]") +{ + faabric::util::setMockMode(true); + + int appIdA = 123; + int appIdB = 345; + + int idxA1 = 1; + int idxA2 = 2; + int idxB1 = 1; + + std::string hostA = "host-a"; + std::string hostB = "host-b"; + + reg.setHostForReceiver(appIdA, idxA1, hostA); + reg.setHostForReceiver(appIdA, idxA2, hostB); + reg.setHostForReceiver(appIdB, idxB1, hostB); + + reg.sendMappings(appIdA, "other-host"); + + auto actualSent = getSentMappings(); + REQUIRE(actualSent.size() == 1); + + faabric::PointToPointMappings actualMappings = actualSent.at(0).second; + REQUIRE(actualMappings.mappings().size() == 2); + + faabric::PointToPointMappings::PointToPointMapping mappingA = + actualMappings.mappings().at(0); + faabric::PointToPointMappings::PointToPointMapping mappingB = + actualMappings.mappings().at(1); + + REQUIRE(mappingA.appid() == appIdA); + REQUIRE(mappingB.appid() == appIdA); + + // Note - we don't know the order the mappings are sent in so we have to + // check both possibilities + if (mappingA.recvidx() == idxA1) { + REQUIRE(mappingA.host() == hostA); + + REQUIRE(mappingB.recvidx() == idxA2); + REQUIRE(mappingB.host() == hostB); + } else if (mappingA.recvidx() == idxA2) { + REQUIRE(mappingA.host() == hostB); + + REQUIRE(mappingB.recvidx() == idxA1); + REQUIRE(mappingB.host() == hostA); + } else { + FAIL(); + } +} +} diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 2826eeb3d..6af939988 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -10,6 +10,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -265,4 +268,28 @@ class RemoteMpiTestFixture : public MpiBaseTestFixture faabric::scheduler::MpiWorld otherWorld; }; + +class PointToPointFixture +{ + public: + PointToPointFixture() + : reg(faabric::transport::getPointToPointRegistry()) + , cli(LOCALHOST) + { + reg.clear(); + server.start(); + } + + ~PointToPointFixture() + { + server.stop(); + reg.clear(); + faabric::util::setMockMode(false); + } + + protected: + faabric::transport::PointToPointRegistry& reg; + faabric::transport::PointToPointServer server; + faabric::transport::PointToPointClient cli; +}; } From 236e83c5fb02a908cf3a0f54203f7ffdc810d32c Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 10:33:47 +0000 Subject: [PATCH 07/22] Mappings tests --- .../faabric/transport/PointToPointRegistry.h | 1 + src/transport/PointToPointRegistry.cpp | 1 + .../transport/test_point_to_point_server.cpp | 48 ++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h index 15dc837e8..0a06e5eeb 100644 --- a/include/faabric/transport/PointToPointRegistry.h +++ b/include/faabric/transport/PointToPointRegistry.h @@ -25,6 +25,7 @@ class PointToPointRegistry std::set getIdxsRegisteredForApp(int appId); void clear(); + private: std::shared_mutex registryMutex; diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp index 74dc916b4..01413c446 100644 --- a/src/transport/PointToPointRegistry.cpp +++ b/src/transport/PointToPointRegistry.cpp @@ -93,6 +93,7 @@ void PointToPointRegistry::clear() { faabric::util::SharedLock lock(registryMutex); + appIdxs.clear(); mappings.clear(); } diff --git a/tests/test/transport/test_point_to_point_server.cpp b/tests/test/transport/test_point_to_point_server.cpp index 5493c31a1..fb8ed04fa 100644 --- a/tests/test/transport/test_point_to_point_server.cpp +++ b/tests/test/transport/test_point_to_point_server.cpp @@ -13,7 +13,9 @@ using namespace faabric::transport; namespace tests { -TEST_CASE_METHOD(PointToPointFixture, "Test sending mappings", "[transport]") +TEST_CASE_METHOD(PointToPointFixture, + "Test sending mappings via registry", + "[transport]") { faabric::util::setMockMode(true); @@ -63,4 +65,48 @@ TEST_CASE_METHOD(PointToPointFixture, "Test sending mappings", "[transport]") FAIL(); } } + +TEST_CASE_METHOD(PointToPointFixture, + "Test sending mappings from client", + "[transport]") +{ + int appIdA = 123; + int appIdB = 345; + + int idxA1 = 1; + int idxA2 = 2; + int idxB1 = 1; + + std::string hostA = "host-a"; + std::string hostB = "host-b"; + + REQUIRE(reg.getIdxsRegisteredForApp(appIdA).empty()); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB).empty()); + + faabric::PointToPointMappings mappings; + + auto* mappingA1 = mappings.add_mappings(); + mappingA1->set_appid(appIdA); + mappingA1->set_recvidx(idxA1); + mappingA1->set_host(hostA); + + auto* mappingA2 = mappings.add_mappings(); + mappingA2->set_appid(appIdA); + mappingA2->set_recvidx(idxA2); + mappingA2->set_host(hostB); + + auto* mappingB1 = mappings.add_mappings(); + mappingB1->set_appid(appIdB); + mappingB1->set_recvidx(idxB1); + mappingB1->set_host(hostA); + + cli.sendMappings(mappings); + + REQUIRE(reg.getIdxsRegisteredForApp(appIdA).size() == 2); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB).size() == 1); + + REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); + REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostA); +} } From 61b063d918c10af71a4b2c25415271a957d9e8e0 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 13:55:24 +0000 Subject: [PATCH 08/22] Fleshing out rest of implementation --- include/faabric/runner/FaabricMain.h | 6 +-- ...ntToPointServer.h => PointToPointBroker.h} | 12 ++--- .../faabric/transport/PointToPointRegistry.h | 14 ++++-- include/faabric/util/config.h | 2 +- src/runner/FaabricMain.cpp | 6 +-- src/transport/CMakeLists.txt | 4 +- ...PointServer.cpp => PointToPointBroker.cpp} | 47 +++++-------------- src/transport/PointToPointRegistry.cpp | 40 +++++++++++++++- src/util/config.cpp | 4 +- ... => test_point_to_point_server_broker.cpp} | 28 ++++++++++- tests/test/util/test_config.cpp | 6 +-- tests/utils/fixtures.h | 4 +- 12 files changed, 111 insertions(+), 62 deletions(-) rename include/faabric/transport/{PointToPointServer.h => PointToPointBroker.h} (68%) rename src/transport/{PointToPointServer.cpp => PointToPointBroker.cpp} (55%) rename tests/test/transport/{test_point_to_point_server.cpp => test_point_to_point_server_broker.cpp} (82%) diff --git a/include/faabric/runner/FaabricMain.h b/include/faabric/runner/FaabricMain.h index 6a5fa6f72..f2fb01bb0 100644 --- a/include/faabric/runner/FaabricMain.h +++ b/include/faabric/runner/FaabricMain.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include namespace faabric::runner { @@ -24,7 +24,7 @@ class FaabricMain void startSnapshotServer(); - void startPointToPointServer(); + void startPointToPointBroker(); void shutdown(); @@ -32,6 +32,6 @@ class FaabricMain faabric::state::StateServer stateServer; faabric::scheduler::FunctionCallServer functionServer; faabric::snapshot::SnapshotServer snapshotServer; - faabric::transport::PointToPointServer pointToPointServer; + faabric::transport::PointToPointBroker pointToPointBroker; }; } diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointBroker.h similarity index 68% rename from include/faabric/transport/PointToPointServer.h rename to include/faabric/transport/PointToPointBroker.h index f41a18d6e..fdad5b8cf 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -3,10 +3,13 @@ #include namespace faabric::transport { -class PointToPointServer final : public MessageEndpointServer + +std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx); + +class PointToPointBroker final : public MessageEndpointServer { public: - PointToPointServer(); + PointToPointBroker(); protected: std::vector recvMessage(int appId, int sendIdx, int recvIdx); @@ -19,12 +22,7 @@ class PointToPointServer final : public MessageEndpointServer std::unique_ptr doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; - std::string getInprocLabel(int appId, int sendIdx, int recvIdx); - std::unique_ptr getSendEndpoint(int appId, int sendIdx, int recvIdx); - - std::unique_ptr - getRecvEndpoint(int appId, int sendIdx, int recvIdx); }; } diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h index 0a06e5eeb..26107b8e7 100644 --- a/include/faabric/transport/PointToPointRegistry.h +++ b/include/faabric/transport/PointToPointRegistry.h @@ -2,11 +2,11 @@ #include +#include #include #include #include #include -#include namespace faabric::transport { class PointToPointRegistry @@ -20,10 +20,18 @@ class PointToPointRegistry void broadcastMappings(int appId); - void sendMappings(int appId, const std::string &host); + void sendMappings(int appId, const std::string& host); std::set getIdxsRegisteredForApp(int appId); + void sendMessage(int appId, + int sendIdx, + int recvIdx, + const uint8_t* buffer, + size_t bufferSize); + + std::vector recvMessage(int appId, int sendIdx, int recvIdx); + void clear(); private: @@ -32,7 +40,7 @@ class PointToPointRegistry std::unordered_map> appIdxs; std::unordered_map mappings; - faabric::scheduler::Scheduler &sch; + faabric::scheduler::Scheduler& sch; std::string getKey(int appId, int recvIdx); }; diff --git a/include/faabric/util/config.h b/include/faabric/util/config.h index b424ebcf4..37ecc2699 100644 --- a/include/faabric/util/config.h +++ b/include/faabric/util/config.h @@ -47,7 +47,7 @@ class SystemConfig int functionServerThreads; int stateServerThreads; int snapshotServerThreads; - int pointToPointServerThreads; + int pointToPointBrokerThreads; SystemConfig(); diff --git a/src/runner/FaabricMain.cpp b/src/runner/FaabricMain.cpp index 286f4c863..1b7d08185 100644 --- a/src/runner/FaabricMain.cpp +++ b/src/runner/FaabricMain.cpp @@ -41,7 +41,7 @@ void FaabricMain::startBackground() startSnapshotServer(); // Point-to-point messaging - startPointToPointServer(); + startPointToPointBroker(); // Work sharing startFunctionCallServer(); @@ -74,10 +74,10 @@ void FaabricMain::startSnapshotServer() snapshotServer.start(); } -void FaabricMain::startPointToPointServer() +void FaabricMain::startPointToPointBroker() { SPDLOG_INFO("Starting point-to-point server"); - pointToPointServer.start(); + pointToPointBroker.start(); } void FaabricMain::startStateServer() diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt index 5929b30d1..17d10b78f 100644 --- a/src/transport/CMakeLists.txt +++ b/src/transport/CMakeLists.txt @@ -12,7 +12,7 @@ set(HEADERS "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointServer.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MpiMessageEndpoint.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointClient.h" - "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointServer.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointBroker.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointRegistry.h" ) @@ -24,7 +24,7 @@ set(LIB_FILES MessageEndpointServer.cpp MpiMessageEndpoint.cpp PointToPointClient.cpp - PointToPointServer.cpp + PointToPointBroker.cpp PointToPointRegistry.cpp ${HEADERS} ) diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointBroker.cpp similarity index 55% rename from src/transport/PointToPointServer.cpp rename to src/transport/PointToPointBroker.cpp index 85f412b04..9001ada0a 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -1,6 +1,6 @@ #include +#include #include -#include #include #include #include @@ -8,15 +8,21 @@ #include namespace faabric::transport { -PointToPointServer::PointToPointServer() + +std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) +{ + return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); +} + +PointToPointBroker::PointToPointBroker() : faabric::transport::MessageEndpointServer( POINT_TO_POINT_ASYNC_PORT, POINT_TO_POINT_SYNC_PORT, POINT_TO_POINT_INPROC_LABEL, - faabric::util::getSystemConfig().pointToPointServerThreads) + faabric::util::getSystemConfig().pointToPointBrokerThreads) {} -void PointToPointServer::doAsyncRecv(int header, +void PointToPointBroker::doAsyncRecv(int header, const uint8_t* buffer, size_t bufferSize) { @@ -35,7 +41,7 @@ void PointToPointServer::doAsyncRecv(int header, endpoint->send(BYTES_CONST(msg.data().c_str()), msg.data().size()); } -std::unique_ptr PointToPointServer::doSyncRecv( +std::unique_ptr PointToPointBroker::doSyncRecv( int header, const uint8_t* buffer, size_t bufferSize) @@ -51,38 +57,11 @@ std::unique_ptr PointToPointServer::doSyncRecv( return std::make_unique(); } -std::vector PointToPointServer::recvMessage(int appId, - int sendIdx, - int recvIdx) -{ - std::unique_ptr endpoint = - getRecvEndpoint(appId, sendIdx, recvIdx); - - std::optional messageDataMaybe = endpoint->recv().value(); - Message& messageData = messageDataMaybe.value(); - - // TODO - possible to avoid this copy too? - return messageData.dataCopy(); -} - -std::string PointToPointServer::getInprocLabel(int appId, - int sendIdx, - int recvIdx) -{ - return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); -} - std::unique_ptr -PointToPointServer::getSendEndpoint(int appId, int sendIdx, int recvIdx) +PointToPointBroker::getSendEndpoint(int appId, int sendIdx, int recvIdx) { - std::string label = getInprocLabel(appId, sendIdx, recvIdx); + std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); return std::make_unique(label); } -std::unique_ptr -PointToPointServer::getRecvEndpoint(int appId, int sendIdx, int recvIdx) -{ - std::string label = getInprocLabel(appId, sendIdx, recvIdx); - return std::make_unique(label); -} } diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp index 01413c446..d6f16a539 100644 --- a/src/transport/PointToPointRegistry.cpp +++ b/src/transport/PointToPointRegistry.cpp @@ -1,6 +1,8 @@ -#include "faabric/transport/PointToPointClient.h" +#include "faabric/util/config.h" #include #include +#include +#include #include #include #include @@ -89,6 +91,42 @@ std::set PointToPointRegistry::getIdxsRegisteredForApp(int appId) return appIdxs[appId]; } +void PointToPointRegistry::sendMessage(int appId, + int sendIdx, + int recvIdx, + const uint8_t* buffer, + size_t bufferSize) +{ + std::string host = getHostForReceiver(appId, recvIdx); + + // TODO - if this host, put directly onto queue + if (host == faabric::util::getSystemConfig().endpointHost) { + std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); + + // TODO - need to keep this endpoint in scope in the same thread until + // the message is consumed. + std::unique_ptr sendEndpoint = + std::make_unique(label); + } else { + // TODO - if not, create client and send to remote host + } +} + +std::vector PointToPointRegistry::recvMessage(int appId, + int sendIdx, + int recvIdx) +{ + std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); + std::unique_ptr endpoint = + std::make_unique(label); + + std::optional messageDataMaybe = endpoint->recv().value(); + Message messageData = messageDataMaybe.value(); + + // TODO - possible to avoid this copy? + return messageData.dataCopy(); +} + void PointToPointRegistry::clear() { faabric::util::SharedLock lock(registryMutex); diff --git a/src/util/config.cpp b/src/util/config.cpp index 633291de3..8b00037b3 100644 --- a/src/util/config.cpp +++ b/src/util/config.cpp @@ -65,8 +65,8 @@ void SystemConfig::initialise() this->getSystemConfIntParam("STATE_SERVER_THREADS", "2"); snapshotServerThreads = this->getSystemConfIntParam("SNAPSHOT_SERVER_THREADS", "2"); - pointToPointServerThreads = - this->getSystemConfIntParam("POINT_TO_POINT_SERVER_THREADS", "2"); + pointToPointBrokerThreads = + this->getSystemConfIntParam("POINT_TO_POINT_BROKER_THREADS", "2"); } int SystemConfig::getSystemConfIntParam(const char* name, diff --git a/tests/test/transport/test_point_to_point_server.cpp b/tests/test/transport/test_point_to_point_server_broker.cpp similarity index 82% rename from tests/test/transport/test_point_to_point_server.cpp rename to tests/test/transport/test_point_to_point_server_broker.cpp index fb8ed04fa..e3a24600b 100644 --- a/tests/test/transport/test_point_to_point_server.cpp +++ b/tests/test/transport/test_point_to_point_server_broker.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include using namespace faabric::transport; @@ -109,4 +109,30 @@ TEST_CASE_METHOD(PointToPointFixture, REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostA); } + +TEST_CASE_METHOD(PointToPointFixture, + "Test sending point-to-point message" + "[transport]") +{ + int appId = 123; + int sendIdx = 5; + int recvIdx = 10; + + // Register the recv index on this host + reg.setHostForReceiver(appId, recvIdx, LOCALHOST); + + std::vector data = { 0, 1, 2, 3 }; + + faabric::PointToPointMessage msg; + msg.set_appid(appId); + msg.set_recvidx(recvIdx); + msg.set_sendidx(sendIdx); + + // Make sure we send the message before a receiver is available + cli.sendMessage(msg); + + std::thread t([] { + + }); +} } diff --git a/tests/test/util/test_config.cpp b/tests/test/util/test_config.cpp index 383253057..306581826 100644 --- a/tests/test/util/test_config.cpp +++ b/tests/test/util/test_config.cpp @@ -52,7 +52,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") std::string stateThreads = setEnvVar("STATE_SERVER_THREADS", "222"); std::string snapshotThreads = setEnvVar("SNAPSHOT_SERVER_THREADS", "333"); std::string pointToPointThreads = - setEnvVar("POINT_TO_POINT_SERVER_THREADS", "444"); + setEnvVar("POINT_TO_POINT_BROKER_THREADS", "444"); std::string mpiSize = setEnvVar("DEFAULT_MPI_WORLD_SIZE", "2468"); std::string mpiPort = setEnvVar("MPI_BASE_PORT", "9999"); @@ -77,7 +77,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") REQUIRE(conf.functionServerThreads == 111); REQUIRE(conf.stateServerThreads == 222); REQUIRE(conf.snapshotServerThreads == 333); - REQUIRE(conf.pointToPointServerThreads == 444); + REQUIRE(conf.pointToPointBrokerThreads == 444); REQUIRE(conf.defaultMpiWorldSize == 2468); REQUIRE(conf.mpiBasePort == 9999); @@ -102,7 +102,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") setEnvVar("FUNCTION_SERVER_THREADS", functionThreads); setEnvVar("STATE_SERVER_THREADS", stateThreads); setEnvVar("SNAPSHOT_SERVER_THREADS", snapshotThreads); - setEnvVar("POINT_TO_POINT_SERVER_THREADS", pointToPointThreads); + setEnvVar("POINT_TO_POINT_BROKER_THREADS", pointToPointThreads); setEnvVar("DEFAULT_MPI_WORLD_SIZE", mpiSize); setEnvVar("MPI_BASE_PORT", mpiPort); diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 6af939988..8306417e8 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -289,7 +289,7 @@ class PointToPointFixture protected: faabric::transport::PointToPointRegistry& reg; - faabric::transport::PointToPointServer server; + faabric::transport::PointToPointBroker server; faabric::transport::PointToPointClient cli; }; } From 61d1d11cc85a5032a3f43fcc3f259082dd2b5dce Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 15:27:44 +0000 Subject: [PATCH 09/22] Test for point to point messaging --- .../faabric/transport/PointToPointBroker.h | 11 +-- src/transport/PointToPointBroker.cpp | 29 ++----- src/transport/PointToPointRegistry.cpp | 31 ++++++- ...ver_broker.cpp => test_point_to_point.cpp} | 87 +++++++++++++++---- .../test_point_to_point_registry.cpp | 61 ------------- 5 files changed, 109 insertions(+), 110 deletions(-) rename tests/test/transport/{test_point_to_point_server_broker.cpp => test_point_to_point.cpp} (56%) delete mode 100644 tests/test/transport/test_point_to_point_registry.cpp diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index fdad5b8cf..7fa5867b6 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -1,28 +1,23 @@ #pragma once #include +#include namespace faabric::transport { -std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx); - class PointToPointBroker final : public MessageEndpointServer { public: PointToPointBroker(); - protected: - std::vector recvMessage(int appId, int sendIdx, int recvIdx); - private: + PointToPointRegistry ® + void doAsyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; std::unique_ptr doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; - - std::unique_ptr - getSendEndpoint(int appId, int sendIdx, int recvIdx); }; } diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 9001ada0a..2667e6534 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -9,17 +9,13 @@ namespace faabric::transport { -std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) -{ - return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); -} - PointToPointBroker::PointToPointBroker() : faabric::transport::MessageEndpointServer( POINT_TO_POINT_ASYNC_PORT, POINT_TO_POINT_SYNC_PORT, POINT_TO_POINT_INPROC_LABEL, faabric::util::getSystemConfig().pointToPointBrokerThreads) + , reg(getPointToPointRegistry()) {} void PointToPointBroker::doAsyncRecv(int header, @@ -28,17 +24,11 @@ void PointToPointBroker::doAsyncRecv(int header, { PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) - std::unique_ptr endpoint = - getSendEndpoint(msg.appid(), msg.sendidx(), msg.recvidx()); - - SPDLOG_TRACE("Forwarding point-to-point message {}:{}:{} to {}", - msg.appid(), - msg.sendidx(), - msg.recvidx(), - endpoint->getAddress()); - - // TODO - is this copying the data? Would be nice to avoid if poss - endpoint->send(BYTES_CONST(msg.data().c_str()), msg.data().size()); + reg.sendMessage(msg.appid(), + msg.sendidx(), + msg.recvidx(), + BYTES_CONST(msg.data().c_str()), + msg.data().size()); } std::unique_ptr PointToPointBroker::doSyncRecv( @@ -57,11 +47,4 @@ std::unique_ptr PointToPointBroker::doSyncRecv( return std::make_unique(); } -std::unique_ptr -PointToPointBroker::getSendEndpoint(int appId, int sendIdx, int recvIdx) -{ - std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); - return std::make_unique(label); -} - } diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp index d6f16a539..13afd0c5e 100644 --- a/src/transport/PointToPointRegistry.cpp +++ b/src/transport/PointToPointRegistry.cpp @@ -9,6 +9,11 @@ namespace faabric::transport { +std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) +{ + return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); +} + PointToPointRegistry::PointToPointRegistry() : sch(faabric::scheduler::getScheduler()) {} @@ -99,16 +104,36 @@ void PointToPointRegistry::sendMessage(int appId, { std::string host = getHostForReceiver(appId, recvIdx); - // TODO - if this host, put directly onto queue if (host == faabric::util::getSystemConfig().endpointHost) { std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); // TODO - need to keep this endpoint in scope in the same thread until // the message is consumed. - std::unique_ptr sendEndpoint = + std::unique_ptr endpoint = std::make_unique(label); + SPDLOG_TRACE("Local point-to-point message {}:{}:{} to {}", + appId, + sendIdx, + recvIdx, + endpoint->getAddress()); + + endpoint->send(buffer, bufferSize); + } else { - // TODO - if not, create client and send to remote host + PointToPointClient cli(host); + faabric::PointToPointMessage msg; + msg.set_appid(appId); + msg.set_sendidx(sendIdx); + msg.set_recvidx(recvIdx); + msg.set_data(buffer, bufferSize); + + SPDLOG_TRACE("Remote point-to-point message {}:{}:{} to {}", + appId, + sendIdx, + recvIdx, + host); + + cli.sendMessage(msg); } } diff --git a/tests/test/transport/test_point_to_point_server_broker.cpp b/tests/test/transport/test_point_to_point.cpp similarity index 56% rename from tests/test/transport/test_point_to_point_server_broker.cpp rename to tests/test/transport/test_point_to_point.cpp index e3a24600b..e09fc1317 100644 --- a/tests/test/transport/test_point_to_point_server_broker.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -1,18 +1,65 @@ -#include "faabric/util/testing.h" +#include "faabric/util/config.h" +#include "faabric_utils.h" #include -#include +#include -#include -#include -#include #include -#include using namespace faabric::transport; +using namespace faabric::util; namespace tests { +TEST_CASE_METHOD(PointToPointFixture, + "Test set and get point-to-point hosts", + "[transport]") +{ + // Note - deliberately overlap app indexes to make sure app id counts + int appIdA = 123; + int appIdB = 345; + int idxA1 = 0; + int idxB1 = 2; + int idxA2 = 10; + int idxB2 = 10; + + std::string hostA = "host-a"; + std::string hostB = "host-b"; + std::string hostC = "host-c"; + + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA1)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB1)); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + + reg.setHostForReceiver(appIdA, idxA1, hostA); + reg.setHostForReceiver(appIdB, idxB1, hostB); + + std::set expectedA = { idxA1 }; + std::set expectedB = { idxB1 }; + REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + + REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); + REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + + reg.setHostForReceiver(appIdA, idxA2, hostB); + reg.setHostForReceiver(appIdB, idxB2, hostC); + + expectedA = { idxA1, idxA2 }; + expectedB = { idxB1, idxB2 }; + + REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + + REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); + REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE(reg.getHostForReceiver(appIdB, idxB2) == hostC); +} + TEST_CASE_METHOD(PointToPointFixture, "Test sending mappings via registry", "[transport]") @@ -118,21 +165,31 @@ TEST_CASE_METHOD(PointToPointFixture, int sendIdx = 5; int recvIdx = 10; + // Ensure this host is set to localhost + faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); + conf.endpointHost = LOCALHOST; + // Register the recv index on this host reg.setHostForReceiver(appId, recvIdx, LOCALHOST); - std::vector data = { 0, 1, 2, 3 }; + std::vector sentData = { 0, 1, 2, 3 }; + std::vector receivedData; - faabric::PointToPointMessage msg; - msg.set_appid(appId); - msg.set_recvidx(recvIdx); - msg.set_sendidx(sendIdx); + // Make sure we send the message before a receiver is available to check + // async handling + reg.sendMessage(appId, sendIdx, recvIdx, sentData.data(), sentData.size()); - // Make sure we send the message before a receiver is available - cli.sendMessage(msg); + std::thread t([appId, sendIdx, recvIdx, &receivedData] { + PointToPointRegistry& reg = getPointToPointRegistry(); + receivedData = reg.recvMessage(appId, sendIdx, recvIdx); + }); - std::thread t([] { + if (t.joinable()) { + t.join(); + } - }); + REQUIRE(receivedData == sentData); + + conf.reset(); } } diff --git a/tests/test/transport/test_point_to_point_registry.cpp b/tests/test/transport/test_point_to_point_registry.cpp deleted file mode 100644 index b1454374d..000000000 --- a/tests/test/transport/test_point_to_point_registry.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "faabric_utils.h" -#include - -#include - -#include - -using namespace faabric::transport; -using namespace faabric::util; - -namespace tests { - -TEST_CASE_METHOD(PointToPointFixture, - "Test set and get point-to-point hosts", - "[transport]") -{ - // Note - deliberately overlap app indexes to make sure app id counts - int appIdA = 123; - int appIdB = 345; - int idxA1 = 0; - int idxB1 = 2; - int idxA2 = 10; - int idxB2 = 10; - - std::string hostA = "host-a"; - std::string hostB = "host-b"; - std::string hostC = "host-c"; - - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA1)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB1)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); - - reg.setHostForReceiver(appIdA, idxA1, hostA); - reg.setHostForReceiver(appIdB, idxB1, hostB); - - std::set expectedA = { idxA1 }; - std::set expectedB = { idxB1 }; - REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); - - REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); - REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); - - reg.setHostForReceiver(appIdA, idxA2, hostB); - reg.setHostForReceiver(appIdB, idxB2, hostC); - - expectedA = { idxA1, idxA2 }; - expectedB = { idxB1, idxB2 }; - - REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); - - REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); - REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); - REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); - REQUIRE(reg.getHostForReceiver(appIdB, idxB2) == hostC); -} -} From d5e572c49814c00be138124d6f5e1b5ea357af04 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 16:05:44 +0000 Subject: [PATCH 10/22] Rename and tidy --- include/faabric/runner/FaabricMain.h | 6 +- .../faabric/transport/PointToPointBroker.h | 46 ++++- .../faabric/transport/PointToPointRegistry.h | 49 ----- .../faabric/transport/PointToPointServer.h | 23 +++ src/runner/FaabricMain.cpp | 4 +- src/transport/CMakeLists.txt | 4 +- src/transport/PointToPointBroker.cpp | 176 +++++++++++++++--- src/transport/PointToPointRegistry.cpp | 168 ----------------- src/transport/PointToPointServer.cpp | 50 +++++ tests/test/transport/test_point_to_point.cpp | 85 ++++----- tests/utils/fixtures.h | 14 +- 11 files changed, 313 insertions(+), 312 deletions(-) delete mode 100644 include/faabric/transport/PointToPointRegistry.h create mode 100644 include/faabric/transport/PointToPointServer.h delete mode 100644 src/transport/PointToPointRegistry.cpp create mode 100644 src/transport/PointToPointServer.cpp diff --git a/include/faabric/runner/FaabricMain.h b/include/faabric/runner/FaabricMain.h index f2fb01bb0..c3ac23a84 100644 --- a/include/faabric/runner/FaabricMain.h +++ b/include/faabric/runner/FaabricMain.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include namespace faabric::runner { @@ -24,7 +24,7 @@ class FaabricMain void startSnapshotServer(); - void startPointToPointBroker(); + void startPointToPointServer(); void shutdown(); @@ -32,6 +32,6 @@ class FaabricMain faabric::state::StateServer stateServer; faabric::scheduler::FunctionCallServer functionServer; faabric::snapshot::SnapshotServer snapshotServer; - faabric::transport::PointToPointBroker pointToPointBroker; + faabric::transport::PointToPointServer pointToPointBroker; }; } diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 7fa5867b6..8d02f0ff4 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -1,23 +1,49 @@ #pragma once -#include -#include +#include -namespace faabric::transport { +#include +#include +#include +#include +#include -class PointToPointBroker final : public MessageEndpointServer +namespace faabric::transport { +class PointToPointBroker { public: PointToPointBroker(); - private: - PointToPointRegistry ® + std::string getHostForReceiver(int appId, int recvIdx); + + void setHostForReceiver(int appId, int recvIdx, const std::string& host); + + void broadcastMappings(int appId); + + void sendMappings(int appId, const std::string& host); + + std::set getIdxsRegisteredForApp(int appId); - void doAsyncRecv(int header, + void sendMessage(int appId, + int sendIdx, + int recvIdx, const uint8_t* buffer, - size_t bufferSize) override; + size_t bufferSize); - std::unique_ptr - doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; + std::vector recvMessage(int appId, int sendIdx, int recvIdx); + + void clear(); + + private: + std::shared_mutex registryMutex; + + std::unordered_map> appIdxs; + std::unordered_map mappings; + + faabric::scheduler::Scheduler& sch; + + std::string getKey(int appId, int recvIdx); }; + +PointToPointBroker& getPointToPointBroker(); } diff --git a/include/faabric/transport/PointToPointRegistry.h b/include/faabric/transport/PointToPointRegistry.h deleted file mode 100644 index 26107b8e7..000000000 --- a/include/faabric/transport/PointToPointRegistry.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include - -namespace faabric::transport { -class PointToPointRegistry -{ - public: - PointToPointRegistry(); - - std::string getHostForReceiver(int appId, int recvIdx); - - void setHostForReceiver(int appId, int recvIdx, const std::string& host); - - void broadcastMappings(int appId); - - void sendMappings(int appId, const std::string& host); - - std::set getIdxsRegisteredForApp(int appId); - - void sendMessage(int appId, - int sendIdx, - int recvIdx, - const uint8_t* buffer, - size_t bufferSize); - - std::vector recvMessage(int appId, int sendIdx, int recvIdx); - - void clear(); - - private: - std::shared_mutex registryMutex; - - std::unordered_map> appIdxs; - std::unordered_map mappings; - - faabric::scheduler::Scheduler& sch; - - std::string getKey(int appId, int recvIdx); -}; - -PointToPointRegistry& getPointToPointRegistry(); -} diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h new file mode 100644 index 000000000..9024fa2d8 --- /dev/null +++ b/include/faabric/transport/PointToPointServer.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + +namespace faabric::transport { + +class PointToPointServer final : public MessageEndpointServer +{ + public: + PointToPointServer(); + + private: + PointToPointBroker ® + + void doAsyncRecv(int header, + const uint8_t* buffer, + size_t bufferSize) override; + + std::unique_ptr + doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; +}; +} diff --git a/src/runner/FaabricMain.cpp b/src/runner/FaabricMain.cpp index 1b7d08185..68cc74408 100644 --- a/src/runner/FaabricMain.cpp +++ b/src/runner/FaabricMain.cpp @@ -41,7 +41,7 @@ void FaabricMain::startBackground() startSnapshotServer(); // Point-to-point messaging - startPointToPointBroker(); + startPointToPointServer(); // Work sharing startFunctionCallServer(); @@ -74,7 +74,7 @@ void FaabricMain::startSnapshotServer() snapshotServer.start(); } -void FaabricMain::startPointToPointBroker() +void FaabricMain::startPointToPointServer() { SPDLOG_INFO("Starting point-to-point server"); pointToPointBroker.start(); diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt index 17d10b78f..b7d12de35 100644 --- a/src/transport/CMakeLists.txt +++ b/src/transport/CMakeLists.txt @@ -12,8 +12,8 @@ set(HEADERS "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointServer.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MpiMessageEndpoint.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointClient.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointServer.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointBroker.h" - "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointRegistry.h" ) set(LIB_FILES @@ -24,8 +24,8 @@ set(LIB_FILES MessageEndpointServer.cpp MpiMessageEndpoint.cpp PointToPointClient.cpp + PointToPointServer.cpp PointToPointBroker.cpp - PointToPointRegistry.cpp ${HEADERS} ) diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 2667e6534..18596a8b0 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -1,50 +1,168 @@ +#include "faabric/util/config.h" #include +#include #include -#include -#include -#include -#include +#include +#include +#include #include -#include namespace faabric::transport { +std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) +{ + return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); +} + PointToPointBroker::PointToPointBroker() - : faabric::transport::MessageEndpointServer( - POINT_TO_POINT_ASYNC_PORT, - POINT_TO_POINT_SYNC_PORT, - POINT_TO_POINT_INPROC_LABEL, - faabric::util::getSystemConfig().pointToPointBrokerThreads) - , reg(getPointToPointRegistry()) + : sch(faabric::scheduler::getScheduler()) {} -void PointToPointBroker::doAsyncRecv(int header, - const uint8_t* buffer, - size_t bufferSize) +std::string PointToPointBroker::getKey(int appId, int recvIdx) +{ + return fmt::format("{}-{}", appId, recvIdx); +} + +std::string PointToPointBroker::getHostForReceiver(int appId, int recvIdx) +{ + faabric::util::SharedLock lock(registryMutex); + + std::string key = getKey(appId, recvIdx); + + if (mappings.find(key) == mappings.end()) { + SPDLOG_ERROR( + "No point-to-point mapping for app {} idx {}", appId, recvIdx); + throw std::runtime_error("No point-to-point mapping found"); + } + + return mappings[key]; +} + +void PointToPointBroker::setHostForReceiver(int appId, + int recvIdx, + const std::string& host) +{ + faabric::util::FullLock lock(registryMutex); + + SPDLOG_TRACE( + "Setting point-to-point mapping {}:{} to {}", appId, recvIdx, host); + + // Record this index for this app + appIdxs[appId].insert(recvIdx); + + // Add host mapping + std::string key = getKey(appId, recvIdx); + mappings[key] = host; +} + +void PointToPointBroker::broadcastMappings(int appId) +{ + auto& sch = faabric::scheduler::getScheduler(); + + // TODO seems excessive to broadcast to all hosts, could we perhaps use the + // set of registered hosts? + std::set hosts = sch.getAvailableHosts(); + + for (const auto& host : hosts) { + sendMappings(appId, host); + } +} + +void PointToPointBroker::sendMappings(int appId, const std::string& host) { - PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) + faabric::util::SharedLock lock(registryMutex); + + faabric::PointToPointMappings msg; + + std::set& indexes = appIdxs[appId]; + + for (auto i : indexes) { + std::string key = getKey(appId, i); + std::string host = mappings[key]; + + auto* mapping = msg.add_mappings(); + mapping->set_appid(appId); + mapping->set_recvidx(i); + mapping->set_host(host); + } + + PointToPointClient cli(host); + cli.sendMappings(msg); +} - reg.sendMessage(msg.appid(), - msg.sendidx(), - msg.recvidx(), - BYTES_CONST(msg.data().c_str()), - msg.data().size()); +std::set PointToPointBroker::getIdxsRegisteredForApp(int appId) +{ + faabric::util::SharedLock lock(registryMutex); + return appIdxs[appId]; } -std::unique_ptr PointToPointBroker::doSyncRecv( - int header, - const uint8_t* buffer, - size_t bufferSize) +void PointToPointBroker::sendMessage(int appId, + int sendIdx, + int recvIdx, + const uint8_t* buffer, + size_t bufferSize) { - PARSE_MSG(faabric::PointToPointMappings, buffer, bufferSize) + std::string host = getHostForReceiver(appId, recvIdx); + + if (host == faabric::util::getSystemConfig().endpointHost) { + std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); + + // TODO - need to keep this endpoint in scope in the same thread until + // the message is consumed. + std::unique_ptr endpoint = + std::make_unique(label); + SPDLOG_TRACE("Local point-to-point message {}:{}:{} to {}", + appId, + sendIdx, + recvIdx, + endpoint->getAddress()); - PointToPointRegistry& reg = getPointToPointRegistry(); + endpoint->send(buffer, bufferSize); - for (const auto& m : msg.mappings()) { - reg.setHostForReceiver(m.appid(), m.recvidx(), m.host()); + } else { + PointToPointClient cli(host); + faabric::PointToPointMessage msg; + msg.set_appid(appId); + msg.set_sendidx(sendIdx); + msg.set_recvidx(recvIdx); + msg.set_data(buffer, bufferSize); + + SPDLOG_TRACE("Remote point-to-point message {}:{}:{} to {}", + appId, + sendIdx, + recvIdx, + host); + + cli.sendMessage(msg); } +} + +std::vector PointToPointBroker::recvMessage(int appId, + int sendIdx, + int recvIdx) +{ + std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); + std::unique_ptr endpoint = + std::make_unique(label); + + std::optional messageDataMaybe = endpoint->recv().value(); + Message messageData = messageDataMaybe.value(); - return std::make_unique(); + // TODO - possible to avoid this copy? + return messageData.dataCopy(); } +void PointToPointBroker::clear() +{ + faabric::util::SharedLock lock(registryMutex); + + appIdxs.clear(); + mappings.clear(); +} + +PointToPointBroker& getPointToPointBroker() +{ + static PointToPointBroker reg; + return reg; +} } diff --git a/src/transport/PointToPointRegistry.cpp b/src/transport/PointToPointRegistry.cpp deleted file mode 100644 index 13afd0c5e..000000000 --- a/src/transport/PointToPointRegistry.cpp +++ /dev/null @@ -1,168 +0,0 @@ -#include "faabric/util/config.h" -#include -#include -#include -#include -#include -#include -#include - -namespace faabric::transport { - -std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) -{ - return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); -} - -PointToPointRegistry::PointToPointRegistry() - : sch(faabric::scheduler::getScheduler()) -{} - -std::string PointToPointRegistry::getKey(int appId, int recvIdx) -{ - return fmt::format("{}-{}", appId, recvIdx); -} - -std::string PointToPointRegistry::getHostForReceiver(int appId, int recvIdx) -{ - faabric::util::SharedLock lock(registryMutex); - - std::string key = getKey(appId, recvIdx); - - if (mappings.find(key) == mappings.end()) { - SPDLOG_ERROR( - "No point-to-point mapping for app {} idx {}", appId, recvIdx); - throw std::runtime_error("No point-to-point mapping found"); - } - - return mappings[key]; -} - -void PointToPointRegistry::setHostForReceiver(int appId, - int recvIdx, - const std::string& host) -{ - faabric::util::FullLock lock(registryMutex); - - SPDLOG_TRACE( - "Setting point-to-point mapping {}:{} to {}", appId, recvIdx, host); - - // Record this index for this app - appIdxs[appId].insert(recvIdx); - - // Add host mapping - std::string key = getKey(appId, recvIdx); - mappings[key] = host; -} - -void PointToPointRegistry::broadcastMappings(int appId) -{ - auto& sch = faabric::scheduler::getScheduler(); - - // TODO seems excessive to broadcast to all hosts, could we perhaps use the - // set of registered hosts? - std::set hosts = sch.getAvailableHosts(); - - for (const auto& host : hosts) { - sendMappings(appId, host); - } -} - -void PointToPointRegistry::sendMappings(int appId, const std::string& host) -{ - faabric::util::SharedLock lock(registryMutex); - - faabric::PointToPointMappings msg; - - std::set& indexes = appIdxs[appId]; - - for (auto i : indexes) { - std::string key = getKey(appId, i); - std::string host = mappings[key]; - - auto* mapping = msg.add_mappings(); - mapping->set_appid(appId); - mapping->set_recvidx(i); - mapping->set_host(host); - } - - PointToPointClient cli(host); - cli.sendMappings(msg); -} - -std::set PointToPointRegistry::getIdxsRegisteredForApp(int appId) -{ - faabric::util::SharedLock lock(registryMutex); - return appIdxs[appId]; -} - -void PointToPointRegistry::sendMessage(int appId, - int sendIdx, - int recvIdx, - const uint8_t* buffer, - size_t bufferSize) -{ - std::string host = getHostForReceiver(appId, recvIdx); - - if (host == faabric::util::getSystemConfig().endpointHost) { - std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); - - // TODO - need to keep this endpoint in scope in the same thread until - // the message is consumed. - std::unique_ptr endpoint = - std::make_unique(label); - SPDLOG_TRACE("Local point-to-point message {}:{}:{} to {}", - appId, - sendIdx, - recvIdx, - endpoint->getAddress()); - - endpoint->send(buffer, bufferSize); - - } else { - PointToPointClient cli(host); - faabric::PointToPointMessage msg; - msg.set_appid(appId); - msg.set_sendidx(sendIdx); - msg.set_recvidx(recvIdx); - msg.set_data(buffer, bufferSize); - - SPDLOG_TRACE("Remote point-to-point message {}:{}:{} to {}", - appId, - sendIdx, - recvIdx, - host); - - cli.sendMessage(msg); - } -} - -std::vector PointToPointRegistry::recvMessage(int appId, - int sendIdx, - int recvIdx) -{ - std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); - std::unique_ptr endpoint = - std::make_unique(label); - - std::optional messageDataMaybe = endpoint->recv().value(); - Message messageData = messageDataMaybe.value(); - - // TODO - possible to avoid this copy? - return messageData.dataCopy(); -} - -void PointToPointRegistry::clear() -{ - faabric::util::SharedLock lock(registryMutex); - - appIdxs.clear(); - mappings.clear(); -} - -PointToPointRegistry& getPointToPointRegistry() -{ - static PointToPointRegistry reg; - return reg; -} -} diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp new file mode 100644 index 000000000..f2e804995 --- /dev/null +++ b/src/transport/PointToPointServer.cpp @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faabric::transport { + +PointToPointServer::PointToPointServer() + : faabric::transport::MessageEndpointServer( + POINT_TO_POINT_ASYNC_PORT, + POINT_TO_POINT_SYNC_PORT, + POINT_TO_POINT_INPROC_LABEL, + faabric::util::getSystemConfig().pointToPointBrokerThreads) + , reg(getPointToPointBroker()) +{} + +void PointToPointServer::doAsyncRecv(int header, + const uint8_t* buffer, + size_t bufferSize) +{ + PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) + + reg.sendMessage(msg.appid(), + msg.sendidx(), + msg.recvidx(), + BYTES_CONST(msg.data().c_str()), + msg.data().size()); +} + +std::unique_ptr PointToPointServer::doSyncRecv( + int header, + const uint8_t* buffer, + size_t bufferSize) +{ + PARSE_MSG(faabric::PointToPointMappings, buffer, bufferSize) + + PointToPointBroker& reg = getPointToPointBroker(); + + for (const auto& m : msg.mappings()) { + reg.setHostForReceiver(m.appid(), m.recvidx(), m.host()); + } + + return std::make_unique(); +} + +} diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index e09fc1317..a23099957 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -4,7 +4,7 @@ #include -#include +#include using namespace faabric::transport; using namespace faabric::util; @@ -13,7 +13,7 @@ namespace tests { TEST_CASE_METHOD(PointToPointFixture, "Test set and get point-to-point hosts", - "[transport]") + "[transport][ptp]") { // Note - deliberately overlap app indexes to make sure app id counts int appIdA = 123; @@ -27,42 +27,42 @@ TEST_CASE_METHOD(PointToPointFixture, std::string hostB = "host-b"; std::string hostC = "host-c"; - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA1)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB1)); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + REQUIRE_THROWS(broker.getHostForReceiver(appIdA, idxA1)); + REQUIRE_THROWS(broker.getHostForReceiver(appIdA, idxA2)); + REQUIRE_THROWS(broker.getHostForReceiver(appIdB, idxB1)); + REQUIRE_THROWS(broker.getHostForReceiver(appIdB, idxB2)); - reg.setHostForReceiver(appIdA, idxA1, hostA); - reg.setHostForReceiver(appIdB, idxB1, hostB); + broker.setHostForReceiver(appIdA, idxA1, hostA); + broker.setHostForReceiver(appIdB, idxB1, hostB); std::set expectedA = { idxA1 }; std::set expectedB = { idxB1 }; - REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + REQUIRE(broker.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(broker.getIdxsRegisteredForApp(appIdB) == expectedB); - REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); - REQUIRE_THROWS(reg.getHostForReceiver(appIdA, idxA2)); - REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); - REQUIRE_THROWS(reg.getHostForReceiver(appIdB, idxB2)); + REQUIRE(broker.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE_THROWS(broker.getHostForReceiver(appIdA, idxA2)); + REQUIRE(broker.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE_THROWS(broker.getHostForReceiver(appIdB, idxB2)); - reg.setHostForReceiver(appIdA, idxA2, hostB); - reg.setHostForReceiver(appIdB, idxB2, hostC); + broker.setHostForReceiver(appIdA, idxA2, hostB); + broker.setHostForReceiver(appIdB, idxB2, hostC); expectedA = { idxA1, idxA2 }; expectedB = { idxB1, idxB2 }; - REQUIRE(reg.getIdxsRegisteredForApp(appIdA) == expectedA); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB) == expectedB); + REQUIRE(broker.getIdxsRegisteredForApp(appIdA) == expectedA); + REQUIRE(broker.getIdxsRegisteredForApp(appIdB) == expectedB); - REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); - REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); - REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostB); - REQUIRE(reg.getHostForReceiver(appIdB, idxB2) == hostC); + REQUIRE(broker.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE(broker.getHostForReceiver(appIdA, idxA2) == hostB); + REQUIRE(broker.getHostForReceiver(appIdB, idxB1) == hostB); + REQUIRE(broker.getHostForReceiver(appIdB, idxB2) == hostC); } TEST_CASE_METHOD(PointToPointFixture, - "Test sending mappings via registry", - "[transport]") + "Test sending mappings via brokeristry", + "[transport][ptp]") { faabric::util::setMockMode(true); @@ -76,11 +76,11 @@ TEST_CASE_METHOD(PointToPointFixture, std::string hostA = "host-a"; std::string hostB = "host-b"; - reg.setHostForReceiver(appIdA, idxA1, hostA); - reg.setHostForReceiver(appIdA, idxA2, hostB); - reg.setHostForReceiver(appIdB, idxB1, hostB); + broker.setHostForReceiver(appIdA, idxA1, hostA); + broker.setHostForReceiver(appIdA, idxA2, hostB); + broker.setHostForReceiver(appIdB, idxB1, hostB); - reg.sendMappings(appIdA, "other-host"); + broker.sendMappings(appIdA, "other-host"); auto actualSent = getSentMappings(); REQUIRE(actualSent.size() == 1); @@ -115,7 +115,7 @@ TEST_CASE_METHOD(PointToPointFixture, TEST_CASE_METHOD(PointToPointFixture, "Test sending mappings from client", - "[transport]") + "[transport][ptp]") { int appIdA = 123; int appIdB = 345; @@ -127,8 +127,8 @@ TEST_CASE_METHOD(PointToPointFixture, std::string hostA = "host-a"; std::string hostB = "host-b"; - REQUIRE(reg.getIdxsRegisteredForApp(appIdA).empty()); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB).empty()); + REQUIRE(broker.getIdxsRegisteredForApp(appIdA).empty()); + REQUIRE(broker.getIdxsRegisteredForApp(appIdB).empty()); faabric::PointToPointMappings mappings; @@ -149,17 +149,17 @@ TEST_CASE_METHOD(PointToPointFixture, cli.sendMappings(mappings); - REQUIRE(reg.getIdxsRegisteredForApp(appIdA).size() == 2); - REQUIRE(reg.getIdxsRegisteredForApp(appIdB).size() == 1); + REQUIRE(broker.getIdxsRegisteredForApp(appIdA).size() == 2); + REQUIRE(broker.getIdxsRegisteredForApp(appIdB).size() == 1); - REQUIRE(reg.getHostForReceiver(appIdA, idxA1) == hostA); - REQUIRE(reg.getHostForReceiver(appIdA, idxA2) == hostB); - REQUIRE(reg.getHostForReceiver(appIdB, idxB1) == hostA); + REQUIRE(broker.getHostForReceiver(appIdA, idxA1) == hostA); + REQUIRE(broker.getHostForReceiver(appIdA, idxA2) == hostB); + REQUIRE(broker.getHostForReceiver(appIdB, idxB1) == hostA); } TEST_CASE_METHOD(PointToPointFixture, "Test sending point-to-point message" - "[transport]") + "[transport][ptp]") { int appId = 123; int sendIdx = 5; @@ -169,19 +169,20 @@ TEST_CASE_METHOD(PointToPointFixture, faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); conf.endpointHost = LOCALHOST; - // Register the recv index on this host - reg.setHostForReceiver(appId, recvIdx, LOCALHOST); + // brokerister the recv index on this host + broker.setHostForReceiver(appId, recvIdx, LOCALHOST); std::vector sentData = { 0, 1, 2, 3 }; std::vector receivedData; // Make sure we send the message before a receiver is available to check // async handling - reg.sendMessage(appId, sendIdx, recvIdx, sentData.data(), sentData.size()); + broker.sendMessage( + appId, sendIdx, recvIdx, sentData.data(), sentData.size()); std::thread t([appId, sendIdx, recvIdx, &receivedData] { - PointToPointRegistry& reg = getPointToPointRegistry(); - receivedData = reg.recvMessage(appId, sendIdx, recvIdx); + PointToPointBroker& broker = getPointToPointBroker(); + receivedData = broker.recvMessage(appId, sendIdx, recvIdx); }); if (t.joinable()) { diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 8306417e8..6f41b2abc 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -10,9 +10,9 @@ #include #include #include -#include -#include #include +#include +#include #include #include #include @@ -273,23 +273,23 @@ class PointToPointFixture { public: PointToPointFixture() - : reg(faabric::transport::getPointToPointRegistry()) + : broker(faabric::transport::getPointToPointBroker()) , cli(LOCALHOST) { - reg.clear(); + broker.clear(); server.start(); } ~PointToPointFixture() { server.stop(); - reg.clear(); + broker.clear(); faabric::util::setMockMode(false); } protected: - faabric::transport::PointToPointRegistry& reg; - faabric::transport::PointToPointBroker server; + faabric::transport::PointToPointBroker& broker; + faabric::transport::PointToPointServer server; faabric::transport::PointToPointClient cli; }; } From a2d8a4f904b37a26cb2ff7da7bdc579cbc03cdc1 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 16:30:18 +0000 Subject: [PATCH 11/22] Sleep to enforce asyncness --- .../faabric/transport/PointToPointBroker.h | 2 - src/transport/PointToPointBroker.cpp | 51 +++++++++---------- tests/test/transport/test_point_to_point.cpp | 10 ++-- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 8d02f0ff4..a74787a08 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -41,8 +41,6 @@ class PointToPointBroker std::unordered_map mappings; faabric::scheduler::Scheduler& sch; - - std::string getKey(int appId, int recvIdx); }; PointToPointBroker& getPointToPointBroker(); diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 18596a8b0..61797002f 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -3,31 +3,30 @@ #include #include #include -#include #include #include namespace faabric::transport { -std::string getPointToPointInprocLabel(int appId, int sendIdx, int recvIdx) +std::string getPointToPointKey(int appId, int sendIdx, int recvIdx) { return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); } -PointToPointBroker::PointToPointBroker() - : sch(faabric::scheduler::getScheduler()) -{} - -std::string PointToPointBroker::getKey(int appId, int recvIdx) +std::string getPointToPointKey(int appId, int recvIdx) { return fmt::format("{}-{}", appId, recvIdx); } +PointToPointBroker::PointToPointBroker() + : sch(faabric::scheduler::getScheduler()) +{} + std::string PointToPointBroker::getHostForReceiver(int appId, int recvIdx) { faabric::util::SharedLock lock(registryMutex); - std::string key = getKey(appId, recvIdx); + std::string key = getPointToPointKey(appId, recvIdx); if (mappings.find(key) == mappings.end()) { SPDLOG_ERROR( @@ -39,8 +38,8 @@ std::string PointToPointBroker::getHostForReceiver(int appId, int recvIdx) } void PointToPointBroker::setHostForReceiver(int appId, - int recvIdx, - const std::string& host) + int recvIdx, + const std::string& host) { faabric::util::FullLock lock(registryMutex); @@ -51,7 +50,7 @@ void PointToPointBroker::setHostForReceiver(int appId, appIdxs[appId].insert(recvIdx); // Add host mapping - std::string key = getKey(appId, recvIdx); + std::string key = getPointToPointKey(appId, recvIdx); mappings[key] = host; } @@ -77,7 +76,7 @@ void PointToPointBroker::sendMappings(int appId, const std::string& host) std::set& indexes = appIdxs[appId]; for (auto i : indexes) { - std::string key = getKey(appId, i); + std::string key = getPointToPointKey(appId, i); std::string host = mappings[key]; auto* mapping = msg.add_mappings(); @@ -97,27 +96,26 @@ std::set PointToPointBroker::getIdxsRegisteredForApp(int appId) } void PointToPointBroker::sendMessage(int appId, - int sendIdx, - int recvIdx, - const uint8_t* buffer, - size_t bufferSize) + int sendIdx, + int recvIdx, + const uint8_t* buffer, + size_t bufferSize) { std::string host = getHostForReceiver(appId, recvIdx); if (host == faabric::util::getSystemConfig().endpointHost) { - std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); + std::string label = getPointToPointKey(appId, sendIdx, recvIdx); // TODO - need to keep this endpoint in scope in the same thread until // the message is consumed. - std::unique_ptr endpoint = - std::make_unique(label); + AsyncInternalSendMessageEndpoint endpoint(label); SPDLOG_TRACE("Local point-to-point message {}:{}:{} to {}", appId, sendIdx, recvIdx, - endpoint->getAddress()); + endpoint.getAddress()); - endpoint->send(buffer, bufferSize); + endpoint.send(buffer, bufferSize); } else { PointToPointClient cli(host); @@ -138,14 +136,13 @@ void PointToPointBroker::sendMessage(int appId, } std::vector PointToPointBroker::recvMessage(int appId, - int sendIdx, - int recvIdx) + int sendIdx, + int recvIdx) { - std::string label = getPointToPointInprocLabel(appId, sendIdx, recvIdx); - std::unique_ptr endpoint = - std::make_unique(label); + std::string label = getPointToPointKey(appId, sendIdx, recvIdx); + AsyncInternalRecvMessageEndpoint endpoint(label); - std::optional messageDataMaybe = endpoint->recv().value(); + std::optional messageDataMaybe = endpoint.recv().value(); Message messageData = messageDataMaybe.value(); // TODO - possible to avoid this copy? diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index a23099957..950ad2f21 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -1,10 +1,12 @@ -#include "faabric/util/config.h" -#include "faabric_utils.h" #include +#include "faabric_utils.h" + #include #include +#include +#include using namespace faabric::transport; using namespace faabric::util; @@ -61,7 +63,7 @@ TEST_CASE_METHOD(PointToPointFixture, } TEST_CASE_METHOD(PointToPointFixture, - "Test sending mappings via brokeristry", + "Test sending mappings via broker", "[transport][ptp]") { faabric::util::setMockMode(true); @@ -180,6 +182,8 @@ TEST_CASE_METHOD(PointToPointFixture, broker.sendMessage( appId, sendIdx, recvIdx, sentData.data(), sentData.size()); + SLEEP_MS(1000); + std::thread t([appId, sendIdx, recvIdx, &receivedData] { PointToPointBroker& broker = getPointToPointBroker(); receivedData = broker.recvMessage(appId, sendIdx, recvIdx); From bed0dacbb2635db0e77f61c1d745f6bf6530cbf0 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 17:31:04 +0000 Subject: [PATCH 12/22] Caching sockets --- .../faabric/transport/MessageEndpointServer.h | 2 + .../faabric/transport/PointToPointBroker.h | 2 + .../faabric/transport/PointToPointServer.h | 2 + src/transport/MessageEndpointServer.cpp | 7 +++ src/transport/PointToPointBroker.cpp | 53 ++++++++++++++++--- src/transport/PointToPointServer.cpp | 7 ++- tests/test/transport/test_point_to_point.cpp | 8 +-- tests/utils/fixtures.h | 4 ++ 8 files changed, 73 insertions(+), 12 deletions(-) diff --git a/include/faabric/transport/MessageEndpointServer.h b/include/faabric/transport/MessageEndpointServer.h index 7f9f7438a..950a21dca 100644 --- a/include/faabric/transport/MessageEndpointServer.h +++ b/include/faabric/transport/MessageEndpointServer.h @@ -56,6 +56,8 @@ class MessageEndpointServer virtual void stop(); + virtual void onThreadStop(); + void setRequestLatch(); void awaitRequestLatch(); diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index a74787a08..9e68b9439 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -34,6 +34,8 @@ class PointToPointBroker void clear(); + void resetThreadLocalCache(); + private: std::shared_mutex registryMutex; diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 9024fa2d8..1f999f59c 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -19,5 +19,7 @@ class PointToPointServer final : public MessageEndpointServer std::unique_ptr doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; + + void onThreadStop() override; }; } diff --git a/src/transport/MessageEndpointServer.cpp b/src/transport/MessageEndpointServer.cpp index 3e340675f..a0796152e 100644 --- a/src/transport/MessageEndpointServer.cpp +++ b/src/transport/MessageEndpointServer.cpp @@ -157,6 +157,9 @@ void MessageEndpointServerHandler::start( } } + // Perform the tidy-up + server->onThreadStop(); + // Just before the thread dies, check if there's something // waiting on the shutdown latch if (server->shutdownLatch != nullptr) { @@ -286,6 +289,10 @@ void MessageEndpointServer::stop() syncHandler.join(); } +void MessageEndpointServer::onThreadStop() { + // Nothing to do by default +} + void MessageEndpointServer::setRequestLatch() { requestLatch = faabric::util::Latch::create(2); diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 61797002f..89050dd0a 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -1,13 +1,26 @@ -#include "faabric/util/config.h" #include #include #include #include +#include #include #include namespace faabric::transport { +// NOTE: 0MQ sockets must _never_ be shared between threads, and must be closed +// _before_ the context. As a result, storing them in TLS can cause problems +// unless we're very careful to close them manually. In this case it's worth it +// to cache the sockets as there may be high churn in this broker. See the +// tidy-up methods that reference these maps for more info. +thread_local std:: + unordered_map> + recvEndpoints; + +thread_local std:: + unordered_map> + sendEndpoints; + std::string getPointToPointKey(int appId, int sendIdx, int recvIdx) { return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); @@ -106,16 +119,22 @@ void PointToPointBroker::sendMessage(int appId, if (host == faabric::util::getSystemConfig().endpointHost) { std::string label = getPointToPointKey(appId, sendIdx, recvIdx); - // TODO - need to keep this endpoint in scope in the same thread until - // the message is consumed. - AsyncInternalSendMessageEndpoint endpoint(label); + // Note - this map is thread-local so no locking required + if (sendEndpoints.find(label) == sendEndpoints.end()) { + sendEndpoints[label] = + std::make_unique(label); + + SPDLOG_TRACE("Created new internal send endpoint {}", + sendEndpoints[label]->getAddress()); + } + SPDLOG_TRACE("Local point-to-point message {}:{}:{} to {}", appId, sendIdx, recvIdx, - endpoint.getAddress()); + sendEndpoints[label]->getAddress()); - endpoint.send(buffer, bufferSize); + sendEndpoints[label]->send(buffer, bufferSize); } else { PointToPointClient cli(host); @@ -140,9 +159,17 @@ std::vector PointToPointBroker::recvMessage(int appId, int recvIdx) { std::string label = getPointToPointKey(appId, sendIdx, recvIdx); - AsyncInternalRecvMessageEndpoint endpoint(label); - std::optional messageDataMaybe = endpoint.recv().value(); + // Note: this map is thread-local so no locking required + if (recvEndpoints.find(label) == recvEndpoints.end()) { + recvEndpoints[label] = + std::make_unique(label); + SPDLOG_TRACE("Created new internal recv endpoint {}", + recvEndpoints[label]->getAddress()); + } + + std::optional messageDataMaybe = + recvEndpoints[label]->recv().value(); Message messageData = messageDataMaybe.value(); // TODO - possible to avoid this copy? @@ -157,6 +184,16 @@ void PointToPointBroker::clear() mappings.clear(); } +void PointToPointBroker::resetThreadLocalCache() +{ + auto tid = (pid_t)syscall(SYS_gettid); + SPDLOG_TRACE("Resetting point-to-point thread-local cache for thread {}", + tid); + + sendEndpoints.clear(); + recvEndpoints.clear(); +} + PointToPointBroker& getPointToPointBroker() { static PointToPointBroker reg; diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index f2e804995..031023163 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include #include #include #include @@ -47,4 +47,9 @@ std::unique_ptr PointToPointServer::doSyncRecv( return std::make_unique(); } +void PointToPointServer::onThreadStop() { + // Clear any thread-local cached sockets + reg.resetThreadLocalCache(); +} + } diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index 950ad2f21..5c88f62df 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -63,7 +63,7 @@ TEST_CASE_METHOD(PointToPointFixture, } TEST_CASE_METHOD(PointToPointFixture, - "Test sending mappings via broker", + "Test sending point-to-point mappings via broker", "[transport][ptp]") { faabric::util::setMockMode(true); @@ -116,7 +116,7 @@ TEST_CASE_METHOD(PointToPointFixture, } TEST_CASE_METHOD(PointToPointFixture, - "Test sending mappings from client", + "Test sending point-to-point mappings from client", "[transport][ptp]") { int appIdA = 123; @@ -160,7 +160,7 @@ TEST_CASE_METHOD(PointToPointFixture, } TEST_CASE_METHOD(PointToPointFixture, - "Test sending point-to-point message" + "Test sending point-to-point message", "[transport][ptp]") { int appId = 123; @@ -187,6 +187,8 @@ TEST_CASE_METHOD(PointToPointFixture, std::thread t([appId, sendIdx, recvIdx, &receivedData] { PointToPointBroker& broker = getPointToPointBroker(); receivedData = broker.recvMessage(appId, sendIdx, recvIdx); + + broker.resetThreadLocalCache(); }); if (t.joinable()) { diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 6f41b2abc..49bfd6015 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -282,6 +282,10 @@ class PointToPointFixture ~PointToPointFixture() { + // Note - here we reset the thread-local cache for the test thread. If + // other threads are used in the tests, they too must do this. + broker.resetThreadLocalCache(); + server.stop(); broker.clear(); faabric::util::setMockMode(false); From 8ec98c441845d7e6beaf81c2d5d280147e7cd536 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 17:56:21 +0000 Subject: [PATCH 13/22] Naming and general tidy --- include/faabric/runner/FaabricMain.h | 2 +- .../faabric/transport/MessageEndpointServer.h | 2 +- .../faabric/transport/PointToPointBroker.h | 2 +- .../faabric/transport/PointToPointServer.h | 2 +- include/faabric/util/config.h | 2 +- src/runner/FaabricMain.cpp | 2 +- src/transport/MessageEndpointServer.cpp | 4 ++-- src/transport/PointToPointBroker.cpp | 21 ++++++++++--------- src/transport/PointToPointServer.cpp | 4 ++-- src/util/config.cpp | 4 ++-- tests/test/util/test_config.cpp | 6 +++--- 11 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/faabric/runner/FaabricMain.h b/include/faabric/runner/FaabricMain.h index c3ac23a84..6a5fa6f72 100644 --- a/include/faabric/runner/FaabricMain.h +++ b/include/faabric/runner/FaabricMain.h @@ -32,6 +32,6 @@ class FaabricMain faabric::state::StateServer stateServer; faabric::scheduler::FunctionCallServer functionServer; faabric::snapshot::SnapshotServer snapshotServer; - faabric::transport::PointToPointServer pointToPointBroker; + faabric::transport::PointToPointServer pointToPointServer; }; } diff --git a/include/faabric/transport/MessageEndpointServer.h b/include/faabric/transport/MessageEndpointServer.h index 950a21dca..c23414f51 100644 --- a/include/faabric/transport/MessageEndpointServer.h +++ b/include/faabric/transport/MessageEndpointServer.h @@ -56,7 +56,7 @@ class MessageEndpointServer virtual void stop(); - virtual void onThreadStop(); + virtual void onWorkerStop(); void setRequestLatch(); diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 9e68b9439..164c094e3 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -37,7 +37,7 @@ class PointToPointBroker void resetThreadLocalCache(); private: - std::shared_mutex registryMutex; + std::shared_mutex brokerMutex; std::unordered_map> appIdxs; std::unordered_map mappings; diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 1f999f59c..0ddac61a3 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -20,6 +20,6 @@ class PointToPointServer final : public MessageEndpointServer std::unique_ptr doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; - void onThreadStop() override; + void onWorkerStop() override; }; } diff --git a/include/faabric/util/config.h b/include/faabric/util/config.h index 37ecc2699..b424ebcf4 100644 --- a/include/faabric/util/config.h +++ b/include/faabric/util/config.h @@ -47,7 +47,7 @@ class SystemConfig int functionServerThreads; int stateServerThreads; int snapshotServerThreads; - int pointToPointBrokerThreads; + int pointToPointServerThreads; SystemConfig(); diff --git a/src/runner/FaabricMain.cpp b/src/runner/FaabricMain.cpp index 68cc74408..286f4c863 100644 --- a/src/runner/FaabricMain.cpp +++ b/src/runner/FaabricMain.cpp @@ -77,7 +77,7 @@ void FaabricMain::startSnapshotServer() void FaabricMain::startPointToPointServer() { SPDLOG_INFO("Starting point-to-point server"); - pointToPointBroker.start(); + pointToPointServer.start(); } void FaabricMain::startStateServer() diff --git a/src/transport/MessageEndpointServer.cpp b/src/transport/MessageEndpointServer.cpp index a0796152e..4c6330f23 100644 --- a/src/transport/MessageEndpointServer.cpp +++ b/src/transport/MessageEndpointServer.cpp @@ -158,7 +158,7 @@ void MessageEndpointServerHandler::start( } // Perform the tidy-up - server->onThreadStop(); + server->onWorkerStop(); // Just before the thread dies, check if there's something // waiting on the shutdown latch @@ -289,7 +289,7 @@ void MessageEndpointServer::stop() syncHandler.join(); } -void MessageEndpointServer::onThreadStop() { +void MessageEndpointServer::onWorkerStop() { // Nothing to do by default } diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 89050dd0a..8f80e16aa 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -8,11 +8,12 @@ namespace faabric::transport { -// NOTE: 0MQ sockets must _never_ be shared between threads, and must be closed -// _before_ the context. As a result, storing them in TLS can cause problems -// unless we're very careful to close them manually. In this case it's worth it -// to cache the sockets as there may be high churn in this broker. See the -// tidy-up methods that reference these maps for more info. +// NOTE: Keeping 0MQ socketsin TLS is usually a bad idea, as they _must_ be +// closed before the global context. However, in this case it's worth it +// to cache the sockets across messages, as otherwise we'd be creating and +// destroying a lot of them under high throughput. To ensure things are cleared +// up, see the thread-local tidy-up message on this class and its usage in the +// rest of the codebase. thread_local std:: unordered_map> recvEndpoints; @@ -37,7 +38,7 @@ PointToPointBroker::PointToPointBroker() std::string PointToPointBroker::getHostForReceiver(int appId, int recvIdx) { - faabric::util::SharedLock lock(registryMutex); + faabric::util::SharedLock lock(brokerMutex); std::string key = getPointToPointKey(appId, recvIdx); @@ -54,7 +55,7 @@ void PointToPointBroker::setHostForReceiver(int appId, int recvIdx, const std::string& host) { - faabric::util::FullLock lock(registryMutex); + faabric::util::FullLock lock(brokerMutex); SPDLOG_TRACE( "Setting point-to-point mapping {}:{} to {}", appId, recvIdx, host); @@ -82,7 +83,7 @@ void PointToPointBroker::broadcastMappings(int appId) void PointToPointBroker::sendMappings(int appId, const std::string& host) { - faabric::util::SharedLock lock(registryMutex); + faabric::util::SharedLock lock(brokerMutex); faabric::PointToPointMappings msg; @@ -104,7 +105,7 @@ void PointToPointBroker::sendMappings(int appId, const std::string& host) std::set PointToPointBroker::getIdxsRegisteredForApp(int appId) { - faabric::util::SharedLock lock(registryMutex); + faabric::util::SharedLock lock(brokerMutex); return appIdxs[appId]; } @@ -178,7 +179,7 @@ std::vector PointToPointBroker::recvMessage(int appId, void PointToPointBroker::clear() { - faabric::util::SharedLock lock(registryMutex); + faabric::util::SharedLock lock(brokerMutex); appIdxs.clear(); mappings.clear(); diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 031023163..096adb757 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -14,7 +14,7 @@ PointToPointServer::PointToPointServer() POINT_TO_POINT_ASYNC_PORT, POINT_TO_POINT_SYNC_PORT, POINT_TO_POINT_INPROC_LABEL, - faabric::util::getSystemConfig().pointToPointBrokerThreads) + faabric::util::getSystemConfig().pointToPointServerThreads) , reg(getPointToPointBroker()) {} @@ -47,7 +47,7 @@ std::unique_ptr PointToPointServer::doSyncRecv( return std::make_unique(); } -void PointToPointServer::onThreadStop() { +void PointToPointServer::onWorkerStop() { // Clear any thread-local cached sockets reg.resetThreadLocalCache(); } diff --git a/src/util/config.cpp b/src/util/config.cpp index 8b00037b3..633291de3 100644 --- a/src/util/config.cpp +++ b/src/util/config.cpp @@ -65,8 +65,8 @@ void SystemConfig::initialise() this->getSystemConfIntParam("STATE_SERVER_THREADS", "2"); snapshotServerThreads = this->getSystemConfIntParam("SNAPSHOT_SERVER_THREADS", "2"); - pointToPointBrokerThreads = - this->getSystemConfIntParam("POINT_TO_POINT_BROKER_THREADS", "2"); + pointToPointServerThreads = + this->getSystemConfIntParam("POINT_TO_POINT_SERVER_THREADS", "2"); } int SystemConfig::getSystemConfIntParam(const char* name, diff --git a/tests/test/util/test_config.cpp b/tests/test/util/test_config.cpp index 306581826..383253057 100644 --- a/tests/test/util/test_config.cpp +++ b/tests/test/util/test_config.cpp @@ -52,7 +52,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") std::string stateThreads = setEnvVar("STATE_SERVER_THREADS", "222"); std::string snapshotThreads = setEnvVar("SNAPSHOT_SERVER_THREADS", "333"); std::string pointToPointThreads = - setEnvVar("POINT_TO_POINT_BROKER_THREADS", "444"); + setEnvVar("POINT_TO_POINT_SERVER_THREADS", "444"); std::string mpiSize = setEnvVar("DEFAULT_MPI_WORLD_SIZE", "2468"); std::string mpiPort = setEnvVar("MPI_BASE_PORT", "9999"); @@ -77,7 +77,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") REQUIRE(conf.functionServerThreads == 111); REQUIRE(conf.stateServerThreads == 222); REQUIRE(conf.snapshotServerThreads == 333); - REQUIRE(conf.pointToPointBrokerThreads == 444); + REQUIRE(conf.pointToPointServerThreads == 444); REQUIRE(conf.defaultMpiWorldSize == 2468); REQUIRE(conf.mpiBasePort == 9999); @@ -102,7 +102,7 @@ TEST_CASE("Test overriding system config initialisation", "[util]") setEnvVar("FUNCTION_SERVER_THREADS", functionThreads); setEnvVar("STATE_SERVER_THREADS", stateThreads); setEnvVar("SNAPSHOT_SERVER_THREADS", snapshotThreads); - setEnvVar("POINT_TO_POINT_BROKER_THREADS", pointToPointThreads); + setEnvVar("POINT_TO_POINT_SERVER_THREADS", pointToPointThreads); setEnvVar("DEFAULT_MPI_WORLD_SIZE", mpiSize); setEnvVar("MPI_BASE_PORT", mpiPort); From 564bfc6fe6cd10718a017b9869daa9b428a7ec0a Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 7 Oct 2021 17:56:44 +0000 Subject: [PATCH 14/22] Formatting --- include/faabric/transport/PointToPointServer.h | 2 +- src/transport/MessageEndpointServer.cpp | 3 ++- src/transport/PointToPointServer.cpp | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 0ddac61a3..68f38ea73 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -11,7 +11,7 @@ class PointToPointServer final : public MessageEndpointServer PointToPointServer(); private: - PointToPointBroker ® + PointToPointBroker& reg; void doAsyncRecv(int header, const uint8_t* buffer, diff --git a/src/transport/MessageEndpointServer.cpp b/src/transport/MessageEndpointServer.cpp index 4c6330f23..e340853a3 100644 --- a/src/transport/MessageEndpointServer.cpp +++ b/src/transport/MessageEndpointServer.cpp @@ -289,7 +289,8 @@ void MessageEndpointServer::stop() syncHandler.join(); } -void MessageEndpointServer::onWorkerStop() { +void MessageEndpointServer::onWorkerStop() +{ // Nothing to do by default } diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 096adb757..07726e257 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -47,7 +47,8 @@ std::unique_ptr PointToPointServer::doSyncRecv( return std::make_unique(); } -void PointToPointServer::onWorkerStop() { +void PointToPointServer::onWorkerStop() +{ // Clear any thread-local cached sockets reg.resetThreadLocalCache(); } From 0f0ee6c6df3e8d403fd0cb5aae4b497bdbfba121 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 10:49:33 +0000 Subject: [PATCH 15/22] Mock tests for broadcast mappings --- .../faabric/transport/PointToPointServer.h | 6 + include/faabric/transport/common.h | 2 +- src/transport/PointToPointBroker.cpp | 13 +- src/transport/PointToPointServer.cpp | 30 +++++ tests/test/transport/test_point_to_point.cpp | 112 +++++++++++++----- tests/utils/fixtures.h | 9 +- 6 files changed, 135 insertions(+), 37 deletions(-) diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 68f38ea73..3a7d80df9 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -21,5 +21,11 @@ class PointToPointServer final : public MessageEndpointServer doSyncRecv(int header, const uint8_t* buffer, size_t bufferSize) override; void onWorkerStop() override; + + void doSendMessage(const uint8_t* buffer, size_t bufferSize); + + std::unique_ptr doRecvMappings( + const uint8_t* buffer, + size_t bufferSize); }; } diff --git a/include/faabric/transport/common.h b/include/faabric/transport/common.h index 1e5560eb1..5a330a831 100644 --- a/include/faabric/transport/common.h +++ b/include/faabric/transport/common.h @@ -18,4 +18,4 @@ #define POINT_TO_POINT_ASYNC_PORT 8009 #define POINT_TO_POINT_SYNC_PORT 8010 -#define POINT_TO_POINT_INPROC_LABEL "pointtopoint" +#define POINT_TO_POINT_INPROC_LABEL "ptp" diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 8f80e16aa..e0d8e02b3 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -8,7 +8,7 @@ namespace faabric::transport { -// NOTE: Keeping 0MQ socketsin TLS is usually a bad idea, as they _must_ be +// NOTE: Keeping 0MQ sockets in TLS is usually a bad idea, as they _must_ be // closed before the global context. However, in this case it's worth it // to cache the sockets across messages, as otherwise we'd be creating and // destroying a lot of them under high throughput. To ensure things are cleared @@ -76,7 +76,14 @@ void PointToPointBroker::broadcastMappings(int appId) // set of registered hosts? std::set hosts = sch.getAvailableHosts(); + faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); + for (const auto& host : hosts) { + // Skip this host + if (host == conf.endpointHost) { + continue; + } + sendMappings(appId, host); } } @@ -99,6 +106,10 @@ void PointToPointBroker::sendMappings(int appId, const std::string& host) mapping->set_host(host); } + SPDLOG_DEBUG("Sending {} point-to-point mappings for {} to {}", + indexes.size(), + appId, + host); PointToPointClient cli(host); cli.sendMappings(msg); } diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 07726e257..790df0584 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,3 +1,4 @@ +#include "faabric/transport/PointToPointCall.h" #include #include #include @@ -21,6 +22,20 @@ PointToPointServer::PointToPointServer() void PointToPointServer::doAsyncRecv(int header, const uint8_t* buffer, size_t bufferSize) +{ + switch (header) { + case (faabric::transport::PointToPointCall::MESSAGE): { + doSendMessage(buffer, bufferSize); + break; + } + default: { + SPDLOG_ERROR("Invalid aync point-to-point header: {}", header); + throw std::runtime_error("Invalid async point-to-point message"); + } + } +} + +void PointToPointServer::doSendMessage(const uint8_t* buffer, size_t bufferSize) { PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) @@ -35,6 +50,21 @@ std::unique_ptr PointToPointServer::doSyncRecv( int header, const uint8_t* buffer, size_t bufferSize) +{ + switch (header) { + case (faabric::transport::PointToPointCall::MAPPING): { + return doRecvMappings(buffer, bufferSize); + } + default: { + SPDLOG_ERROR("Invalid sync point-to-point header: {}", header); + throw std::runtime_error("Invalid sync point-to-point message"); + } + } +} + +std::unique_ptr PointToPointServer::doRecvMappings( + const uint8_t* buffer, + size_t bufferSize) { PARSE_MSG(faabric::PointToPointMappings, buffer, bufferSize) diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index 5c88f62df..7e57209e4 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -1,5 +1,7 @@ #include +#include "faabric/proto/faabric.pb.h" +#include "faabric/scheduler/Scheduler.h" #include "faabric_utils.h" #include @@ -13,7 +15,16 @@ using namespace faabric::util; namespace tests { -TEST_CASE_METHOD(PointToPointFixture, +class PointToPointSchedulerFixture + : public PointToPointTestFixture + , SchedulerTestFixture +{ + public: + PointToPointSchedulerFixture() {} + ~PointToPointSchedulerFixture() {} +}; + +TEST_CASE_METHOD(PointToPointSchedulerFixture, "Test set and get point-to-point hosts", "[transport][ptp]") { @@ -62,7 +73,7 @@ TEST_CASE_METHOD(PointToPointFixture, REQUIRE(broker.getHostForReceiver(appIdB, idxB2) == hostC); } -TEST_CASE_METHOD(PointToPointFixture, +TEST_CASE_METHOD(PointToPointSchedulerFixture, "Test sending point-to-point mappings via broker", "[transport][ptp]") { @@ -77,45 +88,82 @@ TEST_CASE_METHOD(PointToPointFixture, std::string hostA = "host-a"; std::string hostB = "host-b"; + std::string hostC = "host-c"; - broker.setHostForReceiver(appIdA, idxA1, hostA); - broker.setHostForReceiver(appIdA, idxA2, hostB); - broker.setHostForReceiver(appIdB, idxB1, hostB); - - broker.sendMappings(appIdA, "other-host"); + faabric::scheduler::Scheduler& sch = faabric::scheduler::getScheduler(); + sch.reset(); - auto actualSent = getSentMappings(); - REQUIRE(actualSent.size() == 1); + sch.addHostToGlobalSet(hostA); + sch.addHostToGlobalSet(hostB); + sch.addHostToGlobalSet(hostC); - faabric::PointToPointMappings actualMappings = actualSent.at(0).second; - REQUIRE(actualMappings.mappings().size() == 2); + // Includes this host + REQUIRE(sch.getAvailableHosts().size() == 4); - faabric::PointToPointMappings::PointToPointMapping mappingA = - actualMappings.mappings().at(0); - faabric::PointToPointMappings::PointToPointMapping mappingB = - actualMappings.mappings().at(1); + broker.setHostForReceiver(appIdA, idxA1, hostA); + broker.setHostForReceiver(appIdA, idxA2, hostB); + broker.setHostForReceiver(appIdB, idxB1, hostB); - REQUIRE(mappingA.appid() == appIdA); - REQUIRE(mappingB.appid() == appIdA); + std::vector expectedHosts; + SECTION("Send single host") + { + broker.sendMappings(appIdA, hostC); + expectedHosts = { hostC }; + } - // Note - we don't know the order the mappings are sent in so we have to - // check both possibilities - if (mappingA.recvidx() == idxA1) { - REQUIRE(mappingA.host() == hostA); + SECTION("Broadcast all hosts") + { + broker.broadcastMappings(appIdA); - REQUIRE(mappingB.recvidx() == idxA2); - REQUIRE(mappingB.host() == hostB); - } else if (mappingA.recvidx() == idxA2) { - REQUIRE(mappingA.host() == hostB); + // Don't expect to be broadcast to this host + expectedHosts = { hostA, hostB, hostC }; + } - REQUIRE(mappingB.recvidx() == idxA1); - REQUIRE(mappingB.host() == hostA); - } else { - FAIL(); + auto actualSent = getSentMappings(); + REQUIRE(actualSent.size() == expectedHosts.size()); + + // Sort the sent mappings based on host + std::sort(actualSent.begin(), + actualSent.end(), + [](const std::pair& a, + const std::pair& b) + -> bool { return a.first < b.first; }); + + // Check each of the sent mappings is as we would expect + for (int i = 0; i < expectedHosts.size(); i++) { + REQUIRE(actualSent.at(i).first == expectedHosts.at(i)); + + faabric::PointToPointMappings actualMappings = actualSent.at(i).second; + REQUIRE(actualMappings.mappings().size() == 2); + + faabric::PointToPointMappings::PointToPointMapping mappingA = + actualMappings.mappings().at(0); + faabric::PointToPointMappings::PointToPointMapping mappingB = + actualMappings.mappings().at(1); + + REQUIRE(mappingA.appid() == appIdA); + REQUIRE(mappingB.appid() == appIdA); + + // Note - we don't know the order of the mappings and can't easily sort + // the data in the protobuf object, so it's easiest just to check both + // possible orderings. + if (mappingA.recvidx() == idxA1) { + REQUIRE(mappingA.host() == hostA); + + REQUIRE(mappingB.recvidx() == idxA2); + REQUIRE(mappingB.host() == hostB); + } else if (mappingA.recvidx() == idxA2) { + REQUIRE(mappingA.host() == hostB); + + REQUIRE(mappingB.recvidx() == idxA1); + REQUIRE(mappingB.host() == hostA); + } else { + FAIL(); + } } } -TEST_CASE_METHOD(PointToPointFixture, +TEST_CASE_METHOD(PointToPointSchedulerFixture, "Test sending point-to-point mappings from client", "[transport][ptp]") { @@ -159,8 +207,8 @@ TEST_CASE_METHOD(PointToPointFixture, REQUIRE(broker.getHostForReceiver(appIdB, idxB1) == hostA); } -TEST_CASE_METHOD(PointToPointFixture, - "Test sending point-to-point message", +TEST_CASE_METHOD(PointToPointSchedulerFixture, + "Test send and receive point-to-point messages", "[transport][ptp]") { int appId = 123; diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 49bfd6015..b748747dc 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -269,23 +269,26 @@ class RemoteMpiTestFixture : public MpiBaseTestFixture faabric::scheduler::MpiWorld otherWorld; }; -class PointToPointFixture +class PointToPointTestFixture { public: - PointToPointFixture() + PointToPointTestFixture() : broker(faabric::transport::getPointToPointBroker()) , cli(LOCALHOST) { + faabric::util::setMockMode(false); broker.clear(); server.start(); } - ~PointToPointFixture() + ~PointToPointTestFixture() { // Note - here we reset the thread-local cache for the test thread. If // other threads are used in the tests, they too must do this. broker.resetThreadLocalCache(); + faabric::transport::clearSentMessages(); + server.stop(); broker.clear(); faabric::util::setMockMode(false); From 01cbaedc34f4a8ec00d18ad7b7c8ca5110cebd41 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 10:58:25 +0000 Subject: [PATCH 16/22] Test for send back and forth --- tests/test/transport/test_point_to_point.cpp | 34 +++++++++++++------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index 7e57209e4..1f9417aff 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -212,38 +212,50 @@ TEST_CASE_METHOD(PointToPointSchedulerFixture, "[transport][ptp]") { int appId = 123; - int sendIdx = 5; - int recvIdx = 10; + int idxA = 5; + int idxB = 10; // Ensure this host is set to localhost faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); conf.endpointHost = LOCALHOST; - // brokerister the recv index on this host - broker.setHostForReceiver(appId, recvIdx, LOCALHOST); + // Register both indexes on this host + broker.setHostForReceiver(appId, idxA, LOCALHOST); + broker.setHostForReceiver(appId, idxB, LOCALHOST); - std::vector sentData = { 0, 1, 2, 3 }; - std::vector receivedData; + std::vector sentDataA = { 0, 1, 2, 3 }; + std::vector receivedDataA; + std::vector sentDataB = { 3, 4, 5 }; + std::vector receivedDataB; // Make sure we send the message before a receiver is available to check // async handling - broker.sendMessage( - appId, sendIdx, recvIdx, sentData.data(), sentData.size()); + broker.sendMessage(appId, idxA, idxB, sentDataA.data(), sentDataA.size()); SLEEP_MS(1000); - std::thread t([appId, sendIdx, recvIdx, &receivedData] { + std::thread t([appId, idxA, idxB, &receivedDataA, &sentDataB] { PointToPointBroker& broker = getPointToPointBroker(); - receivedData = broker.recvMessage(appId, sendIdx, recvIdx); + + // Receive the first message + receivedDataA = broker.recvMessage(appId, idxA, idxB); + + // Send a message back (note reversing the indexes) + broker.sendMessage( + appId, idxB, idxA, sentDataB.data(), sentDataB.size()); broker.resetThreadLocalCache(); }); + // Receive the message sent back + receivedDataB = broker.recvMessage(appId, idxB, idxA); + if (t.joinable()) { t.join(); } - REQUIRE(receivedData == sentData); + REQUIRE(receivedDataA == sentDataA); + REQUIRE(receivedDataB == sentDataB); conf.reset(); } From 6a68e2f703b8f48c5ac38e95b6b9df721712bf20 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 11:58:36 +0000 Subject: [PATCH 17/22] Stop ptp server --- src/runner/FaabricMain.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/runner/FaabricMain.cpp b/src/runner/FaabricMain.cpp index 286f4c863..43956e916 100644 --- a/src/runner/FaabricMain.cpp +++ b/src/runner/FaabricMain.cpp @@ -108,6 +108,9 @@ void FaabricMain::shutdown() SPDLOG_INFO("Waiting for the snapshot server to finish"); snapshotServer.stop(); + SPDLOG_INFO("Waiting for the point-to-point server to finish"); + pointToPointServer.stop(); + auto& sch = faabric::scheduler::getScheduler(); sch.shutdown(); From a74d6f78e90f83f56c03ca2ccde5c6d82a9e1b7f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 13:13:06 +0000 Subject: [PATCH 18/22] Add failing dist test --- include/faabric/util/bytes.h | 2 + src/util/bytes.cpp | 18 +++++++ tests/dist/CMakeLists.txt | 1 + tests/dist/init.cpp | 1 + tests/dist/init.h | 1 + tests/dist/transport/functions.cpp | 77 ++++++++++++++++++++++++++++ tests/dist/transport/test_funcs.cpp | 78 +++++++++++++++++++++++++++++ tests/test/util/test_bytes.cpp | 21 ++++++++ 8 files changed, 199 insertions(+) create mode 100644 tests/dist/transport/functions.cpp create mode 100644 tests/dist/transport/test_funcs.cpp diff --git a/include/faabric/util/bytes.h b/include/faabric/util/bytes.h index 609952cad..676bf68ed 100644 --- a/include/faabric/util/bytes.h +++ b/include/faabric/util/bytes.h @@ -12,6 +12,8 @@ std::vector stringToBytes(const std::string& str); std::string bytesToString(const std::vector& bytes); +std::string formatByteArrayToIntString(const std::vector&bytes); + void trimTrailingZeros(std::vector& vectorIn); int safeCopyToBuffer(const std::vector& dataIn, diff --git a/src/util/bytes.cpp b/src/util/bytes.cpp index 74d7d73ff..43666b242 100644 --- a/src/util/bytes.cpp +++ b/src/util/bytes.cpp @@ -1,5 +1,6 @@ #include +#include #include namespace faabric::util { @@ -72,4 +73,21 @@ std::string bytesToString(const std::vector& bytes) return result; } + +std::string formatByteArrayToIntString(const std::vector& bytes) +{ + std::stringstream ss; + + ss << "["; + for (int i = 0; i < bytes.size(); i++) { + ss << (int)bytes.at(i); + + if (i < bytes.size() - 1) { + ss << ", "; + } + } + ss << "]"; + + return ss.str(); +} } diff --git a/tests/dist/CMakeLists.txt b/tests/dist/CMakeLists.txt index d55b65443..d24e4b6b9 100644 --- a/tests/dist/CMakeLists.txt +++ b/tests/dist/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(faabric_dist_tests_lib DistTestExecutor.h DistTestExecutor.cpp scheduler/functions.cpp + transport/functions.cpp ) target_link_libraries(faabric_dist_tests_lib faabric_test_utils) diff --git a/tests/dist/init.cpp b/tests/dist/init.cpp index 0a04a55d4..caffcb260 100644 --- a/tests/dist/init.cpp +++ b/tests/dist/init.cpp @@ -9,5 +9,6 @@ void initDistTests() SPDLOG_INFO("Registering distributed test server functions"); tests::registerSchedulerTestFunctions(); + tests::registerTransportTestFunctions(); } } diff --git a/tests/dist/init.h b/tests/dist/init.h index 0a7455916..54018f219 100644 --- a/tests/dist/init.h +++ b/tests/dist/init.h @@ -6,5 +6,6 @@ void initDistTests(); // Specific test functions void registerSchedulerTestFunctions(); +void registerTransportTestFunctions(); } diff --git a/tests/dist/transport/functions.cpp b/tests/dist/transport/functions.cpp new file mode 100644 index 000000000..f94a80a9b --- /dev/null +++ b/tests/dist/transport/functions.cpp @@ -0,0 +1,77 @@ +#include "faabric_utils.h" +#include + +#include "DistTestExecutor.h" +#include "init.h" + +#include +#include +#include + +using namespace faabric::util; + +namespace tests { + +int handlePointToPointFunction( + faabric::scheduler::Executor* exec, + int threadPoolIdx, + int msgIdx, + std::shared_ptr req) +{ + faabric::Message& msg = req->mutable_messages()->at(msgIdx); + + uint8_t appIdx = (uint8_t)msg.appindex(); + + faabric::transport::PointToPointBroker& broker = + faabric::transport::getPointToPointBroker(); + + // Start by receiving a kick-off message from the master (to make sure the + // mappings have been broadcasted) + std::vector kickOffData = + broker.recvMessage(msg.appid(), 0, appIdx); + + // Check data received + std::vector expectedKickOffData = { 0, 1, 2 }; + if (kickOffData != expectedKickOffData) { + SPDLOG_ERROR("Point-to-point kick-off not as expected {} != {}", + formatByteArrayToIntString(kickOffData), + formatByteArrayToIntString(expectedKickOffData)); + return 1; + } + + // Send to next index in ring and recv from previous in ring. + uint8_t minIdx = 1; + uint8_t maxIdx = 3; + uint8_t sendToIdx = appIdx < maxIdx ? appIdx + 1 : minIdx; + uint8_t recvFromIdx = appIdx > minIdx ? appIdx - 1 : maxIdx; + + // Send a series of our own index, expect to receive the same from other + // senders + std::vector sendData(10, appIdx); + std::vector expectedRecvData(10, recvFromIdx); + + // Do the sending + broker.sendMessage( + msg.appid(), appIdx, sendToIdx, sendData.data(), sendData.size()); + + // Do the receiving + std::vector actualRecvData = + broker.recvMessage(msg.appid(), recvFromIdx, appIdx); + + // Check data is as expected + if (actualRecvData != expectedRecvData) { + SPDLOG_ERROR("Point-to-point recv data not as expected {} != {}", + formatByteArrayToIntString(actualRecvData), + formatByteArrayToIntString(expectedRecvData)); + return 1; + } + + return 0; +} + +void registerTransportTestFunctions() +{ + registerDistTestExecutorCallback( + "ptp", "simple", handlePointToPointFunction); +} +} diff --git a/tests/dist/transport/test_funcs.cpp b/tests/dist/transport/test_funcs.cpp new file mode 100644 index 000000000..778221a2d --- /dev/null +++ b/tests/dist/transport/test_funcs.cpp @@ -0,0 +1,78 @@ +#include "faabric_utils.h" +#include + +#include "fixtures.h" +#include "init.h" + +#include +#include +#include +#include +#include + +namespace tests { + +TEST_CASE_METHOD(DistTestsFixture, + "Test point-to-point messaging on multiple hosts", + "[ptp]") +{ + // Set up this host's resources + // Make sure some functions execute remotely, some locally + int nLocalSlots = 1; + int nFuncs = 3; + + faabric::HostResources res; + res.set_slots(nLocalSlots); + sch.setThisHostResources(res); + + // Set up batch request + std::shared_ptr req = + faabric::util::batchExecFactory("ptp", "simple", nFuncs); + + // Double check app id + int appId = req->messages().at(0).appid(); + REQUIRE(appId > 0); + + faabric::transport::PointToPointBroker& broker = + faabric::transport::getPointToPointBroker(); + + std::vector expectedHosts = { getMasterIP(), + getWorkerIP(), + getWorkerIP() }; + + // Set up individual messages + // Note that this thread is acting as app index 0 + for (int i = 0; i < nFuncs; i++) { + faabric::Message& msg = req->mutable_messages()->at(i); + + msg.set_appindex(i + 1); + + // Register function locations to where we assume they'll be executed + // (we'll confirm this is the case after scheduling) + broker.setHostForReceiver( + msg.appid(), msg.appindex(), expectedHosts.at(i)); + } + + // Call the functions + std::vector actualHosts = sch.callFunctions(req); + REQUIRE(actualHosts == expectedHosts); + + // Broadcast mappings to other hosts + broker.broadcastMappings(appId); + + // Send kick-off message to all functions + std::vector kickOffData = { 0, 1, 2 }; + for (int i = 0; i < nFuncs; i++) { + broker.sendMessage( + appId, 0, i + 1, kickOffData.data(), kickOffData.size()); + } + + // Check other functions executed successfully + for (int i = 0; i < nFuncs; i++) { + faabric::Message& m = req->mutable_messages()->at(i); + + sch.getFunctionResult(m.id(), 2000); + REQUIRE(m.returnvalue() == 0); + } +} +} diff --git a/tests/test/util/test_bytes.cpp b/tests/test/util/test_bytes.cpp index a10659d7e..f9573a572 100644 --- a/tests/test/util/test_bytes.cpp +++ b/tests/test/util/test_bytes.cpp @@ -137,4 +137,25 @@ TEST_CASE("Test integer encoding to/from bytes", "[util]") REQUIRE_THROWS_AS(readBytesOf(buffer, offset, &r1), std::range_error); } +TEST_CASE("Test format byte array to string", "[util]") +{ + std::vector bytesIn; + std::string expectedString; + + SECTION("Empty") { expectedString = "[]"; } + + SECTION("Non-empty") + { + bytesIn = { 0, 1, 2, 3, 4, 5, 6, 7 }; + expectedString = "[0, 1, 2, 3, 4, 5, 6, 7]"; + } + + SECTION("Larger int values") + { + bytesIn = { 23, 9, 100 }; + expectedString = "[23, 9, 100]"; + } + + REQUIRE(formatByteArrayToIntString(bytesIn) == expectedString); +} } From f3e86ccc9dfcaa602229beee25f9bbf42575e1c4 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 16:24:01 +0000 Subject: [PATCH 19/22] Fix up point-to-point dist test --- src/util/func.cpp | 4 ++++ tests/dist/fixtures.h | 14 +++++++++-- ...test_funcs.cpp => test_point_to_point.cpp} | 4 ++++ tests/test/transport/test_point_to_point.cpp | 23 +++++++++++++------ tests/utils/fixtures.h | 5 ---- 5 files changed, 36 insertions(+), 14 deletions(-) rename tests/dist/transport/{test_funcs.cpp => test_point_to_point.cpp} (92%) diff --git a/src/util/func.cpp b/src/util/func.cpp index 42deb4206..a5df10e2b 100644 --- a/src/util/func.cpp +++ b/src/util/func.cpp @@ -65,8 +65,12 @@ std::shared_ptr batchExecFactory( { auto req = batchExecFactory(); + uint32_t appId = faabric::util::generateGid(); + + // TODO - make sure the app ID is the same! for (int i = 0; i < count; i++) { *req->add_messages() = messageFactory(user, function); + req->mutable_messages()->at(i).set_appid(appId); } return req; diff --git a/tests/dist/fixtures.h b/tests/dist/fixtures.h index 0ae349aa1..5bbb62c5c 100644 --- a/tests/dist/fixtures.h +++ b/tests/dist/fixtures.h @@ -13,12 +13,14 @@ class DistTestsFixture : public SchedulerTestFixture , public ConfTestFixture , public SnapshotTestFixture + , public PointToPointTestFixture { public: DistTestsFixture() { - // Get other hosts - std::string thisHost = conf.endpointHost; + // Make sure the host list is up to date + sch.addHostToGlobalSet(getMasterIP()); + sch.addHostToGlobalSet(getWorkerIP()); // Set up executor std::shared_ptr fac = @@ -26,6 +28,14 @@ class DistTestsFixture faabric::scheduler::setExecutorFactory(fac); } + ~DistTestsFixture() + { + // Clear thread-local cache for this main thread + faabric::transport::PointToPointBroker& broker = + faabric::transport::getPointToPointBroker(); + broker.resetThreadLocalCache(); + } + std::string getWorkerIP() { if (workerIP.empty()) { diff --git a/tests/dist/transport/test_funcs.cpp b/tests/dist/transport/test_point_to_point.cpp similarity index 92% rename from tests/dist/transport/test_funcs.cpp rename to tests/dist/transport/test_point_to_point.cpp index 778221a2d..495e78dac 100644 --- a/tests/dist/transport/test_funcs.cpp +++ b/tests/dist/transport/test_point_to_point.cpp @@ -16,6 +16,10 @@ TEST_CASE_METHOD(DistTestsFixture, "Test point-to-point messaging on multiple hosts", "[ptp]") { + std::set actualAvailable = sch.getAvailableHosts(); + std::set expectedAvailable = { getMasterIP(), getWorkerIP() }; + REQUIRE(actualAvailable == expectedAvailable); + // Set up this host's resources // Make sure some functions execute remotely, some locally int nLocalSlots = 1; diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index 1f9417aff..d750f0806 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -15,16 +15,25 @@ using namespace faabric::util; namespace tests { -class PointToPointSchedulerFixture +class PointToPointClientServerFixture : public PointToPointTestFixture , SchedulerTestFixture { public: - PointToPointSchedulerFixture() {} - ~PointToPointSchedulerFixture() {} + PointToPointClientServerFixture() + : cli(LOCALHOST) + { + server.start(); + } + + ~PointToPointClientServerFixture() { server.stop(); } + + protected: + faabric::transport::PointToPointClient cli; + faabric::transport::PointToPointServer server; }; -TEST_CASE_METHOD(PointToPointSchedulerFixture, +TEST_CASE_METHOD(PointToPointClientServerFixture, "Test set and get point-to-point hosts", "[transport][ptp]") { @@ -73,7 +82,7 @@ TEST_CASE_METHOD(PointToPointSchedulerFixture, REQUIRE(broker.getHostForReceiver(appIdB, idxB2) == hostC); } -TEST_CASE_METHOD(PointToPointSchedulerFixture, +TEST_CASE_METHOD(PointToPointClientServerFixture, "Test sending point-to-point mappings via broker", "[transport][ptp]") { @@ -163,7 +172,7 @@ TEST_CASE_METHOD(PointToPointSchedulerFixture, } } -TEST_CASE_METHOD(PointToPointSchedulerFixture, +TEST_CASE_METHOD(PointToPointClientServerFixture, "Test sending point-to-point mappings from client", "[transport][ptp]") { @@ -207,7 +216,7 @@ TEST_CASE_METHOD(PointToPointSchedulerFixture, REQUIRE(broker.getHostForReceiver(appIdB, idxB1) == hostA); } -TEST_CASE_METHOD(PointToPointSchedulerFixture, +TEST_CASE_METHOD(PointToPointClientServerFixture, "Test send and receive point-to-point messages", "[transport][ptp]") { diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index b748747dc..1df91f33e 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -274,11 +274,9 @@ class PointToPointTestFixture public: PointToPointTestFixture() : broker(faabric::transport::getPointToPointBroker()) - , cli(LOCALHOST) { faabric::util::setMockMode(false); broker.clear(); - server.start(); } ~PointToPointTestFixture() @@ -289,14 +287,11 @@ class PointToPointTestFixture faabric::transport::clearSentMessages(); - server.stop(); broker.clear(); faabric::util::setMockMode(false); } protected: faabric::transport::PointToPointBroker& broker; - faabric::transport::PointToPointServer server; - faabric::transport::PointToPointClient cli; }; } From fb5aaecfa83e1550909ad6d851021adf4bf88c36 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 16:24:16 +0000 Subject: [PATCH 20/22] Formatting --- include/faabric/util/bytes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/faabric/util/bytes.h b/include/faabric/util/bytes.h index 676bf68ed..ef47afeeb 100644 --- a/include/faabric/util/bytes.h +++ b/include/faabric/util/bytes.h @@ -12,7 +12,7 @@ std::vector stringToBytes(const std::string& str); std::string bytesToString(const std::vector& bytes); -std::string formatByteArrayToIntString(const std::vector&bytes); +std::string formatByteArrayToIntString(const std::vector& bytes); void trimTrailingZeros(std::vector& vectorIn); From cf6dabe61b961cd16cf37c3a920f8b60866705e9 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 8 Oct 2021 16:45:50 +0000 Subject: [PATCH 21/22] Tidy-up --- src/transport/PointToPointServer.cpp | 2 +- src/util/func.cpp | 3 +-- tests/dist/fixtures.h | 8 +------- tests/test/transport/test_point_to_point.cpp | 6 +++--- tests/test/util/test_func.cpp | 21 ++++++++++++++++++++ tests/utils/fixtures.h | 1 - 6 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 790df0584..d9a16efbf 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -1,6 +1,6 @@ -#include "faabric/transport/PointToPointCall.h" #include #include +#include #include #include #include diff --git a/src/util/func.cpp b/src/util/func.cpp index a5df10e2b..0bef897d7 100644 --- a/src/util/func.cpp +++ b/src/util/func.cpp @@ -65,9 +65,8 @@ std::shared_ptr batchExecFactory( { auto req = batchExecFactory(); + // Force the messages to have the same app ID uint32_t appId = faabric::util::generateGid(); - - // TODO - make sure the app ID is the same! for (int i = 0; i < count; i++) { *req->add_messages() = messageFactory(user, function); req->mutable_messages()->at(i).set_appid(appId); diff --git a/tests/dist/fixtures.h b/tests/dist/fixtures.h index 5bbb62c5c..f34abbb4f 100644 --- a/tests/dist/fixtures.h +++ b/tests/dist/fixtures.h @@ -28,13 +28,7 @@ class DistTestsFixture faabric::scheduler::setExecutorFactory(fac); } - ~DistTestsFixture() - { - // Clear thread-local cache for this main thread - faabric::transport::PointToPointBroker& broker = - faabric::transport::getPointToPointBroker(); - broker.resetThreadLocalCache(); - } + ~DistTestsFixture() {} std::string getWorkerIP() { diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index d750f0806..a4ce27d3b 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -1,12 +1,12 @@ #include -#include "faabric/proto/faabric.pb.h" -#include "faabric/scheduler/Scheduler.h" #include "faabric_utils.h" #include -#include +#include +#include +#include #include #include diff --git a/tests/test/util/test_func.cpp b/tests/test/util/test_func.cpp index 29bacc02b..2d381a820 100644 --- a/tests/test/util/test_func.cpp +++ b/tests/test/util/test_func.cpp @@ -30,6 +30,27 @@ TEST_CASE("Test message factory shared", "[util]") REQUIRE(!msg->resultkey().empty()); } +TEST_CASE("Test batch exec factory", "[util]") +{ + int nMessages = 4; + std::shared_ptr req = + faabric::util::batchExecFactory("demo", "echo", nMessages); + + REQUIRE(req->messages().size() == nMessages); + + REQUIRE(req->id() > 0); + + // Expect all messages to have the same app ID by default + int appId = req->messages().at(0).appid(); + REQUIRE(appId > 0); + + for (const auto& m : req->messages()) { + REQUIRE(m.appid() == appId); + REQUIRE(m.user() == "demo"); + REQUIRE(m.function() == "echo"); + } +} + TEST_CASE("Test adding ids to message", "[util]") { faabric::Message msgA; diff --git a/tests/utils/fixtures.h b/tests/utils/fixtures.h index 1df91f33e..65e490cf8 100644 --- a/tests/utils/fixtures.h +++ b/tests/utils/fixtures.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include From 8697736b6e119bb1d8a2ed38a754eaf5749c9873 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 13 Oct 2021 16:40:16 +0000 Subject: [PATCH 22/22] Review comments --- .../faabric/transport/PointToPointBroker.h | 3 ++ .../faabric/transport/PointToPointServer.h | 2 -- src/transport/CMakeLists.txt | 4 +-- src/transport/PointToPointBroker.cpp | 31 ++++++++++++++----- src/transport/PointToPointServer.cpp | 22 +++++-------- 5 files changed, 37 insertions(+), 25 deletions(-) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 164c094e3..19717867e 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -42,6 +43,8 @@ class PointToPointBroker std::unordered_map> appIdxs; std::unordered_map mappings; + std::shared_ptr getClient(const std::string& host); + faabric::scheduler::Scheduler& sch; }; diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 3a7d80df9..2c961b908 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -22,8 +22,6 @@ class PointToPointServer final : public MessageEndpointServer void onWorkerStop() override; - void doSendMessage(const uint8_t* buffer, size_t bufferSize); - std::unique_ptr doRecvMappings( const uint8_t* buffer, size_t bufferSize); diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt index b7d12de35..a9ffb8684 100644 --- a/src/transport/CMakeLists.txt +++ b/src/transport/CMakeLists.txt @@ -11,9 +11,9 @@ set(HEADERS "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointClient.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MessageEndpointServer.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/MpiMessageEndpoint.h" + "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointBroker.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointClient.h" "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointServer.h" - "${FAABRIC_INCLUDE_DIR}/faabric/transport/PointToPointBroker.h" ) set(LIB_FILES @@ -23,9 +23,9 @@ set(LIB_FILES MessageEndpointClient.cpp MessageEndpointServer.cpp MpiMessageEndpoint.cpp + PointToPointBroker.cpp PointToPointClient.cpp PointToPointServer.cpp - PointToPointBroker.cpp ${HEADERS} ) diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index e0d8e02b3..b5bd600df 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -22,6 +22,10 @@ thread_local std:: unordered_map> sendEndpoints; +thread_local std::unordered_map> + clients; + std::string getPointToPointKey(int appId, int sendIdx, int recvIdx) { return fmt::format("{}-{}-{}", appId, sendIdx, recvIdx); @@ -110,8 +114,9 @@ void PointToPointBroker::sendMappings(int appId, const std::string& host) indexes.size(), appId, host); - PointToPointClient cli(host); - cli.sendMappings(msg); + + auto cli = getClient(host); + cli->sendMappings(msg); } std::set PointToPointBroker::getIdxsRegisteredForApp(int appId) @@ -149,7 +154,7 @@ void PointToPointBroker::sendMessage(int appId, sendEndpoints[label]->send(buffer, bufferSize); } else { - PointToPointClient cli(host); + auto cli = getClient(host); faabric::PointToPointMessage msg; msg.set_appid(appId); msg.set_sendidx(sendIdx); @@ -162,7 +167,7 @@ void PointToPointBroker::sendMessage(int appId, recvIdx, host); - cli.sendMessage(msg); + cli->sendMessage(msg); } } @@ -188,6 +193,19 @@ std::vector PointToPointBroker::recvMessage(int appId, return messageData.dataCopy(); } +std::shared_ptr PointToPointBroker::getClient( + const std::string& host) +{ + // Note - this map is thread-local so no locking required + if (clients.find(host) == clients.end()) { + clients[host] = std::make_shared(host); + + SPDLOG_TRACE("Created new point-to-point client {}", host); + } + + return clients[host]; +} + void PointToPointBroker::clear() { faabric::util::SharedLock lock(brokerMutex); @@ -198,12 +216,11 @@ void PointToPointBroker::clear() void PointToPointBroker::resetThreadLocalCache() { - auto tid = (pid_t)syscall(SYS_gettid); - SPDLOG_TRACE("Resetting point-to-point thread-local cache for thread {}", - tid); + SPDLOG_TRACE("Resetting point-to-point thread-local cache"); sendEndpoints.clear(); recvEndpoints.clear(); + clients.clear(); } PointToPointBroker& getPointToPointBroker() diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index d9a16efbf..581167a0c 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -25,7 +25,14 @@ void PointToPointServer::doAsyncRecv(int header, { switch (header) { case (faabric::transport::PointToPointCall::MESSAGE): { - doSendMessage(buffer, bufferSize); + PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) + + // Send the message locally to the downstream socket + reg.sendMessage(msg.appid(), + msg.sendidx(), + msg.recvidx(), + BYTES_CONST(msg.data().c_str()), + msg.data().size()); break; } default: { @@ -35,17 +42,6 @@ void PointToPointServer::doAsyncRecv(int header, } } -void PointToPointServer::doSendMessage(const uint8_t* buffer, size_t bufferSize) -{ - PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) - - reg.sendMessage(msg.appid(), - msg.sendidx(), - msg.recvidx(), - BYTES_CONST(msg.data().c_str()), - msg.data().size()); -} - std::unique_ptr PointToPointServer::doSyncRecv( int header, const uint8_t* buffer, @@ -68,8 +64,6 @@ std::unique_ptr PointToPointServer::doRecvMappings( { PARSE_MSG(faabric::PointToPointMappings, buffer, bufferSize) - PointToPointBroker& reg = getPointToPointBroker(); - for (const auto& m : msg.mappings()) { reg.setHostForReceiver(m.appid(), m.recvidx(), m.host()); }