From 9fdfcef10de0b7a581278bfaa724f6be0696940f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 1 Nov 2021 13:04:49 +0000 Subject: [PATCH 01/31] Started fixes for locks --- .../faabric/transport/PointToPointBroker.h | 6 +- src/transport/PointToPointBroker.cpp | 169 ++++++++++-------- 2 files changed, 93 insertions(+), 82 deletions(-) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index cb4bd33c8..705becfe7 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -30,6 +30,8 @@ class PointToPointGroup static void addGroup(int appId, int groupId, int groupSize); + static void addGroupIfNotExists(int appId, int groupId, int groupSize); + static void clear(); PointToPointGroup(int appId, int groupIdIn, int groupSizeIn); @@ -77,10 +79,6 @@ class PointToPointGroup std::queue lockWaiters; void notifyLocked(int groupIdx); - - void masterLock(int groupIdx, bool recursive); - - void masterUnlock(int groupIdx, bool recursive); }; class PointToPointBroker diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 38e9e4921..4093bec32 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -22,6 +22,8 @@ namespace faabric::transport { static std::unordered_map> groups; +static std::shared_mutex groupsMutex; + // NOTE: Keeping 0MQ sockets in TLS is usually a bad idea, as they _must_ be // closed before the global context. However, in this case it's worth it // to cache the sockets across messages, as otherwise we'd be creating and @@ -76,13 +78,30 @@ std::shared_ptr PointToPointGroup::getGroup(int groupId) bool PointToPointGroup::groupExists(int groupId) { + faabric::util::SharedLock lock(groupsMutex); return groups.find(groupId) != groups.end(); } void PointToPointGroup::addGroup(int appId, int groupId, int groupSize) { - groups.emplace(std::make_pair( - groupId, std::make_shared(appId, groupId, groupSize))); + faabric::util::FullLock lock(groupsMutex); + + if (groups.find(groupId) == groups.end()) { + groups.emplace(std::make_pair( + groupId, + std::make_shared(appId, groupId, groupSize))); + } +} + +void PointToPointGroup::addGroupIfNotExists(int appId, + int groupId, + int groupSize) +{ + if (groupExists(groupId)) { + return; + } + + addGroup(appId, groupId, groupSize); } void PointToPointGroup::clear() @@ -106,7 +125,44 @@ void PointToPointGroup::lock(int groupIdx, bool recursive) ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX); if (host == conf.endpointHost) { - masterLock(groupIdx, recursive); + bool acquiredLock = false; + { + faabric::util::UniqueLock lock(mx); + if (recursive) { + bool isFree = recursiveLockOwners.empty(); + + bool lockOwnedByThisIdx = + !isFree && (recursiveLockOwners.top() == groupIdx); + + if (isFree || lockOwnedByThisIdx) { + // Recursive and either free, or already locked by this idx + recursiveLockOwners.push(groupIdx); + acquiredLock = true; + } + } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) { + // Non-recursive and free + lockOwnerIdx = groupIdx; + acquiredLock = true; + } + } + + if (acquiredLock) { + SPDLOG_TRACE("Group idx {}, locked {} (recursive {})", + groupIdx, + groupId, + recursive); + + notifyLocked(groupIdx); + } else { + SPDLOG_TRACE( + "Group idx {}, lock {} already locked({} waiters, recursive {})", + groupIdx, + groupId, + lockWaiters.size(), + recursive); + + lockWaiters.push(groupIdx); + } } else { auto cli = getClient(host); faabric::PointToPointMessage msg; @@ -121,54 +177,10 @@ void PointToPointGroup::lock(int groupIdx, bool recursive) host); cli->groupLock(appId, groupId, groupIdx, recursive); - - // Await ptp response - ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); } -} - -void PointToPointGroup::masterLock(int groupIdx, bool recursive) -{ - SPDLOG_TRACE("Master lock {}:{}", groupId, groupIdx); - bool success = false; - { - faabric::util::UniqueLock lock(mx); - if (recursive) { - bool isFree = recursiveLockOwners.empty(); - - bool lockOwnedByThisIdx = - !isFree && (recursiveLockOwners.top() == groupIdx); - - if (isFree || lockOwnedByThisIdx) { - // Recursive and either free, or already locked by this idx - SPDLOG_TRACE("Group idx {} recursively locked {} ({})", - groupIdx, - groupId, - lockWaiters.size()); - recursiveLockOwners.push(groupIdx); - success = true; - } else { - SPDLOG_TRACE("Group idx {} unable to recursively lock {} ({})", - groupIdx, - groupId, - lockWaiters.size()); - } - } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) { - // Non-recursive and free - SPDLOG_TRACE("Group idx {} locked {}", groupIdx, groupId); - lockOwnerIdx = groupIdx; - success = true; - } else { - // Unable to lock, wait in queue - SPDLOG_TRACE("Group idx {} unable to lock {}", groupIdx, groupId); - lockWaiters.push(groupIdx); - } - } - - if (success) { - notifyLocked(groupIdx); - } + // Await ptp response + ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); } void PointToPointGroup::localLock() @@ -188,7 +200,35 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive) ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX); if (host == conf.endpointHost) { - masterUnlock(groupIdx, recursive); + SPDLOG_TRACE("Group idx {} unlocking {} ({} waiters, recursive {})", + groupIdx, + groupId, + lockWaiters.size(), + recursive); + + faabric::util::UniqueLock lock(mx); + + if (recursive) { + recursiveLockOwners.pop(); + + if (!recursiveLockOwners.empty()) { + return; + } + + if (!lockWaiters.empty()) { + recursiveLockOwners.push(lockWaiters.front()); + notifyLocked(lockWaiters.front()); + lockWaiters.pop(); + } + } else { + lockOwnerIdx = NO_LOCK_OWNER_IDX; + + if (!lockWaiters.empty()) { + lockOwnerIdx = lockWaiters.front(); + notifyLocked(lockWaiters.front()); + lockWaiters.pop(); + } + } } else { auto cli = getClient(host); faabric::PointToPointMessage msg; @@ -196,7 +236,7 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive) msg.set_sendidx(groupIdx); msg.set_recvidx(POINT_TO_POINT_MASTER_IDX); - SPDLOG_TRACE("Remote lock {}:{}:{} to {}", + SPDLOG_TRACE("Remote unlock {}:{}:{} to {}", groupId, groupIdx, POINT_TO_POINT_MASTER_IDX, @@ -206,33 +246,6 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive) } } -void PointToPointGroup::masterUnlock(int groupIdx, bool recursive) -{ - faabric::util::UniqueLock lock(mx); - - if (recursive) { - recursiveLockOwners.pop(); - - if (!recursiveLockOwners.empty()) { - return; - } - - if (!lockWaiters.empty()) { - recursiveLockOwners.push(lockWaiters.front()); - notifyLocked(lockWaiters.front()); - lockWaiters.pop(); - } - } else { - lockOwnerIdx = NO_LOCK_OWNER_IDX; - - if (!lockWaiters.empty()) { - lockOwnerIdx = lockWaiters.front(); - notifyLocked(lockWaiters.front()); - lockWaiters.pop(); - } - } -} - void PointToPointGroup::localUnlock() { localMx.unlock(); From a14a1bc4f2958f63c962f8b714eef9be3d96b00f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 1 Nov 2021 14:52:27 +0000 Subject: [PATCH 02/31] Clearing up groups --- .../faabric/transport/PointToPointBroker.h | 6 ++++ src/scheduler/Executor.cpp | 15 ++++++++ src/transport/PointToPointBroker.cpp | 36 +++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 705becfe7..3b985e431 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -26,12 +26,16 @@ class PointToPointGroup public: static std::shared_ptr getGroup(int groupId); + static std::shared_ptr getOrAwaitGroup(int groupId); + static bool groupExists(int groupId); static void addGroup(int appId, int groupId, int groupSize); static void addGroupIfNotExists(int appId, int groupId, int groupSize); + static void clearGroup(int groupId); + static void clear(); PointToPointGroup(int appId, int groupIdIn, int groupSizeIn); @@ -106,6 +110,8 @@ class PointToPointBroker std::vector recvMessage(int groupId, int sendIdx, int recvIdx); + void clearGroup(int groupId); + void clear(); void resetThreadLocalCache(); diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 0a2c77c67..41a637cb7 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -253,6 +254,13 @@ void Executor::threadPoolThread(int threadPoolIdx) threadPoolIdx, oldTaskCount - 1); + // If this messages was part of a point-to-point group, we must make + // sure its thread-local state is cleared to allow sockets to be + // destructed. + if (msg.groupid() > 0) { + faabric::transport::getPointToPointBroker().resetThreadLocalCache(); + } + // Handle snapshot diffs _before_ we reset the executor if (isLastInBatch && task.needsSnapshotPush) { // Get diffs between original snapshot and after execution @@ -283,6 +291,13 @@ void Executor::threadPoolThread(int threadPoolIdx) reset(msg); } + // We need to remove the point-to-point group associated with this + // message completely. + if (msg.groupid() > 0) { + faabric::transport::getPointToPointBroker().clearGroup( + msg.groupid()); + } + releaseClaim(); } diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 4093bec32..30e1766f6 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -76,6 +76,14 @@ std::shared_ptr PointToPointGroup::getGroup(int groupId) return groups.at(groupId); } +std::shared_ptr PointToPointGroup::getOrAwaitGroup( + int groupId) +{ + getPointToPointBroker().waitForMappingsOnThisHost(groupId); + + return getGroup(groupId); +} + bool PointToPointGroup::groupExists(int groupId) { faabric::util::SharedLock lock(groupsMutex); @@ -104,6 +112,11 @@ void PointToPointGroup::addGroupIfNotExists(int appId, addGroup(appId, groupId, groupSize); } +void PointToPointGroup::clearGroup(int groupId) +{ + groups.erase(groupId); +} + void PointToPointGroup::clear() { groups.clear(); @@ -533,6 +546,29 @@ std::vector PointToPointBroker::recvMessage(int groupId, return messageData.dataCopy(); } +void PointToPointBroker::clearGroup(int groupId) +{ + SPDLOG_TRACE("Clearing point-to-point group {}", groupId); + + faabric::util::SharedLock lock(brokerMutex); + + std::set idxs = getIdxsRegisteredForGroup(groupId); + for (auto idxA : idxs) { + for (auto idxB : idxs) { + std::string label = getPointToPointKey(groupId, idxA, idxB); + mappings.erase(label); + } + } + + groupIdIdxsMap.erase(groupId); + + PointToPointGroup::clearGroup(groupId); + + groupMappingMutexes.erase(groupId); + groupMappingsFlags.erase(groupId); + groupMappingCvs.erase(groupId); +} + void PointToPointBroker::clear() { faabric::util::SharedLock lock(brokerMutex); From b6a0c2e2bd7fdc4ac900e329a564a90c6c2e3898 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 1 Nov 2021 18:09:45 +0000 Subject: [PATCH 03/31] Experiments with waiters/ maps --- cmake/ExternalProjects.cmake | 3 + .../faabric/transport/PointToPointBroker.h | 9 +-- include/faabric/util/locks.h | 35 ++++++++++ include/faabric/util/map.h | 69 +++++++++++++++++++ src/transport/PointToPointBroker.cpp | 65 +++++++---------- 5 files changed, 135 insertions(+), 46 deletions(-) create mode 100644 include/faabric/util/map.h diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake index 7f16a103b..0ccfe9bab 100644 --- a/cmake/ExternalProjects.cmake +++ b/cmake/ExternalProjects.cmake @@ -29,6 +29,7 @@ conan_cmake_configure( cppzmq/4.8.1 flatbuffers/2.0.0 hiredis/1.0.2 + parallel-hashmap/1.33 protobuf/3.17.1 rapidjson/cci.20200410 spdlog/1.9.2 @@ -81,6 +82,7 @@ find_package(cpprestsdk REQUIRED) find_package(cppzmq REQUIRED) find_package(fmt REQUIRED) find_package(hiredis REQUIRED) +find_package(phmap REQUIRED) find_package(spdlog REQUIRED) # Pistache - Conan version is out of date and doesn't support clang @@ -132,6 +134,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE cppzmq::cppzmq flatbuffers::flatbuffers hiredis::hiredis + phmap::phmap pistache::pistache protobuf::libprotobuf RapidJSON::RapidJSON diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 3b985e431..fdfa2da86 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -117,16 +118,16 @@ class PointToPointBroker void resetThreadLocalCache(); private: + faabric::util::SystemConfig& conf; + std::shared_mutex brokerMutex; std::unordered_map> groupIdIdxsMap; std::unordered_map mappings; - std::unordered_map groupMappingsFlags; - std::unordered_map groupMappingMutexes; - std::unordered_map groupMappingCvs; + std::unordered_map groupFlags; - faabric::util::SystemConfig& conf; + faabric::util::FlagWaiter& getGroupFlag(int groupId); }; PointToPointBroker& getPointToPointBroker(); diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h index ed9037763..3e10c0c62 100644 --- a/include/faabric/util/locks.h +++ b/include/faabric/util/locks.h @@ -1,5 +1,9 @@ #pragma once +#include + +#include +#include #include #include @@ -7,4 +11,35 @@ namespace faabric::util { typedef std::unique_lock UniqueLock; typedef std::unique_lock FullLock; typedef std::shared_lock SharedLock; + +class FlagWaiter +{ + public: + void waitOnFlag() + { + // Check + if (flag.load()) { + return; + } + + // Wait for group to be enabled + UniqueLock lock(flagMx); + if (!cv.wait_for(lock, std::chrono::milliseconds(10000), [this] { + return flag.load(); + })) { + + SPDLOG_ERROR("Timed out waiting for flag"); + throw std::runtime_error("Timed out waiting for flag"); + } + } + + void setFlag(bool value) { flag.store(value); } + + bool getValue() { return flag.load(); } + + private: + std::mutex flagMx; + std::condition_variable cv; + std::atomic flag; +}; } diff --git a/include/faabric/util/map.h b/include/faabric/util/map.h new file mode 100644 index 000000000..638603591 --- /dev/null +++ b/include/faabric/util/map.h @@ -0,0 +1,69 @@ +#pragma once + +#include + +#include + +namespace faabric::util { + +// See documentation for phmap: https://greg7mdp.github.io/parallel-hashmap/ +// Specifically section titled: "Using the intrinsic parallelism ..." + +template +using SimpleMap = + phmap::parallel_flat_hash_map, + phmap::priv::hash_default_eq, + std::allocator>, + 4>; + +template +using ParallelMap = + phmap::parallel_flat_hash_map, + phmap::priv::hash_default_eq, + std::allocator>, + 4, + std::mutex>; + +template +class SlowParallelMap +{ + public: + V& at(const K& key) { return map.at(key); } + + V& getOrInsert(const K& key) + { + getOrInsert(key, [key](std::unordered_map& m) { m[key]; }); + } + + V& getOrInsert( + const K& key, + const std::function)>& insertOp) + { + if (map.find(key) == map.end()) { + FullLock lock(mx); + if (map.find(key) == map.end()) { + insertOp(); + } + } + + { + SharedLock lock(mx); + return map.at(key); + } + } + + void insert(const K& key, const V& value) { map.insert(key, value); } + + void erase(const K& key) { map.erase(key); } + + void clear() { map.clear(); } + + private: + std::shared_mutex mx; + std::unordered_map map; +}; +} diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 30e1766f6..820316875 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -390,19 +390,10 @@ PointToPointBroker::setUpLocalMappingsFromSchedulingDecision( decision.appId, groupId, decision.nFunctions); } - { - // Lock this group - faabric::util::UniqueLock lock(groupMappingMutexes[groupId]); - - SPDLOG_TRACE( - "Enabling point-to-point mapping for {}:{}", decision.appId, groupId); + SPDLOG_TRACE( + "Enabling point-to-point mapping for {}:{}", decision.appId, groupId); - // Enable the group - groupMappingsFlags[groupId] = true; - - // Notify waiters - groupMappingCvs[groupId].notify_all(); - } + getGroupFlag(groupId).setFlag(true); return hosts; } @@ -440,33 +431,27 @@ void PointToPointBroker::setAndSendMappingsFromSchedulingDecision( } } +faabric::util::FlagWaiter& PointToPointBroker::getGroupFlag(int groupId) +{ + if (groupFlags.find(groupId) == groupFlags.end()) { + faabric::util::FullLock lock(brokerMutex); + if (groupFlags.find(groupId) == groupFlags.end()) { + return groupFlags[groupId]; + } + } + + { + faabric::util::SharedLock lock(brokerMutex); + return groupFlags.at(groupId); + } +} + void PointToPointBroker::waitForMappingsOnThisHost(int groupId) { + faabric::util::FlagWaiter &waiter = getGroupFlag(groupId); // Check if it's been enabled - if (!groupMappingsFlags[groupId]) { - - // Lock this group - faabric::util::UniqueLock lock(groupMappingMutexes[groupId]); - - // Check again - if (!groupMappingsFlags[groupId]) { - // Wait for group to be enabled - auto timePoint = std::chrono::system_clock::now() + - std::chrono::milliseconds(MAPPING_TIMEOUT_MS); - - if (!groupMappingCvs[groupId].wait_until( - lock, timePoint, [this, groupId] { - return groupMappingsFlags[groupId]; - })) { - - SPDLOG_ERROR("Timed out waiting for group mappings {}", - groupId); - throw std::runtime_error( - "Timed out waiting for group mappings"); - } - - SPDLOG_TRACE("Point-to-point mappings for {} ready", groupId); - } + if (!waiter.getValue()) { + waiter.waitOnFlag(); } } @@ -564,9 +549,7 @@ void PointToPointBroker::clearGroup(int groupId) PointToPointGroup::clearGroup(groupId); - groupMappingMutexes.erase(groupId); - groupMappingsFlags.erase(groupId); - groupMappingCvs.erase(groupId); + groupFlags.erase(groupId); } void PointToPointBroker::clear() @@ -578,9 +561,7 @@ void PointToPointBroker::clear() PointToPointGroup::clear(); - groupMappingMutexes.clear(); - groupMappingsFlags.clear(); - groupMappingCvs.clear(); + groupFlags.clear(); } void PointToPointBroker::resetThreadLocalCache() From 64dbd924699f71a1350af939167d8db097195fc9 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 10:01:50 +0000 Subject: [PATCH 04/31] Add test for flag waiter --- cmake/ExternalProjects.cmake | 7 +++ include/faabric/util/locks.h | 19 +++++-- src/transport/PointToPointBroker.cpp | 7 +-- tests/test/state/test_state.cpp | 6 -- tests/test/util/test_locks.cpp | 82 ++++++++++++++++++++++++++++ 5 files changed, 107 insertions(+), 14 deletions(-) create mode 100644 tests/test/util/test_locks.cpp diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake index 0ccfe9bab..b2e469a81 100644 --- a/cmake/ExternalProjects.cmake +++ b/cmake/ExternalProjects.cmake @@ -29,6 +29,7 @@ conan_cmake_configure( cppzmq/4.8.1 flatbuffers/2.0.0 hiredis/1.0.2 + libcds/2.3.3 parallel-hashmap/1.33 protobuf/3.17.1 rapidjson/cci.20200410 @@ -82,6 +83,7 @@ find_package(cpprestsdk REQUIRED) find_package(cppzmq REQUIRED) find_package(fmt REQUIRED) find_package(hiredis REQUIRED) +find_package(LibCDS REQUIRED) find_package(phmap REQUIRED) find_package(spdlog REQUIRED) @@ -122,9 +124,11 @@ add_library(zstd::libzstd_static ALIAS libzstd_static) # Group all external dependencies into a convenient virtual CMake library add_library(faabric_common_dependencies INTERFACE) + target_include_directories(faabric_common_dependencies INTERFACE ${FAABRIC_INCLUDE_DIR} ) + target_link_libraries(faabric_common_dependencies INTERFACE Boost::Boost Boost::filesystem @@ -134,6 +138,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE cppzmq::cppzmq flatbuffers::flatbuffers hiredis::hiredis + LibCDS::LibCDS phmap::phmap pistache::pistache protobuf::libprotobuf @@ -142,7 +147,9 @@ target_link_libraries(faabric_common_dependencies INTERFACE Threads::Threads zstd::libzstd_static ) + target_compile_definitions(faabric_common_dependencies INTERFACE FMT_DEPRECATED= # Suppress warnings about use of deprecated api by spdlog ) + add_library(faabric::common_dependencies ALIAS faabric_common_dependencies) diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h index 3e10c0c62..8f14525b3 100644 --- a/include/faabric/util/locks.h +++ b/include/faabric/util/locks.h @@ -7,6 +7,8 @@ #include #include +#define DEFAULT_FLAG_WAIT_MS 10000 + namespace faabric::util { typedef std::unique_lock UniqueLock; typedef std::unique_lock FullLock; @@ -15,6 +17,10 @@ typedef std::shared_lock SharedLock; class FlagWaiter { public: + FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS) + : timeoutMs(timeoutMsIn) + {} + void waitOnFlag() { // Check @@ -24,7 +30,7 @@ class FlagWaiter // Wait for group to be enabled UniqueLock lock(flagMx); - if (!cv.wait_for(lock, std::chrono::milliseconds(10000), [this] { + if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] { return flag.load(); })) { @@ -33,11 +39,16 @@ class FlagWaiter } } - void setFlag(bool value) { flag.store(value); } - - bool getValue() { return flag.load(); } + void setFlag(bool value) + { + UniqueLock lock(flagMx); + flag.store(value); + cv.notify_all(); + } private: + int timeoutMs; + std::mutex flagMx; std::condition_variable cv; std::atomic flag; diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 820316875..c0e5aeda5 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -448,11 +448,10 @@ faabric::util::FlagWaiter& PointToPointBroker::getGroupFlag(int groupId) void PointToPointBroker::waitForMappingsOnThisHost(int groupId) { - faabric::util::FlagWaiter &waiter = getGroupFlag(groupId); + faabric::util::FlagWaiter& waiter = getGroupFlag(groupId); + // Check if it's been enabled - if (!waiter.getValue()) { - waiter.waitOnFlag(); - } + waiter.waitOnFlag(); } std::set PointToPointBroker::getIdxsRegisteredForGroup(int groupId) diff --git a/tests/test/state/test_state.cpp b/tests/test/state/test_state.cpp index af9e7aa3b..40cc5be4a 100644 --- a/tests/test/state/test_state.cpp +++ b/tests/test/state/test_state.cpp @@ -114,12 +114,6 @@ class StateServerTestFixture std::vector actual(values.size(), 0); setDummyData(values); - // Get, with optional pull - int nMessages = 1; - if (doPull) { - nMessages = 2; - } - // Initial pull const std::shared_ptr& localKv = getLocalKv(); localKv->pull(); diff --git a/tests/test/util/test_locks.cpp b/tests/test/util/test_locks.cpp new file mode 100644 index 000000000..839adbf85 --- /dev/null +++ b/tests/test/util/test_locks.cpp @@ -0,0 +1,82 @@ +#include +#include + +#include +#include +#include + +using namespace faabric::util; + +namespace tests { + +TEST_CASE("Test wait flag", "[util]") +{ + int nThreads = 10; + + FlagWaiter flagA; + FlagWaiter flagB; + + std::shared_ptr latchA1 = Latch::create(nThreads + 1); + std::shared_ptr latchB1 = Latch::create(nThreads + 1); + + std::shared_ptr latchA2 = Latch::create(nThreads + 1); + std::shared_ptr latchB2 = Latch::create(nThreads + 1); + + std::vector expectedUnset(nThreads, 0); + std::vector expectedSet; + + std::vector threadsA; + std::vector threadsB; + + std::vector resultsA(nThreads, 0); + std::vector resultsB(nThreads, 0); + + for (int i = 0; i < nThreads; i++) { + expectedSet.push_back(i); + + threadsA.emplace_back([&flagA, &resultsA, &latchA1, &latchA2, i] { + latchA1->wait(); + flagA.waitOnFlag(); + resultsA[i] = i; + latchA2->wait(); + }); + + threadsB.emplace_back([&flagB, &resultsB, &latchB1, &latchB2, i] { + latchB1->wait(); + flagB.waitOnFlag(); + resultsB[i] = i; + latchB2->wait(); + }); + } + + // Check no results are set initially + latchA1->wait(); + latchB1->wait(); + REQUIRE(resultsA == expectedUnset); + REQUIRE(resultsB == expectedUnset); + + // Set one flag, await latch and check + flagA.setFlag(true); + latchA2->wait(); + REQUIRE(resultsA == expectedSet); + REQUIRE(resultsB == expectedUnset); + + // Set other flag, await latch and check + flagB.setFlag(true); + latchB2->wait(); + REQUIRE(resultsA == expectedSet); + REQUIRE(resultsB == expectedSet); + + for (auto& t : threadsA) { + if (t.joinable()) { + t.join(); + } + } + + for (auto& t : threadsB) { + if (t.joinable()) { + t.join(); + } + } +} +} From 9d4570d82bc6460b29af60e262c40fdb25d58a2f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 11:25:54 +0000 Subject: [PATCH 05/31] Avoid scheduling multiple threads on same thread as 0th group --- include/faabric/scheduler/Scheduler.h | 1 + src/scheduler/Executor.cpp | 58 +++++++++++++++++++-------- src/transport/PointToPointBroker.cpp | 6 +++ 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index f95219959..dc7b070d2 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -88,6 +88,7 @@ class Executor std::mutex threadsMutex; std::vector> threadPoolThreads; std::vector> deadThreads; + std::set availablePoolThreads; std::vector> threadTaskQueues; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 41a637cb7..d3545d5d9 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -45,6 +45,11 @@ Executor::Executor(faabric::Message& msg) // Set an ID for this Executor id = conf.endpointHost + "_" + std::to_string(faabric::util::generateGid()); SPDLOG_DEBUG("Starting executor {}", id); + + // Mark all thread pool threads as available + for (int i = 0; i < threadPoolSize; i++) { + availablePoolThreads.insert(i); + } } Executor::~Executor() {} @@ -152,17 +157,35 @@ void Executor::executeTasks(std::vector msgIdxs, bool skipReset = isMaster && isThreads; // Iterate through and invoke tasks + // Rules for thread pool scheduling: + // - Keep index zero free if on master, as this is likely to be the + // main thread which is blocking on others + // - If part of a messaging group, give the 0th idx its own thread, as + // it's likely to block waiting for the others + if (isThreads && isMaster) { + availablePoolThreads.erase(0); + } + + bool isInPtpGroup = firstMsg.groupid() > 0; + + std::set::iterator it = availablePoolThreads.begin(); for (int msgIdx : msgIdxs) { const faabric::Message& msg = req->messages().at(msgIdx); + int threadPoolIdx = -1; - // If executing threads, we must always keep thread pool index zero - // free, as this may be executing the function that spawned them - int threadPoolIdx; - if (isThreads) { - assert(threadPoolSize > 1); - threadPoolIdx = (msg.appidx() % (threadPoolSize - 1)) + 1; + if (isInPtpGroup && msg.groupidx() == 0) { + // If zeroth in a group, need to reserve a thread pool idx + threadPoolIdx = *availablePoolThreads.begin(); + availablePoolThreads.erase(threadPoolIdx); + it = availablePoolThreads.begin(); } else { - threadPoolIdx = msg.appidx() % threadPoolSize; + if (it == availablePoolThreads.end()) { + // Reset iterator to start + it = availablePoolThreads.begin(); + } + + threadPoolIdx = *it; + it++; } // Enqueue the task @@ -254,13 +277,6 @@ void Executor::threadPoolThread(int threadPoolIdx) threadPoolIdx, oldTaskCount - 1); - // If this messages was part of a point-to-point group, we must make - // sure its thread-local state is cleared to allow sockets to be - // destructed. - if (msg.groupid() > 0) { - faabric::transport::getPointToPointBroker().resetThreadLocalCache(); - } - // Handle snapshot diffs _before_ we reset the executor if (isLastInBatch && task.needsSnapshotPush) { // Get diffs between original snapshot and after execution @@ -294,13 +310,23 @@ void Executor::threadPoolThread(int threadPoolIdx) // We need to remove the point-to-point group associated with this // message completely. if (msg.groupid() > 0) { - faabric::transport::getPointToPointBroker().clearGroup( - msg.groupid()); + faabric::transport::PointToPointBroker& broker = + faabric::transport::getPointToPointBroker(); + + broker.clearGroup(msg.groupid()); + + broker.resetThreadLocalCache(); } releaseClaim(); } + // Make sure this thread index is available for scheduling + { + faabric::util::UniqueLock lock(threadsMutex); + availablePoolThreads.insert(threadPoolIdx); + } + // Vacate the slot occupied by this task. This must be done after // releasing the claim on this executor, otherwise the scheduler may try // to schedule another function and be unable to reuse this executor. diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index c0e5aeda5..0fc9d20dc 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -305,10 +305,16 @@ void PointToPointGroup::notify(int groupIdx) { if (groupIdx == POINT_TO_POINT_MASTER_IDX) { for (int i = 1; i < groupSize; i++) { + SPDLOG_TRACE( + "Master group {} waiting for notify from index {}", groupId, i); + ptpBroker.recvMessage(groupId, i, POINT_TO_POINT_MASTER_IDX); + + SPDLOG_TRACE("Master group {} notified by index {}", groupId, i); } } else { std::vector data(1, 0); + SPDLOG_TRACE("Notifying group {} from index {}", groupId, groupIdx); ptpBroker.sendMessage(groupId, groupIdx, POINT_TO_POINT_MASTER_IDX, From 249daf2ab77736545f64ec741d78c6d5271f8a6f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 12:52:36 +0000 Subject: [PATCH 06/31] Overloading thread pool --- .../faabric/transport/PointToPointBroker.h | 2 +- src/scheduler/Executor.cpp | 80 ++++++++++--------- src/transport/PointToPointBroker.cpp | 7 +- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index fdfa2da86..df8ea81e8 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -115,7 +115,7 @@ class PointToPointBroker void clear(); - void resetThreadLocalCache(); + void resetThreadLocalCache(bool soft=false); private: faabric::util::SystemConfig& conf; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index d3545d5d9..53476a0fe 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -156,41 +156,42 @@ void Executor::executeTasks(std::vector msgIdxs, // original function call will cause a reset bool skipReset = isMaster && isThreads; - // Iterate through and invoke tasks - // Rules for thread pool scheduling: - // - Keep index zero free if on master, as this is likely to be the - // main thread which is blocking on others - // - If part of a messaging group, give the 0th idx its own thread, as - // it's likely to block waiting for the others - if (isThreads && isMaster) { - availablePoolThreads.erase(0); - } - - bool isInPtpGroup = firstMsg.groupid() > 0; - - std::set::iterator it = availablePoolThreads.begin(); + // Iterate through and invoke tasks. By default, we allocate tasks + // one-to-one with thread pool threads. Only once the pool is exhausted do + // we start overallocating. for (int msgIdx : msgIdxs) { const faabric::Message& msg = req->messages().at(msgIdx); + int threadPoolIdx = -1; + if (availablePoolThreads.empty()) { + // Here all threads are still executing, so we have to overload. + // If any tasks are blocking we risk a deadlock, and can no longer + // guarantee the application will finish. + // In general if we're on the master host and this is a thread, we + // should avoid the zeroth and first pool threads as they are likely + // to be the main thread and the zeroth in the communication group, + // so will be blocking. + if (isThreads && isMaster) { + assert(threadPoolSize > 2); + threadPoolIdx = (msg.appidx() % (threadPoolSize - 2)) + 2; + } else { + threadPoolIdx = msg.appidx() % threadPoolSize; + } - if (isInPtpGroup && msg.groupidx() == 0) { - // If zeroth in a group, need to reserve a thread pool idx + SPDLOG_WARN("Overloaded app index {} to thread {}", + msg.appidx(), + threadPoolIdx); + } else { + // Take next from those that are available threadPoolIdx = *availablePoolThreads.begin(); availablePoolThreads.erase(threadPoolIdx); - it = availablePoolThreads.begin(); - } else { - if (it == availablePoolThreads.end()) { - // Reset iterator to start - it = availablePoolThreads.begin(); - } - threadPoolIdx = *it; - it++; + SPDLOG_TRACE("Assigned app index {} to thread {}", + msg.appidx(), + threadPoolIdx); } // Enqueue the task - SPDLOG_TRACE( - "Assigning app index {} to thread {}", msg.appidx(), threadPoolIdx); threadTaskQueues[threadPoolIdx].enqueue(ExecutorTask( msgIdx, req, batchCounter, needsSnapshotPush, skipReset)); @@ -207,6 +208,8 @@ void Executor::threadPoolThread(int threadPoolIdx) SPDLOG_DEBUG("Thread pool thread {}:{} starting up", id, threadPoolIdx); auto& sch = faabric::scheduler::getScheduler(); + faabric::transport::PointToPointBroker& broker = + faabric::transport::getPointToPointBroker(); const auto& conf = faabric::util::getSystemConfig(); bool selfShutdown = false; @@ -296,6 +299,17 @@ void Executor::threadPoolThread(int threadPoolIdx) faabric::util::resetDirtyTracking(); } + // In all threads if we've finished with a ptp group, we need to tidy up + // to ensure all sockets can be destructed + if (msg.groupid() > 0) { + broker.resetThreadLocalCache(true); + + // Remove the group from this host if last in batch + if (isLastInBatch) { + broker.clearGroup(msg.groupid()); + } + } + // If this batch is finished, reset the executor and release its claim. // Note that we have to release the claim _after_ resetting, otherwise // the executor won't be ready for reuse. @@ -307,21 +321,10 @@ void Executor::threadPoolThread(int threadPoolIdx) reset(msg); } - // We need to remove the point-to-point group associated with this - // message completely. - if (msg.groupid() > 0) { - faabric::transport::PointToPointBroker& broker = - faabric::transport::getPointToPointBroker(); - - broker.clearGroup(msg.groupid()); - - broker.resetThreadLocalCache(); - } - releaseClaim(); } - // Make sure this thread index is available for scheduling + // Return this thread index to the pool available for scheduling { faabric::util::UniqueLock lock(threadsMutex); availablePoolThreads.insert(threadPoolIdx); @@ -374,8 +377,9 @@ void Executor::threadPoolThread(int threadPoolIdx) } // We have to clean up TLS here as this should be the last use of the - // scheduler from this thread + // scheduler and point-to-point broker from this thread sch.resetThreadLocalCache(); + broker.resetThreadLocalCache(); } bool Executor::tryClaim() diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 0fc9d20dc..6fa56debb 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -569,13 +569,16 @@ void PointToPointBroker::clear() groupFlags.clear(); } -void PointToPointBroker::resetThreadLocalCache() +void PointToPointBroker::resetThreadLocalCache(bool soft) { SPDLOG_TRACE("Resetting point-to-point thread-local cache"); sendEndpoints.clear(); recvEndpoints.clear(); - clients.clear(); + + if (!soft) { + clients.clear(); + } } PointToPointBroker& getPointToPointBroker() From 6fca18b0b96e7763daadfc8e218ebee361a14596 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 13:52:20 +0000 Subject: [PATCH 07/31] Moved unrelated changes to separate PR --- cmake/ExternalProjects.cmake | 10 ----- include/faabric/util/map.h | 69 --------------------------------- tests/test/state/test_state.cpp | 6 +++ 3 files changed, 6 insertions(+), 79 deletions(-) delete mode 100644 include/faabric/util/map.h diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake index b2e469a81..7f16a103b 100644 --- a/cmake/ExternalProjects.cmake +++ b/cmake/ExternalProjects.cmake @@ -29,8 +29,6 @@ conan_cmake_configure( cppzmq/4.8.1 flatbuffers/2.0.0 hiredis/1.0.2 - libcds/2.3.3 - parallel-hashmap/1.33 protobuf/3.17.1 rapidjson/cci.20200410 spdlog/1.9.2 @@ -83,8 +81,6 @@ find_package(cpprestsdk REQUIRED) find_package(cppzmq REQUIRED) find_package(fmt REQUIRED) find_package(hiredis REQUIRED) -find_package(LibCDS REQUIRED) -find_package(phmap REQUIRED) find_package(spdlog REQUIRED) # Pistache - Conan version is out of date and doesn't support clang @@ -124,11 +120,9 @@ add_library(zstd::libzstd_static ALIAS libzstd_static) # Group all external dependencies into a convenient virtual CMake library add_library(faabric_common_dependencies INTERFACE) - target_include_directories(faabric_common_dependencies INTERFACE ${FAABRIC_INCLUDE_DIR} ) - target_link_libraries(faabric_common_dependencies INTERFACE Boost::Boost Boost::filesystem @@ -138,8 +132,6 @@ target_link_libraries(faabric_common_dependencies INTERFACE cppzmq::cppzmq flatbuffers::flatbuffers hiredis::hiredis - LibCDS::LibCDS - phmap::phmap pistache::pistache protobuf::libprotobuf RapidJSON::RapidJSON @@ -147,9 +139,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE Threads::Threads zstd::libzstd_static ) - target_compile_definitions(faabric_common_dependencies INTERFACE FMT_DEPRECATED= # Suppress warnings about use of deprecated api by spdlog ) - add_library(faabric::common_dependencies ALIAS faabric_common_dependencies) diff --git a/include/faabric/util/map.h b/include/faabric/util/map.h deleted file mode 100644 index 638603591..000000000 --- a/include/faabric/util/map.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include - -#include - -namespace faabric::util { - -// See documentation for phmap: https://greg7mdp.github.io/parallel-hashmap/ -// Specifically section titled: "Using the intrinsic parallelism ..." - -template -using SimpleMap = - phmap::parallel_flat_hash_map, - phmap::priv::hash_default_eq, - std::allocator>, - 4>; - -template -using ParallelMap = - phmap::parallel_flat_hash_map, - phmap::priv::hash_default_eq, - std::allocator>, - 4, - std::mutex>; - -template -class SlowParallelMap -{ - public: - V& at(const K& key) { return map.at(key); } - - V& getOrInsert(const K& key) - { - getOrInsert(key, [key](std::unordered_map& m) { m[key]; }); - } - - V& getOrInsert( - const K& key, - const std::function)>& insertOp) - { - if (map.find(key) == map.end()) { - FullLock lock(mx); - if (map.find(key) == map.end()) { - insertOp(); - } - } - - { - SharedLock lock(mx); - return map.at(key); - } - } - - void insert(const K& key, const V& value) { map.insert(key, value); } - - void erase(const K& key) { map.erase(key); } - - void clear() { map.clear(); } - - private: - std::shared_mutex mx; - std::unordered_map map; -}; -} diff --git a/tests/test/state/test_state.cpp b/tests/test/state/test_state.cpp index 40cc5be4a..af9e7aa3b 100644 --- a/tests/test/state/test_state.cpp +++ b/tests/test/state/test_state.cpp @@ -114,6 +114,12 @@ class StateServerTestFixture std::vector actual(values.size(), 0); setDummyData(values); + // Get, with optional pull + int nMessages = 1; + if (doPull) { + nMessages = 2; + } + // Initial pull const std::shared_ptr& localKv = getLocalKv(); localKv->pull(); From ea11f7a41a246bf61059489c3f64a48b3d28e92c Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 14:24:39 +0000 Subject: [PATCH 08/31] Add locks.cpp --- .../faabric/transport/PointToPointBroker.h | 4 +-- include/faabric/util/locks.h | 33 +++---------------- src/scheduler/Executor.cpp | 2 +- src/transport/PointToPointBroker.cpp | 4 +-- src/util/CMakeLists.txt | 1 + src/util/locks.cpp | 33 +++++++++++++++++++ 6 files changed, 44 insertions(+), 33 deletions(-) create mode 100644 src/util/locks.cpp diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index df8ea81e8..459d63543 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -115,7 +115,7 @@ class PointToPointBroker void clear(); - void resetThreadLocalCache(bool soft=false); + void resetThreadLocalCache(bool hard = true); private: faabric::util::SystemConfig& conf; diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h index 8f14525b3..1ab7e9fad 100644 --- a/include/faabric/util/locks.h +++ b/include/faabric/util/locks.h @@ -17,34 +17,11 @@ typedef std::shared_lock SharedLock; class FlagWaiter { public: - FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS) - : timeoutMs(timeoutMsIn) - {} - - void waitOnFlag() - { - // Check - if (flag.load()) { - return; - } - - // Wait for group to be enabled - UniqueLock lock(flagMx); - if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] { - return flag.load(); - })) { - - SPDLOG_ERROR("Timed out waiting for flag"); - throw std::runtime_error("Timed out waiting for flag"); - } - } - - void setFlag(bool value) - { - UniqueLock lock(flagMx); - flag.store(value); - cv.notify_all(); - } + FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS); + + void waitOnFlag(); + + void setFlag(bool value); private: int timeoutMs; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 53476a0fe..5159b7467 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -302,7 +302,7 @@ void Executor::threadPoolThread(int threadPoolIdx) // In all threads if we've finished with a ptp group, we need to tidy up // to ensure all sockets can be destructed if (msg.groupid() > 0) { - broker.resetThreadLocalCache(true); + broker.resetThreadLocalCache(false); // Remove the group from this host if last in batch if (isLastInBatch) { diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 6fa56debb..eecf6ed51 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -569,14 +569,14 @@ void PointToPointBroker::clear() groupFlags.clear(); } -void PointToPointBroker::resetThreadLocalCache(bool soft) +void PointToPointBroker::resetThreadLocalCache(bool hard) { SPDLOG_TRACE("Resetting point-to-point thread-local cache"); sendEndpoints.clear(); recvEndpoints.clear(); - if (!soft) { + if (hard) { clients.clear(); } } diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index bee41ed71..f7d6e1242 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -12,6 +12,7 @@ faabric_lib(util gids.cpp json.cpp latch.cpp + locks.cpp logging.cpp memory.cpp network.cpp diff --git a/src/util/locks.cpp b/src/util/locks.cpp new file mode 100644 index 000000000..14064effa --- /dev/null +++ b/src/util/locks.cpp @@ -0,0 +1,33 @@ +#include + +namespace faabric::util { + +FlagWaiter::FlagWaiter(int timeoutMsIn) + : timeoutMs(timeoutMsIn) +{} + +void FlagWaiter::waitOnFlag() +{ + // Check + if (flag.load()) { + return; + } + + // Wait for flag to be set + UniqueLock lock(flagMx); + if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] { + return flag.load(); + })) { + + SPDLOG_ERROR("Timed out waiting for flag"); + throw std::runtime_error("Timed out waiting for flag"); + } +} + +void FlagWaiter::setFlag(bool value) +{ + UniqueLock lock(flagMx); + flag.store(value); + cv.notify_all(); +} +} From 314226fd0e6413c494928e21ad8a1470e75d25d9 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 2 Nov 2021 17:05:10 +0000 Subject: [PATCH 09/31] Only lock on group when it exists --- src/snapshot/SnapshotServer.cpp | 28 +++++++++------------------- src/util/memory.cpp | 2 ++ 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp index 7cb778be7..95d02f71f 100644 --- a/src/snapshot/SnapshotServer.cpp +++ b/src/snapshot/SnapshotServer.cpp @@ -71,10 +71,9 @@ std::unique_ptr SnapshotServer::recvPushSnapshot( throw std::runtime_error("Received snapshot with zero size"); } - SPDLOG_DEBUG("Receiving snapshot {} (size {}, lock {})", + SPDLOG_DEBUG("Receiving snapshot {} (size {})", r->key()->c_str(), - r->contents()->size(), - r->groupid()); + r->contents()->size()); faabric::snapshot::SnapshotRegistry& reg = faabric::snapshot::getSnapshotRegistry(); @@ -83,12 +82,6 @@ std::unique_ptr SnapshotServer::recvPushSnapshot( faabric::util::SnapshotData data; data.size = r->contents()->size(); - // Lock the function group if necessary - if (r->groupid() > 0) { - faabric::transport::PointToPointGroup::getGroup(r->groupid()) - ->localLock(); - } - // TODO - avoid this copy by changing server superclass to allow subclasses // to provide a buffer to receive data. // TODO - work out snapshot ownership here, how do we know when to delete @@ -99,12 +92,6 @@ std::unique_ptr SnapshotServer::recvPushSnapshot( reg.takeSnapshot(r->key()->str(), data, true); - // Unlock the application - if (r->groupid() > 0) { - faabric::transport::PointToPointGroup::getGroup(r->groupid()) - ->localUnlock(); - } - // Send response return std::make_unique(); } @@ -127,6 +114,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) { const SnapshotDiffPushRequest* r = flatbuffers::GetRoot(buffer); + int groupId = r->groupid(); SPDLOG_DEBUG( "Applying {} diffs to snapshot {}", r->chunks()->size(), r->key()->str()); @@ -136,8 +124,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) faabric::snapshot::getSnapshotRegistry(); faabric::util::SnapshotData& snap = reg.getSnapshot(r->key()->str()); - // Lock the function group - if (r->groupid() > 0) { + // Lock the function group if it exists + if (groupId > 0 && + faabric::transport::PointToPointGroup::groupExists(groupId)) { faabric::transport::PointToPointGroup::getGroup(r->groupid()) ->localLock(); } @@ -202,8 +191,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) } } - // Unlock - if (r->groupid() > 0) { + // Unlock group if exists + if (groupId > 0 && + faabric::transport::PointToPointGroup::groupExists(groupId)) { faabric::transport::PointToPointGroup::getGroup(r->groupid()) ->localUnlock(); } diff --git a/src/util/memory.cpp b/src/util/memory.cpp index 3855aac23..de3b9c95f 100644 --- a/src/util/memory.cpp +++ b/src/util/memory.cpp @@ -82,6 +82,8 @@ AlignedChunk getPageAlignedChunk(long offset, long length) void resetDirtyTracking() { + SPDLOG_DEBUG("Resetting dirty tracking"); + FILE* fd = fopen(CLEAR_REFS, "w"); if (fd == nullptr) { SPDLOG_ERROR("Could not open clear_refs ({})", strerror(errno)); From bb5af80f4f148f34b3ef47df244db6edaeae320e Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 4 Nov 2021 13:46:54 +0000 Subject: [PATCH 10/31] Add custom merges --- include/faabric/util/snapshot.h | 126 +++++++++++++++++++++++-- src/snapshot/SnapshotServer.cpp | 2 +- src/util/snapshot.cpp | 157 +++++++++++++------------------- 3 files changed, 181 insertions(+), 104 deletions(-) diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h index e0d195f6f..e1bb1f7b4 100644 --- a/include/faabric/util/snapshot.h +++ b/include/faabric/util/snapshot.h @@ -7,6 +7,7 @@ #include #include +#include namespace faabric::util { @@ -27,14 +28,6 @@ enum SnapshotMergeOperation Min }; -struct SnapshotMergeRegion -{ - uint32_t offset = 0; - size_t length = 0; - SnapshotDataType dataType = SnapshotDataType::Raw; - SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite; -}; - class SnapshotDiff { public: @@ -44,6 +37,8 @@ class SnapshotDiff size_t size = 0; const uint8_t* data = nullptr; + bool noChange = false; + SnapshotDiff() = default; SnapshotDiff(SnapshotDataType dataTypeIn, @@ -67,6 +62,105 @@ class SnapshotDiff } }; +class SnapshotMergeRegion +{ + public: + uint32_t offset = 0; + size_t length = 0; + SnapshotDataType dataType = SnapshotDataType::Raw; + SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite; + + SnapshotDiff toDiff(const uint8_t* original, const uint8_t* updated) + { + const uint8_t* updatedValue = updated + offset; + const uint8_t* originalValue = original + offset; + + SnapshotDiff diff(offset, updatedValue, length); + diff.dataType = dataType; + diff.operation = operation; + + // Modify diff data for certain operations + switch (dataType) { + case (SnapshotDataType::Int): { + int originalInt = + *(reinterpret_cast(originalValue)); + int updatedInt = *(reinterpret_cast(updatedValue)); + + if (originalInt == updatedInt) { + diff.size = 0; + diff.noChange = true; + return diff; + } + + switch (operation) { + case (SnapshotMergeOperation::Sum): { + // Sums must send the value to be _added_, and + // not the final result + updatedInt -= originalInt; + break; + } + case (SnapshotMergeOperation::Subtract): { + // Subtractions must send the value to be + // subtracted, not the result + updatedInt = originalInt - updatedInt; + break; + } + case (SnapshotMergeOperation::Product): { + // Products must send the value to be + // multiplied, not the result + updatedInt /= originalInt; + break; + } + case (SnapshotMergeOperation::Max): + case (SnapshotMergeOperation::Min): + // Min and max don't need to change + break; + default: { + SPDLOG_ERROR("Unhandled integer merge operation: {}", + operation); + throw std::runtime_error( + "Unhandled integer merge operation"); + } + } + + // TODO - somehow avoid casting away the const here? + // Modify the memory in-place here + std::memcpy( + (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t)); + + break; + } + case (SnapshotDataType::Raw): { + switch (operation) { + case (SnapshotMergeOperation::Ignore): { + break; + } + case (SnapshotMergeOperation::Overwrite): { + // TODO - how can we make this fine-grained? + // Default behaviour + break; + } + default: { + SPDLOG_ERROR("Unhandled raw merge operation: {}", + operation); + throw std::runtime_error( + "Unhandled raw merge operation"); + } + } + + break; + } + default: { + SPDLOG_ERROR("Merge region for unhandled data type: {}", + dataType); + throw std::runtime_error( + "Merge region for unhandled data type"); + } + } + return diff; + } +}; + class SnapshotData { public: @@ -86,10 +180,22 @@ class SnapshotData SnapshotDataType dataType, SnapshotMergeOperation operation); + void setIsCustomMerge(bool value); + + bool isCustomMerge(); + private: - // Note - we care about the order of this map, as we iterate through it in - // order of offsets + // Note - we care about the order of this map, as we iterate through it + // in order of offsets std::map mergeRegions; + + bool _isCustomMerge = false; + + std::vector getCustomDiffs(const uint8_t* updated, + size_t updatedSize); + + std::vector getStandardDiffs(const uint8_t* updated, + size_t updatedSize); }; std::string snapshotDataTypeStr(SnapshotDataType dt); diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp index 95d02f71f..fe0df2e5f 100644 --- a/src/snapshot/SnapshotServer.cpp +++ b/src/snapshot/SnapshotServer.cpp @@ -131,7 +131,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) ->localLock(); } - // Apply diffs to snapshot + // Iterate through the chunks passed in the request for (const auto* r : *r->chunks()) { uint8_t* dest = snap.data + r->offset(); switch (r->dataType()) { diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index f49e99a55..ae0c2cab3 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -36,6 +36,57 @@ std::vector SnapshotData::getDirtyPages() std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, size_t updatedSize) +{ + for (auto& m : mergeRegions) { + SPDLOG_TRACE("{} {} merge region at {} {}-{}", + snapshotDataTypeStr(m.second.dataType), + snapshotMergeOpStr(m.second.operation), + m.first, + m.second.offset, + m.second.offset + m.second.length); + } + + std::vector diffs; + if (_isCustomMerge) { + diffs = getCustomDiffs(updated, updatedSize); + } else { + diffs = getStandardDiffs(updated, updatedSize); + } + + return diffs; +} + +std::vector SnapshotData::getCustomDiffs(const uint8_t* updated, + size_t updatedSize) +{ + SPDLOG_TRACE("Doing custom diff with {} merge regions", + mergeRegions.size()); + + // Here we ignore all diffs except for those specified in merge regions + std::vector diffs; + for (auto& mergePair : mergeRegions) { + // Get the diff for this merge region + SnapshotMergeRegion region = mergePair.second; + SnapshotDiff diff = region.toDiff(data, updated); + + if (diff.noChange) { + SPDLOG_TRACE("No diff for {} {} merge region at {} {}-{}", + snapshotDataTypeStr(mergePair.second.dataType), + snapshotMergeOpStr(mergePair.second.operation), + mergePair.first, + mergePair.second.offset, + mergePair.second.offset + mergePair.second.length); + continue; + } + + diffs.push_back(diff); + } + + return diffs; +} + +std::vector SnapshotData::getStandardDiffs(const uint8_t* updated, + size_t updatedSize) { // Work out which pages have changed size_t nThisPages = getRequiredHostPages(size); @@ -47,15 +98,6 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, dirtyPageNumbers.size(), mergeRegions.size()); - for (auto& m : mergeRegions) { - SPDLOG_TRACE("{} {} merge region at {} {}-{}", - snapshotDataTypeStr(m.second.dataType), - snapshotMergeOpStr(m.second.operation), - m.first, - m.second.offset, - m.second.offset + m.second.length); - } - // Get iterator over merge regions std::map::iterator mergeIt = mergeRegions.begin(); @@ -130,91 +172,9 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, diffInProgress = false; } + // Get the diff for this merge region SnapshotMergeRegion region = mergeIt->second; - - // Set up the diff - const uint8_t* updatedValue = updated + region.offset; - const uint8_t* originalValue = data + region.offset; - - SnapshotDiff diff(region.offset, updatedValue, region.length); - diff.dataType = region.dataType; - diff.operation = region.operation; - - // Modify diff data for certain operations - switch (region.dataType) { - case (SnapshotDataType::Int): { - int originalInt = - *(reinterpret_cast(originalValue)); - int updatedInt = - *(reinterpret_cast(updatedValue)); - - switch (region.operation) { - case (SnapshotMergeOperation::Sum): { - // Sums must send the value to be _added_, and - // not the final result - updatedInt -= originalInt; - break; - } - case (SnapshotMergeOperation::Subtract): { - // Subtractions must send the value to be - // subtracted, not the result - updatedInt = originalInt - updatedInt; - break; - } - case (SnapshotMergeOperation::Product): { - // Products must send the value to be - // multiplied, not the result - updatedInt /= originalInt; - break; - } - case (SnapshotMergeOperation::Max): - case (SnapshotMergeOperation::Min): - // Min and max don't need to change - break; - default: { - SPDLOG_ERROR( - "Unhandled integer merge operation: {}", - region.operation); - throw std::runtime_error( - "Unhandled integer merge operation"); - } - } - - // TODO - somehow avoid casting away the const here? - // Modify the memory in-place here - std::memcpy((uint8_t*)updatedValue, - BYTES(&updatedInt), - sizeof(int32_t)); - - break; - } - case (SnapshotDataType::Raw): { - switch (region.operation) { - case (SnapshotMergeOperation::Ignore): { - break; - } - case (SnapshotMergeOperation::Overwrite): { - // Default behaviour - break; - } - default: { - SPDLOG_ERROR( - "Unhandled raw merge operation: {}", - region.operation); - throw std::runtime_error( - "Unhandled raw merge operation"); - } - } - - break; - } - default: { - SPDLOG_ERROR("Merge region for unhandled data type: {}", - region.dataType); - throw std::runtime_error( - "Merge region for unhandled data type"); - } - } + SnapshotDiff diff = region.toDiff(data, updated); SPDLOG_TRACE("Diff at {} falls in {} {} merge region {}-{}", pageOffset + b, @@ -314,11 +274,22 @@ void SnapshotData::addMergeRegion(uint32_t offset, .length = length, .dataType = dataType, .operation = operation }; + // Locking as this may be called in bursts by multiple threads faabric::util::UniqueLock lock(snapMx); mergeRegions[offset] = region; } +void SnapshotData::setIsCustomMerge(bool value) +{ + _isCustomMerge = value; +} + +bool SnapshotData::isCustomMerge() +{ + return _isCustomMerge; +} + std::string snapshotDataTypeStr(SnapshotDataType dt) { switch (dt) { From 2cd38e019c361fd92983a53b6f6a7d64b39385a0 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 4 Nov 2021 18:09:17 +0000 Subject: [PATCH 11/31] Overhaul of snapshot diffing --- include/faabric/util/snapshot.h | 98 +------- src/util/snapshot.cpp | 395 +++++++++++++++----------------- 2 files changed, 184 insertions(+), 309 deletions(-) diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h index e1bb1f7b4..725eaeb25 100644 --- a/include/faabric/util/snapshot.h +++ b/include/faabric/util/snapshot.h @@ -70,95 +70,9 @@ class SnapshotMergeRegion SnapshotDataType dataType = SnapshotDataType::Raw; SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite; - SnapshotDiff toDiff(const uint8_t* original, const uint8_t* updated) - { - const uint8_t* updatedValue = updated + offset; - const uint8_t* originalValue = original + offset; - - SnapshotDiff diff(offset, updatedValue, length); - diff.dataType = dataType; - diff.operation = operation; - - // Modify diff data for certain operations - switch (dataType) { - case (SnapshotDataType::Int): { - int originalInt = - *(reinterpret_cast(originalValue)); - int updatedInt = *(reinterpret_cast(updatedValue)); - - if (originalInt == updatedInt) { - diff.size = 0; - diff.noChange = true; - return diff; - } - - switch (operation) { - case (SnapshotMergeOperation::Sum): { - // Sums must send the value to be _added_, and - // not the final result - updatedInt -= originalInt; - break; - } - case (SnapshotMergeOperation::Subtract): { - // Subtractions must send the value to be - // subtracted, not the result - updatedInt = originalInt - updatedInt; - break; - } - case (SnapshotMergeOperation::Product): { - // Products must send the value to be - // multiplied, not the result - updatedInt /= originalInt; - break; - } - case (SnapshotMergeOperation::Max): - case (SnapshotMergeOperation::Min): - // Min and max don't need to change - break; - default: { - SPDLOG_ERROR("Unhandled integer merge operation: {}", - operation); - throw std::runtime_error( - "Unhandled integer merge operation"); - } - } - - // TODO - somehow avoid casting away the const here? - // Modify the memory in-place here - std::memcpy( - (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t)); - - break; - } - case (SnapshotDataType::Raw): { - switch (operation) { - case (SnapshotMergeOperation::Ignore): { - break; - } - case (SnapshotMergeOperation::Overwrite): { - // TODO - how can we make this fine-grained? - // Default behaviour - break; - } - default: { - SPDLOG_ERROR("Unhandled raw merge operation: {}", - operation); - throw std::runtime_error( - "Unhandled raw merge operation"); - } - } - - break; - } - default: { - SPDLOG_ERROR("Merge region for unhandled data type: {}", - dataType); - throw std::runtime_error( - "Merge region for unhandled data type"); - } - } - return diff; - } + void addDiffs(std::vector& diffs, + const uint8_t* original, + const uint8_t* updated); }; class SnapshotData @@ -180,17 +94,13 @@ class SnapshotData SnapshotDataType dataType, SnapshotMergeOperation operation); - void setIsCustomMerge(bool value); - - bool isCustomMerge(); + void addMergeRegion(SnapshotMergeRegion& region); private: // Note - we care about the order of this map, as we iterate through it // in order of offsets std::map mergeRegions; - bool _isCustomMerge = false; - std::vector getCustomDiffs(const uint8_t* updated, size_t updatedSize); diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index ae0c2cab3..e5a2751bf 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -37,229 +37,63 @@ std::vector SnapshotData::getDirtyPages() std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, size_t updatedSize) { - for (auto& m : mergeRegions) { - SPDLOG_TRACE("{} {} merge region at {} {}-{}", - snapshotDataTypeStr(m.second.dataType), - snapshotMergeOpStr(m.second.operation), - m.first, - m.second.offset, - m.second.offset + m.second.length); - } - - std::vector diffs; - if (_isCustomMerge) { - diffs = getCustomDiffs(updated, updatedSize); - } else { - diffs = getStandardDiffs(updated, updatedSize); - } - - return diffs; -} - -std::vector SnapshotData::getCustomDiffs(const uint8_t* updated, - size_t updatedSize) -{ - SPDLOG_TRACE("Doing custom diff with {} merge regions", - mergeRegions.size()); - - // Here we ignore all diffs except for those specified in merge regions std::vector diffs; - for (auto& mergePair : mergeRegions) { - // Get the diff for this merge region - SnapshotMergeRegion region = mergePair.second; - SnapshotDiff diff = region.toDiff(data, updated); - - if (diff.noChange) { - SPDLOG_TRACE("No diff for {} {} merge region at {} {}-{}", - snapshotDataTypeStr(mergePair.second.dataType), - snapshotMergeOpStr(mergePair.second.operation), - mergePair.first, - mergePair.second.offset, - mergePair.second.offset + mergePair.second.length); - continue; - } - - diffs.push_back(diff); + SPDLOG_DEBUG("No merge regions set, thus no diffs"); + if (mergeRegions.empty()) { + return diffs; } - return diffs; -} - -std::vector SnapshotData::getStandardDiffs(const uint8_t* updated, - size_t updatedSize) -{ - // Work out which pages have changed + // Work out which pages have changed (these will be sorted) size_t nThisPages = getRequiredHostPages(size); std::vector dirtyPageNumbers = getDirtyPageNumbers(updated, nThisPages); - SPDLOG_TRACE("Diffing {} pages with {} changed pages and {} merge regions", - nThisPages, - dirtyPageNumbers.size(), - mergeRegions.size()); - - // Get iterator over merge regions + // Iterate through each dirty page, work out if there's an overlapping merge + // region, tell that region to add their diffs to the list std::map::iterator mergeIt = mergeRegions.begin(); - // Get byte-wise diffs _within_ the dirty pages - // - // NOTE - if raw diffs cover page boundaries, they will be split into - // multiple diffs, each of which is page-aligned. - // We can be relatively confident that variables will be page-aligned so - // this shouldn't be a problem. - // - // Merge regions support crossing page boundaries. - // - // For each byte we encounter have the following possible scenarios: - // - // 1. the byte is dirty, and is the start of a new diff - // 2. the byte is dirty, but the byte before was also dirty, so we - // are inside a diff - // 3. the byte is not dirty but the previous one was, so we've reached the - // end of a diff - // 4. the last byte of the page is dirty, so we've also come to the end of - // a diff - // 5. the byte is dirty, but is within a special merge region, in which - // case we need to add a diff for that whole region, then skip - // to the next byte after that region - std::vector diffs; for (int i : dirtyPageNumbers) { - int pageOffset = i * HOST_PAGE_SIZE; - - bool diffInProgress = false; - int diffStart = 0; - int offset = pageOffset; - for (int b = 0; b < HOST_PAGE_SIZE; b++) { - offset = pageOffset + b; - bool isDirtyByte = *(data + offset) != *(updated + offset); - - // Skip any merge regions we've passed - while (mergeIt != mergeRegions.end() && - offset >= - (mergeIt->second.offset + mergeIt->second.length)) { - SnapshotMergeRegion region = mergeIt->second; - SPDLOG_TRACE("At offset {}, past region {} {} {}-{}", - offset, - snapshotDataTypeStr(region.dataType), - snapshotMergeOpStr(region.operation), - region.offset, - region.offset + region.length); - - ++mergeIt; - } + int pageStart = i * HOST_PAGE_SIZE; + int pageEnd = pageStart + HOST_PAGE_SIZE; - // Check if we're in a merge region - bool isInMergeRegion = - mergeIt != mergeRegions.end() && - offset >= mergeIt->second.offset && - offset < (mergeIt->second.offset + mergeIt->second.length); - - if (isDirtyByte && isInMergeRegion) { - // If we've entered a merge region with a diff in progress, we - // need to close it off - if (diffInProgress) { - diffs.emplace_back( - diffStart, updated + diffStart, offset - diffStart); - - SPDLOG_TRACE( - "Finished {} {} diff between {}-{} before merge region", - snapshotDataTypeStr(diffs.back().dataType), - snapshotMergeOpStr(diffs.back().operation), - diffs.back().offset, - diffs.back().offset + diffs.back().size); - - diffInProgress = false; - } + SPDLOG_TRACE("Checking dirty page {} at {}-{}", i, pageStart, pageEnd); - // Get the diff for this merge region - SnapshotMergeRegion region = mergeIt->second; - SnapshotDiff diff = region.toDiff(data, updated); + // Skip any merge regions we've passed + while (mergeIt != mergeRegions.end() && + (mergeIt->second.offset < pageStart)) { + SPDLOG_TRACE("Gone past {} {} merge region at {}-{}", + snapshotDataTypeStr(mergeIt->second.dataType), + snapshotMergeOpStr(mergeIt->second.operation), + mergeIt->second.offset, + mergeIt->second.offset + mergeIt->second.length); - SPDLOG_TRACE("Diff at {} falls in {} {} merge region {}-{}", - pageOffset + b, - snapshotDataTypeStr(region.dataType), - snapshotMergeOpStr(region.operation), - region.offset, - region.offset + region.length); - - // Add the diff to the list - if (diff.operation != SnapshotMergeOperation::Ignore) { - diffs.emplace_back(diff); - } - - // Work out the offset where this region ends - int regionEndOffset = - (region.offset - pageOffset) + region.length; - - if (regionEndOffset < HOST_PAGE_SIZE) { - // Skip over this region, still more offsets left in this - // page - SPDLOG_TRACE( - "{} {} merge region {}-{} finished. Skipping to {}", - snapshotDataTypeStr(region.dataType), - snapshotMergeOpStr(region.operation), - region.offset, - region.offset + region.length, - pageOffset + regionEndOffset); - - // Bump the loop variable to the end of this region (note - // that the loop itself will increment onto the next). - b = regionEndOffset - 1; - } else { - // Merge region extends over this page, move onto next - SPDLOG_TRACE( - "{} {} merge region {}-{} over page boundary {} ({}-{})", - snapshotDataTypeStr(region.dataType), - snapshotMergeOpStr(region.operation), - region.offset, - region.offset + region.length, - i, - pageOffset, - pageOffset + HOST_PAGE_SIZE); - - break; - } - } else if (isDirtyByte && !diffInProgress) { - // Diff starts here if it's different and diff not in progress - diffInProgress = true; - diffStart = offset; - - SPDLOG_TRACE("Started Raw Overwrite diff at {}", diffStart); - } else if (!isDirtyByte && diffInProgress) { - // Diff ends if it's not different and diff is in progress - diffInProgress = false; - diffs.emplace_back( - diffStart, updated + diffStart, offset - diffStart); - - SPDLOG_TRACE("Finished {} {} diff between {}-{}", - snapshotDataTypeStr(diffs.back().dataType), - snapshotMergeOpStr(diffs.back().operation), - diffs.back().offset, - diffs.back().offset + diffs.back().size); - } + ++mergeIt; } - // If we've reached the end of this page with a diff in progress, we - // need to close it off - if (diffInProgress) { - offset++; - - diffs.emplace_back( - diffStart, updated + diffStart, offset - diffStart); - - SPDLOG_TRACE("Found {} {} diff between {}-{} at end of page", - snapshotDataTypeStr(diffs.back().dataType), - snapshotMergeOpStr(diffs.back().operation), - diffs.back().offset, - diffs.back().offset + diffs.back().size); + if (mergeIt == mergeRegions.end()) { + // Done if no more merge regions left + SPDLOG_TRACE("No more merge regions left"); + break; } - } - // If comparison has more pages than the original, add another diff - // containing all the new pages - if (updatedSize > size) { - diffs.emplace_back(size, updated + size, updatedSize - size); + // For each merge region that overlaps this dirty page, get it to add + // its diffs, and move onto the next one + // TODO - make this more efficient by passing in dirty pages to merge + // regions + while (mergeIt != mergeRegions.end() && + ((mergeIt->second.offset > pageStart) && + (mergeIt->second.offset + mergeIt->second.length) < pageEnd)) { + + SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}", + snapshotDataTypeStr(mergeIt->second.dataType), + snapshotMergeOpStr(mergeIt->second.operation), + mergeIt->second.offset, + mergeIt->second.offset + mergeIt->second.length); + + mergeIt->second.addDiffs(diffs, data, updated); + mergeIt++; + } } return diffs; @@ -275,19 +109,14 @@ void SnapshotData::addMergeRegion(uint32_t offset, .dataType = dataType, .operation = operation }; - // Locking as this may be called in bursts by multiple threads - faabric::util::UniqueLock lock(snapMx); - mergeRegions[offset] = region; -} - -void SnapshotData::setIsCustomMerge(bool value) -{ - _isCustomMerge = value; + addMergeRegion(region); } -bool SnapshotData::isCustomMerge() +void SnapshotData::addMergeRegion(SnapshotMergeRegion& region) { - return _isCustomMerge; + // Locking as this may be called in bursts by multiple threads + faabric::util::UniqueLock lock(snapMx); + mergeRegions[region.offset] = region; } std::string snapshotDataTypeStr(SnapshotDataType dt) @@ -336,4 +165,140 @@ std::string snapshotMergeOpStr(SnapshotMergeOperation op) } } } + +void SnapshotMergeRegion::addDiffs(std::vector& diffs, + const uint8_t* original, + const uint8_t* updated) +{ + // Skip an ignore + if (operation == SnapshotMergeOperation::Ignore) { + return; + } + + const uint8_t* updatedValue = updated + offset; + const uint8_t* originalValue = original + offset; + + switch (dataType) { + case (SnapshotDataType::Int): { + // Get the original and update values + int originalInt = *(reinterpret_cast(originalValue)); + int updatedInt = *(reinterpret_cast(updatedValue)); + + // Skip if no change + if (originalInt == updatedInt) { + return; + } + + // Add the diff + SnapshotDiff diff(offset, updatedValue, length); + diff.dataType = dataType; + diff.operation = operation; + diffs.emplace_back(diff); + + SPDLOG_TRACE("Adding {} {} diff at {}-{}", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + offset, + offset + length); + + // Potentially modify the original in place depending on the + // operation + switch (operation) { + case (SnapshotMergeOperation::Sum): { + // Sums must send the value to be _added_, and + // not the final result + updatedInt -= originalInt; + break; + } + case (SnapshotMergeOperation::Subtract): { + // Subtractions must send the value to be + // subtracted, not the result + updatedInt = originalInt - updatedInt; + break; + } + case (SnapshotMergeOperation::Product): { + // Products must send the value to be + // multiplied, not the result + updatedInt /= originalInt; + break; + } + case (SnapshotMergeOperation::Max): + case (SnapshotMergeOperation::Min): + // Min and max don't need to change + break; + default: { + SPDLOG_ERROR("Unhandled integer merge operation: {}", + operation); + throw std::runtime_error( + "Unhandled integer merge operation"); + } + } + + // TODO - somehow avoid casting away the const here? + // Modify the memory in-place here + std::memcpy( + (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t)); + + break; + } + case (SnapshotDataType::Raw): { + switch (operation) { + case (SnapshotMergeOperation::Overwrite): { + // Add subsections of diffs only for the bytes that + // have changed + bool diffInProgress = false; + int diffStart = 0; + for (int b = 0; b < length; b++) { + bool isDirtyByte = + *(original + offset) != *(updated + offset); + + if (isDirtyByte && !diffInProgress) { + // Diff starts here if it's different and diff + // not in progress + diffInProgress = true; + diffStart = b; + } else if (!isDirtyByte && diffInProgress) { + // Diff ends if it's not different and diff is + // in progress + SPDLOG_TRACE("Adding {} {} diff at {}-{}", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + offset, + offset + length); + + diffInProgress = false; + diffs.emplace_back( + diffStart, updated + diffStart, b - diffStart); + } + } + + // If we've reached the end of this region with a diff + // in progress, we need to close it off + if (diffInProgress) { + SPDLOG_TRACE("Adding {} {} diff at {}-{}", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + offset, + offset + length); + + diffs.emplace_back(diffStart, + updated + diffStart, + (length + 1) - diffStart); + } + } + default: { + SPDLOG_ERROR("Unhandled raw merge operation: {}", + operation); + throw std::runtime_error("Unhandled raw merge operation"); + } + } + + break; + } + default: { + SPDLOG_ERROR("Merge region for unhandled data type: {}", dataType); + throw std::runtime_error("Merge region for unhandled data type"); + } + } +} } From db284929dd2d0eb436ff1355189099e39d2e0edf Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Thu, 4 Nov 2021 18:34:00 +0000 Subject: [PATCH 12/31] Move spurious logging statement --- src/util/snapshot.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index e5a2751bf..89140947a 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -38,8 +38,8 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, size_t updatedSize) { std::vector diffs; - SPDLOG_DEBUG("No merge regions set, thus no diffs"); if (mergeRegions.empty()) { + SPDLOG_DEBUG("No merge regions set, thus no diffs"); return diffs; } From 8ce837535cb3034f452ba862defe00bcb5986302 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 5 Nov 2021 12:14:25 +0000 Subject: [PATCH 13/31] More snapshotting fixes --- include/faabric/util/snapshot.h | 12 +---- src/snapshot/SnapshotServer.cpp | 27 +++++++----- src/util/snapshot.cpp | 77 ++++++++++++++++++++++++--------- 3 files changed, 75 insertions(+), 41 deletions(-) diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h index 725eaeb25..a005dbae3 100644 --- a/include/faabric/util/snapshot.h +++ b/include/faabric/util/snapshot.h @@ -53,13 +53,6 @@ class SnapshotDiff data = dataIn; size = sizeIn; } - - SnapshotDiff(uint32_t offsetIn, const uint8_t* dataIn, size_t sizeIn) - { - offset = offsetIn; - data = dataIn; - size = sizeIn; - } }; class SnapshotMergeRegion @@ -92,9 +85,8 @@ class SnapshotData void addMergeRegion(uint32_t offset, size_t length, SnapshotDataType dataType, - SnapshotMergeOperation operation); - - void addMergeRegion(SnapshotMergeRegion& region); + SnapshotMergeOperation operation, + bool overwrite = false); private: // Note - we care about the order of this map, as we iterate through it diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp index fe0df2e5f..bc6783783 100644 --- a/src/snapshot/SnapshotServer.cpp +++ b/src/snapshot/SnapshotServer.cpp @@ -132,18 +132,25 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) } // Iterate through the chunks passed in the request - for (const auto* r : *r->chunks()) { - uint8_t* dest = snap.data + r->offset(); - switch (r->dataType()) { + for (const auto* chunk : *r->chunks()) { + uint8_t* dest = snap.data + chunk->offset(); + + SPDLOG_TRACE("Applying snapshot diff to {} at {}-{}", + r->key()->str(), + chunk->offset(), + chunk->offset() + chunk->data()->size()); + + switch (chunk->dataType()) { case (faabric::util::SnapshotDataType::Raw): { - switch (r->mergeOp()) { + switch (chunk->mergeOp()) { case (faabric::util::SnapshotMergeOperation::Overwrite): { - std::memcpy(dest, r->data()->data(), r->data()->size()); + std::memcpy( + dest, chunk->data()->data(), chunk->data()->size()); break; } default: { SPDLOG_ERROR("Unsupported raw merge operation: {}", - r->mergeOp()); + chunk->mergeOp()); throw std::runtime_error( "Unsupported raw merge operation"); } @@ -152,9 +159,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) } case (faabric::util::SnapshotDataType::Int): { const auto* value = - reinterpret_cast(r->data()->data()); + reinterpret_cast(chunk->data()->data()); auto* destValue = reinterpret_cast(dest); - switch (r->mergeOp()) { + switch (chunk->mergeOp()) { case (faabric::util::SnapshotMergeOperation::Sum): { *destValue += *value; break; @@ -177,7 +184,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) } default: { SPDLOG_ERROR("Unsupported int merge operation: {}", - r->mergeOp()); + chunk->mergeOp()); throw std::runtime_error( "Unsupported int merge operation"); } @@ -185,7 +192,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) break; } default: { - SPDLOG_ERROR("Unsupported data type: {}", r->dataType()); + SPDLOG_ERROR("Unsupported data type: {}", chunk->dataType()); throw std::runtime_error("Unsupported merge data type"); } } diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index 89140947a..3b84796a9 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -26,7 +26,11 @@ std::vector SnapshotData::getDirtyPages() std::vector diffs; for (int i : dirtyPageNumbers) { uint32_t offset = i * HOST_PAGE_SIZE; - diffs.emplace_back(offset, data + offset, HOST_PAGE_SIZE); + diffs.emplace_back(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offset, + data + offset, + HOST_PAGE_SIZE); } SPDLOG_DEBUG("Snapshot has {}/{} dirty pages", diffs.size(), nPages); @@ -43,6 +47,14 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, return diffs; } + for (const auto& mr : mergeRegions) { + SPDLOG_TRACE("Merge region {} {} at {}-{}", + snapshotDataTypeStr(mr.second.dataType), + snapshotMergeOpStr(mr.second.operation), + mr.second.offset, + mr.second.offset + mr.second.length); + } + // Work out which pages have changed (these will be sorted) size_t nThisPages = getRequiredHostPages(size); std::vector dirtyPageNumbers = @@ -82,8 +94,8 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, // TODO - make this more efficient by passing in dirty pages to merge // regions while (mergeIt != mergeRegions.end() && - ((mergeIt->second.offset > pageStart) && - (mergeIt->second.offset + mergeIt->second.length) < pageEnd)) { + (mergeIt->second.offset > pageStart && + mergeIt->second.offset < pageEnd)) { SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}", snapshotDataTypeStr(mergeIt->second.dataType), @@ -102,20 +114,39 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, void SnapshotData::addMergeRegion(uint32_t offset, size_t length, SnapshotDataType dataType, - SnapshotMergeOperation operation) + SnapshotMergeOperation operation, + bool overwrite) { SnapshotMergeRegion region{ .offset = offset, .length = length, .dataType = dataType, .operation = operation }; - addMergeRegion(region); -} - -void SnapshotData::addMergeRegion(SnapshotMergeRegion& region) -{ // Locking as this may be called in bursts by multiple threads faabric::util::UniqueLock lock(snapMx); + + if (mergeRegions.find(region.offset) != mergeRegions.end()) { + if (!overwrite) { + SPDLOG_ERROR("Attempting to overwrite existing merge region at {} " + "with {} {} at {}-{}", + region.offset, + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + region.offset, + region.offset + length); + + throw std::runtime_error("Not able to overwrite merge region"); + } + + SPDLOG_TRACE( + "Overwriting existing merge region at {} with {} {} at {}-{}", + region.offset, + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + region.offset, + region.offset + length); + } + mergeRegions[region.offset] = region; } @@ -190,10 +221,8 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, } // Add the diff - SnapshotDiff diff(offset, updatedValue, length); - diff.dataType = dataType; - diff.operation = operation; - diffs.emplace_back(diff); + diffs.emplace_back( + dataType, operation, offset, updatedValue, length); SPDLOG_TRACE("Adding {} {} diff at {}-{}", snapshotDataTypeStr(dataType), @@ -248,9 +277,8 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, // have changed bool diffInProgress = false; int diffStart = 0; - for (int b = 0; b < length; b++) { - bool isDirtyByte = - *(original + offset) != *(updated + offset); + for (int b = offset; b < offset + length; b++) { + bool isDirtyByte = *(original + b) != *(updated + b); if (isDirtyByte && !diffInProgress) { // Diff starts here if it's different and diff @@ -260,15 +288,19 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, } else if (!isDirtyByte && diffInProgress) { // Diff ends if it's not different and diff is // in progress + int diffLength = b - diffStart; SPDLOG_TRACE("Adding {} {} diff at {}-{}", snapshotDataTypeStr(dataType), snapshotMergeOpStr(operation), - offset, - offset + length); + offset + diffStart, + offset + diffStart + diffLength); diffInProgress = false; - diffs.emplace_back( - diffStart, updated + diffStart, b - diffStart); + diffs.emplace_back(dataType, + operation, + diffStart, + updated + diffStart, + diffLength); } } @@ -281,10 +313,13 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, offset, offset + length); - diffs.emplace_back(diffStart, + diffs.emplace_back(dataType, + operation, + diffStart, updated + diffStart, (length + 1) - diffStart); } + break; } default: { SPDLOG_ERROR("Unhandled raw merge operation: {}", From 627209b48b6afbc3503bd84c1f2306ae0c42b7d7 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 5 Nov 2021 14:50:42 +0000 Subject: [PATCH 14/31] Rearrange scheduler logic --- include/faabric/scheduler/Scheduler.h | 9 +- src/scheduler/Scheduler.cpp | 348 +++++++++++++------------- src/snapshot/SnapshotServer.cpp | 3 + 3 files changed, 184 insertions(+), 176 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index dc7b070d2..c76c6d5f3 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -205,6 +205,8 @@ class Scheduler std::promise>> localResults; + std::unordered_map> pushedSnapshotsMap; + std::mutex localResultsMutex; // ---- Clients ---- @@ -234,13 +236,6 @@ class Scheduler std::vector getUnregisteredHosts(const std::string& funcStr, bool noCache = false); - int scheduleFunctionsOnHost( - const std::string& host, - std::shared_ptr req, - faabric::util::SchedulingDecision& decision, - int offset, - faabric::util::SnapshotData* snapshot); - // ---- Accounting and debugging ---- std::vector recordedMessagesAll; std::vector recordedMessagesLocal; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 263ac7d87..f1fd4cc37 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -224,15 +224,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( throw std::runtime_error("Message with no master host"); } - // Set up scheduling decision - SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); - // TODO - more granular locking, this is conservative faabric::util::FullLock lock(mx); // If we're not the master host, we need to forward the request back to the // master host. This will only happen if a nested batch execution happens. - std::vector localMessageIdxs; + SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); if (!forceLocal && masterHost != thisHost) { SPDLOG_DEBUG( "Forwarding {} {} back to master {}", nMessages, funcStr, masterHost); @@ -242,98 +239,50 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( return decision; } + // ------------------------------------------- + // HOST DECISIONS + // ------------------------------------------- + std::vector hosts; if (forceLocal) { // We're forced to execute locally here so we do all the messages for (int i = 0; i < nMessages; i++) { - localMessageIdxs.emplace_back(i); - decision.addMessage(thisHost, req->messages().at(i)); + hosts.push_back(thisHost); } } else { // At this point we know we're the master host, and we've not been // asked to force full local execution. - // Get a list of other registered hosts - std::set& thisRegisteredHosts = registeredHosts[funcStr]; - - // For threads/ processes we need to have a snapshot key and be - // ready to push the snapshot to other hosts. - // We also have to broadcast the latest snapshots to all registered - // hosts, regardless of whether they're going to execute a function. - // This ensures everything is up to date, and we don't have to - // maintain different records of which hosts hold which updates. - faabric::util::SnapshotData snapshotData; - std::string snapshotKey = firstMsg.snapshotkey(); - bool snapshotNeeded = - req->type() == req->THREADS || req->type() == req->PROCESSES; - - if (snapshotNeeded) { - if (snapshotKey.empty()) { - SPDLOG_ERROR("No snapshot provided for {}", funcStr); - throw std::runtime_error( - "Empty snapshot for distributed threads/ processes"); - } - - snapshotData = - faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey); - - if (!thisRegisteredHosts.empty()) { - std::vector snapshotDiffs = - snapshotData.getDirtyPages(); - - // Do the snapshot diff pushing - if (!snapshotDiffs.empty()) { - for (const auto& h : thisRegisteredHosts) { - SPDLOG_DEBUG("Pushing {} snapshot diffs for {} to {}", - snapshotDiffs.size(), - funcStr, - h); - SnapshotClient& c = getSnapshotClient(h); - c.pushSnapshotDiffs( - snapshotKey, firstMsg.groupid(), snapshotDiffs); - } - } - - // Now reset the dirty page tracking, as we want the next batch - // of diffs to contain everything from now on (including the - // updates sent back from all the threads) - SPDLOG_DEBUG("Resetting dirty tracking after pushing diffs {}", - funcStr); - faabric::util::resetDirtyTracking(); - } - } - // Work out how many we can handle locally - int nLocally; - { - int slots = thisHostResources.slots(); + int slots = thisHostResources.slots(); - // Work out available cores, flooring at zero - int available = - slots - this->thisHostUsedSlots.load(std::memory_order_acquire); - available = std::max(available, 0); + // Work out available cores, flooring at zero + int available = + slots - this->thisHostUsedSlots.load(std::memory_order_acquire); + available = std::max(available, 0); - // Claim as many as we can - nLocally = std::min(available, nMessages); - } + // Claim as many as we can + int nLocally = std::min(available, nMessages); // Add those that can be executed locally - if (nLocally > 0) { - SPDLOG_DEBUG( - "Executing {}/{} {} locally", nLocally, nMessages, funcStr); - for (int i = 0; i < nLocally; i++) { - localMessageIdxs.emplace_back(i); - decision.addMessage(thisHost, req->messages().at(i)); - } + for (int i = 0; i < nLocally; i++) { + hosts.push_back(thisHost); } // If some are left, we need to distribute int offset = nLocally; if (offset < nMessages) { // Schedule first to already registered hosts + std::set& thisRegisteredHosts = + registeredHosts[funcStr]; + for (const auto& h : thisRegisteredHosts) { - int nOnThisHost = scheduleFunctionsOnHost( - h, req, decision, offset, &snapshotData); + int remainder = nMessages - offset; + + // Work out resources on this host + faabric::HostResources r = getHostResources(h); + int available = r.slots() - r.usedslots(); + int nOnThisHost = std::min(available, remainder); offset += nOnThisHost; if (offset >= nMessages) { break; @@ -346,15 +295,14 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( std::vector unregisteredHosts = getUnregisteredHosts(funcStr); - for (auto& h : unregisteredHosts) { - // Skip if this host - if (h == thisHost) { - continue; - } + for (const auto& h : unregisteredHosts) { + int remainder = nMessages - offset; + + // Work out resources on this host + faabric::HostResources r = getHostResources(h); + int available = r.slots() - r.usedslots(); - // Schedule functions on the host - int nOnThisHost = scheduleFunctionsOnHost( - h, req, decision, offset, &snapshotData); + int nOnThisHost = std::min(available, remainder); // Register the host if it's exected a function if (nOnThisHost > 0) { @@ -378,8 +326,7 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( funcStr); for (; offset < nMessages; offset++) { - localMessageIdxs.emplace_back(offset); - decision.addMessage(thisHost, req->messages().at(offset)); + hosts.push_back(thisHost); } } @@ -387,6 +334,9 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( assert(offset == nMessages); } + // Sanity check + assert(hosts.size() == nMessages); + // Register thread results if necessary if (isThreads) { for (const auto& m : req->messages()) { @@ -394,58 +344,146 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( } } - // Schedule messages locally if necessary. For threads we only need one - // executor, for anything else we want one Executor per function in flight - if (!localMessageIdxs.empty()) { - // Update slots - this->thisHostUsedSlots.fetch_add((int32_t)localMessageIdxs.size(), - std::memory_order_acquire); - - if (isThreads) { - // Threads use the existing executor. We assume there's only one - // running at a time. - std::vector>& thisExecutors = - executors[funcStr]; - - std::shared_ptr e = nullptr; - if (thisExecutors.empty()) { - // Create executor if not exists - e = claimExecutor(firstMsg, lock); - } else if (thisExecutors.size() == 1) { - // Use existing executor if exists - e = thisExecutors.back(); + // Set up decision + for (int i = 0; i < hosts.size(); i++) { + decision.addMessage(hosts.at(i), req->messages().at(i)); + } + + // ------------------------------------------- + // POINT-TO-POINT + // ------------------------------------------- + + // Send out point-to-point mappings if necessary (unless being forced to + // execute locally, in which case they will be transmitted from the + // master) + if (!forceLocal && (firstMsg.groupid() > 0)) { + broker.setAndSendMappingsFromSchedulingDecision(decision); + } + + // ------------------------------------------- + // SNAPSHOTS + // ------------------------------------------- + + // Push out snapshot diffs to registered hosts. We have to do this to + // *all* hosts, regardless of whether they will be executing functions. + // This greatly simplifies the reasoning about which hosts hold which + // diffs. + std::string snapshotKey = firstMsg.snapshotkey(); + if (!snapshotKey.empty()) { + for (const auto& host : getFunctionRegisteredHosts(firstMsg)) { + SnapshotClient& c = getSnapshotClient(host); + faabric::util::SnapshotData snapshotData = + faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey); + + // See if we've already pushed this snapshot to the given host, + // if so, just push the diffs + if (pushedSnapshotsMap[snapshotKey].contains(host)) { + std::vector snapshotDiffs = + snapshotData.getDirtyPages(); + c.pushSnapshotDiffs( + snapshotKey, firstMsg.groupid(), snapshotDiffs); } else { - SPDLOG_ERROR("Found {} executors for threaded function {}", - thisExecutors.size(), - funcStr); - throw std::runtime_error( - "Expected only one executor for threaded function"); + c.pushSnapshot(snapshotKey, firstMsg.groupid(), snapshotData); + pushedSnapshotsMap[snapshotKey].insert(host); } + } + } - assert(e != nullptr); + // Now reset the dirty page tracking just before we start executing + SPDLOG_DEBUG("Resetting dirty tracking after pushing diffs {}", funcStr); + faabric::util::resetDirtyTracking(); - // Execute the tasks - e->executeTasks(localMessageIdxs, req); - } else { - // Non-threads require one executor per task - for (auto i : localMessageIdxs) { - faabric::Message& localMsg = req->mutable_messages()->at(i); - if (localMsg.executeslocally()) { - faabric::util::UniqueLock resultsLock(localResultsMutex); - localResults.insert( - { localMsg.id(), - std::promise>() }); - } - std::shared_ptr e = claimExecutor(localMsg, lock); - e->executeTasks({ i }, req); + // ------------------------------------------- + // EXECTUION + // ------------------------------------------- + + std::set uniqueHosts(hosts.begin(), hosts.end()); + + // Iterate through unique hosts + for (const std::string& host : uniqueHosts) { + // Work out which indexes are scheduled on this host + std::vector thisHostIdxs; + for (int i = 0; i < hosts.size(); i++) { + if (hosts.at(i) == host) { + thisHostIdxs.push_back(i); } } - } - // Send out point-to-point mappings if necessary (unless being forced to - // execute locally, in which case they will be transmitted from the master) - if (!forceLocal && (firstMsg.groupid() > 0)) { - broker.setAndSendMappingsFromSchedulingDecision(decision); + if (host == thisHost) { + // ------------------------------------------- + // LOCAL EXECTUION + // ------------------------------------------- + // For threads we only need one executor, for anything else we want + // one Executor per function in flight. + + // Update slots + this->thisHostUsedSlots.fetch_add(thisHostIdxs.size(), + std::memory_order_acquire); + + if (isThreads) { + // Threads use the existing executor. We assume there's only + // one running at a time. + std::vector>& thisExecutors = + executors[funcStr]; + + std::shared_ptr e = nullptr; + if (thisExecutors.empty()) { + // Create executor if not exists + e = claimExecutor(firstMsg, lock); + } else if (thisExecutors.size() == 1) { + // Use existing executor if exists + e = thisExecutors.back(); + } else { + SPDLOG_ERROR("Found {} executors for threaded function {}", + thisExecutors.size(), + funcStr); + throw std::runtime_error( + "Expected only one executor for threaded function"); + } + + assert(e != nullptr); + + // Execute the tasks + e->executeTasks(thisHostIdxs, req); + } else { + // Non-threads require one executor per task + for (auto i : thisHostIdxs) { + faabric::Message& localMsg = req->mutable_messages()->at(i); + if (localMsg.executeslocally()) { + faabric::util::UniqueLock resultsLock( + localResultsMutex); + localResults.insert( + { localMsg.id(), + std::promise< + std::unique_ptr>() }); + } + std::shared_ptr e = claimExecutor(localMsg, lock); + e->executeTasks({ i }, req); + } + } + } else { + // ------------------------------------------- + // REMOTE EXECTUION + // ------------------------------------------- + + // Set up new request + std::shared_ptr hostRequest = + faabric::util::batchExecFactory(); + hostRequest->set_snapshotkey(req->snapshotkey()); + hostRequest->set_type(req->type()); + hostRequest->set_subtype(req->subtype()); + hostRequest->set_contextdata(req->contextdata()); + + // Add messages + for (auto msgIdx : thisHostIdxs) { + auto* newMsg = hostRequest->add_messages(); + *newMsg = req->messages().at(msgIdx); + newMsg->set_executeslocally(false); + decision.addMessage(host, req->messages().at(msgIdx)); + } + + getFunctionCallClient(host).executeFunctions(hostRequest); + } } // Records for tests @@ -511,7 +549,7 @@ void Scheduler::broadcastSnapshotDelete(const faabric::Message& msg, int Scheduler::scheduleFunctionsOnHost( const std::string& host, std::shared_ptr req, - SchedulingDecision& decision, + std::vector hosts, int offset, faabric::util::SnapshotData* snapshot) { @@ -531,35 +569,6 @@ int Scheduler::scheduleFunctionsOnHost( return 0; } - // Set up new request - std::shared_ptr hostRequest = - faabric::util::batchExecFactory(); - hostRequest->set_snapshotkey(req->snapshotkey()); - hostRequest->set_type(req->type()); - hostRequest->set_subtype(req->subtype()); - hostRequest->set_contextdata(req->contextdata()); - - // Add messages - int nOnThisHost = std::min(available, remainder); - for (int i = offset; i < (offset + nOnThisHost); i++) { - auto* newMsg = hostRequest->add_messages(); - *newMsg = req->messages().at(i); - newMsg->set_executeslocally(false); - decision.addMessage(host, req->messages().at(i)); - } - - SPDLOG_DEBUG( - "Sending {}/{} {} to {}", nOnThisHost, nMessages, funcStr, host); - - // Handle snapshots - std::string snapshotKey = firstMsg.snapshotkey(); - if (snapshot != nullptr && !snapshotKey.empty()) { - SnapshotClient& c = getSnapshotClient(host); - c.pushSnapshot(snapshotKey, firstMsg.groupid(), *snapshot); - } - - getFunctionCallClient(host).executeFunctions(hostRequest); - return nOnThisHost; } @@ -646,8 +655,8 @@ std::shared_ptr Scheduler::claimExecutor( int nExecutors = thisExecutors.size(); SPDLOG_DEBUG( "Scaling {} from {} -> {}", funcStr, nExecutors, nExecutors + 1); - // Spinning up a new executor can be lengthy, allow other things to run - // in parallel + // Spinning up a new executor can be lengthy, allow other things + // to run in parallel schedulerLock.unlock(); auto executor = factory->createExecutor(msg); schedulerLock.lock(); @@ -711,7 +720,8 @@ void Scheduler::setFunctionResult(faabric::Message& msg) if (it != localResults.end()) { it->second.set_value(std::make_unique(msg)); } - // Sync messages can't have their results read twice, so skip redis + // Sync messages can't have their results read twice, so skip + // redis if (!msg.isasync()) { return; } @@ -729,8 +739,8 @@ void Scheduler::setFunctionResult(faabric::Message& msg) void Scheduler::registerThread(uint32_t msgId) { - // Here we need to ensure the promise is registered locally so callers can - // start waiting + // Here we need to ensure the promise is registered locally so + // callers can start waiting threadResults[msgId]; } @@ -818,13 +828,13 @@ faabric::Message Scheduler::getFunctionResult(unsigned int messageId, faabric::Message msgResult; if (isBlocking) { - // Blocking version will throw an exception when timing out which is - // handled by the caller. + // Blocking version will throw an exception when timing out + // which is handled by the caller. std::vector result = redis.dequeueBytes(resultKey, timeoutMs); msgResult.ParseFromArray(result.data(), (int)result.size()); } else { - // Non-blocking version will tolerate empty responses, therefore we - // handle the exception here + // Non-blocking version will tolerate empty responses, therefore + // we handle the exception here std::vector result; try { result = redis.dequeueBytes(resultKey, timeoutMs); diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp index bc6783783..108e3c5d6 100644 --- a/src/snapshot/SnapshotServer.cpp +++ b/src/snapshot/SnapshotServer.cpp @@ -205,6 +205,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize) ->localUnlock(); } + // Reset dirty tracking having applied diffs + SPDLOG_DEBUG("Resetting dirty page tracking having applied diffs"); + // Send response return std::make_unique(); } From e2b8f3abbcc1e84ffdfd2366b477c4ece530501e Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 5 Nov 2021 15:21:33 +0000 Subject: [PATCH 15/31] Fix indexing in scheduler loop --- src/scheduler/Scheduler.cpp | 122 +++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 59 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index f1fd4cc37..7febf86fb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -117,6 +117,7 @@ void Scheduler::reset() availableHostsCache.clear(); registeredHosts.clear(); threadResults.clear(); + pushedSnapshotsMap.clear(); // Records recordedMessagesAll.clear(); @@ -268,40 +269,39 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( hosts.push_back(thisHost); } - // If some are left, we need to distribute - int offset = nLocally; - if (offset < nMessages) { - // Schedule first to already registered hosts + // If some are left, we need to distribute. + // First try and do so on already registered hosts. + int remainder = nMessages - nLocally; + if (remainder > 0) { std::set& thisRegisteredHosts = registeredHosts[funcStr]; for (const auto& h : thisRegisteredHosts) { - int remainder = nMessages - offset; - // Work out resources on this host faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); - int nOnThisHost = std::min(available, remainder); - offset += nOnThisHost; - if (offset >= nMessages) { + + for (int i = 0; i < nOnThisHost; i++) { + hosts.push_back(h); + } + + remainder -= nOnThisHost; + if (remainder <= 0) { break; } } } - // Now schedule to unregistered hosts if there are some left - if (offset < nMessages) { + // Now schedule to unregistered hosts if there are messages left + if (remainder > 0) { std::vector unregisteredHosts = getUnregisteredHosts(funcStr); for (const auto& h : unregisteredHosts) { - int remainder = nMessages - offset; - // Work out resources on this host faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); - int nOnThisHost = std::min(available, remainder); // Register the host if it's exected a function @@ -310,33 +310,41 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( registeredHosts[funcStr].insert(h); } - offset += nOnThisHost; - if (offset >= nMessages) { + for (int i = 0; i < nOnThisHost; i++) { + hosts.push_back(h); + } + + remainder -= nOnThisHost; + if (remainder <= 0) { break; } } } // At this point there's no more capacity in the system, so we - // just need to execute locally - if (offset < nMessages) { - SPDLOG_DEBUG("Overloading {}/{} {} locally", - nMessages - offset, - nMessages, - funcStr); + // just need to overload locally + if (remainder > 0) { + SPDLOG_DEBUG( + "Overloading {}/{} {} locally", remainder, nMessages, funcStr); - for (; offset < nMessages; offset++) { + for (int i = 0; i < remainder; i++) { hosts.push_back(thisHost); } } - - // Sanity check - assert(offset == nMessages); } // Sanity check assert(hosts.size() == nMessages); + // Set up decision + for (int i = 0; i < hosts.size(); i++) { + decision.addMessage(hosts.at(i), req->messages().at(i)); + } + + // ------------------------------------------- + // THREADS + // ------------------------------------------- + // Register thread results if necessary if (isThreads) { for (const auto& m : req->messages()) { @@ -344,13 +352,8 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( } } - // Set up decision - for (int i = 0; i < hosts.size(); i++) { - decision.addMessage(hosts.at(i), req->messages().at(i)); - } - // ------------------------------------------- - // POINT-TO-POINT + // POINT-TO-POINT MESSAGING // ------------------------------------------- // Send out point-to-point mappings if necessary (unless being forced to @@ -397,10 +400,18 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // EXECTUION // ------------------------------------------- + // NOTE: we want to schedule things on this host _last_, otherwise functions + // may start executing before all messages have been dispatched, thus + // slowing the remaining scheduling. + // TODO - clearner way to do this? std::set uniqueHosts(hosts.begin(), hosts.end()); + uniqueHosts.erase(thisHost); + std::vector orderedHosts(uniqueHosts.begin(), + uniqueHosts.end()); + orderedHosts.push_back(thisHost); // Iterate through unique hosts - for (const std::string& host : uniqueHosts) { + for (const std::string& host : orderedHosts) { // Work out which indexes are scheduled on this host std::vector thisHostIdxs; for (int i = 0; i < hosts.size(); i++) { @@ -416,6 +427,18 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // For threads we only need one executor, for anything else we want // one Executor per function in flight. + if (thisHostIdxs.empty()) { + SPDLOG_DEBUG("Not scheduling any calls to {} out of {} locally", + funcStr, + nMessages); + continue; + } + + SPDLOG_DEBUG("Scheduling {}/{} calls to {} locally", + thisHostIdxs.size(), + nMessages, + funcStr); + // Update slots this->thisHostUsedSlots.fetch_add(thisHostIdxs.size(), std::memory_order_acquire); @@ -466,6 +489,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // REMOTE EXECTUION // ------------------------------------------- + SPDLOG_DEBUG("Scheduling {}/{} calls to {} on {}", + thisHostIdxs.size(), + nMessages, + funcStr, + host); + // Set up new request std::shared_ptr hostRequest = faabric::util::batchExecFactory(); @@ -482,6 +511,7 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( decision.addMessage(host, req->messages().at(msgIdx)); } + // Dispatch the calls getFunctionCallClient(host).executeFunctions(hostRequest); } } @@ -546,32 +576,6 @@ void Scheduler::broadcastSnapshotDelete(const faabric::Message& msg, } } -int Scheduler::scheduleFunctionsOnHost( - const std::string& host, - std::shared_ptr req, - std::vector hosts, - int offset, - faabric::util::SnapshotData* snapshot) -{ - const faabric::Message& firstMsg = req->messages().at(0); - std::string funcStr = faabric::util::funcToString(firstMsg, false); - - int nMessages = req->messages_size(); - int remainder = nMessages - offset; - - // Work out how many we can put on the host - faabric::HostResources r = getHostResources(host); - int available = r.slots() - r.usedslots(); - - // Drop out if none available - if (available <= 0) { - SPDLOG_DEBUG("Not scheduling {} on {}, no resources", funcStr, host); - return 0; - } - - return nOnThisHost; -} - void Scheduler::callFunction(faabric::Message& msg, bool forceLocal) { // TODO - avoid this copy From d9cca418992c7b017c4960942cf5b13540d1eebb Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 5 Nov 2021 15:53:13 +0000 Subject: [PATCH 16/31] Avoid pushing snapshots to master host --- src/scheduler/Scheduler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7febf86fb..38ddb1085 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -299,6 +299,11 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( getUnregisteredHosts(funcStr); for (const auto& h : unregisteredHosts) { + // Skip if this host + if (h == thisHost) { + continue; + } + // Work out resources on this host faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); From a32b7a38300ca3bb24591106bd8c1209f50d2097 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Fri, 5 Nov 2021 16:36:32 +0000 Subject: [PATCH 17/31] Remove last snapshot stuff --- include/faabric/scheduler/Scheduler.h | 2 -- src/scheduler/Executor.cpp | 22 ++++------------------ 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index c76c6d5f3..4813c2819 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -81,8 +81,6 @@ class Executor uint32_t threadPoolSize = 0; private: - std::string lastSnapshot; - std::atomic claimed = false; std::mutex threadsMutex; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 5159b7467..5fb9c2d31 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -88,8 +88,6 @@ void Executor::finish() // Reset variables boundMessage.Clear(); - lastSnapshot = ""; - claimed = false; threadPoolThreads.clear(); @@ -113,8 +111,7 @@ void Executor::executeTasks(std::vector msgIdxs, faabric::util::UniqueLock lock(threadsMutex); // Restore if necessary. If we're executing threads on the master host we - // assume we don't need to restore, but for everything else we do. If we've - // already restored from this snapshot, we don't do so again. + // assume we don't need to restore, but for everything else we do. faabric::Message& firstMsg = req->mutable_messages()->at(0); std::string snapshotKey = firstMsg.snapshotkey(); std::string thisHost = faabric::util::getSystemConfig().endpointHost; @@ -122,21 +119,10 @@ void Executor::executeTasks(std::vector msgIdxs, bool isMaster = firstMsg.masterhost() == thisHost; bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS; bool isSnapshot = !snapshotKey.empty(); - bool alreadyRestored = snapshotKey == lastSnapshot; - if (isSnapshot && !alreadyRestored) { - if ((!isMaster && isThreads) || !isThreads) { - SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey); - lastSnapshot = snapshotKey; - restore(firstMsg); - } else { - SPDLOG_DEBUG("Skipping snapshot restore on master {} [{}]", - funcStr, - snapshotKey); - } - } else if (isSnapshot) { - SPDLOG_DEBUG( - "Skipping already restored snapshot {} [{}]", funcStr, snapshotKey); + if (isSnapshot && !isMaster) { + SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey); + restore(firstMsg); } // Reset dirty page tracking if we're executing threads. From 4b6eb322b41a28878ed8935594dc8861fb1fd033 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 8 Nov 2021 08:48:30 +0000 Subject: [PATCH 18/31] Add scheduler hints --- include/faabric/scheduler/Scheduler.h | 9 ++++ src/scheduler/Scheduler.cpp | 78 +++++++++++++++++++-------- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 4813c2819..0e7414fcb 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -104,6 +104,10 @@ class Scheduler std::shared_ptr req, bool forceLocal = false); + faabric::util::SchedulingDecision callFunctions( + std::shared_ptr req, + faabric::util::SchedulingDecision& hint); + void reset(); void resetThreadLocalCache(); @@ -227,6 +231,11 @@ class Scheduler std::unordered_map> registeredHosts; + faabric::util::SchedulingDecision doCallFunctions( + std::shared_ptr req, + faabric::util::SchedulingDecision& decision, + faabric::util::FullLock& lock); + std::shared_ptr claimExecutor( faabric::Message& msg, faabric::util::FullLock& schedulerLock); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 38ddb1085..00a6a9ba8 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1,3 +1,4 @@ +#include "faabric/util/locks.h" #include #include #include @@ -210,10 +211,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( std::shared_ptr req, bool forceLocal) { - // Extract properties of the request - int nMessages = req->messages_size(); - bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS; - // Note, we assume all the messages are for the same function and have the // same master host faabric::Message& firstMsg = req->mutable_messages()->at(0); @@ -225,25 +222,24 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( throw std::runtime_error("Message with no master host"); } - // TODO - more granular locking, this is conservative - faabric::util::FullLock lock(mx); - // If we're not the master host, we need to forward the request back to the // master host. This will only happen if a nested batch execution happens. SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); if (!forceLocal && masterHost != thisHost) { - SPDLOG_DEBUG( - "Forwarding {} {} back to master {}", nMessages, funcStr, masterHost); + SPDLOG_DEBUG("Forwarding {} back to master {}", funcStr, masterHost); getFunctionCallClient(masterHost).executeFunctions(req); decision.returnHost = masterHost; return decision; } + faabric::util::FullLock lock(mx); + // ------------------------------------------- - // HOST DECISIONS + // DECISION BUILDING // ------------------------------------------- std::vector hosts; + int nMessages = req->messages_size(); if (forceLocal) { // We're forced to execute locally here so we do all the messages for (int i = 0; i < nMessages; i++) { @@ -346,9 +342,53 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( decision.addMessage(hosts.at(i), req->messages().at(i)); } + // Pass decision as hint + return doCallFunctions(req, decision, lock); +} + +faabric::util::SchedulingDecision Scheduler::callFunctions( + std::shared_ptr req, + faabric::util::SchedulingDecision& hint) +{ + faabric::util::FullLock lock(mx); + return doCallFunctions(req, hint, lock); +} + +faabric::util::SchedulingDecision Scheduler::doCallFunctions( + std::shared_ptr req, + faabric::util::SchedulingDecision& decision, + faabric::util::FullLock& lock) +{ + faabric::Message& firstMsg = req->mutable_messages()->at(0); + std::string funcStr = faabric::util::funcToString(firstMsg, false); + int nMessages = req->messages_size(); + + // NOTE: we want to schedule things on this host _last_, otherwise functions + // may start executing before all messages have been dispatched, thus + // slowing the remaining scheduling. + // Therefore we want to create a list of unique hosts, with this host last. + std::vector orderedHosts; + { + std::set uniqueHosts(decision.hosts.begin(), + decision.hosts.end()); + bool hasFunctionsOnThisHost = uniqueHosts.contains(thisHost); + + if (hasFunctionsOnThisHost) { + uniqueHosts.erase(thisHost); + } + + std::vector orderedHosts(uniqueHosts.begin(), + uniqueHosts.end()); + + if (hasFunctionsOnThisHost) { + orderedHosts.push_back(thisHost); + } + } + // ------------------------------------------- // THREADS // ------------------------------------------- + bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS; // Register thread results if necessary if (isThreads) { @@ -364,7 +404,9 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // Send out point-to-point mappings if necessary (unless being forced to // execute locally, in which case they will be transmitted from the // master) - if (!forceLocal && (firstMsg.groupid() > 0)) { + bool isOnlyThisHost = + orderedHosts.size() == 1 && orderedHosts.front() == thisHost; + if (!isOnlyThisHost && (firstMsg.groupid() > 0)) { broker.setAndSendMappingsFromSchedulingDecision(decision); } @@ -405,22 +447,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // EXECTUION // ------------------------------------------- - // NOTE: we want to schedule things on this host _last_, otherwise functions - // may start executing before all messages have been dispatched, thus - // slowing the remaining scheduling. - // TODO - clearner way to do this? - std::set uniqueHosts(hosts.begin(), hosts.end()); - uniqueHosts.erase(thisHost); - std::vector orderedHosts(uniqueHosts.begin(), - uniqueHosts.end()); - orderedHosts.push_back(thisHost); - // Iterate through unique hosts for (const std::string& host : orderedHosts) { // Work out which indexes are scheduled on this host std::vector thisHostIdxs; - for (int i = 0; i < hosts.size(); i++) { - if (hosts.at(i) == host) { + for (int i = 0; i < decision.hosts.size(); i++) { + if (decision.hosts.at(i) == host) { thisHostIdxs.push_back(i); } } From 983dfaaffc26ce9e9643a4730d9cc0bc813e1c8f Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 8 Nov 2021 09:06:07 +0000 Subject: [PATCH 19/31] Fix bug in host ordering --- src/scheduler/Scheduler.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 00a6a9ba8..e1f1942ce 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1,4 +1,3 @@ -#include "faabric/util/locks.h" #include #include #include @@ -9,11 +8,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -307,7 +308,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // Register the host if it's exected a function if (nOnThisHost > 0) { - SPDLOG_DEBUG("Registering {} for {}", h, funcStr); registeredHosts[funcStr].insert(h); } @@ -377,8 +377,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( uniqueHosts.erase(thisHost); } - std::vector orderedHosts(uniqueHosts.begin(), - uniqueHosts.end()); + orderedHosts = std::vector(uniqueHosts.begin(), uniqueHosts.end()); if (hasFunctionsOnThisHost) { orderedHosts.push_back(thisHost); From ce8a8708b680e1a1c09f185a12922b5ac6ce86f2 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 8 Nov 2021 17:31:38 +0000 Subject: [PATCH 20/31] Sort out reusing ptp messages --- src/scheduler/Executor.cpp | 8 ++++---- src/scheduler/Scheduler.cpp | 36 +++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 5fb9c2d31..86c30c4ac 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -288,12 +288,12 @@ void Executor::threadPoolThread(int threadPoolIdx) // In all threads if we've finished with a ptp group, we need to tidy up // to ensure all sockets can be destructed if (msg.groupid() > 0) { - broker.resetThreadLocalCache(false); + // broker.resetThreadLocalCache(false); // Remove the group from this host if last in batch - if (isLastInBatch) { - broker.clearGroup(msg.groupid()); - } + // if (isLastInBatch) { + // broker.clearGroup(msg.groupid()); + // } } // If this batch is finished, reset the executor and release its claim. diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index e1f1942ce..194d1180f 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -342,6 +342,17 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( decision.addMessage(hosts.at(i), req->messages().at(i)); } + // ------------------------------------------- + // POINT-TO-POINT MESSAGING + // ------------------------------------------- + + // Send out point-to-point mappings if necessary (unless being forced to + // execute locally, in which case they will be transmitted from the + // master) + if (!forceLocal && (firstMsg.groupid() > 0)) { + broker.setAndSendMappingsFromSchedulingDecision(decision); + } + // Pass decision as hint return doCallFunctions(req, decision, lock); } @@ -363,6 +374,15 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::string funcStr = faabric::util::funcToString(firstMsg, false); int nMessages = req->messages_size(); + if (decision.hosts.size() != nMessages) { + SPDLOG_ERROR( + "Passed decision for {} with {} messages, but request has {}", + funcStr, + decision.hosts.size(), + nMessages); + throw std::runtime_error("Invalid scheduler hint for messages"); + } + // NOTE: we want to schedule things on this host _last_, otherwise functions // may start executing before all messages have been dispatched, thus // slowing the remaining scheduling. @@ -396,19 +416,6 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( } } - // ------------------------------------------- - // POINT-TO-POINT MESSAGING - // ------------------------------------------- - - // Send out point-to-point mappings if necessary (unless being forced to - // execute locally, in which case they will be transmitted from the - // master) - bool isOnlyThisHost = - orderedHosts.size() == 1 && orderedHosts.front() == thisHost; - if (!isOnlyThisHost && (firstMsg.groupid() > 0)) { - broker.setAndSendMappingsFromSchedulingDecision(decision); - } - // ------------------------------------------- // SNAPSHOTS // ------------------------------------------- @@ -446,7 +453,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // EXECTUION // ------------------------------------------- - // Iterate through unique hosts + // Iterate through unique hosts and dispatch messages for (const std::string& host : orderedHosts) { // Work out which indexes are scheduled on this host std::vector thisHostIdxs; @@ -544,7 +551,6 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( auto* newMsg = hostRequest->add_messages(); *newMsg = req->messages().at(msgIdx); newMsg->set_executeslocally(false); - decision.addMessage(host, req->messages().at(msgIdx)); } // Dispatch the calls From 41118f9a95dfa59cdd941651d581fe185df3c8f7 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Mon, 8 Nov 2021 17:59:07 +0000 Subject: [PATCH 21/31] Naming --- include/faabric/transport/PointToPointBroker.h | 2 +- include/faabric/transport/PointToPointServer.h | 2 +- src/scheduler/Executor.cpp | 11 ----------- src/transport/PointToPointBroker.cpp | 12 ++++-------- src/transport/PointToPointServer.cpp | 9 +++++---- 5 files changed, 11 insertions(+), 25 deletions(-) diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h index 459d63543..a596a1fb5 100644 --- a/include/faabric/transport/PointToPointBroker.h +++ b/include/faabric/transport/PointToPointBroker.h @@ -115,7 +115,7 @@ class PointToPointBroker void clear(); - void resetThreadLocalCache(bool hard = true); + void resetThreadLocalCache(); private: faabric::util::SystemConfig& conf; diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h index 29a15e77c..d62fe6b76 100644 --- a/include/faabric/transport/PointToPointServer.h +++ b/include/faabric/transport/PointToPointServer.h @@ -11,7 +11,7 @@ class PointToPointServer final : public MessageEndpointServer PointToPointServer(); private: - PointToPointBroker& reg; + PointToPointBroker& broker; void doAsyncRecv(int header, const uint8_t* buffer, diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 86c30c4ac..9c504b049 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -285,17 +285,6 @@ void Executor::threadPoolThread(int threadPoolIdx) faabric::util::resetDirtyTracking(); } - // In all threads if we've finished with a ptp group, we need to tidy up - // to ensure all sockets can be destructed - if (msg.groupid() > 0) { - // broker.resetThreadLocalCache(false); - - // Remove the group from this host if last in batch - // if (isLastInBatch) { - // broker.clearGroup(msg.groupid()); - // } - } - // If this batch is finished, reset the executor and release its claim. // Note that we have to release the claim _after_ resetting, otherwise // the executor won't be ready for reuse. diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index eecf6ed51..448b94429 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -540,7 +540,7 @@ void PointToPointBroker::clearGroup(int groupId) { SPDLOG_TRACE("Clearing point-to-point group {}", groupId); - faabric::util::SharedLock lock(brokerMutex); + faabric::util::FullLock lock(brokerMutex); std::set idxs = getIdxsRegisteredForGroup(groupId); for (auto idxA : idxs) { @@ -559,7 +559,7 @@ void PointToPointBroker::clearGroup(int groupId) void PointToPointBroker::clear() { - faabric::util::SharedLock lock(brokerMutex); + faabric::util::FullLock lock(brokerMutex); groupIdIdxsMap.clear(); mappings.clear(); @@ -569,16 +569,12 @@ void PointToPointBroker::clear() groupFlags.clear(); } -void PointToPointBroker::resetThreadLocalCache(bool hard) +void PointToPointBroker::resetThreadLocalCache() { SPDLOG_TRACE("Resetting point-to-point thread-local cache"); - sendEndpoints.clear(); recvEndpoints.clear(); - - if (hard) { - clients.clear(); - } + clients.clear(); } PointToPointBroker& getPointToPointBroker() diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index 7791f0b0f..e2dd6fa80 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -16,7 +16,7 @@ PointToPointServer::PointToPointServer() POINT_TO_POINT_SYNC_PORT, POINT_TO_POINT_INPROC_LABEL, faabric::util::getSystemConfig().pointToPointServerThreads) - , reg(getPointToPointBroker()) + , broker(getPointToPointBroker()) {} void PointToPointServer::doAsyncRecv(int header, @@ -28,7 +28,7 @@ void PointToPointServer::doAsyncRecv(int header, PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize) // Send the message locally to the downstream socket - reg.sendMessage(msg.groupid(), + broker.sendMessage(msg.groupid(), msg.sendidx(), msg.recvidx(), BYTES_CONST(msg.data().c_str()), @@ -85,7 +85,7 @@ std::unique_ptr PointToPointServer::doRecvMappings( SPDLOG_DEBUG("Receiving {} point-to-point mappings", decision.nFunctions); - reg.setUpLocalMappingsFromSchedulingDecision(decision); + broker.setUpLocalMappingsFromSchedulingDecision(decision); return std::make_unique(); } @@ -121,6 +121,7 @@ void PointToPointServer::recvGroupUnlock(const uint8_t* buffer, void PointToPointServer::onWorkerStop() { // Clear any thread-local cached sockets - reg.resetThreadLocalCache(); + broker.resetThreadLocalCache(); + broker.clear(); } } From 52b3877b9f01331ba8e39e43f5e148877ec05eeb Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 9 Nov 2021 10:26:37 +0000 Subject: [PATCH 22/31] Fixing up tests --- include/faabric/util/snapshot.h | 6 - src/scheduler/Executor.cpp | 4 +- src/scheduler/Scheduler.cpp | 4 + src/util/snapshot.cpp | 4 +- tests/test/scheduler/test_executor.cpp | 42 ++++-- tests/test/scheduler/test_scheduler.cpp | 34 +++-- .../snapshot/test_snapshot_client_server.cpp | 121 ++++++++++-------- tests/test/util/test_snapshot.cpp | 48 +++++-- 8 files changed, 168 insertions(+), 95 deletions(-) diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h index a005dbae3..6cef3d21b 100644 --- a/include/faabric/util/snapshot.h +++ b/include/faabric/util/snapshot.h @@ -92,12 +92,6 @@ class SnapshotData // Note - we care about the order of this map, as we iterate through it // in order of offsets std::map mergeRegions; - - std::vector getCustomDiffs(const uint8_t* updated, - size_t updatedSize); - - std::vector getStandardDiffs(const uint8_t* updated, - size_t updatedSize); }; std::string snapshotDataTypeStr(SnapshotDataType dt); diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 9c504b049..2069b071e 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -135,7 +135,7 @@ void Executor::executeTasks(std::vector msgIdxs, // Set up shared counter for this batch of tasks auto batchCounter = - std::make_shared>(req->messages_size()); + std::make_shared>(msgIdxs.size()); // Work out if we should skip the reset after this batch. This only needs to // happen when we're executing threads on the master host, in which case the @@ -144,7 +144,7 @@ void Executor::executeTasks(std::vector msgIdxs, // Iterate through and invoke tasks. By default, we allocate tasks // one-to-one with thread pool threads. Only once the pool is exhausted do - // we start overallocating. + // we start overloading for (int msgIdx : msgIdxs) { const faabric::Message& msg = req->messages().at(msgIdx); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 194d1180f..53d9114d2 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -515,6 +515,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // Non-threads require one executor per task for (auto i : thisHostIdxs) { faabric::Message& localMsg = req->mutable_messages()->at(i); + if (localMsg.executeslocally()) { faabric::util::UniqueLock resultsLock( localResultsMutex); @@ -523,6 +524,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::promise< std::unique_ptr>() }); } + std::shared_ptr e = claimExecutor(localMsg, lock); e->executeTasks({ i }, req); } @@ -762,10 +764,12 @@ void Scheduler::setFunctionResult(faabric::Message& msg) if (msg.executeslocally()) { faabric::util::UniqueLock resultsLock(localResultsMutex); + auto it = localResults.find(msg.id()); if (it != localResults.end()) { it->second.set_value(std::make_unique(msg)); } + // Sync messages can't have their results read twice, so skip // redis if (!msg.isasync()) { diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index 3b84796a9..772870be4 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -92,9 +92,9 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, // For each merge region that overlaps this dirty page, get it to add // its diffs, and move onto the next one // TODO - make this more efficient by passing in dirty pages to merge - // regions + // regions so that they avoid unnecessary work if they're large. while (mergeIt != mergeRegions.end() && - (mergeIt->second.offset > pageStart && + (mergeIt->second.offset >= pageStart && mergeIt->second.offset < pageEnd)) { SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}", diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index 4dbf2983b..918b7d4ea 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -19,6 +19,7 @@ #include using namespace faabric::scheduler; +using namespace faabric::util; namespace tests { @@ -203,13 +204,30 @@ class TestExecutor final : public Executor if (msg.function() == "snap-check") { // Modify a page of the dummy memory uint8_t pageIdx = threadPoolIdx; - SPDLOG_DEBUG("TestExecutor modifying page {} of memory", pageIdx); - uint8_t* offsetPtr = - dummyMemory + (pageIdx * faabric::util::HOST_PAGE_SIZE); - std::vector data = { pageIdx, - (uint8_t)(pageIdx + 1), - (uint8_t)(pageIdx + 2) }; + faabric::util::SnapshotData& snapData = + faabric::snapshot::getSnapshotRegistry().getSnapshot( + msg.snapshotkey()); + + // Avoid writing a zero here as the memory is already zeroed hence + // it's not a change + std::vector data = { (uint8_t)(pageIdx + 1), + (uint8_t)(pageIdx + 2), + (uint8_t)(pageIdx + 3) }; + + // Set up a merge region that should catch the diff + size_t offset = (pageIdx * faabric::util::HOST_PAGE_SIZE); + snapData.addMergeRegion(offset, + data.size() + 10, + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + + SPDLOG_DEBUG("TestExecutor modifying page {} of memory ({}-{})", + pageIdx, + offset, + offset + data.size()); + + uint8_t* offsetPtr = dummyMemory + offset; std::memcpy(offsetPtr, data.data(), data.size()); } @@ -718,7 +736,7 @@ TEST_CASE_METHOD(TestExecutorFixture, faabric::util::setMockMode(true); std::string otherHost = "other"; - // Set up a load of messages executing with a different master host + // Set up some messages executing with a different master host std::vector messageIds; for (int i = 0; i < nThreads; i++) { faabric::Message& msg = req->mutable_messages()->at(i); @@ -750,13 +768,11 @@ TEST_CASE_METHOD(TestExecutorFixture, for (int i = 0; i < diffList.size(); i++) { // Check offset and data (according to logic defined in the dummy // executor) - uint8_t pageIndex = i + 1; - REQUIRE(diffList.at(i).offset == - pageIndex * faabric::util::HOST_PAGE_SIZE); + REQUIRE(diffList.at(i).offset == i * faabric::util::HOST_PAGE_SIZE); - std::vector expected = { pageIndex, - (uint8_t)(pageIndex + 1), - (uint8_t)(pageIndex + 2) }; + std::vector expected = { (uint8_t)(i + 1), + (uint8_t)(i + 2), + (uint8_t)(i + 3) }; std::vector actual(diffList.at(i).data, diffList.at(i).data + 3); diff --git a/tests/test/scheduler/test_scheduler.cpp b/tests/test/scheduler/test_scheduler.cpp index edf9f83b8..9f27c5c0e 100644 --- a/tests/test/scheduler/test_scheduler.cpp +++ b/tests/test/scheduler/test_scheduler.cpp @@ -811,28 +811,36 @@ TEST_CASE_METHOD(SlowExecutorFixture, TEST_CASE_METHOD(DummyExecutorFixture, "Test executor reuse", "[scheduler]") { - faabric::Message msgA = faabric::util::messageFactory("foo", "bar"); - faabric::Message msgB = faabric::util::messageFactory("foo", "bar"); - faabric::Message msgC = faabric::util::messageFactory("foo", "bar"); - faabric::Message msgD = faabric::util::messageFactory("foo", "bar"); + std::shared_ptr reqA = + faabric::util::batchExecFactory("foo", "bar", 2); + std::shared_ptr reqB = + faabric::util::batchExecFactory("foo", "bar", 2); + + faabric::Message& msgA = reqA->mutable_messages()->at(0); + faabric::Message& msgB = reqB->mutable_messages()->at(0); // Execute a couple of functions - sch.callFunction(msgA); - sch.callFunction(msgB); - sch.getFunctionResult(msgA.id(), SHORT_TEST_TIMEOUT_MS); - sch.getFunctionResult(msgB.id(), SHORT_TEST_TIMEOUT_MS); + sch.callFunctions(reqA); + for (const auto& m : reqA->messages()) { + faabric::Message res = + sch.getFunctionResult(m.id(), SHORT_TEST_TIMEOUT_MS); + REQUIRE(res.returnvalue() == 0); + } // Check executor count REQUIRE(sch.getFunctionExecutorCount(msgA) == 2); - // Submit a couple more functions - sch.callFunction(msgC); - sch.callFunction(msgD); - sch.getFunctionResult(msgC.id(), SHORT_TEST_TIMEOUT_MS); - sch.getFunctionResult(msgD.id(), SHORT_TEST_TIMEOUT_MS); + // Execute a couple more functions + sch.callFunctions(reqB); + for (const auto& m : reqB->messages()) { + faabric::Message res = + sch.getFunctionResult(m.id(), SHORT_TEST_TIMEOUT_MS); + REQUIRE(res.returnvalue() == 0); + } // Check executor count is still the same REQUIRE(sch.getFunctionExecutorCount(msgA) == 2); + REQUIRE(sch.getFunctionExecutorCount(msgB) == 2); } TEST_CASE_METHOD(DummyExecutorFixture, diff --git a/tests/test/snapshot/test_snapshot_client_server.cpp b/tests/test/snapshot/test_snapshot_client_server.cpp index fa86b703e..bd0764f6e 100644 --- a/tests/test/snapshot/test_snapshot_client_server.cpp +++ b/tests/test/snapshot/test_snapshot_client_server.cpp @@ -17,6 +17,8 @@ #include #include +using namespace faabric::util; + namespace tests { class SnapshotClientServerFixture @@ -40,8 +42,8 @@ class SnapshotClientServerFixture void setUpFunctionGroup(int appId, int groupId) { - faabric::util::SchedulingDecision decision(appId, groupId); - faabric::Message msg = faabric::util::messageFactory("foo", "bar"); + SchedulingDecision decision(appId, groupId); + faabric::Message msg = messageFactory("foo", "bar"); msg.set_appid(appId); msg.set_groupid(groupId); @@ -71,8 +73,8 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, // Prepare some snapshot data std::string snapKeyA = "foo"; std::string snapKeyB = "bar"; - faabric::util::SnapshotData snapA; - faabric::util::SnapshotData snapB; + SnapshotData snapA; + SnapshotData snapB; size_t snapSizeA = 1024; size_t snapSizeB = 500; snapA.size = snapSizeA; @@ -97,8 +99,8 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, // Check snapshots created in registry REQUIRE(reg.getSnapshotCount() == 2); - const faabric::util::SnapshotData& actualA = reg.getSnapshot(snapKeyA); - const faabric::util::SnapshotData& actualB = reg.getSnapshot(snapKeyB); + const SnapshotData& actualA = reg.getSnapshot(snapKeyA); + const SnapshotData& actualB = reg.getSnapshot(snapKeyB); REQUIRE(actualA.size == snapA.size); REQUIRE(actualB.size == snapB.size); @@ -110,8 +112,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, REQUIRE(actualDataB == dataB); } -void checkDiffsApplied(const uint8_t* snapBase, - std::vector diffs) +void checkDiffsApplied(const uint8_t* snapBase, std::vector diffs) { for (const auto& d : diffs) { std::vector actual(snapBase + d.offset, @@ -127,7 +128,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, "Test push snapshot diffs", "[snapshot]") { - std::string thisHost = faabric::util::getSystemConfig().endpointHost; + std::string thisHost = getSystemConfig().endpointHost; // One request with no group, another with a group we must initialise int appId = 111; @@ -137,25 +138,37 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, setUpFunctionGroup(appId, groupIdB); // Set up a snapshot - std::string snapKey = std::to_string(faabric::util::generateGid()); - faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, true); + std::string snapKey = std::to_string(generateGid()); + SnapshotData snap = takeSnapshot(snapKey, 5, true); // Set up some diffs std::vector diffDataA1 = { 0, 1, 2, 3 }; std::vector diffDataA2 = { 4, 5, 6 }; std::vector diffDataB = { 7, 7, 8, 8, 8 }; - std::vector diffsA; - std::vector diffsB; + std::vector diffsA; + std::vector diffsB; + + SnapshotDiff diffA1(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + 5, + diffDataA1.data(), + diffDataA1.size()); + + SnapshotDiff diffA2(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + 2 * HOST_PAGE_SIZE, + diffDataA2.data(), + diffDataA2.size()); - faabric::util::SnapshotDiff diffA1(5, diffDataA1.data(), diffDataA1.size()); - faabric::util::SnapshotDiff diffA2( - 2 * faabric::util::HOST_PAGE_SIZE, diffDataA2.data(), diffDataA2.size()); diffsA = { diffA1, diffA2 }; cli.pushSnapshotDiffs(snapKey, groupIdA, diffsA); - faabric::util::SnapshotDiff diffB( - 3 * faabric::util::HOST_PAGE_SIZE, diffDataB.data(), diffDataB.size()); + SnapshotDiff diffB(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + 3 * HOST_PAGE_SIZE, + diffDataB.data(), + diffDataB.size()); diffsB = { diffB }; cli.pushSnapshotDiffs(snapKey, groupIdB, diffsB); @@ -171,12 +184,12 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, "[snapshot]") { // Set up a snapshot - std::string snapKey = std::to_string(faabric::util::generateGid()); - faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, false); + std::string snapKey = std::to_string(generateGid()); + SnapshotData snap = takeSnapshot(snapKey, 5, false); // Set up a couple of ints in the snapshot int offsetA1 = 5; - int offsetA2 = 2 * faabric::util::HOST_PAGE_SIZE; + int offsetA2 = 2 * HOST_PAGE_SIZE; int baseA1 = 25; int baseA2 = 60; @@ -189,22 +202,26 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, int diffIntA1 = 123; int diffIntA2 = 345; - std::vector intDataA1 = - faabric::util::valueToBytes(diffIntA1); - std::vector intDataA2 = - faabric::util::valueToBytes(diffIntA2); + std::vector intDataA1 = valueToBytes(diffIntA1); + std::vector intDataA2 = valueToBytes(diffIntA2); - std::vector diffs; + std::vector diffs; - faabric::util::SnapshotDiff diffA1( - offsetA1, intDataA1.data(), intDataA1.size()); - diffA1.operation = faabric::util::SnapshotMergeOperation::Sum; - diffA1.dataType = faabric::util::SnapshotDataType::Int; + SnapshotDiff diffA1(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetA1, + intDataA1.data(), + intDataA1.size()); + diffA1.operation = SnapshotMergeOperation::Sum; + diffA1.dataType = SnapshotDataType::Int; - faabric::util::SnapshotDiff diffA2( - offsetA2, intDataA2.data(), intDataA2.size()); - diffA2.operation = faabric::util::SnapshotMergeOperation::Sum; - diffA2.dataType = faabric::util::SnapshotDataType::Int; + SnapshotDiff diffA2(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetA2, + intDataA2.data(), + intDataA2.size()); + diffA2.operation = SnapshotMergeOperation::Sum; + diffA2.dataType = SnapshotDataType::Int; diffs = { diffA1, diffA2 }; cli.pushSnapshotDiffs(snapKey, 0, diffs); @@ -221,22 +238,20 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, "[snapshot]") { // Set up a snapshot - std::string snapKey = std::to_string(faabric::util::generateGid()); - faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, false); + std::string snapKey = std::to_string(generateGid()); + SnapshotData snap = takeSnapshot(snapKey, 5, false); int offset = 5; std::vector originalData; std::vector diffData; std::vector expectedData; - faabric::util::SnapshotMergeOperation operation = - faabric::util::SnapshotMergeOperation::Overwrite; - faabric::util::SnapshotDataType dataType = - faabric::util::SnapshotDataType::Raw; + SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite; + SnapshotDataType dataType = SnapshotDataType::Raw; SECTION("Integer") { - dataType = faabric::util::SnapshotDataType::Int; + dataType = SnapshotDataType::Int; int original = 0; int diff = 0; int expected = 0; @@ -247,7 +262,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, diff = 10; expected = 110; - operation = faabric::util::SnapshotMergeOperation::Sum; + operation = SnapshotMergeOperation::Sum; } SECTION("Subtract") @@ -256,7 +271,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, diff = 10; expected = 90; - operation = faabric::util::SnapshotMergeOperation::Subtract; + operation = SnapshotMergeOperation::Subtract; } SECTION("Product") @@ -265,7 +280,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, diff = 20; expected = 200; - operation = faabric::util::SnapshotMergeOperation::Product; + operation = SnapshotMergeOperation::Product; } SECTION("Min") @@ -284,7 +299,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, expected = 10; } - operation = faabric::util::SnapshotMergeOperation::Min; + operation = SnapshotMergeOperation::Min; } SECTION("Max") @@ -303,22 +318,26 @@ TEST_CASE_METHOD(SnapshotClientServerFixture, expected = 20; } - operation = faabric::util::SnapshotMergeOperation::Max; + operation = SnapshotMergeOperation::Max; } - originalData = faabric::util::valueToBytes(original); - diffData = faabric::util::valueToBytes(diff); - expectedData = faabric::util::valueToBytes(expected); + originalData = valueToBytes(original); + diffData = valueToBytes(diff); + expectedData = valueToBytes(expected); } // Put original data in place std::memcpy(snap.data + offset, originalData.data(), originalData.size()); - faabric::util::SnapshotDiff diff(offset, diffData.data(), diffData.size()); + SnapshotDiff diff(SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offset, + diffData.data(), + diffData.size()); diff.operation = operation; diff.dataType = dataType; - std::vector diffs = { diff }; + std::vector diffs = { diff }; cli.pushSnapshotDiffs(snapKey, 0, diffs); // Check data is as expected diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp index 2a3701ae0..cf182c66b 100644 --- a/tests/test/util/test_snapshot.cpp +++ b/tests/test/util/test_snapshot.cpp @@ -562,10 +562,26 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]") sharedMem[ignoreOffsetB + ignoreLengthB - 1] = (uint8_t)1; std::vector expectedDiffs = { - { offsetA, dataA.data(), dataA.size() }, - { offsetB, dataB.data(), dataB.size() }, - { offsetC, dataC.data(), dataC.size() }, - { offsetD, dataD.data(), dataD.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetA, + dataA.data(), + dataA.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetB, + dataB.data(), + dataB.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetC, + dataC.data(), + dataC.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetD, + dataD.data(), + dataD.size() }, }; // Check number of diffs @@ -601,10 +617,26 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size()); std::vector expectedDiffs = { - { offsetA, dataA.data(), dataA.size() }, - { offsetB, dataB.data(), dataB.size() }, - { offsetC, dataC.data(), dataC.size() }, - { offsetD, dataD.data(), dataD.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetA, + dataA.data(), + dataA.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetB, + dataB.data(), + dataB.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetC, + dataC.data(), + dataC.size() }, + { SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite, + offsetD, + dataD.data(), + dataD.size() }, }; // Check number of diffs From d472329ba94011edc8a1cd77815a8a1002e7ee5d Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 9 Nov 2021 12:08:16 +0000 Subject: [PATCH 23/31] Continuing test fixes --- src/scheduler/Executor.cpp | 3 +- src/util/snapshot.cpp | 70 +++++++---- tests/test/snapshot/test_snapshot_diffs.cpp | 123 +++++++++++++++----- tests/test/util/test_snapshot.cpp | 16 +-- 4 files changed, 153 insertions(+), 59 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 2069b071e..120a505a7 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -134,8 +134,7 @@ void Executor::executeTasks(std::vector msgIdxs, } // Set up shared counter for this batch of tasks - auto batchCounter = - std::make_shared>(msgIdxs.size()); + auto batchCounter = std::make_shared>(msgIdxs.size()); // Work out if we should skip the reset after this batch. This only needs to // happen when we're executing threads on the master host, in which case the diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index 772870be4..0d7028ad5 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -56,7 +56,7 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, } // Work out which pages have changed (these will be sorted) - size_t nThisPages = getRequiredHostPages(size); + size_t nThisPages = getRequiredHostPages(updatedSize); std::vector dirtyPageNumbers = getDirtyPageNumbers(updated, nThisPages); @@ -97,13 +97,21 @@ std::vector SnapshotData::getChangeDiffs(const uint8_t* updated, (mergeIt->second.offset >= pageStart && mergeIt->second.offset < pageEnd)) { - SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}", - snapshotDataTypeStr(mergeIt->second.dataType), - snapshotMergeOpStr(mergeIt->second.operation), - mergeIt->second.offset, - mergeIt->second.offset + mergeIt->second.length); + uint8_t* original = data; + + // If we're outside the range of the original data, pass a nullptr + if (mergeIt->second.offset > size) { + SPDLOG_TRACE( + "Checking {} {} merge region {}-{} outside original snapshot", + snapshotDataTypeStr(mergeIt->second.dataType), + snapshotMergeOpStr(mergeIt->second.operation), + mergeIt->second.offset, + mergeIt->second.offset + mergeIt->second.length); + + original = nullptr; + } - mergeIt->second.addDiffs(diffs, data, updated); + mergeIt->second.addDiffs(diffs, original, updated); mergeIt++; } } @@ -206,15 +214,26 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, return; } - const uint8_t* updatedValue = updated + offset; - const uint8_t* originalValue = original + offset; + SPDLOG_TRACE("Checking for {} {} merge region at {}-{}", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + offset, + offset + length); switch (dataType) { case (SnapshotDataType::Int): { - // Get the original and update values - int originalInt = *(reinterpret_cast(originalValue)); + // Check if the value has changed + const uint8_t* updatedValue = updated + offset; int updatedInt = *(reinterpret_cast(updatedValue)); + if (original == nullptr) { + throw std::runtime_error( + "Do not support int operations outside original snapshot"); + } + + const uint8_t* originalValue = original + offset; + int originalInt = *(reinterpret_cast(originalValue)); + // Skip if no change if (originalInt == updatedInt) { return; @@ -277,9 +296,16 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, // have changed bool diffInProgress = false; int diffStart = 0; - for (int b = offset; b < offset + length; b++) { - bool isDirtyByte = *(original + b) != *(updated + b); + for (int b = offset; b <= offset + length; b++) { + bool isDirtyByte = false; + + if (original == nullptr) { + isDirtyByte = true; + } else { + isDirtyByte = *(original + b) != *(updated + b); + } + SPDLOG_TRACE("BYTE {} dirty {}", b, isDirtyByte); if (isDirtyByte && !diffInProgress) { // Diff starts here if it's different and diff // not in progress @@ -292,8 +318,8 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, SPDLOG_TRACE("Adding {} {} diff at {}-{}", snapshotDataTypeStr(dataType), snapshotMergeOpStr(operation), - offset + diffStart, - offset + diffStart + diffLength); + diffStart, + diffStart + diffLength); diffInProgress = false; diffs.emplace_back(dataType, @@ -307,17 +333,19 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, // If we've reached the end of this region with a diff // in progress, we need to close it off if (diffInProgress) { - SPDLOG_TRACE("Adding {} {} diff at {}-{}", - snapshotDataTypeStr(dataType), - snapshotMergeOpStr(operation), - offset, - offset + length); + int finalDiffLength = (offset + length) - diffStart + 1; + SPDLOG_TRACE( + "Adding {} {} diff at {}-{} (end of region)", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + diffStart, + diffStart + finalDiffLength); diffs.emplace_back(dataType, operation, diffStart, updated + diffStart, - (length + 1) - diffStart); + finalDiffLength); } break; } diff --git a/tests/test/snapshot/test_snapshot_diffs.cpp b/tests/test/snapshot/test_snapshot_diffs.cpp index 2269d8ba7..6b4a056e3 100644 --- a/tests/test/snapshot/test_snapshot_diffs.cpp +++ b/tests/test/snapshot/test_snapshot_diffs.cpp @@ -15,10 +15,39 @@ void checkSnapshotDiff(int offset, SnapshotDiff& actual) { REQUIRE(offset == actual.offset); + REQUIRE(actual.size > 0); + REQUIRE(actual.data != nullptr); + std::vector actualData(actual.data, actual.data + actual.size); REQUIRE(data == actualData); } +TEST_CASE_METHOD(SnapshotTestFixture, + "Test no snapshot diffs if no merge regions", + "[snapshot]") +{ + std::string snapKey = "foobar123"; + int snapPages = 5; + SnapshotData snap = takeSnapshot(snapKey, snapPages, true); + + int sharedMemPages = 8; + size_t sharedMemSize = sharedMemPages * HOST_PAGE_SIZE; + uint8_t* sharedMem = allocatePages(sharedMemPages); + + reg.mapSnapshot(snapKey, sharedMem); + + // Make various changes + sharedMem[0] = 1; + sharedMem[2 * HOST_PAGE_SIZE] = 1; + sharedMem[3 * HOST_PAGE_SIZE + 10] = 1; + sharedMem[8 * HOST_PAGE_SIZE - 20] = 1; + + // Check there are no diffs + std::vector changeDiffs = + snap.getChangeDiffs(sharedMem, sharedMemSize); + REQUIRE(changeDiffs.empty()); +} + TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]") { std::string snapKey = "foobar123"; @@ -37,28 +66,69 @@ TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]") // Reset dirty tracking faabric::util::resetDirtyTracking(); - // Set up some chunks of data to write into the memory + // Single change, single merge region std::vector dataA = { 1, 2, 3, 4 }; - std::vector dataB = { 4, 5, 6 }; - std::vector dataC = { 7, 6, 5, 4, 3 }; - std::vector dataD = { 1, 1, 1, 1 }; - - // Set up some offsets, both on and over page boundaries int offsetA = HOST_PAGE_SIZE; - int offsetB = HOST_PAGE_SIZE + 20; - int offsetC = 2 * HOST_PAGE_SIZE - 2; - int offsetD = 3 * HOST_PAGE_SIZE - dataD.size(); - - // Write the data std::memcpy(sharedMem + offsetA, dataA.data(), dataA.size()); - std::memcpy(sharedMem + offsetB, dataB.data(), dataB.size()); + + snap.addMergeRegion(offsetA, + dataA.size(), + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + + // NOTE - deliberately add merge regions out of order + // Diff starting in merge region and overlapping the end + std::vector dataC = { 7, 6, 5, 4, 3, 2, 1 }; + std::vector expectedDataC = { 7, 6, 5, 4, 3 }; + int offsetC = 2 * HOST_PAGE_SIZE; std::memcpy(sharedMem + offsetC, dataC.data(), dataC.size()); + + int regionOffsetC = offsetC - 3; + snap.addMergeRegion(regionOffsetC, + dataC.size(), + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + + // Two changes in single merge region + std::vector dataB1 = { 4, 5, 6 }; + std::vector dataB2 = { 7, 6, 5 }; + int offsetB1 = HOST_PAGE_SIZE + 10; + int offsetB2 = HOST_PAGE_SIZE + 16; + std::memcpy(sharedMem + offsetB1, dataB1.data(), dataB1.size()); + std::memcpy(sharedMem + offsetB2, dataB2.data(), dataB2.size()); + + snap.addMergeRegion(offsetB1, + (offsetB2 - offsetB1) + dataB2.size() + 10, + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + + // Merge region within a change + std::vector dataD = { 1, 1, 2, 2, 3, 3, 4 }; + std::vector expectedDataD = { 2, 2, 3, 3 }; + int offsetD = 3 * HOST_PAGE_SIZE - dataD.size(); std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size()); - // Write the data to the region that exceeds the size of the original - std::vector dataExtra( - (sharedMemPages - snapPages) * HOST_PAGE_SIZE, 5); - std::memcpy(sharedMem + snapSize, dataExtra.data(), dataExtra.size()); + int regionOffsetD = offsetD + 2; + int regionSizeD = dataD.size() - 4; + snap.addMergeRegion(regionOffsetD, + regionSizeD, + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + + // Write some data to the region that exceeds the size of the original, then + // add a merge region larger than it. Anything outside the original snapshot + // should be marked as changed. + std::vector dataExtra = { 2, 2, 2 }; + std::vector expectedDataExtra = { 0, 0, 2, 2, 2, 0, 0, 0 }; + int extraOffset = snapSize + HOST_PAGE_SIZE + 10; + std::memcpy(sharedMem + extraOffset, dataExtra.data(), dataExtra.size()); + + int extraRegionOffset = extraOffset - 2; + int extraRegionSize = dataExtra.size() + 4; + snap.addMergeRegion(extraRegionOffset, + extraRegionSize, + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); // Include an offset which doesn't change the data, but will register a // dirty page @@ -73,25 +143,20 @@ TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]") // Check shared memory does have dirty pages (including the non-change) std::vector sharedDirtyPages = getDirtyPageNumbers(sharedMem, sharedMemPages); - std::vector expected = { 1, 2, 3, 5, 6, 7 }; + std::vector expected = { 1, 2, 3, 6 }; REQUIRE(sharedDirtyPages == expected); - // Check change diffs note that diffs across page boundaries will be split - // into two + // Check we have the right number of diffs std::vector changeDiffs = snap.getChangeDiffs(sharedMem, sharedMemSize); - REQUIRE(changeDiffs.size() == 6); - // One chunk will be split over 2 pages - std::vector dataCPart1 = { 7, 6 }; - std::vector dataCPart2 = { 5, 4, 3 }; - int offsetC2 = 2 * HOST_PAGE_SIZE; + REQUIRE(changeDiffs.size() == 6); checkSnapshotDiff(offsetA, dataA, changeDiffs.at(0)); - checkSnapshotDiff(offsetB, dataB, changeDiffs.at(1)); - checkSnapshotDiff(offsetC, dataCPart1, changeDiffs.at(2)); - checkSnapshotDiff(offsetC2, dataCPart2, changeDiffs.at(3)); - checkSnapshotDiff(offsetD, dataD, changeDiffs.at(4)); - checkSnapshotDiff(snapSize, dataExtra, changeDiffs.at(5)); + checkSnapshotDiff(offsetB1, dataB1, changeDiffs.at(1)); + checkSnapshotDiff(offsetB2, dataB2, changeDiffs.at(2)); + checkSnapshotDiff(offsetC, expectedDataC, changeDiffs.at(3)); + checkSnapshotDiff(regionOffsetD, expectedDataD, changeDiffs.at(4)); + checkSnapshotDiff(extraRegionOffset, expectedDataExtra, changeDiffs.at(5)); } } diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp index cf182c66b..b86724c6e 100644 --- a/tests/test/util/test_snapshot.cpp +++ b/tests/test/util/test_snapshot.cpp @@ -66,7 +66,7 @@ class SnapshotMergeTestFixture : public SnapshotTestFixture TEST_CASE_METHOD(SnapshotMergeTestFixture, "Detailed test snapshot merge regions with ints", - "[util]") + "[snapshot][util]") { std::string snapKey = "foobar123"; int snapPages = 5; @@ -177,7 +177,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test edge-cases of snapshot merge regions", - "[util]") + "[snapshot][util]") { // Region edge cases: // - start @@ -286,7 +286,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test snapshot merge regions", - "[util]") + "[snapshot][util]") { std::string snapKey = "foobar123"; int snapPages = 5; @@ -435,7 +435,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test invalid snapshot merges", - "[util]") + "[snapshot][util]") { std::string snapKey = "foobar123"; int snapPages = 3; @@ -500,7 +500,9 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, deallocatePages(snap.data, snapPages); } -TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]") +TEST_CASE_METHOD(SnapshotMergeTestFixture, + "Test cross-page ignores", + "[snapshot][util]") { int snapPages = 6; size_t sharedMemSize = snapPages * HOST_PAGE_SIZE; @@ -593,7 +595,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]") TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test fine-grained byte-wise diffs", - "[util]") + "[snapshot][util]") { int snapPages = 3; size_t sharedMemSize = snapPages * HOST_PAGE_SIZE; @@ -648,7 +650,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test mix of applicable and non-applicable merge regions", - "[util]") + "[snapshot][util]") { int snapPages = 6; size_t sharedMemSize = snapPages * HOST_PAGE_SIZE; From e46fe940e13d5cac8835e96be4492cbd91311023 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 9 Nov 2021 12:23:57 +0000 Subject: [PATCH 24/31] Remove ignore regions --- include/faabric/util/snapshot.h | 1 - src/util/snapshot.cpp | 8 --- tests/test/util/test_snapshot.cpp | 93 ------------------------------- 3 files changed, 102 deletions(-) diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h index 6cef3d21b..6d0c2748e 100644 --- a/include/faabric/util/snapshot.h +++ b/include/faabric/util/snapshot.h @@ -20,7 +20,6 @@ enum SnapshotDataType enum SnapshotMergeOperation { Overwrite, - Ignore, Sum, Product, Subtract, diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index 0d7028ad5..8763a53b7 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -177,9 +177,6 @@ std::string snapshotDataTypeStr(SnapshotDataType dt) std::string snapshotMergeOpStr(SnapshotMergeOperation op) { switch (op) { - case (SnapshotMergeOperation::Ignore): { - return "Ignore"; - } case (SnapshotMergeOperation::Max): { return "Max"; } @@ -209,11 +206,6 @@ void SnapshotMergeRegion::addDiffs(std::vector& diffs, const uint8_t* original, const uint8_t* updated) { - // Skip an ignore - if (operation == SnapshotMergeOperation::Ignore) { - return; - } - SPDLOG_TRACE("Checking for {} {} merge region at {}-{}", snapshotDataTypeStr(dataType), snapshotMergeOpStr(operation), diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp index b86724c6e..9b7224ab4 100644 --- a/tests/test/util/test_snapshot.cpp +++ b/tests/test/util/test_snapshot.cpp @@ -500,99 +500,6 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, deallocatePages(snap.data, snapPages); } -TEST_CASE_METHOD(SnapshotMergeTestFixture, - "Test cross-page ignores", - "[snapshot][util]") -{ - int snapPages = 6; - size_t sharedMemSize = snapPages * HOST_PAGE_SIZE; - uint8_t* sharedMem = setUpSnapshot(snapPages); - - // Add ignore regions that cover multiple pages - int ignoreOffsetA = HOST_PAGE_SIZE + 100; - int ignoreLengthA = 2 * HOST_PAGE_SIZE; - snap.addMergeRegion(ignoreOffsetA, - ignoreLengthA, - faabric::util::SnapshotDataType::Raw, - faabric::util::SnapshotMergeOperation::Ignore); - - int ignoreOffsetB = sharedMemSize - (HOST_PAGE_SIZE + 10); - int ignoreLengthB = 30; - snap.addMergeRegion(ignoreOffsetB, - ignoreLengthB, - faabric::util::SnapshotDataType::Raw, - faabric::util::SnapshotMergeOperation::Ignore); - - // Add some modifications that will cause diffs, and some should be ignored - std::vector dataA(10, 1); - std::vector dataB(1, 1); - std::vector dataC(1, 1); - std::vector dataD(3, 1); - - // Not ignored, start of memory - uint32_t offsetA = 0; - std::memcpy(sharedMem + offsetA, dataA.data(), dataA.size()); - - // Not ignored, just before first ignore region - uint32_t offsetB = ignoreOffsetA - 1; - std::memcpy(sharedMem + offsetB, dataB.data(), dataB.size()); - - // Not ignored, just after first ignore region - uint32_t offsetC = ignoreOffsetA + ignoreLengthA; - std::memcpy(sharedMem + offsetC, dataC.data(), dataC.size()); - - // Not ignored, just before second ignore region - uint32_t offsetD = ignoreOffsetB - 4; - std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size()); - - // Ignored, start of first ignore region - sharedMem[ignoreOffsetA] = (uint8_t)1; - - // Ignored, just before page boundary in ignore region - sharedMem[(2 * HOST_PAGE_SIZE) - 1] = (uint8_t)1; - - // Ignored, just after page boundary in ignore region - sharedMem[(2 * HOST_PAGE_SIZE)] = (uint8_t)1; - - // Deliberately don't put any changes after the next page boundary to check - // that it rolls over properly - - // Ignored, just inside second region - sharedMem[ignoreOffsetB + 2] = (uint8_t)1; - - // Ignored, end of second region - sharedMem[ignoreOffsetB + ignoreLengthB - 1] = (uint8_t)1; - - std::vector expectedDiffs = { - { SnapshotDataType::Raw, - SnapshotMergeOperation::Overwrite, - offsetA, - dataA.data(), - dataA.size() }, - { SnapshotDataType::Raw, - SnapshotMergeOperation::Overwrite, - offsetB, - dataB.data(), - dataB.size() }, - { SnapshotDataType::Raw, - SnapshotMergeOperation::Overwrite, - offsetC, - dataC.data(), - dataC.size() }, - { SnapshotDataType::Raw, - SnapshotMergeOperation::Overwrite, - offsetD, - dataD.data(), - dataD.size() }, - }; - - // Check number of diffs - std::vector actualDiffs = - snap.getChangeDiffs(sharedMem, sharedMemSize); - - checkDiffs(actualDiffs, expectedDiffs); -} - TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test fine-grained byte-wise diffs", "[snapshot][util]") From dd7b6b833daf7c930101479bd90981e3b709ab21 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 9 Nov 2021 15:37:38 +0000 Subject: [PATCH 25/31] Add distributed locking test --- src/transport/PointToPointBroker.cpp | 81 ++++++++++------ tests/dist/transport/functions.cpp | 98 ++++++++++++++++++++ tests/dist/transport/test_coordination.cpp | 34 +++++++ tests/test/transport/test_point_to_point.cpp | 2 - tests/test/util/test_snapshot.cpp | 91 ++++++++---------- 5 files changed, 227 insertions(+), 79 deletions(-) create mode 100644 tests/dist/transport/test_coordination.cpp diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index 448b94429..cb4c4ec2a 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -134,24 +134,25 @@ PointToPointGroup::PointToPointGroup(int appIdIn, void PointToPointGroup::lock(int groupIdx, bool recursive) { - std::string host = + std::string masterHost = ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX); + std::string lockerHost = ptpBroker.getHostForReceiver(groupId, groupIdx); - if (host == conf.endpointHost) { + bool masterIsLocal = masterHost == conf.endpointHost; + bool lockerIsLocal = lockerHost == conf.endpointHost; + + // If we're on the master, we need to try and acquire the lock, otherwise we + // send a remote request + if (masterIsLocal) { bool acquiredLock = false; { faabric::util::UniqueLock lock(mx); - if (recursive) { - bool isFree = recursiveLockOwners.empty(); - bool lockOwnedByThisIdx = - !isFree && (recursiveLockOwners.top() == groupIdx); - - if (isFree || lockOwnedByThisIdx) { - // Recursive and either free, or already locked by this idx - recursiveLockOwners.push(groupIdx); - acquiredLock = true; - } + if (recursive && (recursiveLockOwners.empty() || + recursiveLockOwners.top() == groupIdx)) { + // Recursive and either free, or already locked by this idx + recursiveLockOwners.push(groupIdx); + acquiredLock = true; } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) { // Non-recursive and free lockOwnerIdx = groupIdx; @@ -159,25 +160,52 @@ void PointToPointGroup::lock(int groupIdx, bool recursive) } } - if (acquiredLock) { - SPDLOG_TRACE("Group idx {}, locked {} (recursive {})", + if (acquiredLock && lockerIsLocal) { + // Nothing to do now + SPDLOG_TRACE("Group idx {} ({}), locally locked {} (recursive {})", + groupIdx, + lockerHost, + groupId, + recursive); + + } else if (acquiredLock) { + SPDLOG_TRACE("Group idx {} ({}), remotely locked {} (recursive {})", groupIdx, + lockerHost, groupId, recursive); + // Notify remote locker that they've acquired the lock notifyLocked(groupIdx); } else { - SPDLOG_TRACE( - "Group idx {}, lock {} already locked({} waiters, recursive {})", - groupIdx, - groupId, - lockWaiters.size(), - recursive); - + // Need to wait to get the lock lockWaiters.push(groupIdx); + + // Wait here if local, otherwise the remote end will pick up the + // message + if (lockerIsLocal) { + SPDLOG_TRACE( + "Group idx {} ({}), locally awaiting lock {} (recursive {})", + groupIdx, + lockerHost, + groupId, + recursive); + + ptpBroker.recvMessage( + groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); + } else { + // Notify remote locker that they've acquired the lock + SPDLOG_TRACE( + "Group idx {} ({}), remotely awaiting lock {} (recursive {})", + groupIdx, + lockerHost, + groupId, + masterHost, + recursive); + } } } else { - auto cli = getClient(host); + auto cli = getClient(masterHost); faabric::PointToPointMessage msg; msg.set_groupid(groupId); msg.set_sendidx(groupIdx); @@ -187,13 +215,14 @@ void PointToPointGroup::lock(int groupIdx, bool recursive) groupId, groupIdx, POINT_TO_POINT_MASTER_IDX, - host); + masterHost); + // Send the remote request and await the message saying it's been + // acquired cli->groupLock(appId, groupId, groupIdx, recursive); - } - // Await ptp response - ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); + ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); + } } void PointToPointGroup::localLock() diff --git a/tests/dist/transport/functions.cpp b/tests/dist/transport/functions.cpp index 559fc75cd..f145e7d14 100644 --- a/tests/dist/transport/functions.cpp +++ b/tests/dist/transport/functions.cpp @@ -1,6 +1,10 @@ #include #include "DistTestExecutor.h" +#include "faabric/scheduler/Scheduler.h" +#include "faabric/util/func.h" +#include "faabric/util/gids.h" +#include "faabric/util/scheduling.h" #include "faabric_utils.h" #include "init.h" @@ -9,6 +13,7 @@ #include #include +using namespace faabric::transport; using namespace faabric::util; namespace tests { @@ -57,6 +62,94 @@ int handlePointToPointFunction( return 0; } +int handleDistributedLock(faabric::scheduler::Executor* exec, + int threadPoolIdx, + int msgIdx, + std::shared_ptr req) +{ + // We need sufficient concurrency here to show up bugs every time + int nWorkers = 10; + int nLoops = 30; + + std::string sharedStateKey = "dist-lock-test"; + + faabric::Message& msg = req->mutable_messages()->at(msgIdx); + + faabric::state::State& state = state::getGlobalState(); + std::shared_ptr stateKv = + state.getKV(msg.user(), sharedStateKey, sizeof(int32_t)); + + if (msg.function() == "lock") { + int initialValue = 0; + int groupId = faabric::util::generateGid(); + + stateKv->set(BYTES(&initialValue)); + + std::shared_ptr nestedReq = + faabric::util::batchExecFactory("ptp", "lock-worker", nWorkers); + for (int i = 0; i < nWorkers; i++) { + faabric::Message& m = nestedReq->mutable_messages()->at(i); + m.set_groupid(groupId); + m.set_groupidx(i); + } + + faabric::scheduler::Scheduler& sch = faabric::scheduler::getScheduler(); + faabric::util::SchedulingDecision decision = + sch.callFunctions(nestedReq); + + // Await results + bool success = true; + for (int msgId : decision.messageIds) { + faabric::Message res = sch.getFunctionResult(msgId, 30000); + if (res.returnvalue() != 0) { + success = false; + } + } + + int finalValue = *(int*)stateKv->get(); + int expectedValue = nWorkers * nLoops; + if (finalValue != expectedValue) { + SPDLOG_ERROR("Distributed lock test failed: {} != {}", + finalValue, + expectedValue); + success = false; + } else { + SPDLOG_ERROR("Distributed lock succeeded, result {}", finalValue); + } + + return success ? 0 : 1; + } + + // Here we want to do something that will mess up if the locking isn't + // working properly, so we perform incremental updates to a bit of shared + // state using a global lock in a tight loop. + std::shared_ptr group = + faabric::transport::PointToPointGroup::getGroup(msg.groupid()); + + for (int i = 0; i < nLoops; i++) { + // Get the lock + group->lock(msg.groupidx(), false); + + // Pull the value + stateKv->pull(); + int* originalValue = (int*)stateKv->get(); + + // Short sleep + int sleepTimeMs = std::rand() % 50; + SLEEP_MS(sleepTimeMs); + + // Increment and push + int newValue = *originalValue + 1; + stateKv->set(BYTES(&newValue)); + stateKv->pushFull(); + + // Unlock + group->unlock(msg.groupidx(), false); + } + + return 0; +} + class DistributedCoordinationTestRunner { public: @@ -322,6 +415,11 @@ void registerTransportTestFunctions() registerDistTestExecutorCallback( "ptp", "barrier-worker", handleDistributedBarrierWorker); + registerDistTestExecutorCallback("ptp", "lock", handleDistributedLock); + + registerDistTestExecutorCallback( + "ptp", "lock-worker", handleDistributedLock); + registerDistTestExecutorCallback("ptp", "notify", handleDistributedNotify); registerDistTestExecutorCallback( diff --git a/tests/dist/transport/test_coordination.cpp b/tests/dist/transport/test_coordination.cpp new file mode 100644 index 000000000..f25608883 --- /dev/null +++ b/tests/dist/transport/test_coordination.cpp @@ -0,0 +1,34 @@ +#include + +#include "faabric_utils.h" +#include "fixtures.h" +#include "init.h" + +#include +#include +#include +#include +#include +#include + +namespace tests { + +TEST_CASE_METHOD(DistTestsFixture, "Test distributed lock", "[ptp][transport]") +{ + // Set up this host's resources + int nLocalSlots = 5; + faabric::HostResources res; + res.set_slots(nLocalSlots); + sch.setThisHostResources(res); + + // Set up the request + std::shared_ptr req = + faabric::util::batchExecFactory("ptp", "lock", 1); + + sch.callFunctions(req); + + faabric::Message& m = req->mutable_messages()->at(0); + faabric::Message result = sch.getFunctionResult(m.id(), 30000); + REQUIRE(result.returnvalue() == 0); +} +} diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp index 01d58340d..2782cdfac 100644 --- a/tests/test/transport/test_point_to_point.cpp +++ b/tests/test/transport/test_point_to_point.cpp @@ -353,8 +353,6 @@ TEST_CASE_METHOD(PointToPointClientServerFixture, server.setRequestLatch(); cli.groupLock(appId, groupId, groupIdx, recursive); server.awaitRequestLatch(); - - broker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx); } REQUIRE(group->getLockOwner(recursive) == groupIdx); diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp index 9b7224ab4..2f5e06007 100644 --- a/tests/test/util/test_snapshot.cpp +++ b/tests/test/util/test_snapshot.cpp @@ -141,37 +141,28 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, REQUIRE(*intBOriginal == originalValueB); // Check diffs themselves - REQUIRE(actualDiffs.size() == 3); + REQUIRE(actualDiffs.size() == 2); SnapshotDiff diffA = actualDiffs.at(0); SnapshotDiff diffB = actualDiffs.at(1); - SnapshotDiff diffOther = actualDiffs.at(2); REQUIRE(diffA.offset == intAOffset); REQUIRE(diffB.offset == intBOffset); - REQUIRE(diffOther.offset == otherOffset); REQUIRE(diffA.operation == SnapshotMergeOperation::Sum); REQUIRE(diffB.operation == SnapshotMergeOperation::Sum); - REQUIRE(diffOther.operation == SnapshotMergeOperation::Overwrite); REQUIRE(diffA.dataType == SnapshotDataType::Int); REQUIRE(diffB.dataType == SnapshotDataType::Int); - REQUIRE(diffOther.dataType == SnapshotDataType::Raw); REQUIRE(diffA.size == sizeof(int32_t)); REQUIRE(diffB.size == sizeof(int32_t)); - REQUIRE(diffOther.size == otherData.size()); // Check that original values have been subtracted from final values for // sums REQUIRE(*(int*)diffA.data == sumValueA); REQUIRE(*(int*)diffB.data == sumValueB); - std::vector actualOtherData(diffOther.data, - diffOther.data + diffOther.size); - REQUIRE(actualOtherData == otherData); - deallocatePages(snap.data, snapPages); } @@ -307,8 +298,6 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, faabric::util::SnapshotMergeOperation::Overwrite; size_t dataLength = 0; - int expectedNumDiffs = 1; - SECTION("Integer") { int originalValue = 0; @@ -370,25 +359,16 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, SECTION("Raw") { - dataLength = 2 * sizeof(int32_t); + dataLength = 100; originalData = std::vector(dataLength, 3); - updatedData = originalData; - expectedData = originalData; + updatedData = std::vector(dataLength, 4); + expectedData = updatedData; dataType = faabric::util::SnapshotDataType::Raw; - operation = faabric::util::SnapshotMergeOperation::Overwrite; - SECTION("Ignore") + SECTION("Overwrite") { - operation = faabric::util::SnapshotMergeOperation::Ignore; - - // Scatter some modifications through the updated data, to make sure - // none are picked up - updatedData[0] = 1; - updatedData[sizeof(int32_t) - 2] = 1; - updatedData[sizeof(int32_t) + 10] = 1; - - expectedNumDiffs = 0; + operation = faabric::util::SnapshotMergeOperation::Overwrite; } } @@ -417,18 +397,15 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, std::vector actualDiffs = snap.getChangeDiffs(sharedMem, sharedMemSize); - // Check number of diffs - REQUIRE(actualDiffs.size() == expectedNumDiffs); + // Check diff + REQUIRE(actualDiffs.size() == 1); + std::vector expectedDiffs = { { dataType, + operation, + offset, + expectedData.data(), + expectedData.size() } }; - if (expectedNumDiffs == 1) { - std::vector expectedDiffs = { { dataType, - operation, - offset, - expectedData.data(), - expectedData.size() } }; - - checkDiffs(actualDiffs, expectedDiffs); - } + checkDiffs(actualDiffs, expectedDiffs); deallocatePages(snap.data, snapPages); } @@ -548,6 +525,12 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, dataD.size() }, }; + // Add a single merge region for all the changes + snap.addMergeRegion(0, + offsetD + dataD.size() + 20, + SnapshotDataType::Raw, + SnapshotMergeOperation::Overwrite); + // Check number of diffs std::vector actualDiffs = snap.getChangeDiffs(sharedMem, sharedMemSize); @@ -564,32 +547,33 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, uint8_t* sharedMem = setUpSnapshot(snapPages); // Add a couple of merge regions on each page, which should be skipped as - // they won't cover any changes + // they won't overlap any changes for (int i = 0; i < snapPages; i++) { - // Ignore - int iOff = i * HOST_PAGE_SIZE; - snap.addMergeRegion(iOff, + // Overwrite + int skippedOverwriteOffset = i * HOST_PAGE_SIZE; + snap.addMergeRegion(skippedOverwriteOffset, 10, faabric::util::SnapshotDataType::Raw, - faabric::util::SnapshotMergeOperation::Ignore); + faabric::util::SnapshotMergeOperation::Overwrite); // Sum - int sOff = ((i + 1) * HOST_PAGE_SIZE) - (2 * sizeof(int32_t)); - snap.addMergeRegion(sOff, + int skippedSumOffset = + ((i + 1) * HOST_PAGE_SIZE) - (2 * sizeof(int32_t)); + snap.addMergeRegion(skippedSumOffset, sizeof(int32_t), faabric::util::SnapshotDataType::Int, faabric::util::SnapshotMergeOperation::Sum); } - // Add an ignore region that should take effect, along with a corresponding - // change to be ignored - uint32_t ignoreA = (2 * HOST_PAGE_SIZE) + 2; - snap.addMergeRegion(ignoreA, + // Add an overwrite region that should take effect + uint32_t overwriteAOffset = (2 * HOST_PAGE_SIZE) + 20; + snap.addMergeRegion(overwriteAOffset, 20, faabric::util::SnapshotDataType::Raw, - faabric::util::SnapshotMergeOperation::Ignore); - std::vector dataA(10, 1); - std::memcpy(sharedMem + ignoreA, dataA.data(), dataA.size()); + faabric::util::SnapshotMergeOperation::Overwrite); + std::vector overwriteData(10, 1); + std::memcpy( + sharedMem + overwriteAOffset, overwriteData.data(), overwriteData.size()); // Add a sum region and data that should also take effect uint32_t sumOffset = (4 * HOST_PAGE_SIZE) + 100; @@ -605,6 +589,11 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, // Check diffs std::vector expectedDiffs = { + { faabric::util::SnapshotDataType::Raw, + faabric::util::SnapshotMergeOperation::Overwrite, + overwriteAOffset, + BYTES(overwriteData.data()), + overwriteData.size() }, { faabric::util::SnapshotDataType::Int, faabric::util::SnapshotMergeOperation::Sum, sumOffset, From ebc21eb427f2928f5c91d5d274635313cff670b4 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Tue, 9 Nov 2021 15:42:44 +0000 Subject: [PATCH 26/31] Formatting --- src/transport/PointToPointServer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp index e2dd6fa80..32db8cac7 100644 --- a/src/transport/PointToPointServer.cpp +++ b/src/transport/PointToPointServer.cpp @@ -29,10 +29,10 @@ void PointToPointServer::doAsyncRecv(int header, // Send the message locally to the downstream socket broker.sendMessage(msg.groupid(), - msg.sendidx(), - msg.recvidx(), - BYTES_CONST(msg.data().c_str()), - msg.data().size()); + msg.sendidx(), + msg.recvidx(), + BYTES_CONST(msg.data().c_str()), + msg.data().size()); break; } case faabric::transport::PointToPointCall::LOCK_GROUP: { From 11473c824b0d0acd6e582d8b819325196e0b9135 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 10 Nov 2021 08:06:25 +0000 Subject: [PATCH 27/31] Fix dist tests and remove unnecessary unit test --- src/scheduler/Executor.cpp | 9 ++- src/util/snapshot.cpp | 6 ++ tests/dist/scheduler/functions.cpp | 65 ++++++++++++++----- tests/test/scheduler/test_executor.cpp | 88 -------------------------- 4 files changed, 60 insertions(+), 108 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 120a505a7..a8a1d27fd 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -163,9 +163,9 @@ void Executor::executeTasks(std::vector msgIdxs, threadPoolIdx = msg.appidx() % threadPoolSize; } - SPDLOG_WARN("Overloaded app index {} to thread {}", - msg.appidx(), - threadPoolIdx); + SPDLOG_DEBUG("Overloaded app index {} to thread {}", + msg.appidx(), + threadPoolIdx); } else { // Take next from those that are available threadPoolIdx = *availablePoolThreads.begin(); @@ -274,6 +274,9 @@ void Executor::threadPoolThread(int threadPoolIdx) faabric::snapshot::getSnapshotRegistry().getSnapshot( msg.snapshotkey()); + SPDLOG_TRACE("Diffing pre and post execution snapshots for {}", + msg.snapshotkey()); + std::vector diffs = snapshotPreExecution.getChangeDiffs(snapshotPostExecution.data, snapshotPostExecution.size); diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp index 8763a53b7..595061220 100644 --- a/src/util/snapshot.cpp +++ b/src/util/snapshot.cpp @@ -153,6 +153,12 @@ void SnapshotData::addMergeRegion(uint32_t offset, snapshotMergeOpStr(operation), region.offset, region.offset + length); + } else { + SPDLOG_DEBUG("Adding new {} {} merge region at {}-{}", + snapshotDataTypeStr(dataType), + snapshotMergeOpStr(operation), + region.offset, + region.offset + length); } mergeRegions[region.offset] = region; diff --git a/tests/dist/scheduler/functions.cpp b/tests/dist/scheduler/functions.cpp index 28436f7f4..a91ad0405 100644 --- a/tests/dist/scheduler/functions.cpp +++ b/tests/dist/scheduler/functions.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace tests { @@ -60,20 +61,31 @@ int handleFakeDiffsFunction(faabric::scheduler::Executor* exec, { faabric::Message& msg = req->mutable_messages()->at(msgIdx); - faabric::util::SnapshotData snap = exec->snapshot(); - std::string msgInput = msg.inputdata(); std::string snapshotKey = msg.snapshotkey(); - // Modify the executor's memory + faabric::snapshot::SnapshotRegistry& reg = + faabric::snapshot::getSnapshotRegistry(); + + faabric::util::SnapshotData& originalSnap = reg.getSnapshot(snapshotKey); + faabric::util::SnapshotData updatedSnap = exec->snapshot(); + + // Add a single merge region to catch both diffs + int offsetA = 10; + int offsetB = 100; std::vector inputBytes = faabric::util::stringToBytes(msgInput); - std::vector keyBytes = faabric::util::stringToBytes(snapshotKey); - uint32_t offsetA = 10; - uint32_t offsetB = 100; + originalSnap.addMergeRegion( + 0, + offsetB + inputBytes.size() + 10, + faabric::util::SnapshotDataType::Raw, + faabric::util::SnapshotMergeOperation::Overwrite); - std::memcpy(snap.data + offsetA, keyBytes.data(), keyBytes.size()); - std::memcpy(snap.data + offsetB, inputBytes.data(), inputBytes.size()); + // Modify the executor's memory + std::vector keyBytes = faabric::util::stringToBytes(snapshotKey); + std::memcpy(updatedSnap.data + offsetA, keyBytes.data(), keyBytes.size()); + std::memcpy( + updatedSnap.data + offsetB, inputBytes.data(), inputBytes.size()); return 123; } @@ -89,6 +101,9 @@ int handleFakeDiffsThreadedFunction( std::string snapshotKey = "fake-diffs-threaded-snap"; std::string msgInput = msg.inputdata(); + faabric::snapshot::SnapshotRegistry& reg = + faabric::snapshot::getSnapshotRegistry(); + // This function creates a snapshot, then spawns some child threads that // will modify the shared memory. It then awaits the results and checks that // the modifications are synced back to the original host. @@ -104,8 +119,6 @@ int handleFakeDiffsThreadedFunction( snap.data = snapMemory; snap.size = snapSize; - faabric::snapshot::SnapshotRegistry& reg = - faabric::snapshot::getSnapshotRegistry(); reg.takeSnapshot(snapshotKey, snap); auto req = @@ -121,7 +134,7 @@ int handleFakeDiffsThreadedFunction( // Make a small modification to a page that will also be edited by // the child thread to make sure it's not overwritten std::vector localChange(3, i); - uint32_t offset = 2 * i * faabric::util::HOST_PAGE_SIZE; + int offset = 2 * i * faabric::util::HOST_PAGE_SIZE; std::memcpy( snapMemory + offset, localChange.data(), localChange.size()); } @@ -157,7 +170,7 @@ int handleFakeDiffsThreadedFunction( for (int i = 0; i < nThreads; i++) { // Check local modifications std::vector expectedLocal(3, i); - uint32_t localOffset = 2 * i * faabric::util::HOST_PAGE_SIZE; + int localOffset = 2 * i * faabric::util::HOST_PAGE_SIZE; std::vector actualLocal(snapMemory + localOffset, snapMemory + localOffset + expectedLocal.size()); @@ -168,7 +181,7 @@ int handleFakeDiffsThreadedFunction( } // Check remote modifications - uint32_t offset = 2 * i * faabric::util::HOST_PAGE_SIZE + 10; + int offset = 2 * i * faabric::util::HOST_PAGE_SIZE + 10; std::string expectedData("thread_" + std::to_string(i)); auto* charPtr = reinterpret_cast(snapMemory + offset); std::string actual(charPtr); @@ -185,15 +198,33 @@ int handleFakeDiffsThreadedFunction( } } else { + // This is the code that will be executed by the remote threads. + // Add a merge region to catch the modification int idx = msg.appidx(); - uint32_t offset = 2 * idx * faabric::util::HOST_PAGE_SIZE + 10; - // Modify the executor's memory + int regionOffset = 2 * idx * faabric::util::HOST_PAGE_SIZE; + int changeOffset = regionOffset + 10; + + // Get the input data std::vector inputBytes = faabric::util::stringToBytes(msgInput); - faabric::util::SnapshotData snap = exec->snapshot(); - std::memcpy(snap.data + offset, inputBytes.data(), inputBytes.size()); + faabric::util::SnapshotData& originalSnap = + reg.getSnapshot(snapshotKey); + faabric::util::SnapshotData updatedSnap = exec->snapshot(); + + // Make sure it's captured by the region + int regionLength = 20 + inputBytes.size(); + originalSnap.addMergeRegion( + regionOffset, + regionLength, + faabric::util::SnapshotDataType::Raw, + faabric::util::SnapshotMergeOperation::Overwrite); + + // Now modify the memory + std::memcpy(updatedSnap.data + changeOffset, + inputBytes.data(), + inputBytes.size()); return 0; } diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index 918b7d4ea..b1c74ed72 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -448,94 +448,6 @@ TEST_CASE_METHOD(TestExecutorFixture, } } -TEST_CASE_METHOD(TestExecutorFixture, - "Test executing remote chained threads", - "[executor]") -{ - faabric::util::setMockMode(true); - - std::string thisHost = conf.endpointHost; - - // Add other host to available hosts - std::string otherHost = "other"; - sch.addHostToGlobalSet(otherHost); - - // Make sure we have only enough resources to execute the initial function - faabric::HostResources res; - res.set_slots(1); - sch.setThisHostResources(res); - - // Set up other host to have some resources - faabric::HostResources resOther; - resOther.set_slots(20); - faabric::scheduler::queueResourceResponse(otherHost, resOther); - - // Background thread to execute main function and await results - int nThreads = 8; - auto latch = faabric::util::Latch::create(2); - std::thread t([&latch, nThreads] { - std::shared_ptr req = - faabric::util::batchExecFactory("dummy", "thread-check", 1); - faabric::Message& msg = req->mutable_messages()->at(0); - msg.set_inputdata(std::to_string(nThreads)); - - auto& sch = faabric::scheduler::getScheduler(); - sch.callFunctions(req, false); - - latch->wait(); - - faabric::Message res = sch.getFunctionResult(msg.id(), 2000); - assert(res.returnvalue() == 0); - }); - - // Wait for the remote thread request to have been submitted - auto reqs = faabric::scheduler::getBatchRequests(); - REQUIRE_RETRY(reqs = faabric::scheduler::getBatchRequests(), - reqs.size() == 1); - std::string actualHost = reqs.at(0).first; - REQUIRE(actualHost == otherHost); - - std::shared_ptr distReq = reqs.at(0).second; - REQUIRE(distReq->messages().size() == nThreads); - faabric::Message firstMsg = distReq->messages().at(0); - - // Check restore hasn't been called (as we're the master) - REQUIRE(restoreCount == 0); - - // Check the snapshot has been pushed to the other host - auto snapPushes = faabric::snapshot::getSnapshotPushes(); - REQUIRE(snapPushes.size() == 1); - REQUIRE(snapPushes.at(0).first == otherHost); - - // Now execute request on this host as if we were on the other host - conf.endpointHost = otherHost; - sch.callFunctions(distReq, true); - - // Check restore has been called as we're no longer master - REQUIRE(restoreCount == 1); - - // Wait for the results - auto results = faabric::snapshot::getThreadResults(); - REQUIRE_RETRY(results = faabric::snapshot::getThreadResults(), - results.size() == nThreads); - - // Reset the host config for this host - conf.endpointHost = thisHost; - - // Process the thread results - for (auto& r : results) { - REQUIRE(r.first == thisHost); - auto args = r.second; - sch.setThreadResultLocally(args.first, args.second); - } - - // Rejoin the background thread - latch->wait(); - if (t.joinable()) { - t.join(); - } -} - TEST_CASE_METHOD(TestExecutorFixture, "Test thread results returned on non-master", "[executor]") From 0d2708e425a9c978ceb557ff5aa70bb3ec7e1d96 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 10 Nov 2021 08:35:15 +0000 Subject: [PATCH 28/31] Clearer error message when insufficient pool threads --- src/scheduler/Executor.cpp | 11 ++++++++++- tests/test/scheduler/test_executor.cpp | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index a8a1d27fd..49812a55c 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -157,7 +157,16 @@ void Executor::executeTasks(std::vector msgIdxs, // to be the main thread and the zeroth in the communication group, // so will be blocking. if (isThreads && isMaster) { - assert(threadPoolSize > 2); + if (threadPoolSize <= 2) { + SPDLOG_ERROR( + "Insufficient pool threads ({}) to overload {} idx {}", + availablePoolThreads.size(), + funcStr, + msg.appidx()); + + throw std::runtime_error("Insufficient pool threads"); + } + threadPoolIdx = (msg.appidx() % (threadPoolSize - 2)) + 2; } else { threadPoolIdx = msg.appidx() % threadPoolSize; diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index b1c74ed72..573a08522 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -806,6 +806,8 @@ TEST_CASE_METHOD(TestExecutorFixture, { faabric::util::setMockMode(true); + conf.overrideCpuCount = 4; + std::string hostOverride = conf.endpointHost; int nMessages = 1; faabric::BatchExecuteRequest::BatchExecuteType requestType = @@ -846,8 +848,6 @@ TEST_CASE_METHOD(TestExecutorFixture, // As we're faking a non-master execution results will be sent back to // the fake master so we can't wait on them, thus have to sleep - SLEEP_MS(1000); - - REQUIRE(resetCount == expectedResets); + REQUIRE_RETRY({}, resetCount == expectedResets); } } From 4a76b0beb5c84b12a43de0e8add4ca7e32dc306d Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 10 Nov 2021 09:24:08 +0000 Subject: [PATCH 29/31] Bump cores in failing tests --- src/scheduler/Executor.cpp | 2 +- tests/test/scheduler/test_scheduler.cpp | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 49812a55c..b94606709 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -160,7 +160,7 @@ void Executor::executeTasks(std::vector msgIdxs, if (threadPoolSize <= 2) { SPDLOG_ERROR( "Insufficient pool threads ({}) to overload {} idx {}", - availablePoolThreads.size(), + threadPoolSize, funcStr, msg.appidx()); diff --git a/tests/test/scheduler/test_scheduler.cpp b/tests/test/scheduler/test_scheduler.cpp index 9f27c5c0e..69c683119 100644 --- a/tests/test/scheduler/test_scheduler.cpp +++ b/tests/test/scheduler/test_scheduler.cpp @@ -191,6 +191,10 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]") int32_t expectedSubType; std::string expectedContextData; + int thisCores = 5; + faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); + conf.overrideCpuCount = thisCores; + SECTION("Threads") { execMode = faabric::BatchExecuteRequest::THREADS; @@ -232,7 +236,7 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]") // Mock everything faabric::util::setMockMode(true); - std::string thisHost = faabric::util::getSystemConfig().endpointHost; + std::string thisHost = conf.endpointHost; // Set up another host std::string otherHost = "beta"; @@ -240,7 +244,6 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]") int nCallsOne = 10; int nCallsTwo = 5; - int thisCores = 5; int otherCores = 11; int nCallsOffloadedOne = nCallsOne - thisCores; @@ -390,6 +393,9 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test overloaded scheduler", "[scheduler]") { + faabric::util::SystemConfig& conf = faabric::util::getSystemConfig(); + conf.overrideCpuCount = 5; + faabric::util::setMockMode(true); faabric::BatchExecuteRequest::BatchExecuteType execMode; From ef52533ee5f6110db6289b48f9821a4b85fc064c Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 10 Nov 2021 14:08:38 +0000 Subject: [PATCH 30/31] Factor out scheduler decision making --- include/faabric/scheduler/Scheduler.h | 4 +++ src/scheduler/Scheduler.cpp | 44 +++++++++++++++------------ src/transport/PointToPointBroker.cpp | 4 +++ 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 0e7414fcb..48da69b86 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -231,6 +231,10 @@ class Scheduler std::unordered_map> registeredHosts; + faabric::util::SchedulingDecision makeSchedulingDecision( + std::shared_ptr req, + bool forceLocal); + faabric::util::SchedulingDecision doCallFunctions( std::shared_ptr req, faabric::util::SchedulingDecision& decision, diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 53d9114d2..946700919 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -215,7 +215,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // Note, we assume all the messages are for the same function and have the // same master host faabric::Message& firstMsg = req->mutable_messages()->at(0); - std::string funcStr = faabric::util::funcToString(firstMsg, false); std::string masterHost = firstMsg.masterhost(); if (masterHost.empty()) { std::string funcStrWithId = faabric::util::funcToString(firstMsg, true); @@ -225,22 +224,40 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( // If we're not the master host, we need to forward the request back to the // master host. This will only happen if a nested batch execution happens. - SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); if (!forceLocal && masterHost != thisHost) { + std::string funcStr = faabric::util::funcToString(firstMsg, false); SPDLOG_DEBUG("Forwarding {} back to master {}", funcStr, masterHost); getFunctionCallClient(masterHost).executeFunctions(req); + SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); decision.returnHost = masterHost; return decision; } faabric::util::FullLock lock(mx); - // ------------------------------------------- - // DECISION BUILDING - // ------------------------------------------- - std::vector hosts; + SchedulingDecision decision = makeSchedulingDecision(req, forceLocal); + + // Send out point-to-point mappings if necessary (unless being forced to + // execute locally, in which case they will be transmitted from the + // master) + if (!forceLocal && (firstMsg.groupid() > 0)) { + broker.setAndSendMappingsFromSchedulingDecision(decision); + } + + // Pass decision as hint + return doCallFunctions(req, decision, lock); +} + +faabric::util::SchedulingDecision Scheduler::makeSchedulingDecision( + std::shared_ptr req, + bool forceLocal) +{ int nMessages = req->messages_size(); + faabric::Message& firstMsg = req->mutable_messages()->at(0); + std::string funcStr = faabric::util::funcToString(firstMsg, false); + + std::vector hosts; if (forceLocal) { // We're forced to execute locally here so we do all the messages for (int i = 0; i < nMessages; i++) { @@ -338,23 +355,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( assert(hosts.size() == nMessages); // Set up decision + SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); for (int i = 0; i < hosts.size(); i++) { decision.addMessage(hosts.at(i), req->messages().at(i)); } - // ------------------------------------------- - // POINT-TO-POINT MESSAGING - // ------------------------------------------- - - // Send out point-to-point mappings if necessary (unless being forced to - // execute locally, in which case they will be transmitted from the - // master) - if (!forceLocal && (firstMsg.groupid() > 0)) { - broker.setAndSendMappingsFromSchedulingDecision(decision); - } - - // Pass decision as hint - return doCallFunctions(req, decision, lock); + return decision; } faabric::util::SchedulingDecision Scheduler::callFunctions( diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index cb4c4ec2a..ae19bcb33 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -114,11 +114,15 @@ void PointToPointGroup::addGroupIfNotExists(int appId, void PointToPointGroup::clearGroup(int groupId) { + faabric::util::FullLock lock(groupsMutex); + groups.erase(groupId); } void PointToPointGroup::clear() { + faabric::util::FullLock lock(groupsMutex); + groups.clear(); } From 5af97fdb1b5f6687fdc4726732d42321276fd615 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 10 Nov 2021 15:00:41 +0000 Subject: [PATCH 31/31] Fix locking bug --- src/transport/PointToPointBroker.cpp | 2 +- .../transport/test_point_to_point_groups.cpp | 84 ++++++++++++++----- 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp index ae19bcb33..4d9b23fad 100644 --- a/src/transport/PointToPointBroker.cpp +++ b/src/transport/PointToPointBroker.cpp @@ -157,7 +157,7 @@ void PointToPointGroup::lock(int groupIdx, bool recursive) // Recursive and either free, or already locked by this idx recursiveLockOwners.push(groupIdx); acquiredLock = true; - } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) { + } else if (!recursive && (lockOwnerIdx == NO_LOCK_OWNER_IDX)) { // Non-recursive and free lockOwnerIdx = groupIdx; acquiredLock = true; diff --git a/tests/test/transport/test_point_to_point_groups.cpp b/tests/test/transport/test_point_to_point_groups.cpp index d0c4bfb33..757120b17 100644 --- a/tests/test/transport/test_point_to_point_groups.cpp +++ b/tests/test/transport/test_point_to_point_groups.cpp @@ -164,41 +164,85 @@ TEST_CASE_METHOD(PointToPointGroupFixture, } TEST_CASE_METHOD(PointToPointGroupFixture, - "Test local locking and unlocking", + "Test locking and unlocking", "[ptp][transport]") { - std::atomic sharedInt = 0; int appId = 123; int groupId = 234; - // Arbitrary group size, local locks don't care - auto group = setUpGroup(appId, groupId, 3); + int nThreads = 4; + int nLoops = 50; - group->localLock(); + auto group = setUpGroup(appId, groupId, nThreads); - std::thread tA([&group, &sharedInt] { - group->localLock(); + std::atomic success = true; - assert(sharedInt == 99); - sharedInt = 88; + int criticalVar = 1; - group->localUnlock(); - }); + bool useLocal = false; + bool recursive = false; + SECTION("Local-only") { useLocal = true; } - // Main thread sleep for a while, make sure the other can't run and update - // the counter - SLEEP_MS(1000); + SECTION("Distributed version non-recursive") + { + useLocal = false; + recursive = false; + } - REQUIRE(sharedInt == 0); - sharedInt.store(99); + SECTION("Distributed version recursive") + { + useLocal = false; + recursive = true; + } - group->localUnlock(); + // Create high contention on the critical var that will be detected if + // locking isn't working. + std::vector threads; + for (int i = 0; i < nThreads; i++) { + threads.emplace_back( + [useLocal, recursive, i, nLoops, &group, &criticalVar, &success] { + if (useLocal) { + group->localLock(); + } else { + group->lock(i, recursive); + } + + // Check that while in this critical section, no changes from + // other threads are visible + + criticalVar = 2; + for (int j = 0; j < nLoops; j++) { + // Set the var + criticalVar = i; + + // Sleep a bit + int sleepTimeMs = std::rand() % 30; + SLEEP_MS(sleepTimeMs); + + // Check the var is unchanged by others + if (criticalVar != i) { + SPDLOG_ERROR("Inner loop testing locking got {} != {}", + criticalVar, + i); + success = false; + } + } + + if (useLocal) { + group->localUnlock(); + } else { + group->unlock(i, recursive); + } + }); + } - if (tA.joinable()) { - tA.join(); + for (auto& t : threads) { + if (t.joinable()) { + t.join(); + } } - REQUIRE(sharedInt == 88); + REQUIRE(success); } TEST_CASE_METHOD(PointToPointGroupFixture,