From 9fdfcef10de0b7a581278bfaa724f6be0696940f Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 1 Nov 2021 13:04:49 +0000
Subject: [PATCH 01/31] Started fixes for locks

---
 .../faabric/transport/PointToPointBroker.h    |   6 +-
 src/transport/PointToPointBroker.cpp          | 169 ++++++++++--------
 2 files changed, 93 insertions(+), 82 deletions(-)
diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index cb4bd33c8..705becfe7 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -30,6 +30,8 @@ class PointToPointGroup
 
     static void addGroup(int appId, int groupId, int groupSize);
 
+    static void addGroupIfNotExists(int appId, int groupId, int groupSize);
+
     static void clear();
 
     PointToPointGroup(int appId, int groupIdIn, int groupSizeIn);
@@ -77,10 +79,6 @@ class PointToPointGroup
     std::queue<int> lockWaiters;
 
     void notifyLocked(int groupIdx);
-
-    void masterLock(int groupIdx, bool recursive);
-
-    void masterUnlock(int groupIdx, bool recursive);
 };
 
 class PointToPointBroker
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 38e9e4921..4093bec32 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -22,6 +22,8 @@ namespace faabric::transport {
 
 static std::unordered_map<int, std::shared_ptr<PointToPointGroup>> groups;
 
+static std::shared_mutex groupsMutex;
+
 // NOTE: Keeping 0MQ sockets in TLS is usually a bad idea, as they _must_ be
 // closed before the global context. However, in this case it's worth it
 // to cache the sockets across messages, as otherwise we'd be creating and
@@ -76,13 +78,30 @@ std::shared_ptr<PointToPointGroup> PointToPointGroup::getGroup(int groupId)
 
 bool PointToPointGroup::groupExists(int groupId)
 {
+    faabric::util::SharedLock lock(groupsMutex);
     return groups.find(groupId) != groups.end();
 }
 
 void PointToPointGroup::addGroup(int appId, int groupId, int groupSize)
 {
-    groups.emplace(std::make_pair(
-      groupId, std::make_shared<PointToPointGroup>(appId, groupId, groupSize)));
+    faabric::util::FullLock lock(groupsMutex);
+
+    if (groups.find(groupId) == groups.end()) {
+        groups.emplace(std::make_pair(
+          groupId,
+          std::make_shared<PointToPointGroup>(appId, groupId, groupSize)));
+    }
+}
+
+void PointToPointGroup::addGroupIfNotExists(int appId,
+                                            int groupId,
+                                            int groupSize)
+{
+    if (groupExists(groupId)) {
+        return;
+    }
+
+    addGroup(appId, groupId, groupSize);
 }
 
 void PointToPointGroup::clear()
@@ -106,7 +125,44 @@ void PointToPointGroup::lock(int groupIdx, bool recursive)
       ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX);
 
     if (host == conf.endpointHost) {
-        masterLock(groupIdx, recursive);
+        bool acquiredLock = false;
+        {
+            faabric::util::UniqueLock lock(mx);
+            if (recursive) {
+                bool isFree = recursiveLockOwners.empty();
+
+                bool lockOwnedByThisIdx =
+                  !isFree && (recursiveLockOwners.top() == groupIdx);
+
+                if (isFree || lockOwnedByThisIdx) {
+                    // Recursive and either free, or already locked by this idx
+                    recursiveLockOwners.push(groupIdx);
+                    acquiredLock = true;
+                }
+            } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) {
+                // Non-recursive and free
+                lockOwnerIdx = groupIdx;
+                acquiredLock = true;
+            }
+        }
+
+        if (acquiredLock) {
+            SPDLOG_TRACE("Group idx {}, locked {} (recursive {})",
+                         groupIdx,
+                         groupId,
+                         recursive);
+
+            notifyLocked(groupIdx);
+        } else {
+            SPDLOG_TRACE(
+              "Group idx {}, lock {} already locked({} waiters, recursive {})",
+              groupIdx,
+              groupId,
+              lockWaiters.size(),
+              recursive);
+
+            lockWaiters.push(groupIdx);
+        }
     } else {
         auto cli = getClient(host);
         faabric::PointToPointMessage msg;
@@ -121,54 +177,10 @@ void PointToPointGroup::lock(int groupIdx, bool recursive)
                      host);
 
         cli->groupLock(appId, groupId, groupIdx, recursive);
-
-        // Await ptp response
-        ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
     }
-}
-
-void PointToPointGroup::masterLock(int groupIdx, bool recursive)
-{
-    SPDLOG_TRACE("Master lock {}:{}", groupId, groupIdx);
 
-    bool success = false;
-    {
-        faabric::util::UniqueLock lock(mx);
-        if (recursive) {
-            bool isFree = recursiveLockOwners.empty();
-
-            bool lockOwnedByThisIdx =
-              !isFree && (recursiveLockOwners.top() == groupIdx);
-
-            if (isFree || lockOwnedByThisIdx) {
-                // Recursive and either free, or already locked by this idx
-                SPDLOG_TRACE("Group idx {} recursively locked {} ({})",
-                             groupIdx,
-                             groupId,
-                             lockWaiters.size());
-                recursiveLockOwners.push(groupIdx);
-                success = true;
-            } else {
-                SPDLOG_TRACE("Group idx {} unable to recursively lock {} ({})",
-                             groupIdx,
-                             groupId,
-                             lockWaiters.size());
-            }
-        } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) {
-            // Non-recursive and free
-            SPDLOG_TRACE("Group idx {} locked {}", groupIdx, groupId);
-            lockOwnerIdx = groupIdx;
-            success = true;
-        } else {
-            // Unable to lock, wait in queue
-            SPDLOG_TRACE("Group idx {} unable to lock {}", groupIdx, groupId);
-            lockWaiters.push(groupIdx);
-        }
-    }
-
-    if (success) {
-        notifyLocked(groupIdx);
-    }
+    // Await ptp response
+    ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
 }
 
 void PointToPointGroup::localLock()
@@ -188,7 +200,35 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive)
       ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX);
 
     if (host == conf.endpointHost) {
-        masterUnlock(groupIdx, recursive);
+        SPDLOG_TRACE("Group idx {} unlocking {} ({} waiters, recursive {})",
+                     groupIdx,
+                     groupId,
+                     lockWaiters.size(),
+                     recursive);
+
+        faabric::util::UniqueLock lock(mx);
+
+        if (recursive) {
+            recursiveLockOwners.pop();
+
+            if (!recursiveLockOwners.empty()) {
+                return;
+            }
+
+            if (!lockWaiters.empty()) {
+                recursiveLockOwners.push(lockWaiters.front());
+                notifyLocked(lockWaiters.front());
+                lockWaiters.pop();
+            }
+        } else {
+            lockOwnerIdx = NO_LOCK_OWNER_IDX;
+
+            if (!lockWaiters.empty()) {
+                lockOwnerIdx = lockWaiters.front();
+                notifyLocked(lockWaiters.front());
+                lockWaiters.pop();
+            }
+        }
     } else {
         auto cli = getClient(host);
         faabric::PointToPointMessage msg;
@@ -196,7 +236,7 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive)
         msg.set_sendidx(groupIdx);
         msg.set_recvidx(POINT_TO_POINT_MASTER_IDX);
 
-        SPDLOG_TRACE("Remote lock {}:{}:{} to {}",
+        SPDLOG_TRACE("Remote unlock {}:{}:{} to {}",
                      groupId,
                      groupIdx,
                      POINT_TO_POINT_MASTER_IDX,
@@ -206,33 +246,6 @@ void PointToPointGroup::unlock(int groupIdx, bool recursive)
     }
 }
 
-void PointToPointGroup::masterUnlock(int groupIdx, bool recursive)
-{
-    faabric::util::UniqueLock lock(mx);
-
-    if (recursive) {
-        recursiveLockOwners.pop();
-
-        if (!recursiveLockOwners.empty()) {
-            return;
-        }
-
-        if (!lockWaiters.empty()) {
-            recursiveLockOwners.push(lockWaiters.front());
-            notifyLocked(lockWaiters.front());
-            lockWaiters.pop();
-        }
-    } else {
-        lockOwnerIdx = NO_LOCK_OWNER_IDX;
-
-        if (!lockWaiters.empty()) {
-            lockOwnerIdx = lockWaiters.front();
-            notifyLocked(lockWaiters.front());
-            lockWaiters.pop();
-        }
-    }
-}
-
 void PointToPointGroup::localUnlock()
 {
     localMx.unlock();

From a14a1bc4f2958f63c962f8b714eef9be3d96b00f Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 1 Nov 2021 14:52:27 +0000
Subject: [PATCH 02/31] Clearing up groups

---
 .../faabric/transport/PointToPointBroker.h    |  6 ++++
 src/scheduler/Executor.cpp                    | 15 ++++++++
 src/transport/PointToPointBroker.cpp          | 36 +++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index 705becfe7..3b985e431 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -26,12 +26,16 @@ class PointToPointGroup
   public:
     static std::shared_ptr<PointToPointGroup> getGroup(int groupId);
 
+    static std::shared_ptr<PointToPointGroup> getOrAwaitGroup(int groupId);
+
     static bool groupExists(int groupId);
 
     static void addGroup(int appId, int groupId, int groupSize);
 
     static void addGroupIfNotExists(int appId, int groupId, int groupSize);
 
+    static void clearGroup(int groupId);
+
     static void clear();
 
     PointToPointGroup(int appId, int groupIdIn, int groupSizeIn);
@@ -106,6 +110,8 @@ class PointToPointBroker
 
     std::vector<uint8_t> recvMessage(int groupId, int sendIdx, int recvIdx);
 
+    void clearGroup(int groupId);
+
     void clear();
 
     void resetThreadLocalCache();
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 0a2c77c67..41a637cb7 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -2,6 +2,7 @@
 #include <faabric/scheduler/Scheduler.h>
 #include <faabric/snapshot/SnapshotRegistry.h>
 #include <faabric/state/State.h>
+#include <faabric/transport/PointToPointBroker.h>
 #include <faabric/util/clock.h>
 #include <faabric/util/config.h>
 #include <faabric/util/environment.h>
@@ -253,6 +254,13 @@ void Executor::threadPoolThread(int threadPoolIdx)
                      threadPoolIdx,
                      oldTaskCount - 1);
 
+        // If this messages was part of a point-to-point group, we must make
+        // sure its thread-local state is cleared to allow sockets to be
+        // destructed.
+        if (msg.groupid() > 0) {
+            faabric::transport::getPointToPointBroker().resetThreadLocalCache();
+        }
+
         // Handle snapshot diffs _before_ we reset the executor
         if (isLastInBatch && task.needsSnapshotPush) {
             // Get diffs between original snapshot and after execution
@@ -283,6 +291,13 @@ void Executor::threadPoolThread(int threadPoolIdx)
                 reset(msg);
             }
 
+            // We need to remove the point-to-point group associated with this
+            // message completely.
+            if (msg.groupid() > 0) {
+                faabric::transport::getPointToPointBroker().clearGroup(
+                  msg.groupid());
+            }
+
             releaseClaim();
         }
 
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 4093bec32..30e1766f6 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -76,6 +76,14 @@ std::shared_ptr<PointToPointGroup> PointToPointGroup::getGroup(int groupId)
     return groups.at(groupId);
 }
 
+std::shared_ptr<PointToPointGroup> PointToPointGroup::getOrAwaitGroup(
+  int groupId)
+{
+    getPointToPointBroker().waitForMappingsOnThisHost(groupId);
+
+    return getGroup(groupId);
+}
+
 bool PointToPointGroup::groupExists(int groupId)
 {
     faabric::util::SharedLock lock(groupsMutex);
@@ -104,6 +112,11 @@ void PointToPointGroup::addGroupIfNotExists(int appId,
     addGroup(appId, groupId, groupSize);
 }
 
+void PointToPointGroup::clearGroup(int groupId)
+{
+    groups.erase(groupId);
+}
+
 void PointToPointGroup::clear()
 {
     groups.clear();
@@ -533,6 +546,29 @@ std::vector<uint8_t> PointToPointBroker::recvMessage(int groupId,
     return messageData.dataCopy();
 }
 
+void PointToPointBroker::clearGroup(int groupId)
+{
+    SPDLOG_TRACE("Clearing point-to-point group {}", groupId);
+
+    faabric::util::SharedLock lock(brokerMutex);
+
+    std::set<int> idxs = getIdxsRegisteredForGroup(groupId);
+    for (auto idxA : idxs) {
+        for (auto idxB : idxs) {
+            std::string label = getPointToPointKey(groupId, idxA, idxB);
+            mappings.erase(label);
+        }
+    }
+
+    groupIdIdxsMap.erase(groupId);
+
+    PointToPointGroup::clearGroup(groupId);
+
+    groupMappingMutexes.erase(groupId);
+    groupMappingsFlags.erase(groupId);
+    groupMappingCvs.erase(groupId);
+}
+
 void PointToPointBroker::clear()
 {
     faabric::util::SharedLock lock(brokerMutex);

From b6a0c2e2bd7fdc4ac900e329a564a90c6c2e3898 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 1 Nov 2021 18:09:45 +0000
Subject: [PATCH 03/31] Experiments with waiters/ maps

---
 cmake/ExternalProjects.cmake                  |  3 +
 .../faabric/transport/PointToPointBroker.h    |  9 +--
 include/faabric/util/locks.h                  | 35 ++++++++++
 include/faabric/util/map.h                    | 69 +++++++++++++++++++
 src/transport/PointToPointBroker.cpp          | 65 +++++++----------
 5 files changed, 135 insertions(+), 46 deletions(-)
 create mode 100644 include/faabric/util/map.h

diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake
index 7f16a103b..0ccfe9bab 100644
--- a/cmake/ExternalProjects.cmake
+++ b/cmake/ExternalProjects.cmake
@@ -29,6 +29,7 @@ conan_cmake_configure(
         cppzmq/4.8.1
         flatbuffers/2.0.0
         hiredis/1.0.2
+        parallel-hashmap/1.33
         protobuf/3.17.1
         rapidjson/cci.20200410
         spdlog/1.9.2
@@ -81,6 +82,7 @@ find_package(cpprestsdk REQUIRED)
 find_package(cppzmq REQUIRED)
 find_package(fmt REQUIRED)
 find_package(hiredis REQUIRED)
+find_package(phmap REQUIRED)
 find_package(spdlog REQUIRED)
 
 # Pistache - Conan version is out of date and doesn't support clang
@@ -132,6 +134,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE
     cppzmq::cppzmq
     flatbuffers::flatbuffers
     hiredis::hiredis
+    phmap::phmap
     pistache::pistache
     protobuf::libprotobuf
     RapidJSON::RapidJSON
diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index 3b985e431..fdfa2da86 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -2,6 +2,7 @@
 
 #include <faabric/transport/PointToPointClient.h>
 #include <faabric/util/config.h>
+#include <faabric/util/map.h>
 #include <faabric/util/scheduling.h>
 
 #include <condition_variable>
@@ -117,16 +118,16 @@ class PointToPointBroker
     void resetThreadLocalCache();
 
   private:
+    faabric::util::SystemConfig& conf;
+
     std::shared_mutex brokerMutex;
 
     std::unordered_map<int, std::set<int>> groupIdIdxsMap;
     std::unordered_map<std::string, std::string> mappings;
 
-    std::unordered_map<int, bool> groupMappingsFlags;
-    std::unordered_map<int, std::mutex> groupMappingMutexes;
-    std::unordered_map<int, std::condition_variable> groupMappingCvs;
+    std::unordered_map<int, faabric::util::FlagWaiter> groupFlags;
 
-    faabric::util::SystemConfig& conf;
+    faabric::util::FlagWaiter& getGroupFlag(int groupId);
 };
 
 PointToPointBroker& getPointToPointBroker();
diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h
index ed9037763..3e10c0c62 100644
--- a/include/faabric/util/locks.h
+++ b/include/faabric/util/locks.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <faabric/util/logging.h>
+
+#include <atomic>
+#include <condition_variable>
 #include <mutex>
 #include <shared_mutex>
 
@@ -7,4 +11,35 @@ namespace faabric::util {
 typedef std::unique_lock<std::mutex> UniqueLock;
 typedef std::unique_lock<std::shared_mutex> FullLock;
 typedef std::shared_lock<std::shared_mutex> SharedLock;
+
+class FlagWaiter
+{
+  public:
+    void waitOnFlag()
+    {
+        // Check
+        if (flag.load()) {
+            return;
+        }
+
+        // Wait for group to be enabled
+        UniqueLock lock(flagMx);
+        if (!cv.wait_for(lock, std::chrono::milliseconds(10000), [this] {
+                return flag.load();
+            })) {
+
+            SPDLOG_ERROR("Timed out waiting for flag");
+            throw std::runtime_error("Timed out waiting for flag");
+        }
+    }
+
+    void setFlag(bool value) { flag.store(value); }
+
+    bool getValue() { return flag.load(); }
+
+  private:
+    std::mutex flagMx;
+    std::condition_variable cv;
+    std::atomic<bool> flag;
+};
 }
diff --git a/include/faabric/util/map.h b/include/faabric/util/map.h
new file mode 100644
index 000000000..638603591
--- /dev/null
+++ b/include/faabric/util/map.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <faabric/util/locks.h>
+
+#include <parallel_hashmap/phmap.h>
+
+namespace faabric::util {
+
+// See documentation for phmap: https://greg7mdp.github.io/parallel-hashmap/
+// Specifically section titled: "Using the intrinsic parallelism ..."
+
+template<typename K, typename V>
+using SimpleMap =
+  phmap::parallel_flat_hash_map<K,
+                                V,
+                                phmap::priv::hash_default_hash<K>,
+                                phmap::priv::hash_default_eq<K>,
+                                std::allocator<std::pair<const K, V>>,
+                                4>;
+
+template<typename K, typename V>
+using ParallelMap =
+  phmap::parallel_flat_hash_map<K,
+                                V,
+                                phmap::priv::hash_default_hash<K>,
+                                phmap::priv::hash_default_eq<K>,
+                                std::allocator<std::pair<const K, V>>,
+                                4,
+                                std::mutex>;
+
+template<typename K, typename V>
+class SlowParallelMap
+{
+  public:
+    V& at(const K& key) { return map.at(key); }
+
+    V& getOrInsert(const K& key)
+    {
+        getOrInsert(key, [key](std::unordered_map<K, V>& m) { m[key]; });
+    }
+
+    V& getOrInsert(
+      const K& key,
+      const std::function<void(std::unordered_map<K, V>)>& insertOp)
+    {
+        if (map.find(key) == map.end()) {
+            FullLock lock(mx);
+            if (map.find(key) == map.end()) {
+                insertOp();
+            }
+        }
+
+        {
+            SharedLock lock(mx);
+            return map.at(key);
+        }
+    }
+
+    void insert(const K& key, const V& value) { map.insert(key, value); }
+
+    void erase(const K& key) { map.erase(key); }
+
+    void clear() { map.clear(); }
+
+  private:
+    std::shared_mutex mx;
+    std::unordered_map<K, V> map;
+};
+}
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 30e1766f6..820316875 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -390,19 +390,10 @@ PointToPointBroker::setUpLocalMappingsFromSchedulingDecision(
           decision.appId, groupId, decision.nFunctions);
     }
 
-    {
-        // Lock this group
-        faabric::util::UniqueLock lock(groupMappingMutexes[groupId]);
-
-        SPDLOG_TRACE(
-          "Enabling point-to-point mapping for {}:{}", decision.appId, groupId);
+    SPDLOG_TRACE(
+      "Enabling point-to-point mapping for {}:{}", decision.appId, groupId);
 
-        // Enable the group
-        groupMappingsFlags[groupId] = true;
-
-        // Notify waiters
-        groupMappingCvs[groupId].notify_all();
-    }
+    getGroupFlag(groupId).setFlag(true);
 
     return hosts;
 }
@@ -440,33 +431,27 @@ void PointToPointBroker::setAndSendMappingsFromSchedulingDecision(
     }
 }
 
+faabric::util::FlagWaiter& PointToPointBroker::getGroupFlag(int groupId)
+{
+    if (groupFlags.find(groupId) == groupFlags.end()) {
+        faabric::util::FullLock lock(brokerMutex);
+        if (groupFlags.find(groupId) == groupFlags.end()) {
+            return groupFlags[groupId];
+        }
+    }
+
+    {
+        faabric::util::SharedLock lock(brokerMutex);
+        return groupFlags.at(groupId);
+    }
+}
+
 void PointToPointBroker::waitForMappingsOnThisHost(int groupId)
 {
+    faabric::util::FlagWaiter &waiter = getGroupFlag(groupId);
     // Check if it's been enabled
-    if (!groupMappingsFlags[groupId]) {
-
-        // Lock this group
-        faabric::util::UniqueLock lock(groupMappingMutexes[groupId]);
-
-        // Check again
-        if (!groupMappingsFlags[groupId]) {
-            // Wait for group to be enabled
-            auto timePoint = std::chrono::system_clock::now() +
-                             std::chrono::milliseconds(MAPPING_TIMEOUT_MS);
-
-            if (!groupMappingCvs[groupId].wait_until(
-                  lock, timePoint, [this, groupId] {
-                      return groupMappingsFlags[groupId];
-                  })) {
-
-                SPDLOG_ERROR("Timed out waiting for group mappings {}",
-                             groupId);
-                throw std::runtime_error(
-                  "Timed out waiting for group mappings");
-            }
-
-            SPDLOG_TRACE("Point-to-point mappings for {} ready", groupId);
-        }
+    if (!waiter.getValue()) {
+        waiter.waitOnFlag();
     }
 }
 
@@ -564,9 +549,7 @@ void PointToPointBroker::clearGroup(int groupId)
 
     PointToPointGroup::clearGroup(groupId);
 
-    groupMappingMutexes.erase(groupId);
-    groupMappingsFlags.erase(groupId);
-    groupMappingCvs.erase(groupId);
+    groupFlags.erase(groupId);
 }
 
 void PointToPointBroker::clear()
@@ -578,9 +561,7 @@ void PointToPointBroker::clear()
 
     PointToPointGroup::clear();
 
-    groupMappingMutexes.clear();
-    groupMappingsFlags.clear();
-    groupMappingCvs.clear();
+    groupFlags.clear();
 }
 
 void PointToPointBroker::resetThreadLocalCache()

From 64dbd924699f71a1350af939167d8db097195fc9 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 10:01:50 +0000
Subject: [PATCH 04/31] Add test for flag waiter

---
 cmake/ExternalProjects.cmake         |  7 +++
 include/faabric/util/locks.h         | 19 +++++--
 src/transport/PointToPointBroker.cpp |  7 +--
 tests/test/state/test_state.cpp      |  6 --
 tests/test/util/test_locks.cpp       | 82 ++++++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 tests/test/util/test_locks.cpp

diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake
index 0ccfe9bab..b2e469a81 100644
--- a/cmake/ExternalProjects.cmake
+++ b/cmake/ExternalProjects.cmake
@@ -29,6 +29,7 @@ conan_cmake_configure(
         cppzmq/4.8.1
         flatbuffers/2.0.0
         hiredis/1.0.2
+        libcds/2.3.3
         parallel-hashmap/1.33
         protobuf/3.17.1
         rapidjson/cci.20200410
@@ -82,6 +83,7 @@ find_package(cpprestsdk REQUIRED)
 find_package(cppzmq REQUIRED)
 find_package(fmt REQUIRED)
 find_package(hiredis REQUIRED)
+find_package(LibCDS REQUIRED)
 find_package(phmap REQUIRED)
 find_package(spdlog REQUIRED)
 
@@ -122,9 +124,11 @@ add_library(zstd::libzstd_static ALIAS libzstd_static)
 
 # Group all external dependencies into a convenient virtual CMake library
 add_library(faabric_common_dependencies INTERFACE)
+
 target_include_directories(faabric_common_dependencies INTERFACE
     ${FAABRIC_INCLUDE_DIR}
 )
+
 target_link_libraries(faabric_common_dependencies INTERFACE
     Boost::Boost
     Boost::filesystem
@@ -134,6 +138,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE
     cppzmq::cppzmq
     flatbuffers::flatbuffers
     hiredis::hiredis
+    LibCDS::LibCDS
     phmap::phmap
     pistache::pistache
     protobuf::libprotobuf
@@ -142,7 +147,9 @@ target_link_libraries(faabric_common_dependencies INTERFACE
     Threads::Threads
     zstd::libzstd_static
 )
+
 target_compile_definitions(faabric_common_dependencies INTERFACE
     FMT_DEPRECATED= # Suppress warnings about use of deprecated api by spdlog
 )
+
 add_library(faabric::common_dependencies ALIAS faabric_common_dependencies)
diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h
index 3e10c0c62..8f14525b3 100644
--- a/include/faabric/util/locks.h
+++ b/include/faabric/util/locks.h
@@ -7,6 +7,8 @@
 #include <mutex>
 #include <shared_mutex>
 
+#define DEFAULT_FLAG_WAIT_MS 10000
+
 namespace faabric::util {
 typedef std::unique_lock<std::mutex> UniqueLock;
 typedef std::unique_lock<std::shared_mutex> FullLock;
@@ -15,6 +17,10 @@ typedef std::shared_lock<std::shared_mutex> SharedLock;
 class FlagWaiter
 {
   public:
+    FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS)
+      : timeoutMs(timeoutMsIn)
+    {}
+
     void waitOnFlag()
     {
         // Check
@@ -24,7 +30,7 @@ class FlagWaiter
 
         // Wait for group to be enabled
         UniqueLock lock(flagMx);
-        if (!cv.wait_for(lock, std::chrono::milliseconds(10000), [this] {
+        if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] {
                 return flag.load();
             })) {
 
@@ -33,11 +39,16 @@ class FlagWaiter
         }
     }
 
-    void setFlag(bool value) { flag.store(value); }
-
-    bool getValue() { return flag.load(); }
+    void setFlag(bool value)
+    {
+        UniqueLock lock(flagMx);
+        flag.store(value);
+        cv.notify_all();
+    }
 
   private:
+    int timeoutMs;
+
     std::mutex flagMx;
     std::condition_variable cv;
     std::atomic<bool> flag;
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 820316875..c0e5aeda5 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -448,11 +448,10 @@ faabric::util::FlagWaiter& PointToPointBroker::getGroupFlag(int groupId)
 
 void PointToPointBroker::waitForMappingsOnThisHost(int groupId)
 {
-    faabric::util::FlagWaiter &waiter = getGroupFlag(groupId);
+    faabric::util::FlagWaiter& waiter = getGroupFlag(groupId);
+
     // Check if it's been enabled
-    if (!waiter.getValue()) {
-        waiter.waitOnFlag();
-    }
+    waiter.waitOnFlag();
 }
 
 std::set<int> PointToPointBroker::getIdxsRegisteredForGroup(int groupId)
diff --git a/tests/test/state/test_state.cpp b/tests/test/state/test_state.cpp
index af9e7aa3b..40cc5be4a 100644
--- a/tests/test/state/test_state.cpp
+++ b/tests/test/state/test_state.cpp
@@ -114,12 +114,6 @@ class StateServerTestFixture
         std::vector<uint8_t> actual(values.size(), 0);
         setDummyData(values);
 
-        // Get, with optional pull
-        int nMessages = 1;
-        if (doPull) {
-            nMessages = 2;
-        }
-
         // Initial pull
         const std::shared_ptr<state::StateKeyValue>& localKv = getLocalKv();
         localKv->pull();
diff --git a/tests/test/util/test_locks.cpp b/tests/test/util/test_locks.cpp
new file mode 100644
index 000000000..839adbf85
--- /dev/null
+++ b/tests/test/util/test_locks.cpp
@@ -0,0 +1,82 @@
+#include <catch2/catch.hpp>
+#include <faabric/util/latch.h>
+
+#include <faabric/util/latch.h>
+#include <faabric/util/locks.h>
+#include <faabric/util/macros.h>
+
+using namespace faabric::util;
+
+namespace tests {
+
+TEST_CASE("Test wait flag", "[util]")
+{
+    int nThreads = 10;
+
+    FlagWaiter flagA;
+    FlagWaiter flagB;
+
+    std::shared_ptr<Latch> latchA1 = Latch::create(nThreads + 1);
+    std::shared_ptr<Latch> latchB1 = Latch::create(nThreads + 1);
+
+    std::shared_ptr<Latch> latchA2 = Latch::create(nThreads + 1);
+    std::shared_ptr<Latch> latchB2 = Latch::create(nThreads + 1);
+
+    std::vector<int> expectedUnset(nThreads, 0);
+    std::vector<int> expectedSet;
+
+    std::vector<std::thread> threadsA;
+    std::vector<std::thread> threadsB;
+
+    std::vector<int> resultsA(nThreads, 0);
+    std::vector<int> resultsB(nThreads, 0);
+
+    for (int i = 0; i < nThreads; i++) {
+        expectedSet.push_back(i);
+
+        threadsA.emplace_back([&flagA, &resultsA, &latchA1, &latchA2, i] {
+            latchA1->wait();
+            flagA.waitOnFlag();
+            resultsA[i] = i;
+            latchA2->wait();
+        });
+
+        threadsB.emplace_back([&flagB, &resultsB, &latchB1, &latchB2, i] {
+            latchB1->wait();
+            flagB.waitOnFlag();
+            resultsB[i] = i;
+            latchB2->wait();
+        });
+    }
+
+    // Check no results are set initially
+    latchA1->wait();
+    latchB1->wait();
+    REQUIRE(resultsA == expectedUnset);
+    REQUIRE(resultsB == expectedUnset);
+
+    // Set one flag, await latch and check
+    flagA.setFlag(true);
+    latchA2->wait();
+    REQUIRE(resultsA == expectedSet);
+    REQUIRE(resultsB == expectedUnset);
+
+    // Set other flag, await latch and check
+    flagB.setFlag(true);
+    latchB2->wait();
+    REQUIRE(resultsA == expectedSet);
+    REQUIRE(resultsB == expectedSet);
+
+    for (auto& t : threadsA) {
+        if (t.joinable()) {
+            t.join();
+        }
+    }
+
+    for (auto& t : threadsB) {
+        if (t.joinable()) {
+            t.join();
+        }
+    }
+}
+}

From 9d4570d82bc6460b29af60e262c40fdb25d58a2f Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 11:25:54 +0000
Subject: [PATCH 05/31] Avoid scheduling multiple threads on same thread as 0th
 group

---
 include/faabric/scheduler/Scheduler.h |  1 +
 src/scheduler/Executor.cpp            | 58 +++++++++++++++++++--------
 src/transport/PointToPointBroker.cpp  |  6 +++
 3 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
index f95219959..dc7b070d2 100644
--- a/include/faabric/scheduler/Scheduler.h
+++ b/include/faabric/scheduler/Scheduler.h
@@ -88,6 +88,7 @@ class Executor
     std::mutex threadsMutex;
     std::vector<std::shared_ptr<std::thread>> threadPoolThreads;
     std::vector<std::shared_ptr<std::thread>> deadThreads;
+    std::set<int> availablePoolThreads;
 
     std::vector<faabric::util::Queue<ExecutorTask>> threadTaskQueues;
 
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 41a637cb7..d3545d5d9 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -45,6 +45,11 @@ Executor::Executor(faabric::Message& msg)
     // Set an ID for this Executor
     id = conf.endpointHost + "_" + std::to_string(faabric::util::generateGid());
     SPDLOG_DEBUG("Starting executor {}", id);
+
+    // Mark all thread pool threads as available
+    for (int i = 0; i < threadPoolSize; i++) {
+        availablePoolThreads.insert(i);
+    }
 }
 
 Executor::~Executor() {}
@@ -152,17 +157,35 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     bool skipReset = isMaster && isThreads;
 
     // Iterate through and invoke tasks
+    // Rules for thread pool scheduling:
+    // - Keep index zero free if on master, as this is likely to be the
+    //   main thread which is blocking on others
+    // - If part of a messaging group, give the 0th idx its own thread, as
+    //   it's likely to block waiting for the others
+    if (isThreads && isMaster) {
+        availablePoolThreads.erase(0);
+    }
+
+    bool isInPtpGroup = firstMsg.groupid() > 0;
+
+    std::set<int>::iterator it = availablePoolThreads.begin();
     for (int msgIdx : msgIdxs) {
         const faabric::Message& msg = req->messages().at(msgIdx);
+        int threadPoolIdx = -1;
 
-        // If executing threads, we must always keep thread pool index zero
-        // free, as this may be executing the function that spawned them
-        int threadPoolIdx;
-        if (isThreads) {
-            assert(threadPoolSize > 1);
-            threadPoolIdx = (msg.appidx() % (threadPoolSize - 1)) + 1;
+        if (isInPtpGroup && msg.groupidx() == 0) {
+            // If zeroth in a group, need to reserve a thread pool idx
+            threadPoolIdx = *availablePoolThreads.begin();
+            availablePoolThreads.erase(threadPoolIdx);
+            it = availablePoolThreads.begin();
         } else {
-            threadPoolIdx = msg.appidx() % threadPoolSize;
+            if (it == availablePoolThreads.end()) {
+                // Reset iterator to start
+                it = availablePoolThreads.begin();
+            }
+
+            threadPoolIdx = *it;
+            it++;
         }
 
         // Enqueue the task
@@ -254,13 +277,6 @@ void Executor::threadPoolThread(int threadPoolIdx)
                      threadPoolIdx,
                      oldTaskCount - 1);
 
-        // If this messages was part of a point-to-point group, we must make
-        // sure its thread-local state is cleared to allow sockets to be
-        // destructed.
-        if (msg.groupid() > 0) {
-            faabric::transport::getPointToPointBroker().resetThreadLocalCache();
-        }
-
         // Handle snapshot diffs _before_ we reset the executor
         if (isLastInBatch && task.needsSnapshotPush) {
             // Get diffs between original snapshot and after execution
@@ -294,13 +310,23 @@ void Executor::threadPoolThread(int threadPoolIdx)
             // We need to remove the point-to-point group associated with this
             // message completely.
             if (msg.groupid() > 0) {
-                faabric::transport::getPointToPointBroker().clearGroup(
-                  msg.groupid());
+                faabric::transport::PointToPointBroker& broker =
+                  faabric::transport::getPointToPointBroker();
+
+                broker.clearGroup(msg.groupid());
+
+                broker.resetThreadLocalCache();
             }
 
             releaseClaim();
         }
 
+        // Make sure this thread index is available for scheduling
+        {
+            faabric::util::UniqueLock lock(threadsMutex);
+            availablePoolThreads.insert(threadPoolIdx);
+        }
+
         // Vacate the slot occupied by this task. This must be done after
         // releasing the claim on this executor, otherwise the scheduler may try
         // to schedule another function and be unable to reuse this executor.
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index c0e5aeda5..0fc9d20dc 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -305,10 +305,16 @@ void PointToPointGroup::notify(int groupIdx)
 {
     if (groupIdx == POINT_TO_POINT_MASTER_IDX) {
         for (int i = 1; i < groupSize; i++) {
+            SPDLOG_TRACE(
+              "Master group {} waiting for notify from index {}", groupId, i);
+
             ptpBroker.recvMessage(groupId, i, POINT_TO_POINT_MASTER_IDX);
+
+            SPDLOG_TRACE("Master group {} notified by index {}", groupId, i);
         }
     } else {
         std::vector<uint8_t> data(1, 0);
+        SPDLOG_TRACE("Notifying group {} from index {}", groupId, groupIdx);
         ptpBroker.sendMessage(groupId,
                               groupIdx,
                               POINT_TO_POINT_MASTER_IDX,

From 249daf2ab77736545f64ec741d78c6d5271f8a6f Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 12:52:36 +0000
Subject: [PATCH 06/31] Overloading thread pool

---
 .../faabric/transport/PointToPointBroker.h    |  2 +-
 src/scheduler/Executor.cpp                    | 80 ++++++++++---------
 src/transport/PointToPointBroker.cpp          |  7 +-
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index fdfa2da86..df8ea81e8 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -115,7 +115,7 @@ class PointToPointBroker
 
     void clear();
 
-    void resetThreadLocalCache();
+    void resetThreadLocalCache(bool soft=false);
 
   private:
     faabric::util::SystemConfig& conf;
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index d3545d5d9..53476a0fe 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -156,41 +156,42 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     // original function call will cause a reset
     bool skipReset = isMaster && isThreads;
 
-    // Iterate through and invoke tasks
-    // Rules for thread pool scheduling:
-    // - Keep index zero free if on master, as this is likely to be the
-    //   main thread which is blocking on others
-    // - If part of a messaging group, give the 0th idx its own thread, as
-    //   it's likely to block waiting for the others
-    if (isThreads && isMaster) {
-        availablePoolThreads.erase(0);
-    }
-
-    bool isInPtpGroup = firstMsg.groupid() > 0;
-
-    std::set<int>::iterator it = availablePoolThreads.begin();
+    // Iterate through and invoke tasks. By default, we allocate tasks
+    // one-to-one with thread pool threads. Only once the pool is exhausted do
+    // we start overallocating.
     for (int msgIdx : msgIdxs) {
         const faabric::Message& msg = req->messages().at(msgIdx);
+
         int threadPoolIdx = -1;
+        if (availablePoolThreads.empty()) {
+            // Here all threads are still executing, so we have to overload.
+            // If any tasks are blocking we risk a deadlock, and can no longer
+            // guarantee the application will finish.
+            // In general if we're on the master host and this is a thread, we
+            // should avoid the zeroth and first pool threads as they are likely
+            // to be the main thread and the zeroth in the communication group,
+            // so will be blocking.
+            if (isThreads && isMaster) {
+                assert(threadPoolSize > 2);
+                threadPoolIdx = (msg.appidx() % (threadPoolSize - 2)) + 2;
+            } else {
+                threadPoolIdx = msg.appidx() % threadPoolSize;
+            }
 
-        if (isInPtpGroup && msg.groupidx() == 0) {
-            // If zeroth in a group, need to reserve a thread pool idx
+            SPDLOG_WARN("Overloaded app index {} to thread {}",
+                        msg.appidx(),
+                        threadPoolIdx);
+        } else {
+            // Take next from those that are available
             threadPoolIdx = *availablePoolThreads.begin();
             availablePoolThreads.erase(threadPoolIdx);
-            it = availablePoolThreads.begin();
-        } else {
-            if (it == availablePoolThreads.end()) {
-                // Reset iterator to start
-                it = availablePoolThreads.begin();
-            }
 
-            threadPoolIdx = *it;
-            it++;
+            SPDLOG_TRACE("Assigned app index {} to thread {}",
+                         msg.appidx(),
+                         threadPoolIdx);
         }
 
         // Enqueue the task
-        SPDLOG_TRACE(
-          "Assigning app index {} to thread {}", msg.appidx(), threadPoolIdx);
         threadTaskQueues[threadPoolIdx].enqueue(ExecutorTask(
           msgIdx, req, batchCounter, needsSnapshotPush, skipReset));
 
@@ -207,6 +208,8 @@ void Executor::threadPoolThread(int threadPoolIdx)
     SPDLOG_DEBUG("Thread pool thread {}:{} starting up", id, threadPoolIdx);
 
     auto& sch = faabric::scheduler::getScheduler();
+    faabric::transport::PointToPointBroker& broker =
+      faabric::transport::getPointToPointBroker();
     const auto& conf = faabric::util::getSystemConfig();
 
     bool selfShutdown = false;
@@ -296,6 +299,17 @@ void Executor::threadPoolThread(int threadPoolIdx)
             faabric::util::resetDirtyTracking();
         }
 
+        // In all threads if we've finished with a ptp group, we need to tidy up
+        // to ensure all sockets can be destructed
+        if (msg.groupid() > 0) {
+            broker.resetThreadLocalCache(true);
+
+            // Remove the group from this host if last in batch
+            if (isLastInBatch) {
+                broker.clearGroup(msg.groupid());
+            }
+        }
+
         // If this batch is finished, reset the executor and release its claim.
         // Note that we have to release the claim _after_ resetting, otherwise
         // the executor won't be ready for reuse.
@@ -307,21 +321,10 @@ void Executor::threadPoolThread(int threadPoolIdx)
                 reset(msg);
             }
 
-            // We need to remove the point-to-point group associated with this
-            // message completely.
-            if (msg.groupid() > 0) {
-                faabric::transport::PointToPointBroker& broker =
-                  faabric::transport::getPointToPointBroker();
-
-                broker.clearGroup(msg.groupid());
-
-                broker.resetThreadLocalCache();
-            }
-
             releaseClaim();
         }
 
-        // Make sure this thread index is available for scheduling
+        // Return this thread index to the pool available for scheduling
         {
             faabric::util::UniqueLock lock(threadsMutex);
             availablePoolThreads.insert(threadPoolIdx);
@@ -374,8 +377,9 @@ void Executor::threadPoolThread(int threadPoolIdx)
     }
 
     // We have to clean up TLS here as this should be the last use of the
-    // scheduler from this thread
+    // scheduler and point-to-point broker from this thread
     sch.resetThreadLocalCache();
+    broker.resetThreadLocalCache();
 }
 
 bool Executor::tryClaim()
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 0fc9d20dc..6fa56debb 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -569,13 +569,16 @@ void PointToPointBroker::clear()
     groupFlags.clear();
 }
 
-void PointToPointBroker::resetThreadLocalCache()
+void PointToPointBroker::resetThreadLocalCache(bool soft)
 {
     SPDLOG_TRACE("Resetting point-to-point thread-local cache");
 
     sendEndpoints.clear();
     recvEndpoints.clear();
-    clients.clear();
+
+    if (!soft) {
+        clients.clear();
+    }
 }
 
 PointToPointBroker& getPointToPointBroker()

From 6fca18b0b96e7763daadfc8e218ebee361a14596 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 13:52:20 +0000
Subject: [PATCH 07/31] Moved unrelated changes to separate PR

---
 cmake/ExternalProjects.cmake    | 10 -----
 include/faabric/util/map.h      | 69 ---------------------------------
 tests/test/state/test_state.cpp |  6 +++
 3 files changed, 6 insertions(+), 79 deletions(-)
 delete mode 100644 include/faabric/util/map.h

diff --git a/cmake/ExternalProjects.cmake b/cmake/ExternalProjects.cmake
index b2e469a81..7f16a103b 100644
--- a/cmake/ExternalProjects.cmake
+++ b/cmake/ExternalProjects.cmake
@@ -29,8 +29,6 @@ conan_cmake_configure(
         cppzmq/4.8.1
         flatbuffers/2.0.0
         hiredis/1.0.2
-        libcds/2.3.3
-        parallel-hashmap/1.33
         protobuf/3.17.1
         rapidjson/cci.20200410
         spdlog/1.9.2
@@ -83,8 +81,6 @@ find_package(cpprestsdk REQUIRED)
 find_package(cppzmq REQUIRED)
 find_package(fmt REQUIRED)
 find_package(hiredis REQUIRED)
-find_package(LibCDS REQUIRED)
-find_package(phmap REQUIRED)
 find_package(spdlog REQUIRED)
 
 # Pistache - Conan version is out of date and doesn't support clang
@@ -124,11 +120,9 @@ add_library(zstd::libzstd_static ALIAS libzstd_static)
 
 # Group all external dependencies into a convenient virtual CMake library
 add_library(faabric_common_dependencies INTERFACE)
-
 target_include_directories(faabric_common_dependencies INTERFACE
     ${FAABRIC_INCLUDE_DIR}
 )
-
 target_link_libraries(faabric_common_dependencies INTERFACE
     Boost::Boost
     Boost::filesystem
@@ -138,8 +132,6 @@ target_link_libraries(faabric_common_dependencies INTERFACE
     cppzmq::cppzmq
     flatbuffers::flatbuffers
     hiredis::hiredis
-    LibCDS::LibCDS
-    phmap::phmap
     pistache::pistache
     protobuf::libprotobuf
     RapidJSON::RapidJSON
@@ -147,9 +139,7 @@ target_link_libraries(faabric_common_dependencies INTERFACE
     Threads::Threads
     zstd::libzstd_static
 )
-
 target_compile_definitions(faabric_common_dependencies INTERFACE
     FMT_DEPRECATED= # Suppress warnings about use of deprecated api by spdlog
 )
-
 add_library(faabric::common_dependencies ALIAS faabric_common_dependencies)
diff --git a/include/faabric/util/map.h b/include/faabric/util/map.h
deleted file mode 100644
index 638603591..000000000
--- a/include/faabric/util/map.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <faabric/util/locks.h>
-
-#include <parallel_hashmap/phmap.h>
-
-namespace faabric::util {
-
-// See documentation for phmap: https://greg7mdp.github.io/parallel-hashmap/
-// Specifically section titled: "Using the intrinsic parallelism ..."
-
-template<typename K, typename V>
-using SimpleMap =
-  phmap::parallel_flat_hash_map<K,
-                                V,
-                                phmap::priv::hash_default_hash<K>,
-                                phmap::priv::hash_default_eq<K>,
-                                std::allocator<std::pair<const K, V>>,
-                                4>;
-
-template<typename K, typename V>
-using ParallelMap =
-  phmap::parallel_flat_hash_map<K,
-                                V,
-                                phmap::priv::hash_default_hash<K>,
-                                phmap::priv::hash_default_eq<K>,
-                                std::allocator<std::pair<const K, V>>,
-                                4,
-                                std::mutex>;
-
-template<typename K, typename V>
-class SlowParallelMap
-{
-  public:
-    V& at(const K& key) { return map.at(key); }
-
-    V& getOrInsert(const K& key)
-    {
-        getOrInsert(key, [key](std::unordered_map<K, V>& m) { m[key]; });
-    }
-
-    V& getOrInsert(
-      const K& key,
-      const std::function<void(std::unordered_map<K, V>)>& insertOp)
-    {
-        if (map.find(key) == map.end()) {
-            FullLock lock(mx);
-            if (map.find(key) == map.end()) {
-                insertOp();
-            }
-        }
-
-        {
-            SharedLock lock(mx);
-            return map.at(key);
-        }
-    }
-
-    void insert(const K& key, const V& value) { map.insert(key, value); }
-
-    void erase(const K& key) { map.erase(key); }
-
-    void clear() { map.clear(); }
-
-  private:
-    std::shared_mutex mx;
-    std::unordered_map<K, V> map;
-};
-}
diff --git a/tests/test/state/test_state.cpp b/tests/test/state/test_state.cpp
index 40cc5be4a..af9e7aa3b 100644
--- a/tests/test/state/test_state.cpp
+++ b/tests/test/state/test_state.cpp
@@ -114,6 +114,12 @@ class StateServerTestFixture
         std::vector<uint8_t> actual(values.size(), 0);
         setDummyData(values);
 
+        // Get, with optional pull
+        int nMessages = 1;
+        if (doPull) {
+            nMessages = 2;
+        }
+
         // Initial pull
         const std::shared_ptr<state::StateKeyValue>& localKv = getLocalKv();
         localKv->pull();

From ea11f7a41a246bf61059489c3f64a48b3d28e92c Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 14:24:39 +0000
Subject: [PATCH 08/31] Add locks.cpp

---
 .../faabric/transport/PointToPointBroker.h    |  4 +--
 include/faabric/util/locks.h                  | 33 +++----------------
 src/scheduler/Executor.cpp                    |  2 +-
 src/transport/PointToPointBroker.cpp          |  4 +--
 src/util/CMakeLists.txt                       |  1 +
 src/util/locks.cpp                            | 33 +++++++++++++++++++
 6 files changed, 44 insertions(+), 33 deletions(-)
 create mode 100644 src/util/locks.cpp

diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index df8ea81e8..459d63543 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -2,7 +2,7 @@
 
 #include <faabric/transport/PointToPointClient.h>
 #include <faabric/util/config.h>
-#include <faabric/util/map.h>
+#include <faabric/util/locks.h>
 #include <faabric/util/scheduling.h>
 
 #include <condition_variable>
@@ -115,7 +115,7 @@ class PointToPointBroker
 
     void clear();
 
-    void resetThreadLocalCache(bool soft=false);
+    void resetThreadLocalCache(bool hard = true);
 
   private:
     faabric::util::SystemConfig& conf;
diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h
index 8f14525b3..1ab7e9fad 100644
--- a/include/faabric/util/locks.h
+++ b/include/faabric/util/locks.h
@@ -17,34 +17,11 @@ typedef std::shared_lock<std::shared_mutex> SharedLock;
 class FlagWaiter
 {
   public:
-    FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS)
-      : timeoutMs(timeoutMsIn)
-    {}
-
-    void waitOnFlag()
-    {
-        // Check
-        if (flag.load()) {
-            return;
-        }
-
-        // Wait for group to be enabled
-        UniqueLock lock(flagMx);
-        if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] {
-                return flag.load();
-            })) {
-
-            SPDLOG_ERROR("Timed out waiting for flag");
-            throw std::runtime_error("Timed out waiting for flag");
-        }
-    }
-
-    void setFlag(bool value)
-    {
-        UniqueLock lock(flagMx);
-        flag.store(value);
-        cv.notify_all();
-    }
+    FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS);
+
+    void waitOnFlag();
+
+    void setFlag(bool value);
 
   private:
     int timeoutMs;
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 53476a0fe..5159b7467 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -302,7 +302,7 @@ void Executor::threadPoolThread(int threadPoolIdx)
         // In all threads if we've finished with a ptp group, we need to tidy up
         // to ensure all sockets can be destructed
         if (msg.groupid() > 0) {
-            broker.resetThreadLocalCache(true);
+            broker.resetThreadLocalCache(false);
 
             // Remove the group from this host if last in batch
             if (isLastInBatch) {
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 6fa56debb..eecf6ed51 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -569,14 +569,14 @@ void PointToPointBroker::clear()
     groupFlags.clear();
 }
 
-void PointToPointBroker::resetThreadLocalCache(bool soft)
+void PointToPointBroker::resetThreadLocalCache(bool hard)
 {
     SPDLOG_TRACE("Resetting point-to-point thread-local cache");
 
     sendEndpoints.clear();
     recvEndpoints.clear();
 
-    if (!soft) {
+    if (hard) {
         clients.clear();
     }
 }
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index bee41ed71..f7d6e1242 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -12,6 +12,7 @@ faabric_lib(util
     gids.cpp
     json.cpp
     latch.cpp
+    locks.cpp
     logging.cpp
     memory.cpp
     network.cpp
diff --git a/src/util/locks.cpp b/src/util/locks.cpp
new file mode 100644
index 000000000..14064effa
--- /dev/null
+++ b/src/util/locks.cpp
@@ -0,0 +1,33 @@
+#include <faabric/util/locks.h>
+
+namespace faabric::util {
+
+FlagWaiter::FlagWaiter(int timeoutMsIn)
+  : timeoutMs(timeoutMsIn)
+{}
+
+void FlagWaiter::waitOnFlag()
+{
+    // Check
+    if (flag.load()) {
+        return;
+    }
+
+    // Wait for flag to be set
+    UniqueLock lock(flagMx);
+    if (!cv.wait_for(lock, std::chrono::milliseconds(timeoutMs), [this] {
+            return flag.load();
+        })) {
+
+        SPDLOG_ERROR("Timed out waiting for flag");
+        throw std::runtime_error("Timed out waiting for flag");
+    }
+}
+
+void FlagWaiter::setFlag(bool value)
+{
+    UniqueLock lock(flagMx);
+    flag.store(value);
+    cv.notify_all();
+}
+}

From 314226fd0e6413c494928e21ad8a1470e75d25d9 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 2 Nov 2021 17:05:10 +0000
Subject: [PATCH 09/31] Only lock on group when it exists

---
 src/snapshot/SnapshotServer.cpp | 28 +++++++++-------------------
 src/util/memory.cpp             |  2 ++
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp
index 7cb778be7..95d02f71f 100644
--- a/src/snapshot/SnapshotServer.cpp
+++ b/src/snapshot/SnapshotServer.cpp
@@ -71,10 +71,9 @@ std::unique_ptr<google::protobuf::Message> SnapshotServer::recvPushSnapshot(
         throw std::runtime_error("Received snapshot with zero size");
     }
 
-    SPDLOG_DEBUG("Receiving snapshot {} (size {}, lock {})",
+    SPDLOG_DEBUG("Receiving snapshot {} (size {})",
                  r->key()->c_str(),
-                 r->contents()->size(),
-                 r->groupid());
+                 r->contents()->size());
 
     faabric::snapshot::SnapshotRegistry& reg =
       faabric::snapshot::getSnapshotRegistry();
@@ -83,12 +82,6 @@ std::unique_ptr<google::protobuf::Message> SnapshotServer::recvPushSnapshot(
     faabric::util::SnapshotData data;
     data.size = r->contents()->size();
 
-    // Lock the function group if necessary
-    if (r->groupid() > 0) {
-        faabric::transport::PointToPointGroup::getGroup(r->groupid())
-          ->localLock();
-    }
-
     // TODO - avoid this copy by changing server superclass to allow subclasses
     // to provide a buffer to receive data.
     // TODO - work out snapshot ownership here, how do we know when to delete
@@ -99,12 +92,6 @@ std::unique_ptr<google::protobuf::Message> SnapshotServer::recvPushSnapshot(
 
     reg.takeSnapshot(r->key()->str(), data, true);
 
-    // Unlock the application
-    if (r->groupid() > 0) {
-        faabric::transport::PointToPointGroup::getGroup(r->groupid())
-          ->localUnlock();
-    }
-
     // Send response
     return std::make_unique<faabric::EmptyResponse>();
 }
@@ -127,6 +114,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
 {
     const SnapshotDiffPushRequest* r =
       flatbuffers::GetRoot<SnapshotDiffPushRequest>(buffer);
+    int groupId = r->groupid();
 
     SPDLOG_DEBUG(
       "Applying {} diffs to snapshot {}", r->chunks()->size(), r->key()->str());
@@ -136,8 +124,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
       faabric::snapshot::getSnapshotRegistry();
     faabric::util::SnapshotData& snap = reg.getSnapshot(r->key()->str());
 
-    // Lock the function group
-    if (r->groupid() > 0) {
+    // Lock the function group if it exists
+    if (groupId > 0 &&
+        faabric::transport::PointToPointGroup::groupExists(groupId)) {
         faabric::transport::PointToPointGroup::getGroup(r->groupid())
           ->localLock();
     }
@@ -202,8 +191,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
         }
     }
 
-    // Unlock
-    if (r->groupid() > 0) {
+    // Unlock group if exists
+    if (groupId > 0 &&
+        faabric::transport::PointToPointGroup::groupExists(groupId)) {
         faabric::transport::PointToPointGroup::getGroup(r->groupid())
           ->localUnlock();
     }
diff --git a/src/util/memory.cpp b/src/util/memory.cpp
index 3855aac23..de3b9c95f 100644
--- a/src/util/memory.cpp
+++ b/src/util/memory.cpp
@@ -82,6 +82,8 @@ AlignedChunk getPageAlignedChunk(long offset, long length)
 
 void resetDirtyTracking()
 {
+    SPDLOG_DEBUG("Resetting dirty tracking");
+
     FILE* fd = fopen(CLEAR_REFS, "w");
     if (fd == nullptr) {
         SPDLOG_ERROR("Could not open clear_refs ({})", strerror(errno));

From bb5af80f4f148f34b3ef47df244db6edaeae320e Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Thu, 4 Nov 2021 13:46:54 +0000
Subject: [PATCH 10/31] Add custom merges

---
 include/faabric/util/snapshot.h | 126 +++++++++++++++++++++++--
 src/snapshot/SnapshotServer.cpp |   2 +-
 src/util/snapshot.cpp           | 157 +++++++++++++-------------------
 3 files changed, 181 insertions(+), 104 deletions(-)

diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
index e0d195f6f..e1bb1f7b4 100644
--- a/include/faabric/util/snapshot.h
+++ b/include/faabric/util/snapshot.h
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include <faabric/util/logging.h>
+#include <faabric/util/macros.h>
 
 namespace faabric::util {
 
@@ -27,14 +28,6 @@ enum SnapshotMergeOperation
     Min
 };
 
-struct SnapshotMergeRegion
-{
-    uint32_t offset = 0;
-    size_t length = 0;
-    SnapshotDataType dataType = SnapshotDataType::Raw;
-    SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
-};
-
 class SnapshotDiff
 {
   public:
@@ -44,6 +37,8 @@ class SnapshotDiff
     size_t size = 0;
     const uint8_t* data = nullptr;
 
+    bool noChange = false;
+
     SnapshotDiff() = default;
 
     SnapshotDiff(SnapshotDataType dataTypeIn,
@@ -67,6 +62,105 @@ class SnapshotDiff
     }
 };
 
+class SnapshotMergeRegion
+{
+  public:
+    uint32_t offset = 0;
+    size_t length = 0;
+    SnapshotDataType dataType = SnapshotDataType::Raw;
+    SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
+
+    SnapshotDiff toDiff(const uint8_t* original, const uint8_t* updated)
+    {
+        const uint8_t* updatedValue = updated + offset;
+        const uint8_t* originalValue = original + offset;
+
+        SnapshotDiff diff(offset, updatedValue, length);
+        diff.dataType = dataType;
+        diff.operation = operation;
+
+        // Modify diff data for certain operations
+        switch (dataType) {
+            case (SnapshotDataType::Int): {
+                int originalInt =
+                  *(reinterpret_cast<const int*>(originalValue));
+                int updatedInt = *(reinterpret_cast<const int*>(updatedValue));
+
+                if (originalInt == updatedInt) {
+                    diff.size = 0;
+                    diff.noChange = true;
+                    return diff;
+                }
+
+                switch (operation) {
+                    case (SnapshotMergeOperation::Sum): {
+                        // Sums must send the value to be _added_, and
+                        // not the final result
+                        updatedInt -= originalInt;
+                        break;
+                    }
+                    case (SnapshotMergeOperation::Subtract): {
+                        // Subtractions must send the value to be
+                        // subtracted, not the result
+                        updatedInt = originalInt - updatedInt;
+                        break;
+                    }
+                    case (SnapshotMergeOperation::Product): {
+                        // Products must send the value to be
+                        // multiplied, not the result
+                        updatedInt /= originalInt;
+                        break;
+                    }
+                    case (SnapshotMergeOperation::Max):
+                    case (SnapshotMergeOperation::Min):
+                        // Min and max don't need to change
+                        break;
+                    default: {
+                        SPDLOG_ERROR("Unhandled integer merge operation: {}",
+                                     operation);
+                        throw std::runtime_error(
+                          "Unhandled integer merge operation");
+                    }
+                }
+
+                // TODO - somehow avoid casting away the const here?
+                // Modify the memory in-place here
+                std::memcpy(
+                  (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t));
+
+                break;
+            }
+            case (SnapshotDataType::Raw): {
+                switch (operation) {
+                    case (SnapshotMergeOperation::Ignore): {
+                        break;
+                    }
+                    case (SnapshotMergeOperation::Overwrite): {
+                        // TODO - how can we make this fine-grained?
+                        // Default behaviour
+                        break;
+                    }
+                    default: {
+                        SPDLOG_ERROR("Unhandled raw merge operation: {}",
+                                     operation);
+                        throw std::runtime_error(
+                          "Unhandled raw merge operation");
+                    }
+                }
+
+                break;
+            }
+            default: {
+                SPDLOG_ERROR("Merge region for unhandled data type: {}",
+                             dataType);
+                throw std::runtime_error(
+                  "Merge region for unhandled data type");
+            }
+        }
+        return diff;
+    }
+};
+
 class SnapshotData
 {
   public:
@@ -86,10 +180,22 @@ class SnapshotData
                         SnapshotDataType dataType,
                         SnapshotMergeOperation operation);
 
+    void setIsCustomMerge(bool value);
+
+    bool isCustomMerge();
+
   private:
-    // Note - we care about the order of this map, as we iterate through it in
-    // order of offsets
+    // Note - we care about the order of this map, as we iterate through it
+    // in order of offsets
     std::map<uint32_t, SnapshotMergeRegion> mergeRegions;
+
+    bool _isCustomMerge = false;
+
+    std::vector<SnapshotDiff> getCustomDiffs(const uint8_t* updated,
+                                             size_t updatedSize);
+
+    std::vector<SnapshotDiff> getStandardDiffs(const uint8_t* updated,
+                                               size_t updatedSize);
 };
 
 std::string snapshotDataTypeStr(SnapshotDataType dt);
diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp
index 95d02f71f..fe0df2e5f 100644
--- a/src/snapshot/SnapshotServer.cpp
+++ b/src/snapshot/SnapshotServer.cpp
@@ -131,7 +131,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
           ->localLock();
     }
 
-    // Apply diffs to snapshot
+    // Iterate through the chunks passed in the request
     for (const auto* r : *r->chunks()) {
         uint8_t* dest = snap.data + r->offset();
         switch (r->dataType()) {
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index f49e99a55..ae0c2cab3 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -36,6 +36,57 @@ std::vector<SnapshotDiff> SnapshotData::getDirtyPages()
 
 std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                                                        size_t updatedSize)
+{
+    for (auto& m : mergeRegions) {
+        SPDLOG_TRACE("{} {} merge region at {} {}-{}",
+                     snapshotDataTypeStr(m.second.dataType),
+                     snapshotMergeOpStr(m.second.operation),
+                     m.first,
+                     m.second.offset,
+                     m.second.offset + m.second.length);
+    }
+
+    std::vector<SnapshotDiff> diffs;
+    if (_isCustomMerge) {
+        diffs = getCustomDiffs(updated, updatedSize);
+    } else {
+        diffs = getStandardDiffs(updated, updatedSize);
+    }
+
+    return diffs;
+}
+
+std::vector<SnapshotDiff> SnapshotData::getCustomDiffs(const uint8_t* updated,
+                                                       size_t updatedSize)
+{
+    SPDLOG_TRACE("Doing custom diff with {} merge regions",
+                 mergeRegions.size());
+
+    // Here we ignore all diffs except for those specified in merge regions
+    std::vector<SnapshotDiff> diffs;
+    for (auto& mergePair : mergeRegions) {
+        // Get the diff for this merge region
+        SnapshotMergeRegion region = mergePair.second;
+        SnapshotDiff diff = region.toDiff(data, updated);
+
+        if (diff.noChange) {
+            SPDLOG_TRACE("No diff for {} {} merge region at {} {}-{}",
+                         snapshotDataTypeStr(mergePair.second.dataType),
+                         snapshotMergeOpStr(mergePair.second.operation),
+                         mergePair.first,
+                         mergePair.second.offset,
+                         mergePair.second.offset + mergePair.second.length);
+            continue;
+        }
+
+        diffs.push_back(diff);
+    }
+
+    return diffs;
+}
+
+std::vector<SnapshotDiff> SnapshotData::getStandardDiffs(const uint8_t* updated,
+                                                         size_t updatedSize)
 {
     // Work out which pages have changed
     size_t nThisPages = getRequiredHostPages(size);
@@ -47,15 +98,6 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                  dirtyPageNumbers.size(),
                  mergeRegions.size());
 
-    for (auto& m : mergeRegions) {
-        SPDLOG_TRACE("{} {} merge region at {} {}-{}",
-                     snapshotDataTypeStr(m.second.dataType),
-                     snapshotMergeOpStr(m.second.operation),
-                     m.first,
-                     m.second.offset,
-                     m.second.offset + m.second.length);
-    }
-
     // Get iterator over merge regions
     std::map<uint32_t, SnapshotMergeRegion>::iterator mergeIt =
       mergeRegions.begin();
@@ -130,91 +172,9 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                     diffInProgress = false;
                 }
 
+                // Get the diff for this merge region
                 SnapshotMergeRegion region = mergeIt->second;
-
-                // Set up the diff
-                const uint8_t* updatedValue = updated + region.offset;
-                const uint8_t* originalValue = data + region.offset;
-
-                SnapshotDiff diff(region.offset, updatedValue, region.length);
-                diff.dataType = region.dataType;
-                diff.operation = region.operation;
-
-                // Modify diff data for certain operations
-                switch (region.dataType) {
-                    case (SnapshotDataType::Int): {
-                        int originalInt =
-                          *(reinterpret_cast<const int*>(originalValue));
-                        int updatedInt =
-                          *(reinterpret_cast<const int*>(updatedValue));
-
-                        switch (region.operation) {
-                            case (SnapshotMergeOperation::Sum): {
-                                // Sums must send the value to be _added_, and
-                                // not the final result
-                                updatedInt -= originalInt;
-                                break;
-                            }
-                            case (SnapshotMergeOperation::Subtract): {
-                                // Subtractions must send the value to be
-                                // subtracted, not the result
-                                updatedInt = originalInt - updatedInt;
-                                break;
-                            }
-                            case (SnapshotMergeOperation::Product): {
-                                // Products must send the value to be
-                                // multiplied, not the result
-                                updatedInt /= originalInt;
-                                break;
-                            }
-                            case (SnapshotMergeOperation::Max):
-                            case (SnapshotMergeOperation::Min):
-                                // Min and max don't need to change
-                                break;
-                            default: {
-                                SPDLOG_ERROR(
-                                  "Unhandled integer merge operation: {}",
-                                  region.operation);
-                                throw std::runtime_error(
-                                  "Unhandled integer merge operation");
-                            }
-                        }
-
-                        // TODO - somehow avoid casting away the const here?
-                        // Modify the memory in-place here
-                        std::memcpy((uint8_t*)updatedValue,
-                                    BYTES(&updatedInt),
-                                    sizeof(int32_t));
-
-                        break;
-                    }
-                    case (SnapshotDataType::Raw): {
-                        switch (region.operation) {
-                            case (SnapshotMergeOperation::Ignore): {
-                                break;
-                            }
-                            case (SnapshotMergeOperation::Overwrite): {
-                                // Default behaviour
-                                break;
-                            }
-                            default: {
-                                SPDLOG_ERROR(
-                                  "Unhandled raw merge operation: {}",
-                                  region.operation);
-                                throw std::runtime_error(
-                                  "Unhandled raw merge operation");
-                            }
-                        }
-
-                        break;
-                    }
-                    default: {
-                        SPDLOG_ERROR("Merge region for unhandled data type: {}",
-                                     region.dataType);
-                        throw std::runtime_error(
-                          "Merge region for unhandled data type");
-                    }
-                }
+                SnapshotDiff diff = region.toDiff(data, updated);
 
                 SPDLOG_TRACE("Diff at {} falls in {} {} merge region {}-{}",
                              pageOffset + b,
@@ -314,11 +274,22 @@ void SnapshotData::addMergeRegion(uint32_t offset,
                                 .length = length,
                                 .dataType = dataType,
                                 .operation = operation };
+
     // Locking as this may be called in bursts by multiple threads
     faabric::util::UniqueLock lock(snapMx);
     mergeRegions[offset] = region;
 }
 
+void SnapshotData::setIsCustomMerge(bool value)
+{
+    _isCustomMerge = value;
+}
+
+bool SnapshotData::isCustomMerge()
+{
+    return _isCustomMerge;
+}
+
 std::string snapshotDataTypeStr(SnapshotDataType dt)
 {
     switch (dt) {

From 2cd38e019c361fd92983a53b6f6a7d64b39385a0 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Thu, 4 Nov 2021 18:09:17 +0000
Subject: [PATCH 11/31] Overhaul of snapshot diffing

---
 include/faabric/util/snapshot.h |  98 +-------
 src/util/snapshot.cpp           | 395 +++++++++++++++-----------------
 2 files changed, 184 insertions(+), 309 deletions(-)

diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
index e1bb1f7b4..725eaeb25 100644
--- a/include/faabric/util/snapshot.h
+++ b/include/faabric/util/snapshot.h
@@ -70,95 +70,9 @@ class SnapshotMergeRegion
     SnapshotDataType dataType = SnapshotDataType::Raw;
     SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
 
-    SnapshotDiff toDiff(const uint8_t* original, const uint8_t* updated)
-    {
-        const uint8_t* updatedValue = updated + offset;
-        const uint8_t* originalValue = original + offset;
-
-        SnapshotDiff diff(offset, updatedValue, length);
-        diff.dataType = dataType;
-        diff.operation = operation;
-
-        // Modify diff data for certain operations
-        switch (dataType) {
-            case (SnapshotDataType::Int): {
-                int originalInt =
-                  *(reinterpret_cast<const int*>(originalValue));
-                int updatedInt = *(reinterpret_cast<const int*>(updatedValue));
-
-                if (originalInt == updatedInt) {
-                    diff.size = 0;
-                    diff.noChange = true;
-                    return diff;
-                }
-
-                switch (operation) {
-                    case (SnapshotMergeOperation::Sum): {
-                        // Sums must send the value to be _added_, and
-                        // not the final result
-                        updatedInt -= originalInt;
-                        break;
-                    }
-                    case (SnapshotMergeOperation::Subtract): {
-                        // Subtractions must send the value to be
-                        // subtracted, not the result
-                        updatedInt = originalInt - updatedInt;
-                        break;
-                    }
-                    case (SnapshotMergeOperation::Product): {
-                        // Products must send the value to be
-                        // multiplied, not the result
-                        updatedInt /= originalInt;
-                        break;
-                    }
-                    case (SnapshotMergeOperation::Max):
-                    case (SnapshotMergeOperation::Min):
-                        // Min and max don't need to change
-                        break;
-                    default: {
-                        SPDLOG_ERROR("Unhandled integer merge operation: {}",
-                                     operation);
-                        throw std::runtime_error(
-                          "Unhandled integer merge operation");
-                    }
-                }
-
-                // TODO - somehow avoid casting away the const here?
-                // Modify the memory in-place here
-                std::memcpy(
-                  (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t));
-
-                break;
-            }
-            case (SnapshotDataType::Raw): {
-                switch (operation) {
-                    case (SnapshotMergeOperation::Ignore): {
-                        break;
-                    }
-                    case (SnapshotMergeOperation::Overwrite): {
-                        // TODO - how can we make this fine-grained?
-                        // Default behaviour
-                        break;
-                    }
-                    default: {
-                        SPDLOG_ERROR("Unhandled raw merge operation: {}",
-                                     operation);
-                        throw std::runtime_error(
-                          "Unhandled raw merge operation");
-                    }
-                }
-
-                break;
-            }
-            default: {
-                SPDLOG_ERROR("Merge region for unhandled data type: {}",
-                             dataType);
-                throw std::runtime_error(
-                  "Merge region for unhandled data type");
-            }
-        }
-        return diff;
-    }
+    void addDiffs(std::vector<SnapshotDiff>& diffs,
+                  const uint8_t* original,
+                  const uint8_t* updated);
 };
 
 class SnapshotData
@@ -180,17 +94,13 @@ class SnapshotData
                         SnapshotDataType dataType,
                         SnapshotMergeOperation operation);
 
-    void setIsCustomMerge(bool value);
-
-    bool isCustomMerge();
+    void addMergeRegion(SnapshotMergeRegion& region);
 
   private:
     // Note - we care about the order of this map, as we iterate through it
     // in order of offsets
     std::map<uint32_t, SnapshotMergeRegion> mergeRegions;
 
-    bool _isCustomMerge = false;
-
     std::vector<SnapshotDiff> getCustomDiffs(const uint8_t* updated,
                                              size_t updatedSize);
 
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index ae0c2cab3..e5a2751bf 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -37,229 +37,63 @@ std::vector<SnapshotDiff> SnapshotData::getDirtyPages()
 std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                                                        size_t updatedSize)
 {
-    for (auto& m : mergeRegions) {
-        SPDLOG_TRACE("{} {} merge region at {} {}-{}",
-                     snapshotDataTypeStr(m.second.dataType),
-                     snapshotMergeOpStr(m.second.operation),
-                     m.first,
-                     m.second.offset,
-                     m.second.offset + m.second.length);
-    }
-
-    std::vector<SnapshotDiff> diffs;
-    if (_isCustomMerge) {
-        diffs = getCustomDiffs(updated, updatedSize);
-    } else {
-        diffs = getStandardDiffs(updated, updatedSize);
-    }
-
-    return diffs;
-}
-
-std::vector<SnapshotDiff> SnapshotData::getCustomDiffs(const uint8_t* updated,
-                                                       size_t updatedSize)
-{
-    SPDLOG_TRACE("Doing custom diff with {} merge regions",
-                 mergeRegions.size());
-
-    // Here we ignore all diffs except for those specified in merge regions
     std::vector<SnapshotDiff> diffs;
-    for (auto& mergePair : mergeRegions) {
-        // Get the diff for this merge region
-        SnapshotMergeRegion region = mergePair.second;
-        SnapshotDiff diff = region.toDiff(data, updated);
-
-        if (diff.noChange) {
-            SPDLOG_TRACE("No diff for {} {} merge region at {} {}-{}",
-                         snapshotDataTypeStr(mergePair.second.dataType),
-                         snapshotMergeOpStr(mergePair.second.operation),
-                         mergePair.first,
-                         mergePair.second.offset,
-                         mergePair.second.offset + mergePair.second.length);
-            continue;
-        }
-
-        diffs.push_back(diff);
+    SPDLOG_DEBUG("No merge regions set, thus no diffs");
+    if (mergeRegions.empty()) {
+        return diffs;
     }
 
-    return diffs;
-}
-
-std::vector<SnapshotDiff> SnapshotData::getStandardDiffs(const uint8_t* updated,
-                                                         size_t updatedSize)
-{
-    // Work out which pages have changed
+    // Work out which pages have changed (these will be sorted)
     size_t nThisPages = getRequiredHostPages(size);
     std::vector<int> dirtyPageNumbers =
       getDirtyPageNumbers(updated, nThisPages);
 
-    SPDLOG_TRACE("Diffing {} pages with {} changed pages and {} merge regions",
-                 nThisPages,
-                 dirtyPageNumbers.size(),
-                 mergeRegions.size());
-
-    // Get iterator over merge regions
+    // Iterate through each dirty page, work out if there's an overlapping merge
+    // region, tell that region to add their diffs to the list
     std::map<uint32_t, SnapshotMergeRegion>::iterator mergeIt =
       mergeRegions.begin();
 
-    // Get byte-wise diffs _within_ the dirty pages
-    //
-    // NOTE - if raw diffs cover page boundaries, they will be split into
-    // multiple diffs, each of which is page-aligned.
-    // We can be relatively confident that variables will be page-aligned so
-    // this shouldn't be a problem.
-    //
-    // Merge regions support crossing page boundaries.
-    //
-    // For each byte we encounter have the following possible scenarios:
-    //
-    // 1. the byte is dirty, and is the start of a new diff
-    // 2. the byte is dirty, but the byte before was also dirty, so we
-    // are inside a diff
-    // 3. the byte is not dirty but the previous one was, so we've reached the
-    // end of a diff
-    // 4. the last byte of the page is dirty, so we've also come to the end of
-    // a diff
-    // 5. the byte is dirty, but is within a special merge region, in which
-    // case we need to add a diff for that whole region, then skip
-    // to the next byte after that region
-    std::vector<SnapshotDiff> diffs;
     for (int i : dirtyPageNumbers) {
-        int pageOffset = i * HOST_PAGE_SIZE;
-
-        bool diffInProgress = false;
-        int diffStart = 0;
-        int offset = pageOffset;
-        for (int b = 0; b < HOST_PAGE_SIZE; b++) {
-            offset = pageOffset + b;
-            bool isDirtyByte = *(data + offset) != *(updated + offset);
-
-            // Skip any merge regions we've passed
-            while (mergeIt != mergeRegions.end() &&
-                   offset >=
-                     (mergeIt->second.offset + mergeIt->second.length)) {
-                SnapshotMergeRegion region = mergeIt->second;
-                SPDLOG_TRACE("At offset {}, past region {} {} {}-{}",
-                             offset,
-                             snapshotDataTypeStr(region.dataType),
-                             snapshotMergeOpStr(region.operation),
-                             region.offset,
-                             region.offset + region.length);
-
-                ++mergeIt;
-            }
+        int pageStart = i * HOST_PAGE_SIZE;
+        int pageEnd = pageStart + HOST_PAGE_SIZE;
 
-            // Check if we're in a merge region
-            bool isInMergeRegion =
-              mergeIt != mergeRegions.end() &&
-              offset >= mergeIt->second.offset &&
-              offset < (mergeIt->second.offset + mergeIt->second.length);
-
-            if (isDirtyByte && isInMergeRegion) {
-                // If we've entered a merge region with a diff in progress, we
-                // need to close it off
-                if (diffInProgress) {
-                    diffs.emplace_back(
-                      diffStart, updated + diffStart, offset - diffStart);
-
-                    SPDLOG_TRACE(
-                      "Finished {} {} diff between {}-{} before merge region",
-                      snapshotDataTypeStr(diffs.back().dataType),
-                      snapshotMergeOpStr(diffs.back().operation),
-                      diffs.back().offset,
-                      diffs.back().offset + diffs.back().size);
-
-                    diffInProgress = false;
-                }
+        SPDLOG_TRACE("Checking dirty page {} at {}-{}", i, pageStart, pageEnd);
 
-                // Get the diff for this merge region
-                SnapshotMergeRegion region = mergeIt->second;
-                SnapshotDiff diff = region.toDiff(data, updated);
+        // Skip any merge regions we've passed
+        while (mergeIt != mergeRegions.end() &&
+               (mergeIt->second.offset < pageStart)) {
+            SPDLOG_TRACE("Gone past {} {} merge region at {}-{}",
+                         snapshotDataTypeStr(mergeIt->second.dataType),
+                         snapshotMergeOpStr(mergeIt->second.operation),
+                         mergeIt->second.offset,
+                         mergeIt->second.offset + mergeIt->second.length);
 
-                SPDLOG_TRACE("Diff at {} falls in {} {} merge region {}-{}",
-                             pageOffset + b,
-                             snapshotDataTypeStr(region.dataType),
-                             snapshotMergeOpStr(region.operation),
-                             region.offset,
-                             region.offset + region.length);
-
-                // Add the diff to the list
-                if (diff.operation != SnapshotMergeOperation::Ignore) {
-                    diffs.emplace_back(diff);
-                }
-
-                // Work out the offset where this region ends
-                int regionEndOffset =
-                  (region.offset - pageOffset) + region.length;
-
-                if (regionEndOffset < HOST_PAGE_SIZE) {
-                    // Skip over this region, still more offsets left in this
-                    // page
-                    SPDLOG_TRACE(
-                      "{} {} merge region {}-{} finished. Skipping to {}",
-                      snapshotDataTypeStr(region.dataType),
-                      snapshotMergeOpStr(region.operation),
-                      region.offset,
-                      region.offset + region.length,
-                      pageOffset + regionEndOffset);
-
-                    // Bump the loop variable to the end of this region (note
-                    // that the loop itself will increment onto the next).
-                    b = regionEndOffset - 1;
-                } else {
-                    // Merge region extends over this page, move onto next
-                    SPDLOG_TRACE(
-                      "{} {} merge region {}-{} over page boundary {} ({}-{})",
-                      snapshotDataTypeStr(region.dataType),
-                      snapshotMergeOpStr(region.operation),
-                      region.offset,
-                      region.offset + region.length,
-                      i,
-                      pageOffset,
-                      pageOffset + HOST_PAGE_SIZE);
-
-                    break;
-                }
-            } else if (isDirtyByte && !diffInProgress) {
-                // Diff starts here if it's different and diff not in progress
-                diffInProgress = true;
-                diffStart = offset;
-
-                SPDLOG_TRACE("Started Raw Overwrite diff at {}", diffStart);
-            } else if (!isDirtyByte && diffInProgress) {
-                // Diff ends if it's not different and diff is in progress
-                diffInProgress = false;
-                diffs.emplace_back(
-                  diffStart, updated + diffStart, offset - diffStart);
-
-                SPDLOG_TRACE("Finished {} {} diff between {}-{}",
-                             snapshotDataTypeStr(diffs.back().dataType),
-                             snapshotMergeOpStr(diffs.back().operation),
-                             diffs.back().offset,
-                             diffs.back().offset + diffs.back().size);
-            }
+            ++mergeIt;
         }
 
-        // If we've reached the end of this page with a diff in progress, we
-        // need to close it off
-        if (diffInProgress) {
-            offset++;
-
-            diffs.emplace_back(
-              diffStart, updated + diffStart, offset - diffStart);
-
-            SPDLOG_TRACE("Found {} {} diff between {}-{} at end of page",
-                         snapshotDataTypeStr(diffs.back().dataType),
-                         snapshotMergeOpStr(diffs.back().operation),
-                         diffs.back().offset,
-                         diffs.back().offset + diffs.back().size);
+        if (mergeIt == mergeRegions.end()) {
+            // Done if no more merge regions left
+            SPDLOG_TRACE("No more merge regions left");
+            break;
         }
-    }
 
-    // If comparison has more pages than the original, add another diff
-    // containing all the new pages
-    if (updatedSize > size) {
-        diffs.emplace_back(size, updated + size, updatedSize - size);
+        // For each merge region that overlaps this dirty page, get it to add
+        // its diffs, and move onto the next one
+        // TODO - make this more efficient by passing in dirty pages to merge
+        // regions
+        while (mergeIt != mergeRegions.end() &&
+               ((mergeIt->second.offset > pageStart) &&
+                (mergeIt->second.offset + mergeIt->second.length) < pageEnd)) {
+
+            SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}",
+                         snapshotDataTypeStr(mergeIt->second.dataType),
+                         snapshotMergeOpStr(mergeIt->second.operation),
+                         mergeIt->second.offset,
+                         mergeIt->second.offset + mergeIt->second.length);
+
+            mergeIt->second.addDiffs(diffs, data, updated);
+            mergeIt++;
+        }
     }
 
     return diffs;
@@ -275,19 +109,14 @@ void SnapshotData::addMergeRegion(uint32_t offset,
                                 .dataType = dataType,
                                 .operation = operation };
 
-    // Locking as this may be called in bursts by multiple threads
-    faabric::util::UniqueLock lock(snapMx);
-    mergeRegions[offset] = region;
-}
-
-void SnapshotData::setIsCustomMerge(bool value)
-{
-    _isCustomMerge = value;
+    addMergeRegion(region);
 }
 
-bool SnapshotData::isCustomMerge()
+void SnapshotData::addMergeRegion(SnapshotMergeRegion& region)
 {
-    return _isCustomMerge;
+    // Locking as this may be called in bursts by multiple threads
+    faabric::util::UniqueLock lock(snapMx);
+    mergeRegions[region.offset] = region;
 }
 
 std::string snapshotDataTypeStr(SnapshotDataType dt)
@@ -336,4 +165,140 @@ std::string snapshotMergeOpStr(SnapshotMergeOperation op)
         }
     }
 }
+
+void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
+                                   const uint8_t* original,
+                                   const uint8_t* updated)
+{
+    // Skip an ignore
+    if (operation == SnapshotMergeOperation::Ignore) {
+        return;
+    }
+
+    const uint8_t* updatedValue = updated + offset;
+    const uint8_t* originalValue = original + offset;
+
+    switch (dataType) {
+        case (SnapshotDataType::Int): {
+            // Get the original and update values
+            int originalInt = *(reinterpret_cast<const int*>(originalValue));
+            int updatedInt = *(reinterpret_cast<const int*>(updatedValue));
+
+            // Skip if no change
+            if (originalInt == updatedInt) {
+                return;
+            }
+
+            // Add the diff
+            SnapshotDiff diff(offset, updatedValue, length);
+            diff.dataType = dataType;
+            diff.operation = operation;
+            diffs.emplace_back(diff);
+
+            SPDLOG_TRACE("Adding {} {} diff at {}-{}",
+                         snapshotDataTypeStr(dataType),
+                         snapshotMergeOpStr(operation),
+                         offset,
+                         offset + length);
+
+            // Potentially modify the original in place depending on the
+            // operation
+            switch (operation) {
+                case (SnapshotMergeOperation::Sum): {
+                    // Sums must send the value to be _added_, and
+                    // not the final result
+                    updatedInt -= originalInt;
+                    break;
+                }
+                case (SnapshotMergeOperation::Subtract): {
+                    // Subtractions must send the value to be
+                    // subtracted, not the result
+                    updatedInt = originalInt - updatedInt;
+                    break;
+                }
+                case (SnapshotMergeOperation::Product): {
+                    // Products must send the value to be
+                    // multiplied, not the result
+                    updatedInt /= originalInt;
+                    break;
+                }
+                case (SnapshotMergeOperation::Max):
+                case (SnapshotMergeOperation::Min):
+                    // Min and max don't need to change
+                    break;
+                default: {
+                    SPDLOG_ERROR("Unhandled integer merge operation: {}",
+                                 operation);
+                    throw std::runtime_error(
+                      "Unhandled integer merge operation");
+                }
+            }
+
+            // TODO - somehow avoid casting away the const here?
+            // Modify the memory in-place here
+            std::memcpy(
+              (uint8_t*)updatedValue, BYTES(&updatedInt), sizeof(int32_t));
+
+            break;
+        }
+        case (SnapshotDataType::Raw): {
+            switch (operation) {
+                case (SnapshotMergeOperation::Overwrite): {
+                    // Add subsections of diffs only for the bytes that
+                    // have changed
+                    bool diffInProgress = false;
+                    int diffStart = 0;
+                    for (int b = 0; b < length; b++) {
+                        bool isDirtyByte =
+                          *(original + offset) != *(updated + offset);
+
+                        if (isDirtyByte && !diffInProgress) {
+                            // Diff starts here if it's different and diff
+                            // not in progress
+                            diffInProgress = true;
+                            diffStart = b;
+                        } else if (!isDirtyByte && diffInProgress) {
+                            // Diff ends if it's not different and diff is
+                            // in progress
+                            SPDLOG_TRACE("Adding {} {} diff at {}-{}",
+                                         snapshotDataTypeStr(dataType),
+                                         snapshotMergeOpStr(operation),
+                                         offset,
+                                         offset + length);
+
+                            diffInProgress = false;
+                            diffs.emplace_back(
+                              diffStart, updated + diffStart, b - diffStart);
+                        }
+                    }
+
+                    // If we've reached the end of this region with a diff
+                    // in progress, we need to close it off
+                    if (diffInProgress) {
+                        SPDLOG_TRACE("Adding {} {} diff at {}-{}",
+                                     snapshotDataTypeStr(dataType),
+                                     snapshotMergeOpStr(operation),
+                                     offset,
+                                     offset + length);
+
+                        diffs.emplace_back(diffStart,
+                                           updated + diffStart,
+                                           (length + 1) - diffStart);
+                    }
+                }
+                default: {
+                    SPDLOG_ERROR("Unhandled raw merge operation: {}",
+                                 operation);
+                    throw std::runtime_error("Unhandled raw merge operation");
+                }
+            }
+
+            break;
+        }
+        default: {
+            SPDLOG_ERROR("Merge region for unhandled data type: {}", dataType);
+            throw std::runtime_error("Merge region for unhandled data type");
+        }
+    }
+}
 }

From db284929dd2d0eb436ff1355189099e39d2e0edf Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Thu, 4 Nov 2021 18:34:00 +0000
Subject: [PATCH 12/31] Move spurious logging statement

---
 src/util/snapshot.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index e5a2751bf..89140947a 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -38,8 +38,8 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                                                        size_t updatedSize)
 {
     std::vector<SnapshotDiff> diffs;
-    SPDLOG_DEBUG("No merge regions set, thus no diffs");
     if (mergeRegions.empty()) {
+        SPDLOG_DEBUG("No merge regions set, thus no diffs");
         return diffs;
     }
 

From 8ce837535cb3034f452ba862defe00bcb5986302 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Fri, 5 Nov 2021 12:14:25 +0000
Subject: [PATCH 13/31] More snapshotting fixes

---
 include/faabric/util/snapshot.h | 12 +----
 src/snapshot/SnapshotServer.cpp | 27 +++++++-----
 src/util/snapshot.cpp           | 77 ++++++++++++++++++++++++---------
 3 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
index 725eaeb25..a005dbae3 100644
--- a/include/faabric/util/snapshot.h
+++ b/include/faabric/util/snapshot.h
@@ -53,13 +53,6 @@ class SnapshotDiff
         data = dataIn;
         size = sizeIn;
     }
-
-    SnapshotDiff(uint32_t offsetIn, const uint8_t* dataIn, size_t sizeIn)
-    {
-        offset = offsetIn;
-        data = dataIn;
-        size = sizeIn;
-    }
 };
 
 class SnapshotMergeRegion
@@ -92,9 +85,8 @@ class SnapshotData
     void addMergeRegion(uint32_t offset,
                         size_t length,
                         SnapshotDataType dataType,
-                        SnapshotMergeOperation operation);
-
-    void addMergeRegion(SnapshotMergeRegion& region);
+                        SnapshotMergeOperation operation,
+                        bool overwrite = false);
 
   private:
     // Note - we care about the order of this map, as we iterate through it
diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp
index fe0df2e5f..bc6783783 100644
--- a/src/snapshot/SnapshotServer.cpp
+++ b/src/snapshot/SnapshotServer.cpp
@@ -132,18 +132,25 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
     }
 
     // Iterate through the chunks passed in the request
-    for (const auto* r : *r->chunks()) {
-        uint8_t* dest = snap.data + r->offset();
-        switch (r->dataType()) {
+    for (const auto* chunk : *r->chunks()) {
+        uint8_t* dest = snap.data + chunk->offset();
+
+        SPDLOG_TRACE("Applying snapshot diff to {} at {}-{}",
+                     r->key()->str(),
+                     chunk->offset(),
+                     chunk->offset() + chunk->data()->size());
+
+        switch (chunk->dataType()) {
             case (faabric::util::SnapshotDataType::Raw): {
-                switch (r->mergeOp()) {
+                switch (chunk->mergeOp()) {
                     case (faabric::util::SnapshotMergeOperation::Overwrite): {
-                        std::memcpy(dest, r->data()->data(), r->data()->size());
+                        std::memcpy(
+                          dest, chunk->data()->data(), chunk->data()->size());
                         break;
                     }
                     default: {
                         SPDLOG_ERROR("Unsupported raw merge operation: {}",
-                                     r->mergeOp());
+                                     chunk->mergeOp());
                         throw std::runtime_error(
                           "Unsupported raw merge operation");
                     }
@@ -152,9 +159,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
             }
             case (faabric::util::SnapshotDataType::Int): {
                 const auto* value =
-                  reinterpret_cast<const int32_t*>(r->data()->data());
+                  reinterpret_cast<const int32_t*>(chunk->data()->data());
                 auto* destValue = reinterpret_cast<int32_t*>(dest);
-                switch (r->mergeOp()) {
+                switch (chunk->mergeOp()) {
                     case (faabric::util::SnapshotMergeOperation::Sum): {
                         *destValue += *value;
                         break;
@@ -177,7 +184,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
                     }
                     default: {
                         SPDLOG_ERROR("Unsupported int merge operation: {}",
-                                     r->mergeOp());
+                                     chunk->mergeOp());
                         throw std::runtime_error(
                           "Unsupported int merge operation");
                     }
@@ -185,7 +192,7 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
                 break;
             }
             default: {
-                SPDLOG_ERROR("Unsupported data type: {}", r->dataType());
+                SPDLOG_ERROR("Unsupported data type: {}", chunk->dataType());
                 throw std::runtime_error("Unsupported merge data type");
             }
         }
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index 89140947a..3b84796a9 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -26,7 +26,11 @@ std::vector<SnapshotDiff> SnapshotData::getDirtyPages()
     std::vector<SnapshotDiff> diffs;
     for (int i : dirtyPageNumbers) {
         uint32_t offset = i * HOST_PAGE_SIZE;
-        diffs.emplace_back(offset, data + offset, HOST_PAGE_SIZE);
+        diffs.emplace_back(SnapshotDataType::Raw,
+                           SnapshotMergeOperation::Overwrite,
+                           offset,
+                           data + offset,
+                           HOST_PAGE_SIZE);
     }
 
     SPDLOG_DEBUG("Snapshot has {}/{} dirty pages", diffs.size(), nPages);
@@ -43,6 +47,14 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
         return diffs;
     }
 
+    for (const auto& mr : mergeRegions) {
+        SPDLOG_TRACE("Merge region {} {} at {}-{}",
+                     snapshotDataTypeStr(mr.second.dataType),
+                     snapshotMergeOpStr(mr.second.operation),
+                     mr.second.offset,
+                     mr.second.offset + mr.second.length);
+    }
+
     // Work out which pages have changed (these will be sorted)
     size_t nThisPages = getRequiredHostPages(size);
     std::vector<int> dirtyPageNumbers =
@@ -82,8 +94,8 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
         // TODO - make this more efficient by passing in dirty pages to merge
         // regions
         while (mergeIt != mergeRegions.end() &&
-               ((mergeIt->second.offset > pageStart) &&
-                (mergeIt->second.offset + mergeIt->second.length) < pageEnd)) {
+               (mergeIt->second.offset > pageStart &&
+                mergeIt->second.offset < pageEnd)) {
 
             SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}",
                          snapshotDataTypeStr(mergeIt->second.dataType),
@@ -102,20 +114,39 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
 void SnapshotData::addMergeRegion(uint32_t offset,
                                   size_t length,
                                   SnapshotDataType dataType,
-                                  SnapshotMergeOperation operation)
+                                  SnapshotMergeOperation operation,
+                                  bool overwrite)
 {
     SnapshotMergeRegion region{ .offset = offset,
                                 .length = length,
                                 .dataType = dataType,
                                 .operation = operation };
 
-    addMergeRegion(region);
-}
-
-void SnapshotData::addMergeRegion(SnapshotMergeRegion& region)
-{
     // Locking as this may be called in bursts by multiple threads
     faabric::util::UniqueLock lock(snapMx);
+
+    if (mergeRegions.find(region.offset) != mergeRegions.end()) {
+        if (!overwrite) {
+            SPDLOG_ERROR("Attempting to overwrite existing merge region at {} "
+                         "with {} {} at {}-{}",
+                         region.offset,
+                         snapshotDataTypeStr(dataType),
+                         snapshotMergeOpStr(operation),
+                         region.offset,
+                         region.offset + length);
+
+            throw std::runtime_error("Not able to overwrite merge region");
+        }
+
+        SPDLOG_TRACE(
+          "Overwriting existing merge region at {} with {} {} at {}-{}",
+          region.offset,
+          snapshotDataTypeStr(dataType),
+          snapshotMergeOpStr(operation),
+          region.offset,
+          region.offset + length);
+    }
+
     mergeRegions[region.offset] = region;
 }
 
@@ -190,10 +221,8 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
             }
 
             // Add the diff
-            SnapshotDiff diff(offset, updatedValue, length);
-            diff.dataType = dataType;
-            diff.operation = operation;
-            diffs.emplace_back(diff);
+            diffs.emplace_back(
+              dataType, operation, offset, updatedValue, length);
 
             SPDLOG_TRACE("Adding {} {} diff at {}-{}",
                          snapshotDataTypeStr(dataType),
@@ -248,9 +277,8 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                     // have changed
                     bool diffInProgress = false;
                     int diffStart = 0;
-                    for (int b = 0; b < length; b++) {
-                        bool isDirtyByte =
-                          *(original + offset) != *(updated + offset);
+                    for (int b = offset; b < offset + length; b++) {
+                        bool isDirtyByte = *(original + b) != *(updated + b);
 
                         if (isDirtyByte && !diffInProgress) {
                             // Diff starts here if it's different and diff
@@ -260,15 +288,19 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                         } else if (!isDirtyByte && diffInProgress) {
                             // Diff ends if it's not different and diff is
                             // in progress
+                            int diffLength = b - diffStart;
                             SPDLOG_TRACE("Adding {} {} diff at {}-{}",
                                          snapshotDataTypeStr(dataType),
                                          snapshotMergeOpStr(operation),
-                                         offset,
-                                         offset + length);
+                                         offset + diffStart,
+                                         offset + diffStart + diffLength);
 
                             diffInProgress = false;
-                            diffs.emplace_back(
-                              diffStart, updated + diffStart, b - diffStart);
+                            diffs.emplace_back(dataType,
+                                               operation,
+                                               diffStart,
+                                               updated + diffStart,
+                                               diffLength);
                         }
                     }
 
@@ -281,10 +313,13 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                                      offset,
                                      offset + length);
 
-                        diffs.emplace_back(diffStart,
+                        diffs.emplace_back(dataType,
+                                           operation,
+                                           diffStart,
                                            updated + diffStart,
                                            (length + 1) - diffStart);
                     }
+                    break;
                 }
                 default: {
                     SPDLOG_ERROR("Unhandled raw merge operation: {}",

From 627209b48b6afbc3503bd84c1f2306ae0c42b7d7 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Fri, 5 Nov 2021 14:50:42 +0000
Subject: [PATCH 14/31] Rearrange scheduler logic

---
 include/faabric/scheduler/Scheduler.h |   9 +-
 src/scheduler/Scheduler.cpp           | 348 +++++++++++++-------------
 src/snapshot/SnapshotServer.cpp       |   3 +
 3 files changed, 184 insertions(+), 176 deletions(-)

diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
index dc7b070d2..c76c6d5f3 100644
--- a/include/faabric/scheduler/Scheduler.h
+++ b/include/faabric/scheduler/Scheduler.h
@@ -205,6 +205,8 @@ class Scheduler
                        std::promise<std::unique_ptr<faabric::Message>>>
       localResults;
 
+    std::unordered_map<std::string, std::set<std::string>> pushedSnapshotsMap;
+
     std::mutex localResultsMutex;
 
     // ---- Clients ----
@@ -234,13 +236,6 @@ class Scheduler
     std::vector<std::string> getUnregisteredHosts(const std::string& funcStr,
                                                   bool noCache = false);
 
-    int scheduleFunctionsOnHost(
-      const std::string& host,
-      std::shared_ptr<faabric::BatchExecuteRequest> req,
-      faabric::util::SchedulingDecision& decision,
-      int offset,
-      faabric::util::SnapshotData* snapshot);
-
     // ---- Accounting and debugging ----
     std::vector<faabric::Message> recordedMessagesAll;
     std::vector<faabric::Message> recordedMessagesLocal;
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 263ac7d87..f1fd4cc37 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -224,15 +224,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         throw std::runtime_error("Message with no master host");
     }
 
-    // Set up scheduling decision
-    SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
-
     // TODO - more granular locking, this is conservative
     faabric::util::FullLock lock(mx);
 
     // If we're not the master host, we need to forward the request back to the
     // master host. This will only happen if a nested batch execution happens.
-    std::vector<int> localMessageIdxs;
+    SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
     if (!forceLocal && masterHost != thisHost) {
         SPDLOG_DEBUG(
           "Forwarding {} {} back to master {}", nMessages, funcStr, masterHost);
@@ -242,98 +239,50 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         return decision;
     }
 
+    // -------------------------------------------
+    // HOST DECISIONS
+    // -------------------------------------------
+    std::vector<std::string> hosts;
     if (forceLocal) {
         // We're forced to execute locally here so we do all the messages
         for (int i = 0; i < nMessages; i++) {
-            localMessageIdxs.emplace_back(i);
-            decision.addMessage(thisHost, req->messages().at(i));
+            hosts.push_back(thisHost);
         }
     } else {
         // At this point we know we're the master host, and we've not been
         // asked to force full local execution.
 
-        // Get a list of other registered hosts
-        std::set<std::string>& thisRegisteredHosts = registeredHosts[funcStr];
-
-        // For threads/ processes we need to have a snapshot key and be
-        // ready to push the snapshot to other hosts.
-        // We also have to broadcast the latest snapshots to all registered
-        // hosts, regardless of whether they're going to execute a function.
-        // This ensures everything is up to date, and we don't have to
-        // maintain different records of which hosts hold which updates.
-        faabric::util::SnapshotData snapshotData;
-        std::string snapshotKey = firstMsg.snapshotkey();
-        bool snapshotNeeded =
-          req->type() == req->THREADS || req->type() == req->PROCESSES;
-
-        if (snapshotNeeded) {
-            if (snapshotKey.empty()) {
-                SPDLOG_ERROR("No snapshot provided for {}", funcStr);
-                throw std::runtime_error(
-                  "Empty snapshot for distributed threads/ processes");
-            }
-
-            snapshotData =
-              faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey);
-
-            if (!thisRegisteredHosts.empty()) {
-                std::vector<faabric::util::SnapshotDiff> snapshotDiffs =
-                  snapshotData.getDirtyPages();
-
-                // Do the snapshot diff pushing
-                if (!snapshotDiffs.empty()) {
-                    for (const auto& h : thisRegisteredHosts) {
-                        SPDLOG_DEBUG("Pushing {} snapshot diffs for {} to {}",
-                                     snapshotDiffs.size(),
-                                     funcStr,
-                                     h);
-                        SnapshotClient& c = getSnapshotClient(h);
-                        c.pushSnapshotDiffs(
-                          snapshotKey, firstMsg.groupid(), snapshotDiffs);
-                    }
-                }
-
-                // Now reset the dirty page tracking, as we want the next batch
-                // of diffs to contain everything from now on (including the
-                // updates sent back from all the threads)
-                SPDLOG_DEBUG("Resetting dirty tracking after pushing diffs {}",
-                             funcStr);
-                faabric::util::resetDirtyTracking();
-            }
-        }
-
         // Work out how many we can handle locally
-        int nLocally;
-        {
-            int slots = thisHostResources.slots();
+        int slots = thisHostResources.slots();
 
-            // Work out available cores, flooring at zero
-            int available =
-              slots - this->thisHostUsedSlots.load(std::memory_order_acquire);
-            available = std::max<int>(available, 0);
+        // Work out available cores, flooring at zero
+        int available =
+          slots - this->thisHostUsedSlots.load(std::memory_order_acquire);
+        available = std::max<int>(available, 0);
 
-            // Claim as many as we can
-            nLocally = std::min<int>(available, nMessages);
-        }
+        // Claim as many as we can
+        int nLocally = std::min<int>(available, nMessages);
 
         // Add those that can be executed locally
-        if (nLocally > 0) {
-            SPDLOG_DEBUG(
-              "Executing {}/{} {} locally", nLocally, nMessages, funcStr);
-            for (int i = 0; i < nLocally; i++) {
-                localMessageIdxs.emplace_back(i);
-                decision.addMessage(thisHost, req->messages().at(i));
-            }
+        for (int i = 0; i < nLocally; i++) {
+            hosts.push_back(thisHost);
         }
 
         // If some are left, we need to distribute
         int offset = nLocally;
         if (offset < nMessages) {
             // Schedule first to already registered hosts
+            std::set<std::string>& thisRegisteredHosts =
+              registeredHosts[funcStr];
+
             for (const auto& h : thisRegisteredHosts) {
-                int nOnThisHost = scheduleFunctionsOnHost(
-                  h, req, decision, offset, &snapshotData);
+                int remainder = nMessages - offset;
+
+                // Work out resources on this host
+                faabric::HostResources r = getHostResources(h);
+                int available = r.slots() - r.usedslots();
 
+                int nOnThisHost = std::min(available, remainder);
                 offset += nOnThisHost;
                 if (offset >= nMessages) {
                     break;
@@ -346,15 +295,14 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
             std::vector<std::string> unregisteredHosts =
               getUnregisteredHosts(funcStr);
 
-            for (auto& h : unregisteredHosts) {
-                // Skip if this host
-                if (h == thisHost) {
-                    continue;
-                }
+            for (const auto& h : unregisteredHosts) {
+                int remainder = nMessages - offset;
+
+                // Work out resources on this host
+                faabric::HostResources r = getHostResources(h);
+                int available = r.slots() - r.usedslots();
 
-                // Schedule functions on the host
-                int nOnThisHost = scheduleFunctionsOnHost(
-                  h, req, decision, offset, &snapshotData);
+                int nOnThisHost = std::min(available, remainder);
 
                 // Register the host if it's exected a function
                 if (nOnThisHost > 0) {
@@ -378,8 +326,7 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
                          funcStr);
 
             for (; offset < nMessages; offset++) {
-                localMessageIdxs.emplace_back(offset);
-                decision.addMessage(thisHost, req->messages().at(offset));
+                hosts.push_back(thisHost);
             }
         }
 
@@ -387,6 +334,9 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         assert(offset == nMessages);
     }
 
+    // Sanity check
+    assert(hosts.size() == nMessages);
+
     // Register thread results if necessary
     if (isThreads) {
         for (const auto& m : req->messages()) {
@@ -394,58 +344,146 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         }
     }
 
-    // Schedule messages locally if necessary. For threads we only need one
-    // executor, for anything else we want one Executor per function in flight
-    if (!localMessageIdxs.empty()) {
-        // Update slots
-        this->thisHostUsedSlots.fetch_add((int32_t)localMessageIdxs.size(),
-                                          std::memory_order_acquire);
-
-        if (isThreads) {
-            // Threads use the existing executor. We assume there's only one
-            // running at a time.
-            std::vector<std::shared_ptr<Executor>>& thisExecutors =
-              executors[funcStr];
-
-            std::shared_ptr<Executor> e = nullptr;
-            if (thisExecutors.empty()) {
-                // Create executor if not exists
-                e = claimExecutor(firstMsg, lock);
-            } else if (thisExecutors.size() == 1) {
-                // Use existing executor if exists
-                e = thisExecutors.back();
+    // Set up decision
+    for (int i = 0; i < hosts.size(); i++) {
+        decision.addMessage(hosts.at(i), req->messages().at(i));
+    }
+
+    // -------------------------------------------
+    // POINT-TO-POINT
+    // -------------------------------------------
+
+    // Send out point-to-point mappings if necessary (unless being forced to
+    // execute locally, in which case they will be transmitted from the
+    // master)
+    if (!forceLocal && (firstMsg.groupid() > 0)) {
+        broker.setAndSendMappingsFromSchedulingDecision(decision);
+    }
+
+    // -------------------------------------------
+    // SNAPSHOTS
+    // -------------------------------------------
+
+    // Push out snapshot diffs to registered hosts. We have to do this to
+    // *all* hosts, regardless of whether they will be executing functions.
+    // This greatly simplifies the reasoning about which hosts hold which
+    // diffs.
+    std::string snapshotKey = firstMsg.snapshotkey();
+    if (!snapshotKey.empty()) {
+        for (const auto& host : getFunctionRegisteredHosts(firstMsg)) {
+            SnapshotClient& c = getSnapshotClient(host);
+            faabric::util::SnapshotData snapshotData =
+              faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey);
+
+            // See if we've already pushed this snapshot to the given host,
+            // if so, just push the diffs
+            if (pushedSnapshotsMap[snapshotKey].contains(host)) {
+                std::vector<faabric::util::SnapshotDiff> snapshotDiffs =
+                  snapshotData.getDirtyPages();
+                c.pushSnapshotDiffs(
+                  snapshotKey, firstMsg.groupid(), snapshotDiffs);
             } else {
-                SPDLOG_ERROR("Found {} executors for threaded function {}",
-                             thisExecutors.size(),
-                             funcStr);
-                throw std::runtime_error(
-                  "Expected only one executor for threaded function");
+                c.pushSnapshot(snapshotKey, firstMsg.groupid(), snapshotData);
+                pushedSnapshotsMap[snapshotKey].insert(host);
             }
+        }
+    }
 
-            assert(e != nullptr);
+    // Now reset the dirty page tracking just before we start executing
+    SPDLOG_DEBUG("Resetting dirty tracking after pushing diffs {}", funcStr);
+    faabric::util::resetDirtyTracking();
 
-            // Execute the tasks
-            e->executeTasks(localMessageIdxs, req);
-        } else {
-            // Non-threads require one executor per task
-            for (auto i : localMessageIdxs) {
-                faabric::Message& localMsg = req->mutable_messages()->at(i);
-                if (localMsg.executeslocally()) {
-                    faabric::util::UniqueLock resultsLock(localResultsMutex);
-                    localResults.insert(
-                      { localMsg.id(),
-                        std::promise<std::unique_ptr<faabric::Message>>() });
-                }
-                std::shared_ptr<Executor> e = claimExecutor(localMsg, lock);
-                e->executeTasks({ i }, req);
+    // -------------------------------------------
+    // EXECTUION
+    // -------------------------------------------
+
+    std::set<std::string> uniqueHosts(hosts.begin(), hosts.end());
+
+    // Iterate through unique hosts
+    for (const std::string& host : uniqueHosts) {
+        // Work out which indexes are scheduled on this host
+        std::vector<int> thisHostIdxs;
+        for (int i = 0; i < hosts.size(); i++) {
+            if (hosts.at(i) == host) {
+                thisHostIdxs.push_back(i);
             }
         }
-    }
 
-    // Send out point-to-point mappings if necessary (unless being forced to
-    // execute locally, in which case they will be transmitted from the master)
-    if (!forceLocal && (firstMsg.groupid() > 0)) {
-        broker.setAndSendMappingsFromSchedulingDecision(decision);
+        if (host == thisHost) {
+            // -------------------------------------------
+            // LOCAL EXECTUION
+            // -------------------------------------------
+            // For threads we only need one executor, for anything else we want
+            // one Executor per function in flight.
+
+            // Update slots
+            this->thisHostUsedSlots.fetch_add(thisHostIdxs.size(),
+                                              std::memory_order_acquire);
+
+            if (isThreads) {
+                // Threads use the existing executor. We assume there's only
+                // one running at a time.
+                std::vector<std::shared_ptr<Executor>>& thisExecutors =
+                  executors[funcStr];
+
+                std::shared_ptr<Executor> e = nullptr;
+                if (thisExecutors.empty()) {
+                    // Create executor if not exists
+                    e = claimExecutor(firstMsg, lock);
+                } else if (thisExecutors.size() == 1) {
+                    // Use existing executor if exists
+                    e = thisExecutors.back();
+                } else {
+                    SPDLOG_ERROR("Found {} executors for threaded function {}",
+                                 thisExecutors.size(),
+                                 funcStr);
+                    throw std::runtime_error(
+                      "Expected only one executor for threaded function");
+                }
+
+                assert(e != nullptr);
+
+                // Execute the tasks
+                e->executeTasks(thisHostIdxs, req);
+            } else {
+                // Non-threads require one executor per task
+                for (auto i : thisHostIdxs) {
+                    faabric::Message& localMsg = req->mutable_messages()->at(i);
+                    if (localMsg.executeslocally()) {
+                        faabric::util::UniqueLock resultsLock(
+                          localResultsMutex);
+                        localResults.insert(
+                          { localMsg.id(),
+                            std::promise<
+                              std::unique_ptr<faabric::Message>>() });
+                    }
+                    std::shared_ptr<Executor> e = claimExecutor(localMsg, lock);
+                    e->executeTasks({ i }, req);
+                }
+            }
+        } else {
+            // -------------------------------------------
+            // REMOTE EXECTUION
+            // -------------------------------------------
+
+            // Set up new request
+            std::shared_ptr<faabric::BatchExecuteRequest> hostRequest =
+              faabric::util::batchExecFactory();
+            hostRequest->set_snapshotkey(req->snapshotkey());
+            hostRequest->set_type(req->type());
+            hostRequest->set_subtype(req->subtype());
+            hostRequest->set_contextdata(req->contextdata());
+
+            // Add messages
+            for (auto msgIdx : thisHostIdxs) {
+                auto* newMsg = hostRequest->add_messages();
+                *newMsg = req->messages().at(msgIdx);
+                newMsg->set_executeslocally(false);
+                decision.addMessage(host, req->messages().at(msgIdx));
+            }
+
+            getFunctionCallClient(host).executeFunctions(hostRequest);
+        }
     }
 
     // Records for tests
@@ -511,7 +549,7 @@ void Scheduler::broadcastSnapshotDelete(const faabric::Message& msg,
 int Scheduler::scheduleFunctionsOnHost(
   const std::string& host,
   std::shared_ptr<faabric::BatchExecuteRequest> req,
-  SchedulingDecision& decision,
+  std::vector<std::string> hosts,
   int offset,
   faabric::util::SnapshotData* snapshot)
 {
@@ -531,35 +569,6 @@ int Scheduler::scheduleFunctionsOnHost(
         return 0;
     }
 
-    // Set up new request
-    std::shared_ptr<faabric::BatchExecuteRequest> hostRequest =
-      faabric::util::batchExecFactory();
-    hostRequest->set_snapshotkey(req->snapshotkey());
-    hostRequest->set_type(req->type());
-    hostRequest->set_subtype(req->subtype());
-    hostRequest->set_contextdata(req->contextdata());
-
-    // Add messages
-    int nOnThisHost = std::min<int>(available, remainder);
-    for (int i = offset; i < (offset + nOnThisHost); i++) {
-        auto* newMsg = hostRequest->add_messages();
-        *newMsg = req->messages().at(i);
-        newMsg->set_executeslocally(false);
-        decision.addMessage(host, req->messages().at(i));
-    }
-
-    SPDLOG_DEBUG(
-      "Sending {}/{} {} to {}", nOnThisHost, nMessages, funcStr, host);
-
-    // Handle snapshots
-    std::string snapshotKey = firstMsg.snapshotkey();
-    if (snapshot != nullptr && !snapshotKey.empty()) {
-        SnapshotClient& c = getSnapshotClient(host);
-        c.pushSnapshot(snapshotKey, firstMsg.groupid(), *snapshot);
-    }
-
-    getFunctionCallClient(host).executeFunctions(hostRequest);
-
     return nOnThisHost;
 }
 
@@ -646,8 +655,8 @@ std::shared_ptr<Executor> Scheduler::claimExecutor(
         int nExecutors = thisExecutors.size();
         SPDLOG_DEBUG(
           "Scaling {} from {} -> {}", funcStr, nExecutors, nExecutors + 1);
-        // Spinning up a new executor can be lengthy, allow other things to run
-        // in parallel
+        // Spinning up a new executor can be lengthy, allow other things
+        // to run in parallel
         schedulerLock.unlock();
         auto executor = factory->createExecutor(msg);
         schedulerLock.lock();
@@ -711,7 +720,8 @@ void Scheduler::setFunctionResult(faabric::Message& msg)
         if (it != localResults.end()) {
             it->second.set_value(std::make_unique<faabric::Message>(msg));
         }
-        // Sync messages can't have their results read twice, so skip redis
+        // Sync messages can't have their results read twice, so skip
+        // redis
         if (!msg.isasync()) {
             return;
         }
@@ -729,8 +739,8 @@ void Scheduler::setFunctionResult(faabric::Message& msg)
 
 void Scheduler::registerThread(uint32_t msgId)
 {
-    // Here we need to ensure the promise is registered locally so callers can
-    // start waiting
+    // Here we need to ensure the promise is registered locally so
+    // callers can start waiting
     threadResults[msgId];
 }
 
@@ -818,13 +828,13 @@ faabric::Message Scheduler::getFunctionResult(unsigned int messageId,
     faabric::Message msgResult;
 
     if (isBlocking) {
-        // Blocking version will throw an exception when timing out which is
-        // handled by the caller.
+        // Blocking version will throw an exception when timing out
+        // which is handled by the caller.
         std::vector<uint8_t> result = redis.dequeueBytes(resultKey, timeoutMs);
         msgResult.ParseFromArray(result.data(), (int)result.size());
     } else {
-        // Non-blocking version will tolerate empty responses, therefore we
-        // handle the exception here
+        // Non-blocking version will tolerate empty responses, therefore
+        // we handle the exception here
         std::vector<uint8_t> result;
         try {
             result = redis.dequeueBytes(resultKey, timeoutMs);
diff --git a/src/snapshot/SnapshotServer.cpp b/src/snapshot/SnapshotServer.cpp
index bc6783783..108e3c5d6 100644
--- a/src/snapshot/SnapshotServer.cpp
+++ b/src/snapshot/SnapshotServer.cpp
@@ -205,6 +205,9 @@ SnapshotServer::recvPushSnapshotDiffs(const uint8_t* buffer, size_t bufferSize)
           ->localUnlock();
     }
 
+    // Reset dirty tracking having applied diffs
+    SPDLOG_DEBUG("Resetting dirty page tracking having applied diffs");
+
     // Send response
     return std::make_unique<faabric::EmptyResponse>();
 }

From e2b8f3abbcc1e84ffdfd2366b477c4ece530501e Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Fri, 5 Nov 2021 15:21:33 +0000
Subject: [PATCH 15/31] Fix indexing in scheduler loop

---
 src/scheduler/Scheduler.cpp | 122 +++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 59 deletions(-)

diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index f1fd4cc37..7febf86fb 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -117,6 +117,7 @@ void Scheduler::reset()
     availableHostsCache.clear();
     registeredHosts.clear();
     threadResults.clear();
+    pushedSnapshotsMap.clear();
 
     // Records
     recordedMessagesAll.clear();
@@ -268,40 +269,39 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
             hosts.push_back(thisHost);
         }
 
-        // If some are left, we need to distribute
-        int offset = nLocally;
-        if (offset < nMessages) {
-            // Schedule first to already registered hosts
+        // If some are left, we need to distribute.
+        // First try and do so on already registered hosts.
+        int remainder = nMessages - nLocally;
+        if (remainder > 0) {
             std::set<std::string>& thisRegisteredHosts =
               registeredHosts[funcStr];
 
             for (const auto& h : thisRegisteredHosts) {
-                int remainder = nMessages - offset;
-
                 // Work out resources on this host
                 faabric::HostResources r = getHostResources(h);
                 int available = r.slots() - r.usedslots();
-
                 int nOnThisHost = std::min(available, remainder);
-                offset += nOnThisHost;
-                if (offset >= nMessages) {
+
+                for (int i = 0; i < nOnThisHost; i++) {
+                    hosts.push_back(h);
+                }
+
+                remainder -= nOnThisHost;
+                if (remainder <= 0) {
                     break;
                 }
             }
         }
 
-        // Now schedule to unregistered hosts if there are some left
-        if (offset < nMessages) {
+        // Now schedule to unregistered hosts if there are messages left
+        if (remainder > 0) {
             std::vector<std::string> unregisteredHosts =
               getUnregisteredHosts(funcStr);
 
             for (const auto& h : unregisteredHosts) {
-                int remainder = nMessages - offset;
-
                 // Work out resources on this host
                 faabric::HostResources r = getHostResources(h);
                 int available = r.slots() - r.usedslots();
-
                 int nOnThisHost = std::min(available, remainder);
 
                 // Register the host if it's exected a function
@@ -310,33 +310,41 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
                     registeredHosts[funcStr].insert(h);
                 }
 
-                offset += nOnThisHost;
-                if (offset >= nMessages) {
+                for (int i = 0; i < nOnThisHost; i++) {
+                    hosts.push_back(h);
+                }
+
+                remainder -= nOnThisHost;
+                if (remainder <= 0) {
                     break;
                 }
             }
         }
 
         // At this point there's no more capacity in the system, so we
-        // just need to execute locally
-        if (offset < nMessages) {
-            SPDLOG_DEBUG("Overloading {}/{} {} locally",
-                         nMessages - offset,
-                         nMessages,
-                         funcStr);
+        // just need to overload locally
+        if (remainder > 0) {
+            SPDLOG_DEBUG(
+              "Overloading {}/{} {} locally", remainder, nMessages, funcStr);
 
-            for (; offset < nMessages; offset++) {
+            for (int i = 0; i < remainder; i++) {
                 hosts.push_back(thisHost);
             }
         }
-
-        // Sanity check
-        assert(offset == nMessages);
     }
 
     // Sanity check
     assert(hosts.size() == nMessages);
 
+    // Set up decision
+    for (int i = 0; i < hosts.size(); i++) {
+        decision.addMessage(hosts.at(i), req->messages().at(i));
+    }
+
+    // -------------------------------------------
+    // THREADS
+    // -------------------------------------------
+
     // Register thread results if necessary
     if (isThreads) {
         for (const auto& m : req->messages()) {
@@ -344,13 +352,8 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         }
     }
 
-    // Set up decision
-    for (int i = 0; i < hosts.size(); i++) {
-        decision.addMessage(hosts.at(i), req->messages().at(i));
-    }
-
     // -------------------------------------------
-    // POINT-TO-POINT
+    // POINT-TO-POINT MESSAGING
     // -------------------------------------------
 
     // Send out point-to-point mappings if necessary (unless being forced to
@@ -397,10 +400,18 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
     // EXECTUION
     // -------------------------------------------
 
+    // NOTE: we want to schedule things on this host _last_, otherwise functions
+    // may start executing before all messages have been dispatched, thus
+    // slowing the remaining scheduling.
+    // TODO - clearner way to do this?
     std::set<std::string> uniqueHosts(hosts.begin(), hosts.end());
+    uniqueHosts.erase(thisHost);
+    std::vector<std::string> orderedHosts(uniqueHosts.begin(),
+                                          uniqueHosts.end());
+    orderedHosts.push_back(thisHost);
 
     // Iterate through unique hosts
-    for (const std::string& host : uniqueHosts) {
+    for (const std::string& host : orderedHosts) {
         // Work out which indexes are scheduled on this host
         std::vector<int> thisHostIdxs;
         for (int i = 0; i < hosts.size(); i++) {
@@ -416,6 +427,18 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
             // For threads we only need one executor, for anything else we want
             // one Executor per function in flight.
 
+            if (thisHostIdxs.empty()) {
+                SPDLOG_DEBUG("Not scheduling any calls to {} out of {} locally",
+                             funcStr,
+                             nMessages);
+                continue;
+            }
+
+            SPDLOG_DEBUG("Scheduling {}/{} calls to {} locally",
+                         thisHostIdxs.size(),
+                         nMessages,
+                         funcStr);
+
             // Update slots
             this->thisHostUsedSlots.fetch_add(thisHostIdxs.size(),
                                               std::memory_order_acquire);
@@ -466,6 +489,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
             // REMOTE EXECTUION
             // -------------------------------------------
 
+            SPDLOG_DEBUG("Scheduling {}/{} calls to {} on {}",
+                         thisHostIdxs.size(),
+                         nMessages,
+                         funcStr,
+                         host);
+
             // Set up new request
             std::shared_ptr<faabric::BatchExecuteRequest> hostRequest =
               faabric::util::batchExecFactory();
@@ -482,6 +511,7 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
                 decision.addMessage(host, req->messages().at(msgIdx));
             }
 
+            // Dispatch the calls
             getFunctionCallClient(host).executeFunctions(hostRequest);
         }
     }
@@ -546,32 +576,6 @@ void Scheduler::broadcastSnapshotDelete(const faabric::Message& msg,
     }
 }
 
-int Scheduler::scheduleFunctionsOnHost(
-  const std::string& host,
-  std::shared_ptr<faabric::BatchExecuteRequest> req,
-  std::vector<std::string> hosts,
-  int offset,
-  faabric::util::SnapshotData* snapshot)
-{
-    const faabric::Message& firstMsg = req->messages().at(0);
-    std::string funcStr = faabric::util::funcToString(firstMsg, false);
-
-    int nMessages = req->messages_size();
-    int remainder = nMessages - offset;
-
-    // Work out how many we can put on the host
-    faabric::HostResources r = getHostResources(host);
-    int available = r.slots() - r.usedslots();
-
-    // Drop out if none available
-    if (available <= 0) {
-        SPDLOG_DEBUG("Not scheduling {} on {}, no resources", funcStr, host);
-        return 0;
-    }
-
-    return nOnThisHost;
-}
-
 void Scheduler::callFunction(faabric::Message& msg, bool forceLocal)
 {
     // TODO - avoid this copy

From d9cca418992c7b017c4960942cf5b13540d1eebb Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Fri, 5 Nov 2021 15:53:13 +0000
Subject: [PATCH 16/31] Avoid pushing snapshots to master host

---
 src/scheduler/Scheduler.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 7febf86fb..38ddb1085 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -299,6 +299,11 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
               getUnregisteredHosts(funcStr);
 
             for (const auto& h : unregisteredHosts) {
+                // Skip if this host
+                if (h == thisHost) {
+                    continue;
+                }
+
                 // Work out resources on this host
                 faabric::HostResources r = getHostResources(h);
                 int available = r.slots() - r.usedslots();

From a32b7a38300ca3bb24591106bd8c1209f50d2097 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Fri, 5 Nov 2021 16:36:32 +0000
Subject: [PATCH 17/31] Remove last snapshot stuff

---
 include/faabric/scheduler/Scheduler.h |  2 --
 src/scheduler/Executor.cpp            | 22 ++++------------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
index c76c6d5f3..4813c2819 100644
--- a/include/faabric/scheduler/Scheduler.h
+++ b/include/faabric/scheduler/Scheduler.h
@@ -81,8 +81,6 @@ class Executor
     uint32_t threadPoolSize = 0;
 
   private:
-    std::string lastSnapshot;
-
     std::atomic<bool> claimed = false;
 
     std::mutex threadsMutex;
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 5159b7467..5fb9c2d31 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -88,8 +88,6 @@ void Executor::finish()
     // Reset variables
     boundMessage.Clear();
 
-    lastSnapshot = "";
-
     claimed = false;
 
     threadPoolThreads.clear();
@@ -113,8 +111,7 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     faabric::util::UniqueLock lock(threadsMutex);
 
     // Restore if necessary. If we're executing threads on the master host we
-    // assume we don't need to restore, but for everything else we do. If we've
-    // already restored from this snapshot, we don't do so again.
+    // assume we don't need to restore, but for everything else we do.
     faabric::Message& firstMsg = req->mutable_messages()->at(0);
     std::string snapshotKey = firstMsg.snapshotkey();
     std::string thisHost = faabric::util::getSystemConfig().endpointHost;
@@ -122,21 +119,10 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     bool isMaster = firstMsg.masterhost() == thisHost;
     bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS;
     bool isSnapshot = !snapshotKey.empty();
-    bool alreadyRestored = snapshotKey == lastSnapshot;
 
-    if (isSnapshot && !alreadyRestored) {
-        if ((!isMaster && isThreads) || !isThreads) {
-            SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey);
-            lastSnapshot = snapshotKey;
-            restore(firstMsg);
-        } else {
-            SPDLOG_DEBUG("Skipping snapshot restore on master {} [{}]",
-                         funcStr,
-                         snapshotKey);
-        }
-    } else if (isSnapshot) {
-        SPDLOG_DEBUG(
-          "Skipping already restored snapshot {} [{}]", funcStr, snapshotKey);
+    if (isSnapshot && !isMaster) {
+        SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey);
+        restore(firstMsg);
     }
 
     // Reset dirty page tracking if we're executing threads.

From 4b6eb322b41a28878ed8935594dc8861fb1fd033 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 8 Nov 2021 08:48:30 +0000
Subject: [PATCH 18/31] Add scheduler hints

---
 include/faabric/scheduler/Scheduler.h |  9 ++++
 src/scheduler/Scheduler.cpp           | 78 +++++++++++++++++++--------
 2 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
index 4813c2819..0e7414fcb 100644
--- a/include/faabric/scheduler/Scheduler.h
+++ b/include/faabric/scheduler/Scheduler.h
@@ -104,6 +104,10 @@ class Scheduler
       std::shared_ptr<faabric::BatchExecuteRequest> req,
       bool forceLocal = false);
 
+    faabric::util::SchedulingDecision callFunctions(
+      std::shared_ptr<faabric::BatchExecuteRequest> req,
+      faabric::util::SchedulingDecision& hint);
+
     void reset();
 
     void resetThreadLocalCache();
@@ -227,6 +231,11 @@ class Scheduler
 
     std::unordered_map<std::string, std::set<std::string>> registeredHosts;
 
+    faabric::util::SchedulingDecision doCallFunctions(
+      std::shared_ptr<faabric::BatchExecuteRequest> req,
+      faabric::util::SchedulingDecision& decision,
+      faabric::util::FullLock& lock);
+
     std::shared_ptr<Executor> claimExecutor(
       faabric::Message& msg,
       faabric::util::FullLock& schedulerLock);
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 38ddb1085..00a6a9ba8 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -1,3 +1,4 @@
+#include "faabric/util/locks.h"
 #include <faabric/proto/faabric.pb.h>
 #include <faabric/redis/Redis.h>
 #include <faabric/scheduler/ExecutorFactory.h>
@@ -210,10 +211,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
   std::shared_ptr<faabric::BatchExecuteRequest> req,
   bool forceLocal)
 {
-    // Extract properties of the request
-    int nMessages = req->messages_size();
-    bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS;
-
     // Note, we assume all the messages are for the same function and have the
     // same master host
     faabric::Message& firstMsg = req->mutable_messages()->at(0);
@@ -225,25 +222,24 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         throw std::runtime_error("Message with no master host");
     }
 
-    // TODO - more granular locking, this is conservative
-    faabric::util::FullLock lock(mx);
-
     // If we're not the master host, we need to forward the request back to the
     // master host. This will only happen if a nested batch execution happens.
     SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
     if (!forceLocal && masterHost != thisHost) {
-        SPDLOG_DEBUG(
-          "Forwarding {} {} back to master {}", nMessages, funcStr, masterHost);
+        SPDLOG_DEBUG("Forwarding {} back to master {}", funcStr, masterHost);
 
         getFunctionCallClient(masterHost).executeFunctions(req);
         decision.returnHost = masterHost;
         return decision;
     }
 
+    faabric::util::FullLock lock(mx);
+
     // -------------------------------------------
-    // HOST DECISIONS
+    // DECISION BUILDING
     // -------------------------------------------
     std::vector<std::string> hosts;
+    int nMessages = req->messages_size();
     if (forceLocal) {
         // We're forced to execute locally here so we do all the messages
         for (int i = 0; i < nMessages; i++) {
@@ -346,9 +342,53 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         decision.addMessage(hosts.at(i), req->messages().at(i));
     }
 
+    // Pass decision as hint
+    return doCallFunctions(req, decision, lock);
+}
+
+faabric::util::SchedulingDecision Scheduler::callFunctions(
+  std::shared_ptr<faabric::BatchExecuteRequest> req,
+  faabric::util::SchedulingDecision& hint)
+{
+    faabric::util::FullLock lock(mx);
+    return doCallFunctions(req, hint, lock);
+}
+
+faabric::util::SchedulingDecision Scheduler::doCallFunctions(
+  std::shared_ptr<faabric::BatchExecuteRequest> req,
+  faabric::util::SchedulingDecision& decision,
+  faabric::util::FullLock& lock)
+{
+    faabric::Message& firstMsg = req->mutable_messages()->at(0);
+    std::string funcStr = faabric::util::funcToString(firstMsg, false);
+    int nMessages = req->messages_size();
+
+    // NOTE: we want to schedule things on this host _last_, otherwise functions
+    // may start executing before all messages have been dispatched, thus
+    // slowing the remaining scheduling.
+    // Therefore we want to create a list of unique hosts, with this host last.
+    std::vector<std::string> orderedHosts;
+    {
+        std::set<std::string> uniqueHosts(decision.hosts.begin(),
+                                          decision.hosts.end());
+        bool hasFunctionsOnThisHost = uniqueHosts.contains(thisHost);
+
+        if (hasFunctionsOnThisHost) {
+            uniqueHosts.erase(thisHost);
+        }
+
+        std::vector<std::string> orderedHosts(uniqueHosts.begin(),
+                                              uniqueHosts.end());
+
+        if (hasFunctionsOnThisHost) {
+            orderedHosts.push_back(thisHost);
+        }
+    }
+
     // -------------------------------------------
     // THREADS
     // -------------------------------------------
+    bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS;
 
     // Register thread results if necessary
     if (isThreads) {
@@ -364,7 +404,9 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
     // Send out point-to-point mappings if necessary (unless being forced to
     // execute locally, in which case they will be transmitted from the
     // master)
-    if (!forceLocal && (firstMsg.groupid() > 0)) {
+    bool isOnlyThisHost =
+      orderedHosts.size() == 1 && orderedHosts.front() == thisHost;
+    if (!isOnlyThisHost && (firstMsg.groupid() > 0)) {
         broker.setAndSendMappingsFromSchedulingDecision(decision);
     }
 
@@ -405,22 +447,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
     // EXECTUION
     // -------------------------------------------
 
-    // NOTE: we want to schedule things on this host _last_, otherwise functions
-    // may start executing before all messages have been dispatched, thus
-    // slowing the remaining scheduling.
-    // TODO - clearner way to do this?
-    std::set<std::string> uniqueHosts(hosts.begin(), hosts.end());
-    uniqueHosts.erase(thisHost);
-    std::vector<std::string> orderedHosts(uniqueHosts.begin(),
-                                          uniqueHosts.end());
-    orderedHosts.push_back(thisHost);
-
     // Iterate through unique hosts
     for (const std::string& host : orderedHosts) {
         // Work out which indexes are scheduled on this host
         std::vector<int> thisHostIdxs;
-        for (int i = 0; i < hosts.size(); i++) {
-            if (hosts.at(i) == host) {
+        for (int i = 0; i < decision.hosts.size(); i++) {
+            if (decision.hosts.at(i) == host) {
                 thisHostIdxs.push_back(i);
             }
         }

From 983dfaaffc26ce9e9643a4730d9cc0bc813e1c8f Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 8 Nov 2021 09:06:07 +0000
Subject: [PATCH 19/31] Fix bug in host ordering

---
 src/scheduler/Scheduler.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 00a6a9ba8..e1f1942ce 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -1,4 +1,3 @@
-#include "faabric/util/locks.h"
 #include <faabric/proto/faabric.pb.h>
 #include <faabric/redis/Redis.h>
 #include <faabric/scheduler/ExecutorFactory.h>
@@ -9,11 +8,13 @@
 #include <faabric/transport/PointToPointBroker.h>
 #include <faabric/util/environment.h>
 #include <faabric/util/func.h>
+#include <faabric/util/locks.h>
 #include <faabric/util/logging.h>
 #include <faabric/util/memory.h>
 #include <faabric/util/random.h>
 #include <faabric/util/scheduling.h>
 #include <faabric/util/snapshot.h>
+#include <faabric/util/string_tools.h>
 #include <faabric/util/testing.h>
 #include <faabric/util/timing.h>
 
@@ -307,7 +308,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
 
                 // Register the host if it's exected a function
                 if (nOnThisHost > 0) {
-                    SPDLOG_DEBUG("Registering {} for {}", h, funcStr);
                     registeredHosts[funcStr].insert(h);
                 }
 
@@ -377,8 +377,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
             uniqueHosts.erase(thisHost);
         }
 
-        std::vector<std::string> orderedHosts(uniqueHosts.begin(),
-                                              uniqueHosts.end());
+        orderedHosts = std::vector(uniqueHosts.begin(), uniqueHosts.end());
 
         if (hasFunctionsOnThisHost) {
             orderedHosts.push_back(thisHost);

From ce8a8708b680e1a1c09f185a12922b5ac6ce86f2 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 8 Nov 2021 17:31:38 +0000
Subject: [PATCH 20/31] Sort out reusing ptp messages

---
 src/scheduler/Executor.cpp  |  8 ++++----
 src/scheduler/Scheduler.cpp | 36 +++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 5fb9c2d31..86c30c4ac 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -288,12 +288,12 @@ void Executor::threadPoolThread(int threadPoolIdx)
         // In all threads if we've finished with a ptp group, we need to tidy up
         // to ensure all sockets can be destructed
         if (msg.groupid() > 0) {
-            broker.resetThreadLocalCache(false);
+            // broker.resetThreadLocalCache(false);
 
             // Remove the group from this host if last in batch
-            if (isLastInBatch) {
-                broker.clearGroup(msg.groupid());
-            }
+            // if (isLastInBatch) {
+            //     broker.clearGroup(msg.groupid());
+            // }
         }
 
         // If this batch is finished, reset the executor and release its claim.
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index e1f1942ce..194d1180f 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -342,6 +342,17 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
         decision.addMessage(hosts.at(i), req->messages().at(i));
     }
 
+    // -------------------------------------------
+    // POINT-TO-POINT MESSAGING
+    // -------------------------------------------
+
+    // Send out point-to-point mappings if necessary (unless being forced to
+    // execute locally, in which case they will be transmitted from the
+    // master)
+    if (!forceLocal && (firstMsg.groupid() > 0)) {
+        broker.setAndSendMappingsFromSchedulingDecision(decision);
+    }
+
     // Pass decision as hint
     return doCallFunctions(req, decision, lock);
 }
@@ -363,6 +374,15 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
     std::string funcStr = faabric::util::funcToString(firstMsg, false);
     int nMessages = req->messages_size();
 
+    if (decision.hosts.size() != nMessages) {
+        SPDLOG_ERROR(
+          "Passed decision for {} with {} messages, but request has {}",
+          funcStr,
+          decision.hosts.size(),
+          nMessages);
+        throw std::runtime_error("Invalid scheduler hint for messages");
+    }
+
     // NOTE: we want to schedule things on this host _last_, otherwise functions
     // may start executing before all messages have been dispatched, thus
     // slowing the remaining scheduling.
@@ -396,19 +416,6 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
         }
     }
 
-    // -------------------------------------------
-    // POINT-TO-POINT MESSAGING
-    // -------------------------------------------
-
-    // Send out point-to-point mappings if necessary (unless being forced to
-    // execute locally, in which case they will be transmitted from the
-    // master)
-    bool isOnlyThisHost =
-      orderedHosts.size() == 1 && orderedHosts.front() == thisHost;
-    if (!isOnlyThisHost && (firstMsg.groupid() > 0)) {
-        broker.setAndSendMappingsFromSchedulingDecision(decision);
-    }
-
     // -------------------------------------------
     // SNAPSHOTS
     // -------------------------------------------
@@ -446,7 +453,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
     // EXECTUION
     // -------------------------------------------
 
-    // Iterate through unique hosts
+    // Iterate through unique hosts and dispatch messages
     for (const std::string& host : orderedHosts) {
         // Work out which indexes are scheduled on this host
         std::vector<int> thisHostIdxs;
@@ -544,7 +551,6 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
                 auto* newMsg = hostRequest->add_messages();
                 *newMsg = req->messages().at(msgIdx);
                 newMsg->set_executeslocally(false);
-                decision.addMessage(host, req->messages().at(msgIdx));
             }
 
             // Dispatch the calls

From 41118f9a95dfa59cdd941651d581fe185df3c8f7 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Mon, 8 Nov 2021 17:59:07 +0000
Subject: [PATCH 21/31] Naming

---
 include/faabric/transport/PointToPointBroker.h |  2 +-
 include/faabric/transport/PointToPointServer.h |  2 +-
 src/scheduler/Executor.cpp                     | 11 -----------
 src/transport/PointToPointBroker.cpp           | 12 ++++--------
 src/transport/PointToPointServer.cpp           |  9 +++++----
 5 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
index 459d63543..a596a1fb5 100644
--- a/include/faabric/transport/PointToPointBroker.h
+++ b/include/faabric/transport/PointToPointBroker.h
@@ -115,7 +115,7 @@ class PointToPointBroker
 
     void clear();
 
-    void resetThreadLocalCache(bool hard = true);
+    void resetThreadLocalCache();
 
   private:
     faabric::util::SystemConfig& conf;
diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h
index 29a15e77c..d62fe6b76 100644
--- a/include/faabric/transport/PointToPointServer.h
+++ b/include/faabric/transport/PointToPointServer.h
@@ -11,7 +11,7 @@ class PointToPointServer final : public MessageEndpointServer
     PointToPointServer();
 
   private:
-    PointToPointBroker& reg;
+    PointToPointBroker& broker;
 
     void doAsyncRecv(int header,
                      const uint8_t* buffer,
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 86c30c4ac..9c504b049 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -285,17 +285,6 @@ void Executor::threadPoolThread(int threadPoolIdx)
             faabric::util::resetDirtyTracking();
         }
 
-        // In all threads if we've finished with a ptp group, we need to tidy up
-        // to ensure all sockets can be destructed
-        if (msg.groupid() > 0) {
-            // broker.resetThreadLocalCache(false);
-
-            // Remove the group from this host if last in batch
-            // if (isLastInBatch) {
-            //     broker.clearGroup(msg.groupid());
-            // }
-        }
-
         // If this batch is finished, reset the executor and release its claim.
         // Note that we have to release the claim _after_ resetting, otherwise
         // the executor won't be ready for reuse.
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index eecf6ed51..448b94429 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -540,7 +540,7 @@ void PointToPointBroker::clearGroup(int groupId)
 {
     SPDLOG_TRACE("Clearing point-to-point group {}", groupId);
 
-    faabric::util::SharedLock lock(brokerMutex);
+    faabric::util::FullLock lock(brokerMutex);
 
     std::set<int> idxs = getIdxsRegisteredForGroup(groupId);
     for (auto idxA : idxs) {
@@ -559,7 +559,7 @@ void PointToPointBroker::clearGroup(int groupId)
 
 void PointToPointBroker::clear()
 {
-    faabric::util::SharedLock lock(brokerMutex);
+    faabric::util::FullLock lock(brokerMutex);
 
     groupIdIdxsMap.clear();
     mappings.clear();
@@ -569,16 +569,12 @@ void PointToPointBroker::clear()
     groupFlags.clear();
 }
 
-void PointToPointBroker::resetThreadLocalCache(bool hard)
+void PointToPointBroker::resetThreadLocalCache()
 {
     SPDLOG_TRACE("Resetting point-to-point thread-local cache");
-
     sendEndpoints.clear();
     recvEndpoints.clear();
-
-    if (hard) {
-        clients.clear();
-    }
+    clients.clear();
 }
 
 PointToPointBroker& getPointToPointBroker()
diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp
index 7791f0b0f..e2dd6fa80 100644
--- a/src/transport/PointToPointServer.cpp
+++ b/src/transport/PointToPointServer.cpp
@@ -16,7 +16,7 @@ PointToPointServer::PointToPointServer()
       POINT_TO_POINT_SYNC_PORT,
       POINT_TO_POINT_INPROC_LABEL,
       faabric::util::getSystemConfig().pointToPointServerThreads)
-  , reg(getPointToPointBroker())
+  , broker(getPointToPointBroker())
 {}
 
 void PointToPointServer::doAsyncRecv(int header,
@@ -28,7 +28,7 @@ void PointToPointServer::doAsyncRecv(int header,
             PARSE_MSG(faabric::PointToPointMessage, buffer, bufferSize)
 
             // Send the message locally to the downstream socket
-            reg.sendMessage(msg.groupid(),
+            broker.sendMessage(msg.groupid(),
                             msg.sendidx(),
                             msg.recvidx(),
                             BYTES_CONST(msg.data().c_str()),
@@ -85,7 +85,7 @@ std::unique_ptr<google::protobuf::Message> PointToPointServer::doRecvMappings(
 
     SPDLOG_DEBUG("Receiving {} point-to-point mappings", decision.nFunctions);
 
-    reg.setUpLocalMappingsFromSchedulingDecision(decision);
+    broker.setUpLocalMappingsFromSchedulingDecision(decision);
 
     return std::make_unique<faabric::EmptyResponse>();
 }
@@ -121,6 +121,7 @@ void PointToPointServer::recvGroupUnlock(const uint8_t* buffer,
 void PointToPointServer::onWorkerStop()
 {
     // Clear any thread-local cached sockets
-    reg.resetThreadLocalCache();
+    broker.resetThreadLocalCache();
+    broker.clear();
 }
 }

From 52b3877b9f01331ba8e39e43f5e148877ec05eeb Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 9 Nov 2021 10:26:37 +0000
Subject: [PATCH 22/31] Fixing up tests

---
 include/faabric/util/snapshot.h               |   6 -
 src/scheduler/Executor.cpp                    |   4 +-
 src/scheduler/Scheduler.cpp                   |   4 +
 src/util/snapshot.cpp                         |   4 +-
 tests/test/scheduler/test_executor.cpp        |  42 ++++--
 tests/test/scheduler/test_scheduler.cpp       |  34 +++--
 .../snapshot/test_snapshot_client_server.cpp  | 121 ++++++++++--------
 tests/test/util/test_snapshot.cpp             |  48 +++++--
 8 files changed, 168 insertions(+), 95 deletions(-)

diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
index a005dbae3..6cef3d21b 100644
--- a/include/faabric/util/snapshot.h
+++ b/include/faabric/util/snapshot.h
@@ -92,12 +92,6 @@ class SnapshotData
     // Note - we care about the order of this map, as we iterate through it
     // in order of offsets
     std::map<uint32_t, SnapshotMergeRegion> mergeRegions;
-
-    std::vector<SnapshotDiff> getCustomDiffs(const uint8_t* updated,
-                                             size_t updatedSize);
-
-    std::vector<SnapshotDiff> getStandardDiffs(const uint8_t* updated,
-                                               size_t updatedSize);
 };
 
 std::string snapshotDataTypeStr(SnapshotDataType dt);
diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 9c504b049..2069b071e 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -135,7 +135,7 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
 
     // Set up shared counter for this batch of tasks
     auto batchCounter =
-      std::make_shared<std::atomic<int>>(req->messages_size());
+      std::make_shared<std::atomic<int>>(msgIdxs.size());
 
     // Work out if we should skip the reset after this batch. This only needs to
     // happen when we're executing threads on the master host, in which case the
@@ -144,7 +144,7 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
 
     // Iterate through and invoke tasks. By default, we allocate tasks
     // one-to-one with thread pool threads. Only once the pool is exhausted do
-    // we start overallocating.
+    // we start overloading
     for (int msgIdx : msgIdxs) {
         const faabric::Message& msg = req->messages().at(msgIdx);
 
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 194d1180f..53d9114d2 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -515,6 +515,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
                 // Non-threads require one executor per task
                 for (auto i : thisHostIdxs) {
                     faabric::Message& localMsg = req->mutable_messages()->at(i);
+
                     if (localMsg.executeslocally()) {
                         faabric::util::UniqueLock resultsLock(
                           localResultsMutex);
@@ -523,6 +524,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
                             std::promise<
                               std::unique_ptr<faabric::Message>>() });
                     }
+
                     std::shared_ptr<Executor> e = claimExecutor(localMsg, lock);
                     e->executeTasks({ i }, req);
                 }
@@ -762,10 +764,12 @@ void Scheduler::setFunctionResult(faabric::Message& msg)
 
     if (msg.executeslocally()) {
         faabric::util::UniqueLock resultsLock(localResultsMutex);
+
         auto it = localResults.find(msg.id());
         if (it != localResults.end()) {
             it->second.set_value(std::make_unique<faabric::Message>(msg));
         }
+
         // Sync messages can't have their results read twice, so skip
         // redis
         if (!msg.isasync()) {
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index 3b84796a9..772870be4 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -92,9 +92,9 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
         // For each merge region that overlaps this dirty page, get it to add
         // its diffs, and move onto the next one
         // TODO - make this more efficient by passing in dirty pages to merge
-        // regions
+        // regions so that they avoid unnecessary work if they're large.
         while (mergeIt != mergeRegions.end() &&
-               (mergeIt->second.offset > pageStart &&
+               (mergeIt->second.offset >= pageStart &&
                 mergeIt->second.offset < pageEnd)) {
 
             SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}",
diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp
index 4dbf2983b..918b7d4ea 100644
--- a/tests/test/scheduler/test_executor.cpp
+++ b/tests/test/scheduler/test_executor.cpp
@@ -19,6 +19,7 @@
 #include <faabric/util/testing.h>
 
 using namespace faabric::scheduler;
+using namespace faabric::util;
 
 namespace tests {
 
@@ -203,13 +204,30 @@ class TestExecutor final : public Executor
         if (msg.function() == "snap-check") {
             // Modify a page of the dummy memory
             uint8_t pageIdx = threadPoolIdx;
-            SPDLOG_DEBUG("TestExecutor modifying page {} of memory", pageIdx);
-            uint8_t* offsetPtr =
-              dummyMemory + (pageIdx * faabric::util::HOST_PAGE_SIZE);
 
-            std::vector<uint8_t> data = { pageIdx,
-                                          (uint8_t)(pageIdx + 1),
-                                          (uint8_t)(pageIdx + 2) };
+            faabric::util::SnapshotData& snapData =
+              faabric::snapshot::getSnapshotRegistry().getSnapshot(
+                msg.snapshotkey());
+
+            // Avoid writing a zero here as the memory is already zeroed hence
+            // it's not a change
+            std::vector<uint8_t> data = { (uint8_t)(pageIdx + 1),
+                                          (uint8_t)(pageIdx + 2),
+                                          (uint8_t)(pageIdx + 3) };
+
+            // Set up a merge region that should catch the diff
+            size_t offset = (pageIdx * faabric::util::HOST_PAGE_SIZE);
+            snapData.addMergeRegion(offset,
+                                    data.size() + 10,
+                                    SnapshotDataType::Raw,
+                                    SnapshotMergeOperation::Overwrite);
+
+            SPDLOG_DEBUG("TestExecutor modifying page {} of memory ({}-{})",
+                         pageIdx,
+                         offset,
+                         offset + data.size());
+
+            uint8_t* offsetPtr = dummyMemory + offset;
             std::memcpy(offsetPtr, data.data(), data.size());
         }
 
@@ -718,7 +736,7 @@ TEST_CASE_METHOD(TestExecutorFixture,
     faabric::util::setMockMode(true);
     std::string otherHost = "other";
 
-    // Set up a load of messages executing with a different master host
+    // Set up some messages executing with a different master host
     std::vector<uint32_t> messageIds;
     for (int i = 0; i < nThreads; i++) {
         faabric::Message& msg = req->mutable_messages()->at(i);
@@ -750,13 +768,11 @@ TEST_CASE_METHOD(TestExecutorFixture,
     for (int i = 0; i < diffList.size(); i++) {
         // Check offset and data (according to logic defined in the dummy
         // executor)
-        uint8_t pageIndex = i + 1;
-        REQUIRE(diffList.at(i).offset ==
-                pageIndex * faabric::util::HOST_PAGE_SIZE);
+        REQUIRE(diffList.at(i).offset == i * faabric::util::HOST_PAGE_SIZE);
 
-        std::vector<uint8_t> expected = { pageIndex,
-                                          (uint8_t)(pageIndex + 1),
-                                          (uint8_t)(pageIndex + 2) };
+        std::vector<uint8_t> expected = { (uint8_t)(i + 1),
+                                          (uint8_t)(i + 2),
+                                          (uint8_t)(i + 3) };
 
         std::vector<uint8_t> actual(diffList.at(i).data,
                                     diffList.at(i).data + 3);
diff --git a/tests/test/scheduler/test_scheduler.cpp b/tests/test/scheduler/test_scheduler.cpp
index edf9f83b8..9f27c5c0e 100644
--- a/tests/test/scheduler/test_scheduler.cpp
+++ b/tests/test/scheduler/test_scheduler.cpp
@@ -811,28 +811,36 @@ TEST_CASE_METHOD(SlowExecutorFixture,
 
 TEST_CASE_METHOD(DummyExecutorFixture, "Test executor reuse", "[scheduler]")
 {
-    faabric::Message msgA = faabric::util::messageFactory("foo", "bar");
-    faabric::Message msgB = faabric::util::messageFactory("foo", "bar");
-    faabric::Message msgC = faabric::util::messageFactory("foo", "bar");
-    faabric::Message msgD = faabric::util::messageFactory("foo", "bar");
+    std::shared_ptr<faabric::BatchExecuteRequest> reqA =
+      faabric::util::batchExecFactory("foo", "bar", 2);
+    std::shared_ptr<faabric::BatchExecuteRequest> reqB =
+      faabric::util::batchExecFactory("foo", "bar", 2);
+
+    faabric::Message& msgA = reqA->mutable_messages()->at(0);
+    faabric::Message& msgB = reqB->mutable_messages()->at(0);
 
     // Execute a couple of functions
-    sch.callFunction(msgA);
-    sch.callFunction(msgB);
-    sch.getFunctionResult(msgA.id(), SHORT_TEST_TIMEOUT_MS);
-    sch.getFunctionResult(msgB.id(), SHORT_TEST_TIMEOUT_MS);
+    sch.callFunctions(reqA);
+    for (const auto& m : reqA->messages()) {
+        faabric::Message res =
+          sch.getFunctionResult(m.id(), SHORT_TEST_TIMEOUT_MS);
+        REQUIRE(res.returnvalue() == 0);
+    }
 
     // Check executor count
     REQUIRE(sch.getFunctionExecutorCount(msgA) == 2);
 
-    // Submit a couple more functions
-    sch.callFunction(msgC);
-    sch.callFunction(msgD);
-    sch.getFunctionResult(msgC.id(), SHORT_TEST_TIMEOUT_MS);
-    sch.getFunctionResult(msgD.id(), SHORT_TEST_TIMEOUT_MS);
+    // Execute a couple more functions
+    sch.callFunctions(reqB);
+    for (const auto& m : reqB->messages()) {
+        faabric::Message res =
+          sch.getFunctionResult(m.id(), SHORT_TEST_TIMEOUT_MS);
+        REQUIRE(res.returnvalue() == 0);
+    }
 
     // Check executor count is still the same
     REQUIRE(sch.getFunctionExecutorCount(msgA) == 2);
+    REQUIRE(sch.getFunctionExecutorCount(msgB) == 2);
 }
 
 TEST_CASE_METHOD(DummyExecutorFixture,
diff --git a/tests/test/snapshot/test_snapshot_client_server.cpp b/tests/test/snapshot/test_snapshot_client_server.cpp
index fa86b703e..bd0764f6e 100644
--- a/tests/test/snapshot/test_snapshot_client_server.cpp
+++ b/tests/test/snapshot/test_snapshot_client_server.cpp
@@ -17,6 +17,8 @@
 #include <faabric/util/snapshot.h>
 #include <faabric/util/testing.h>
 
+using namespace faabric::util;
+
 namespace tests {
 
 class SnapshotClientServerFixture
@@ -40,8 +42,8 @@ class SnapshotClientServerFixture
 
     void setUpFunctionGroup(int appId, int groupId)
     {
-        faabric::util::SchedulingDecision decision(appId, groupId);
-        faabric::Message msg = faabric::util::messageFactory("foo", "bar");
+        SchedulingDecision decision(appId, groupId);
+        faabric::Message msg = messageFactory("foo", "bar");
         msg.set_appid(appId);
         msg.set_groupid(groupId);
 
@@ -71,8 +73,8 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
     // Prepare some snapshot data
     std::string snapKeyA = "foo";
     std::string snapKeyB = "bar";
-    faabric::util::SnapshotData snapA;
-    faabric::util::SnapshotData snapB;
+    SnapshotData snapA;
+    SnapshotData snapB;
     size_t snapSizeA = 1024;
     size_t snapSizeB = 500;
     snapA.size = snapSizeA;
@@ -97,8 +99,8 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
 
     // Check snapshots created in registry
     REQUIRE(reg.getSnapshotCount() == 2);
-    const faabric::util::SnapshotData& actualA = reg.getSnapshot(snapKeyA);
-    const faabric::util::SnapshotData& actualB = reg.getSnapshot(snapKeyB);
+    const SnapshotData& actualA = reg.getSnapshot(snapKeyA);
+    const SnapshotData& actualB = reg.getSnapshot(snapKeyB);
 
     REQUIRE(actualA.size == snapA.size);
     REQUIRE(actualB.size == snapB.size);
@@ -110,8 +112,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
     REQUIRE(actualDataB == dataB);
 }
 
-void checkDiffsApplied(const uint8_t* snapBase,
-                       std::vector<faabric::util::SnapshotDiff> diffs)
+void checkDiffsApplied(const uint8_t* snapBase, std::vector<SnapshotDiff> diffs)
 {
     for (const auto& d : diffs) {
         std::vector<uint8_t> actual(snapBase + d.offset,
@@ -127,7 +128,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
                  "Test push snapshot diffs",
                  "[snapshot]")
 {
-    std::string thisHost = faabric::util::getSystemConfig().endpointHost;
+    std::string thisHost = getSystemConfig().endpointHost;
 
     // One request with no group, another with a group we must initialise
     int appId = 111;
@@ -137,25 +138,37 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
     setUpFunctionGroup(appId, groupIdB);
 
     // Set up a snapshot
-    std::string snapKey = std::to_string(faabric::util::generateGid());
-    faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, true);
+    std::string snapKey = std::to_string(generateGid());
+    SnapshotData snap = takeSnapshot(snapKey, 5, true);
 
     // Set up some diffs
     std::vector<uint8_t> diffDataA1 = { 0, 1, 2, 3 };
     std::vector<uint8_t> diffDataA2 = { 4, 5, 6 };
     std::vector<uint8_t> diffDataB = { 7, 7, 8, 8, 8 };
 
-    std::vector<faabric::util::SnapshotDiff> diffsA;
-    std::vector<faabric::util::SnapshotDiff> diffsB;
+    std::vector<SnapshotDiff> diffsA;
+    std::vector<SnapshotDiff> diffsB;
+
+    SnapshotDiff diffA1(SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite,
+                        5,
+                        diffDataA1.data(),
+                        diffDataA1.size());
+
+    SnapshotDiff diffA2(SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite,
+                        2 * HOST_PAGE_SIZE,
+                        diffDataA2.data(),
+                        diffDataA2.size());
 
-    faabric::util::SnapshotDiff diffA1(5, diffDataA1.data(), diffDataA1.size());
-    faabric::util::SnapshotDiff diffA2(
-      2 * faabric::util::HOST_PAGE_SIZE, diffDataA2.data(), diffDataA2.size());
     diffsA = { diffA1, diffA2 };
     cli.pushSnapshotDiffs(snapKey, groupIdA, diffsA);
 
-    faabric::util::SnapshotDiff diffB(
-      3 * faabric::util::HOST_PAGE_SIZE, diffDataB.data(), diffDataB.size());
+    SnapshotDiff diffB(SnapshotDataType::Raw,
+                       SnapshotMergeOperation::Overwrite,
+                       3 * HOST_PAGE_SIZE,
+                       diffDataB.data(),
+                       diffDataB.size());
     diffsB = { diffB };
     cli.pushSnapshotDiffs(snapKey, groupIdB, diffsB);
 
@@ -171,12 +184,12 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
                  "[snapshot]")
 {
     // Set up a snapshot
-    std::string snapKey = std::to_string(faabric::util::generateGid());
-    faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, false);
+    std::string snapKey = std::to_string(generateGid());
+    SnapshotData snap = takeSnapshot(snapKey, 5, false);
 
     // Set up a couple of ints in the snapshot
     int offsetA1 = 5;
-    int offsetA2 = 2 * faabric::util::HOST_PAGE_SIZE;
+    int offsetA2 = 2 * HOST_PAGE_SIZE;
     int baseA1 = 25;
     int baseA2 = 60;
 
@@ -189,22 +202,26 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
     int diffIntA1 = 123;
     int diffIntA2 = 345;
 
-    std::vector<uint8_t> intDataA1 =
-      faabric::util::valueToBytes<int>(diffIntA1);
-    std::vector<uint8_t> intDataA2 =
-      faabric::util::valueToBytes<int>(diffIntA2);
+    std::vector<uint8_t> intDataA1 = valueToBytes<int>(diffIntA1);
+    std::vector<uint8_t> intDataA2 = valueToBytes<int>(diffIntA2);
 
-    std::vector<faabric::util::SnapshotDiff> diffs;
+    std::vector<SnapshotDiff> diffs;
 
-    faabric::util::SnapshotDiff diffA1(
-      offsetA1, intDataA1.data(), intDataA1.size());
-    diffA1.operation = faabric::util::SnapshotMergeOperation::Sum;
-    diffA1.dataType = faabric::util::SnapshotDataType::Int;
+    SnapshotDiff diffA1(SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite,
+                        offsetA1,
+                        intDataA1.data(),
+                        intDataA1.size());
+    diffA1.operation = SnapshotMergeOperation::Sum;
+    diffA1.dataType = SnapshotDataType::Int;
 
-    faabric::util::SnapshotDiff diffA2(
-      offsetA2, intDataA2.data(), intDataA2.size());
-    diffA2.operation = faabric::util::SnapshotMergeOperation::Sum;
-    diffA2.dataType = faabric::util::SnapshotDataType::Int;
+    SnapshotDiff diffA2(SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite,
+                        offsetA2,
+                        intDataA2.data(),
+                        intDataA2.size());
+    diffA2.operation = SnapshotMergeOperation::Sum;
+    diffA2.dataType = SnapshotDataType::Int;
 
     diffs = { diffA1, diffA2 };
     cli.pushSnapshotDiffs(snapKey, 0, diffs);
@@ -221,22 +238,20 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
                  "[snapshot]")
 {
     // Set up a snapshot
-    std::string snapKey = std::to_string(faabric::util::generateGid());
-    faabric::util::SnapshotData snap = takeSnapshot(snapKey, 5, false);
+    std::string snapKey = std::to_string(generateGid());
+    SnapshotData snap = takeSnapshot(snapKey, 5, false);
 
     int offset = 5;
     std::vector<uint8_t> originalData;
     std::vector<uint8_t> diffData;
     std::vector<uint8_t> expectedData;
 
-    faabric::util::SnapshotMergeOperation operation =
-      faabric::util::SnapshotMergeOperation::Overwrite;
-    faabric::util::SnapshotDataType dataType =
-      faabric::util::SnapshotDataType::Raw;
+    SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
+    SnapshotDataType dataType = SnapshotDataType::Raw;
 
     SECTION("Integer")
     {
-        dataType = faabric::util::SnapshotDataType::Int;
+        dataType = SnapshotDataType::Int;
         int original = 0;
         int diff = 0;
         int expected = 0;
@@ -247,7 +262,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
             diff = 10;
             expected = 110;
 
-            operation = faabric::util::SnapshotMergeOperation::Sum;
+            operation = SnapshotMergeOperation::Sum;
         }
 
         SECTION("Subtract")
@@ -256,7 +271,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
             diff = 10;
             expected = 90;
 
-            operation = faabric::util::SnapshotMergeOperation::Subtract;
+            operation = SnapshotMergeOperation::Subtract;
         }
 
         SECTION("Product")
@@ -265,7 +280,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
             diff = 20;
             expected = 200;
 
-            operation = faabric::util::SnapshotMergeOperation::Product;
+            operation = SnapshotMergeOperation::Product;
         }
 
         SECTION("Min")
@@ -284,7 +299,7 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
                 expected = 10;
             }
 
-            operation = faabric::util::SnapshotMergeOperation::Min;
+            operation = SnapshotMergeOperation::Min;
         }
 
         SECTION("Max")
@@ -303,22 +318,26 @@ TEST_CASE_METHOD(SnapshotClientServerFixture,
                 expected = 20;
             }
 
-            operation = faabric::util::SnapshotMergeOperation::Max;
+            operation = SnapshotMergeOperation::Max;
         }
 
-        originalData = faabric::util::valueToBytes<int>(original);
-        diffData = faabric::util::valueToBytes<int>(diff);
-        expectedData = faabric::util::valueToBytes<int>(expected);
+        originalData = valueToBytes<int>(original);
+        diffData = valueToBytes<int>(diff);
+        expectedData = valueToBytes<int>(expected);
     }
 
     // Put original data in place
     std::memcpy(snap.data + offset, originalData.data(), originalData.size());
 
-    faabric::util::SnapshotDiff diff(offset, diffData.data(), diffData.size());
+    SnapshotDiff diff(SnapshotDataType::Raw,
+                      SnapshotMergeOperation::Overwrite,
+                      offset,
+                      diffData.data(),
+                      diffData.size());
     diff.operation = operation;
     diff.dataType = dataType;
 
-    std::vector<faabric::util::SnapshotDiff> diffs = { diff };
+    std::vector<SnapshotDiff> diffs = { diff };
     cli.pushSnapshotDiffs(snapKey, 0, diffs);
 
     // Check data is as expected
diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp
index 2a3701ae0..cf182c66b 100644
--- a/tests/test/util/test_snapshot.cpp
+++ b/tests/test/util/test_snapshot.cpp
@@ -562,10 +562,26 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]")
     sharedMem[ignoreOffsetB + ignoreLengthB - 1] = (uint8_t)1;
 
     std::vector<SnapshotDiff> expectedDiffs = {
-        { offsetA, dataA.data(), dataA.size() },
-        { offsetB, dataB.data(), dataB.size() },
-        { offsetC, dataC.data(), dataC.size() },
-        { offsetD, dataD.data(), dataD.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetA,
+          dataA.data(),
+          dataA.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetB,
+          dataB.data(),
+          dataB.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetC,
+          dataC.data(),
+          dataC.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetD,
+          dataD.data(),
+          dataD.size() },
     };
 
     // Check number of diffs
@@ -601,10 +617,26 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size());
 
     std::vector<SnapshotDiff> expectedDiffs = {
-        { offsetA, dataA.data(), dataA.size() },
-        { offsetB, dataB.data(), dataB.size() },
-        { offsetC, dataC.data(), dataC.size() },
-        { offsetD, dataD.data(), dataD.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetA,
+          dataA.data(),
+          dataA.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetB,
+          dataB.data(),
+          dataB.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetC,
+          dataC.data(),
+          dataC.size() },
+        { SnapshotDataType::Raw,
+          SnapshotMergeOperation::Overwrite,
+          offsetD,
+          dataD.data(),
+          dataD.size() },
     };
 
     // Check number of diffs

From d472329ba94011edc8a1cd77815a8a1002e7ee5d Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 9 Nov 2021 12:08:16 +0000
Subject: [PATCH 23/31] Continuing test fixes

---
 src/scheduler/Executor.cpp                  |   3 +-
 src/util/snapshot.cpp                       |  70 +++++++----
 tests/test/snapshot/test_snapshot_diffs.cpp | 123 +++++++++++++++-----
 tests/test/util/test_snapshot.cpp           |  16 +--
 4 files changed, 153 insertions(+), 59 deletions(-)

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 2069b071e..120a505a7 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -134,8 +134,7 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     }
 
     // Set up shared counter for this batch of tasks
-    auto batchCounter =
-      std::make_shared<std::atomic<int>>(msgIdxs.size());
+    auto batchCounter = std::make_shared<std::atomic<int>>(msgIdxs.size());
 
     // Work out if we should skip the reset after this batch. This only needs to
     // happen when we're executing threads on the master host, in which case the
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index 772870be4..0d7028ad5 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -56,7 +56,7 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
     }
 
     // Work out which pages have changed (these will be sorted)
-    size_t nThisPages = getRequiredHostPages(size);
+    size_t nThisPages = getRequiredHostPages(updatedSize);
     std::vector<int> dirtyPageNumbers =
       getDirtyPageNumbers(updated, nThisPages);
 
@@ -97,13 +97,21 @@ std::vector<SnapshotDiff> SnapshotData::getChangeDiffs(const uint8_t* updated,
                (mergeIt->second.offset >= pageStart &&
                 mergeIt->second.offset < pageEnd)) {
 
-            SPDLOG_TRACE("Overlap with {} {} merge region at {}-{}",
-                         snapshotDataTypeStr(mergeIt->second.dataType),
-                         snapshotMergeOpStr(mergeIt->second.operation),
-                         mergeIt->second.offset,
-                         mergeIt->second.offset + mergeIt->second.length);
+            uint8_t* original = data;
+
+            // If we're outside the range of the original data, pass a nullptr
+            if (mergeIt->second.offset > size) {
+                SPDLOG_TRACE(
+                  "Checking {} {} merge region {}-{} outside original snapshot",
+                  snapshotDataTypeStr(mergeIt->second.dataType),
+                  snapshotMergeOpStr(mergeIt->second.operation),
+                  mergeIt->second.offset,
+                  mergeIt->second.offset + mergeIt->second.length);
+
+                original = nullptr;
+            }
 
-            mergeIt->second.addDiffs(diffs, data, updated);
+            mergeIt->second.addDiffs(diffs, original, updated);
             mergeIt++;
         }
     }
@@ -206,15 +214,26 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
         return;
     }
 
-    const uint8_t* updatedValue = updated + offset;
-    const uint8_t* originalValue = original + offset;
+    SPDLOG_TRACE("Checking for {} {} merge region at {}-{}",
+                 snapshotDataTypeStr(dataType),
+                 snapshotMergeOpStr(operation),
+                 offset,
+                 offset + length);
 
     switch (dataType) {
         case (SnapshotDataType::Int): {
-            // Get the original and update values
-            int originalInt = *(reinterpret_cast<const int*>(originalValue));
+            // Check if the value has changed
+            const uint8_t* updatedValue = updated + offset;
             int updatedInt = *(reinterpret_cast<const int*>(updatedValue));
 
+            if (original == nullptr) {
+                throw std::runtime_error(
+                  "Do not support int operations outside original snapshot");
+            }
+
+            const uint8_t* originalValue = original + offset;
+            int originalInt = *(reinterpret_cast<const int*>(originalValue));
+
             // Skip if no change
             if (originalInt == updatedInt) {
                 return;
@@ -277,9 +296,16 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                     // have changed
                     bool diffInProgress = false;
                     int diffStart = 0;
-                    for (int b = offset; b < offset + length; b++) {
-                        bool isDirtyByte = *(original + b) != *(updated + b);
+                    for (int b = offset; b <= offset + length; b++) {
+                        bool isDirtyByte = false;
+
+                        if (original == nullptr) {
+                            isDirtyByte = true;
+                        } else {
+                            isDirtyByte = *(original + b) != *(updated + b);
+                        }
 
+                        SPDLOG_TRACE("BYTE {} dirty {}", b, isDirtyByte);
                         if (isDirtyByte && !diffInProgress) {
                             // Diff starts here if it's different and diff
                             // not in progress
@@ -292,8 +318,8 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                             SPDLOG_TRACE("Adding {} {} diff at {}-{}",
                                          snapshotDataTypeStr(dataType),
                                          snapshotMergeOpStr(operation),
-                                         offset + diffStart,
-                                         offset + diffStart + diffLength);
+                                         diffStart,
+                                         diffStart + diffLength);
 
                             diffInProgress = false;
                             diffs.emplace_back(dataType,
@@ -307,17 +333,19 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                     // If we've reached the end of this region with a diff
                     // in progress, we need to close it off
                     if (diffInProgress) {
-                        SPDLOG_TRACE("Adding {} {} diff at {}-{}",
-                                     snapshotDataTypeStr(dataType),
-                                     snapshotMergeOpStr(operation),
-                                     offset,
-                                     offset + length);
+                        int finalDiffLength = (offset + length) - diffStart + 1;
+                        SPDLOG_TRACE(
+                          "Adding {} {} diff at {}-{} (end of region)",
+                          snapshotDataTypeStr(dataType),
+                          snapshotMergeOpStr(operation),
+                          diffStart,
+                          diffStart + finalDiffLength);
 
                         diffs.emplace_back(dataType,
                                            operation,
                                            diffStart,
                                            updated + diffStart,
-                                           (length + 1) - diffStart);
+                                           finalDiffLength);
                     }
                     break;
                 }
diff --git a/tests/test/snapshot/test_snapshot_diffs.cpp b/tests/test/snapshot/test_snapshot_diffs.cpp
index 2269d8ba7..6b4a056e3 100644
--- a/tests/test/snapshot/test_snapshot_diffs.cpp
+++ b/tests/test/snapshot/test_snapshot_diffs.cpp
@@ -15,10 +15,39 @@ void checkSnapshotDiff(int offset,
                        SnapshotDiff& actual)
 {
     REQUIRE(offset == actual.offset);
+    REQUIRE(actual.size > 0);
+    REQUIRE(actual.data != nullptr);
+
     std::vector<uint8_t> actualData(actual.data, actual.data + actual.size);
     REQUIRE(data == actualData);
 }
 
+TEST_CASE_METHOD(SnapshotTestFixture,
+                 "Test no snapshot diffs if no merge regions",
+                 "[snapshot]")
+{
+    std::string snapKey = "foobar123";
+    int snapPages = 5;
+    SnapshotData snap = takeSnapshot(snapKey, snapPages, true);
+
+    int sharedMemPages = 8;
+    size_t sharedMemSize = sharedMemPages * HOST_PAGE_SIZE;
+    uint8_t* sharedMem = allocatePages(sharedMemPages);
+
+    reg.mapSnapshot(snapKey, sharedMem);
+
+    // Make various changes
+    sharedMem[0] = 1;
+    sharedMem[2 * HOST_PAGE_SIZE] = 1;
+    sharedMem[3 * HOST_PAGE_SIZE + 10] = 1;
+    sharedMem[8 * HOST_PAGE_SIZE - 20] = 1;
+
+    // Check there are no diffs
+    std::vector<SnapshotDiff> changeDiffs =
+      snap.getChangeDiffs(sharedMem, sharedMemSize);
+    REQUIRE(changeDiffs.empty());
+}
+
 TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]")
 {
     std::string snapKey = "foobar123";
@@ -37,28 +66,69 @@ TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]")
     // Reset dirty tracking
     faabric::util::resetDirtyTracking();
 
-    // Set up some chunks of data to write into the memory
+    // Single change, single merge region
     std::vector<uint8_t> dataA = { 1, 2, 3, 4 };
-    std::vector<uint8_t> dataB = { 4, 5, 6 };
-    std::vector<uint8_t> dataC = { 7, 6, 5, 4, 3 };
-    std::vector<uint8_t> dataD = { 1, 1, 1, 1 };
-
-    // Set up some offsets, both on and over page boundaries
     int offsetA = HOST_PAGE_SIZE;
-    int offsetB = HOST_PAGE_SIZE + 20;
-    int offsetC = 2 * HOST_PAGE_SIZE - 2;
-    int offsetD = 3 * HOST_PAGE_SIZE - dataD.size();
-
-    // Write the data
     std::memcpy(sharedMem + offsetA, dataA.data(), dataA.size());
-    std::memcpy(sharedMem + offsetB, dataB.data(), dataB.size());
+
+    snap.addMergeRegion(offsetA,
+                        dataA.size(),
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
+
+    // NOTE - deliberately add merge regions out of order
+    // Diff starting in merge region and overlapping the end
+    std::vector<uint8_t> dataC = { 7, 6, 5, 4, 3, 2, 1 };
+    std::vector<uint8_t> expectedDataC = { 7, 6, 5, 4, 3 };
+    int offsetC = 2 * HOST_PAGE_SIZE;
     std::memcpy(sharedMem + offsetC, dataC.data(), dataC.size());
+
+    int regionOffsetC = offsetC - 3;
+    snap.addMergeRegion(regionOffsetC,
+                        dataC.size(),
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
+
+    // Two changes in single merge region
+    std::vector<uint8_t> dataB1 = { 4, 5, 6 };
+    std::vector<uint8_t> dataB2 = { 7, 6, 5 };
+    int offsetB1 = HOST_PAGE_SIZE + 10;
+    int offsetB2 = HOST_PAGE_SIZE + 16;
+    std::memcpy(sharedMem + offsetB1, dataB1.data(), dataB1.size());
+    std::memcpy(sharedMem + offsetB2, dataB2.data(), dataB2.size());
+
+    snap.addMergeRegion(offsetB1,
+                        (offsetB2 - offsetB1) + dataB2.size() + 10,
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
+
+    // Merge region within a change
+    std::vector<uint8_t> dataD = { 1, 1, 2, 2, 3, 3, 4 };
+    std::vector<uint8_t> expectedDataD = { 2, 2, 3, 3 };
+    int offsetD = 3 * HOST_PAGE_SIZE - dataD.size();
     std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size());
 
-    // Write the data to the region that exceeds the size of the original
-    std::vector<uint8_t> dataExtra(
-      (sharedMemPages - snapPages) * HOST_PAGE_SIZE, 5);
-    std::memcpy(sharedMem + snapSize, dataExtra.data(), dataExtra.size());
+    int regionOffsetD = offsetD + 2;
+    int regionSizeD = dataD.size() - 4;
+    snap.addMergeRegion(regionOffsetD,
+                        regionSizeD,
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
+
+    // Write some data to the region that exceeds the size of the original, then
+    // add a merge region larger than it. Anything outside the original snapshot
+    // should be marked as changed.
+    std::vector<uint8_t> dataExtra = { 2, 2, 2 };
+    std::vector<uint8_t> expectedDataExtra = { 0, 0, 2, 2, 2, 0, 0, 0 };
+    int extraOffset = snapSize + HOST_PAGE_SIZE + 10;
+    std::memcpy(sharedMem + extraOffset, dataExtra.data(), dataExtra.size());
+
+    int extraRegionOffset = extraOffset - 2;
+    int extraRegionSize = dataExtra.size() + 4;
+    snap.addMergeRegion(extraRegionOffset,
+                        extraRegionSize,
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
 
     // Include an offset which doesn't change the data, but will register a
     // dirty page
@@ -73,25 +143,20 @@ TEST_CASE_METHOD(SnapshotTestFixture, "Test snapshot diffs", "[snapshot]")
     // Check shared memory does have dirty pages (including the non-change)
     std::vector<int> sharedDirtyPages =
       getDirtyPageNumbers(sharedMem, sharedMemPages);
-    std::vector<int> expected = { 1, 2, 3, 5, 6, 7 };
+    std::vector<int> expected = { 1, 2, 3, 6 };
     REQUIRE(sharedDirtyPages == expected);
 
-    // Check change diffs note that diffs across page boundaries will be split
-    // into two
+    // Check we have the right number of diffs
     std::vector<SnapshotDiff> changeDiffs =
       snap.getChangeDiffs(sharedMem, sharedMemSize);
-    REQUIRE(changeDiffs.size() == 6);
 
-    // One chunk will be split over 2 pages
-    std::vector<uint8_t> dataCPart1 = { 7, 6 };
-    std::vector<uint8_t> dataCPart2 = { 5, 4, 3 };
-    int offsetC2 = 2 * HOST_PAGE_SIZE;
+    REQUIRE(changeDiffs.size() == 6);
 
     checkSnapshotDiff(offsetA, dataA, changeDiffs.at(0));
-    checkSnapshotDiff(offsetB, dataB, changeDiffs.at(1));
-    checkSnapshotDiff(offsetC, dataCPart1, changeDiffs.at(2));
-    checkSnapshotDiff(offsetC2, dataCPart2, changeDiffs.at(3));
-    checkSnapshotDiff(offsetD, dataD, changeDiffs.at(4));
-    checkSnapshotDiff(snapSize, dataExtra, changeDiffs.at(5));
+    checkSnapshotDiff(offsetB1, dataB1, changeDiffs.at(1));
+    checkSnapshotDiff(offsetB2, dataB2, changeDiffs.at(2));
+    checkSnapshotDiff(offsetC, expectedDataC, changeDiffs.at(3));
+    checkSnapshotDiff(regionOffsetD, expectedDataD, changeDiffs.at(4));
+    checkSnapshotDiff(extraRegionOffset, expectedDataExtra, changeDiffs.at(5));
 }
 }
diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp
index cf182c66b..b86724c6e 100644
--- a/tests/test/util/test_snapshot.cpp
+++ b/tests/test/util/test_snapshot.cpp
@@ -66,7 +66,7 @@ class SnapshotMergeTestFixture : public SnapshotTestFixture
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Detailed test snapshot merge regions with ints",
-                 "[util]")
+                 "[snapshot][util]")
 {
     std::string snapKey = "foobar123";
     int snapPages = 5;
@@ -177,7 +177,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test edge-cases of snapshot merge regions",
-                 "[util]")
+                 "[snapshot][util]")
 {
     // Region edge cases:
     // - start
@@ -286,7 +286,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test snapshot merge regions",
-                 "[util]")
+                 "[snapshot][util]")
 {
     std::string snapKey = "foobar123";
     int snapPages = 5;
@@ -435,7 +435,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test invalid snapshot merges",
-                 "[util]")
+                 "[snapshot][util]")
 {
     std::string snapKey = "foobar123";
     int snapPages = 3;
@@ -500,7 +500,9 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     deallocatePages(snap.data, snapPages);
 }
 
-TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]")
+TEST_CASE_METHOD(SnapshotMergeTestFixture,
+                 "Test cross-page ignores",
+                 "[snapshot][util]")
 {
     int snapPages = 6;
     size_t sharedMemSize = snapPages * HOST_PAGE_SIZE;
@@ -593,7 +595,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture, "Test cross-page ignores", "[util]")
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test fine-grained byte-wise diffs",
-                 "[util]")
+                 "[snapshot][util]")
 {
     int snapPages = 3;
     size_t sharedMemSize = snapPages * HOST_PAGE_SIZE;
@@ -648,7 +650,7 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test mix of applicable and non-applicable merge regions",
-                 "[util]")
+                 "[snapshot][util]")
 {
     int snapPages = 6;
     size_t sharedMemSize = snapPages * HOST_PAGE_SIZE;

From e46fe940e13d5cac8835e96be4492cbd91311023 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 9 Nov 2021 12:23:57 +0000
Subject: [PATCH 24/31] Remove ignore regions

---
 include/faabric/util/snapshot.h   |  1 -
 src/util/snapshot.cpp             |  8 ---
 tests/test/util/test_snapshot.cpp | 93 -------------------------------
 3 files changed, 102 deletions(-)

diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
index 6cef3d21b..6d0c2748e 100644
--- a/include/faabric/util/snapshot.h
+++ b/include/faabric/util/snapshot.h
@@ -20,7 +20,6 @@ enum SnapshotDataType
 enum SnapshotMergeOperation
 {
     Overwrite,
-    Ignore,
     Sum,
     Product,
     Subtract,
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index 0d7028ad5..8763a53b7 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -177,9 +177,6 @@ std::string snapshotDataTypeStr(SnapshotDataType dt)
 std::string snapshotMergeOpStr(SnapshotMergeOperation op)
 {
     switch (op) {
-        case (SnapshotMergeOperation::Ignore): {
-            return "Ignore";
-        }
         case (SnapshotMergeOperation::Max): {
             return "Max";
         }
@@ -209,11 +206,6 @@ void SnapshotMergeRegion::addDiffs(std::vector<SnapshotDiff>& diffs,
                                    const uint8_t* original,
                                    const uint8_t* updated)
 {
-    // Skip an ignore
-    if (operation == SnapshotMergeOperation::Ignore) {
-        return;
-    }
-
     SPDLOG_TRACE("Checking for {} {} merge region at {}-{}",
                  snapshotDataTypeStr(dataType),
                  snapshotMergeOpStr(operation),
diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp
index b86724c6e..9b7224ab4 100644
--- a/tests/test/util/test_snapshot.cpp
+++ b/tests/test/util/test_snapshot.cpp
@@ -500,99 +500,6 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     deallocatePages(snap.data, snapPages);
 }
 
-TEST_CASE_METHOD(SnapshotMergeTestFixture,
-                 "Test cross-page ignores",
-                 "[snapshot][util]")
-{
-    int snapPages = 6;
-    size_t sharedMemSize = snapPages * HOST_PAGE_SIZE;
-    uint8_t* sharedMem = setUpSnapshot(snapPages);
-
-    // Add ignore regions that cover multiple pages
-    int ignoreOffsetA = HOST_PAGE_SIZE + 100;
-    int ignoreLengthA = 2 * HOST_PAGE_SIZE;
-    snap.addMergeRegion(ignoreOffsetA,
-                        ignoreLengthA,
-                        faabric::util::SnapshotDataType::Raw,
-                        faabric::util::SnapshotMergeOperation::Ignore);
-
-    int ignoreOffsetB = sharedMemSize - (HOST_PAGE_SIZE + 10);
-    int ignoreLengthB = 30;
-    snap.addMergeRegion(ignoreOffsetB,
-                        ignoreLengthB,
-                        faabric::util::SnapshotDataType::Raw,
-                        faabric::util::SnapshotMergeOperation::Ignore);
-
-    // Add some modifications that will cause diffs, and some should be ignored
-    std::vector<uint8_t> dataA(10, 1);
-    std::vector<uint8_t> dataB(1, 1);
-    std::vector<uint8_t> dataC(1, 1);
-    std::vector<uint8_t> dataD(3, 1);
-
-    // Not ignored, start of memory
-    uint32_t offsetA = 0;
-    std::memcpy(sharedMem + offsetA, dataA.data(), dataA.size());
-
-    // Not ignored, just before first ignore region
-    uint32_t offsetB = ignoreOffsetA - 1;
-    std::memcpy(sharedMem + offsetB, dataB.data(), dataB.size());
-
-    // Not ignored, just after first ignore region
-    uint32_t offsetC = ignoreOffsetA + ignoreLengthA;
-    std::memcpy(sharedMem + offsetC, dataC.data(), dataC.size());
-
-    // Not ignored, just before second ignore region
-    uint32_t offsetD = ignoreOffsetB - 4;
-    std::memcpy(sharedMem + offsetD, dataD.data(), dataD.size());
-
-    // Ignored, start of first ignore region
-    sharedMem[ignoreOffsetA] = (uint8_t)1;
-
-    // Ignored, just before page boundary in ignore region
-    sharedMem[(2 * HOST_PAGE_SIZE) - 1] = (uint8_t)1;
-
-    // Ignored, just after page boundary in ignore region
-    sharedMem[(2 * HOST_PAGE_SIZE)] = (uint8_t)1;
-
-    // Deliberately don't put any changes after the next page boundary to check
-    // that it rolls over properly
-
-    // Ignored, just inside second region
-    sharedMem[ignoreOffsetB + 2] = (uint8_t)1;
-
-    // Ignored, end of second region
-    sharedMem[ignoreOffsetB + ignoreLengthB - 1] = (uint8_t)1;
-
-    std::vector<SnapshotDiff> expectedDiffs = {
-        { SnapshotDataType::Raw,
-          SnapshotMergeOperation::Overwrite,
-          offsetA,
-          dataA.data(),
-          dataA.size() },
-        { SnapshotDataType::Raw,
-          SnapshotMergeOperation::Overwrite,
-          offsetB,
-          dataB.data(),
-          dataB.size() },
-        { SnapshotDataType::Raw,
-          SnapshotMergeOperation::Overwrite,
-          offsetC,
-          dataC.data(),
-          dataC.size() },
-        { SnapshotDataType::Raw,
-          SnapshotMergeOperation::Overwrite,
-          offsetD,
-          dataD.data(),
-          dataD.size() },
-    };
-
-    // Check number of diffs
-    std::vector<SnapshotDiff> actualDiffs =
-      snap.getChangeDiffs(sharedMem, sharedMemSize);
-
-    checkDiffs(actualDiffs, expectedDiffs);
-}
-
 TEST_CASE_METHOD(SnapshotMergeTestFixture,
                  "Test fine-grained byte-wise diffs",
                  "[snapshot][util]")

From dd7b6b833daf7c930101479bd90981e3b709ab21 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 9 Nov 2021 15:37:38 +0000
Subject: [PATCH 25/31] Add distributed locking test

---
 src/transport/PointToPointBroker.cpp         | 81 ++++++++++------
 tests/dist/transport/functions.cpp           | 98 ++++++++++++++++++++
 tests/dist/transport/test_coordination.cpp   | 34 +++++++
 tests/test/transport/test_point_to_point.cpp |  2 -
 tests/test/util/test_snapshot.cpp            | 91 ++++++++----------
 5 files changed, 227 insertions(+), 79 deletions(-)
 create mode 100644 tests/dist/transport/test_coordination.cpp

diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index 448b94429..cb4c4ec2a 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -134,24 +134,25 @@ PointToPointGroup::PointToPointGroup(int appIdIn,
 
 void PointToPointGroup::lock(int groupIdx, bool recursive)
 {
-    std::string host =
+    std::string masterHost =
       ptpBroker.getHostForReceiver(groupId, POINT_TO_POINT_MASTER_IDX);
+    std::string lockerHost = ptpBroker.getHostForReceiver(groupId, groupIdx);
 
-    if (host == conf.endpointHost) {
+    bool masterIsLocal = masterHost == conf.endpointHost;
+    bool lockerIsLocal = lockerHost == conf.endpointHost;
+
+    // If we're on the master, we need to try and acquire the lock, otherwise we
+    // send a remote request
+    if (masterIsLocal) {
         bool acquiredLock = false;
         {
             faabric::util::UniqueLock lock(mx);
-            if (recursive) {
-                bool isFree = recursiveLockOwners.empty();
 
-                bool lockOwnedByThisIdx =
-                  !isFree && (recursiveLockOwners.top() == groupIdx);
-
-                if (isFree || lockOwnedByThisIdx) {
-                    // Recursive and either free, or already locked by this idx
-                    recursiveLockOwners.push(groupIdx);
-                    acquiredLock = true;
-                }
+            if (recursive && (recursiveLockOwners.empty() ||
+                              recursiveLockOwners.top() == groupIdx)) {
+                // Recursive and either free, or already locked by this idx
+                recursiveLockOwners.push(groupIdx);
+                acquiredLock = true;
             } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) {
                 // Non-recursive and free
                 lockOwnerIdx = groupIdx;
@@ -159,25 +160,52 @@ void PointToPointGroup::lock(int groupIdx, bool recursive)
             }
         }
 
-        if (acquiredLock) {
-            SPDLOG_TRACE("Group idx {}, locked {} (recursive {})",
+        if (acquiredLock && lockerIsLocal) {
+            // Nothing to do now
+            SPDLOG_TRACE("Group idx {} ({}), locally locked {} (recursive {})",
+                         groupIdx,
+                         lockerHost,
+                         groupId,
+                         recursive);
+
+        } else if (acquiredLock) {
+            SPDLOG_TRACE("Group idx {} ({}), remotely locked {} (recursive {})",
                          groupIdx,
+                         lockerHost,
                          groupId,
                          recursive);
 
+            // Notify remote locker that they've acquired the lock
             notifyLocked(groupIdx);
         } else {
-            SPDLOG_TRACE(
-              "Group idx {}, lock {} already locked({} waiters, recursive {})",
-              groupIdx,
-              groupId,
-              lockWaiters.size(),
-              recursive);
-
+            // Need to wait to get the lock
             lockWaiters.push(groupIdx);
+
+            // Wait here if local, otherwise the remote end will pick up the
+            // message
+            if (lockerIsLocal) {
+                SPDLOG_TRACE(
+                  "Group idx {} ({}), locally awaiting lock {} (recursive {})",
+                  groupIdx,
+                  lockerHost,
+                  groupId,
+                  recursive);
+
+                ptpBroker.recvMessage(
+                  groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
+            } else {
+                // Notify remote locker that they've acquired the lock
+                SPDLOG_TRACE(
+                  "Group idx {} ({}), remotely awaiting lock {} (recursive {})",
+                  groupIdx,
+                  lockerHost,
+                  groupId,
+                  masterHost,
+                  recursive);
+            }
         }
     } else {
-        auto cli = getClient(host);
+        auto cli = getClient(masterHost);
         faabric::PointToPointMessage msg;
         msg.set_groupid(groupId);
         msg.set_sendidx(groupIdx);
@@ -187,13 +215,14 @@ void PointToPointGroup::lock(int groupIdx, bool recursive)
                      groupId,
                      groupIdx,
                      POINT_TO_POINT_MASTER_IDX,
-                     host);
+                     masterHost);
 
+        // Send the remote request and await the message saying it's been
+        // acquired
         cli->groupLock(appId, groupId, groupIdx, recursive);
-    }
 
-    // Await ptp response
-    ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
+        ptpBroker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
+    }
 }
 
 void PointToPointGroup::localLock()
diff --git a/tests/dist/transport/functions.cpp b/tests/dist/transport/functions.cpp
index 559fc75cd..f145e7d14 100644
--- a/tests/dist/transport/functions.cpp
+++ b/tests/dist/transport/functions.cpp
@@ -1,6 +1,10 @@
 #include <catch2/catch.hpp>
 
 #include "DistTestExecutor.h"
+#include "faabric/scheduler/Scheduler.h"
+#include "faabric/util/func.h"
+#include "faabric/util/gids.h"
+#include "faabric/util/scheduling.h"
 #include "faabric_utils.h"
 #include "init.h"
 
@@ -9,6 +13,7 @@
 #include <faabric/util/bytes.h>
 #include <faabric/util/string_tools.h>
 
+using namespace faabric::transport;
 using namespace faabric::util;
 
 namespace tests {
@@ -57,6 +62,94 @@ int handlePointToPointFunction(
     return 0;
 }
 
+int handleDistributedLock(faabric::scheduler::Executor* exec,
+                          int threadPoolIdx,
+                          int msgIdx,
+                          std::shared_ptr<faabric::BatchExecuteRequest> req)
+{
+    // We need sufficient concurrency here to show up bugs every time
+    int nWorkers = 10;
+    int nLoops = 30;
+
+    std::string sharedStateKey = "dist-lock-test";
+
+    faabric::Message& msg = req->mutable_messages()->at(msgIdx);
+
+    faabric::state::State& state = state::getGlobalState();
+    std::shared_ptr<faabric::state::StateKeyValue> stateKv =
+      state.getKV(msg.user(), sharedStateKey, sizeof(int32_t));
+
+    if (msg.function() == "lock") {
+        int initialValue = 0;
+        int groupId = faabric::util::generateGid();
+
+        stateKv->set(BYTES(&initialValue));
+
+        std::shared_ptr<faabric::BatchExecuteRequest> nestedReq =
+          faabric::util::batchExecFactory("ptp", "lock-worker", nWorkers);
+        for (int i = 0; i < nWorkers; i++) {
+            faabric::Message& m = nestedReq->mutable_messages()->at(i);
+            m.set_groupid(groupId);
+            m.set_groupidx(i);
+        }
+
+        faabric::scheduler::Scheduler& sch = faabric::scheduler::getScheduler();
+        faabric::util::SchedulingDecision decision =
+          sch.callFunctions(nestedReq);
+
+        // Await results
+        bool success = true;
+        for (int msgId : decision.messageIds) {
+            faabric::Message res = sch.getFunctionResult(msgId, 30000);
+            if (res.returnvalue() != 0) {
+                success = false;
+            }
+        }
+
+        int finalValue = *(int*)stateKv->get();
+        int expectedValue = nWorkers * nLoops;
+        if (finalValue != expectedValue) {
+            SPDLOG_ERROR("Distributed lock test failed: {} != {}",
+                         finalValue,
+                         expectedValue);
+            success = false;
+        } else {
+            SPDLOG_ERROR("Distributed lock succeeded, result {}", finalValue);
+        }
+
+        return success ? 0 : 1;
+    }
+
+    // Here we want to do something that will mess up if the locking isn't
+    // working properly, so we perform incremental updates to a bit of shared
+    // state using a global lock in a tight loop.
+    std::shared_ptr<PointToPointGroup> group =
+      faabric::transport::PointToPointGroup::getGroup(msg.groupid());
+
+    for (int i = 0; i < nLoops; i++) {
+        // Get the lock
+        group->lock(msg.groupidx(), false);
+
+        // Pull the value
+        stateKv->pull();
+        int* originalValue = (int*)stateKv->get();
+
+        // Short sleep
+        int sleepTimeMs = std::rand() % 50;
+        SLEEP_MS(sleepTimeMs);
+
+        // Increment and push
+        int newValue = *originalValue + 1;
+        stateKv->set(BYTES(&newValue));
+        stateKv->pushFull();
+
+        // Unlock
+        group->unlock(msg.groupidx(), false);
+    }
+
+    return 0;
+}
+
 class DistributedCoordinationTestRunner
 {
   public:
@@ -322,6 +415,11 @@ void registerTransportTestFunctions()
     registerDistTestExecutorCallback(
       "ptp", "barrier-worker", handleDistributedBarrierWorker);
 
+    registerDistTestExecutorCallback("ptp", "lock", handleDistributedLock);
+
+    registerDistTestExecutorCallback(
+      "ptp", "lock-worker", handleDistributedLock);
+
     registerDistTestExecutorCallback("ptp", "notify", handleDistributedNotify);
 
     registerDistTestExecutorCallback(
diff --git a/tests/dist/transport/test_coordination.cpp b/tests/dist/transport/test_coordination.cpp
new file mode 100644
index 000000000..f25608883
--- /dev/null
+++ b/tests/dist/transport/test_coordination.cpp
@@ -0,0 +1,34 @@
+#include <catch2/catch.hpp>
+
+#include "faabric_utils.h"
+#include "fixtures.h"
+#include "init.h"
+
+#include <faabric/proto/faabric.pb.h>
+#include <faabric/scheduler/Scheduler.h>
+#include <faabric/util/config.h>
+#include <faabric/util/func.h>
+#include <faabric/util/logging.h>
+#include <faabric/util/scheduling.h>
+
+namespace tests {
+
+TEST_CASE_METHOD(DistTestsFixture, "Test distributed lock", "[ptp][transport]")
+{
+    // Set up this host's resources
+    int nLocalSlots = 5;
+    faabric::HostResources res;
+    res.set_slots(nLocalSlots);
+    sch.setThisHostResources(res);
+
+    // Set up the request
+    std::shared_ptr<faabric::BatchExecuteRequest> req =
+      faabric::util::batchExecFactory("ptp", "lock", 1);
+
+    sch.callFunctions(req);
+
+    faabric::Message& m = req->mutable_messages()->at(0);
+    faabric::Message result = sch.getFunctionResult(m.id(), 30000);
+    REQUIRE(result.returnvalue() == 0);
+}
+}
diff --git a/tests/test/transport/test_point_to_point.cpp b/tests/test/transport/test_point_to_point.cpp
index 01d58340d..2782cdfac 100644
--- a/tests/test/transport/test_point_to_point.cpp
+++ b/tests/test/transport/test_point_to_point.cpp
@@ -353,8 +353,6 @@ TEST_CASE_METHOD(PointToPointClientServerFixture,
         server.setRequestLatch();
         cli.groupLock(appId, groupId, groupIdx, recursive);
         server.awaitRequestLatch();
-
-        broker.recvMessage(groupId, POINT_TO_POINT_MASTER_IDX, groupIdx);
     }
 
     REQUIRE(group->getLockOwner(recursive) == groupIdx);
diff --git a/tests/test/util/test_snapshot.cpp b/tests/test/util/test_snapshot.cpp
index 9b7224ab4..2f5e06007 100644
--- a/tests/test/util/test_snapshot.cpp
+++ b/tests/test/util/test_snapshot.cpp
@@ -141,37 +141,28 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     REQUIRE(*intBOriginal == originalValueB);
 
     // Check diffs themselves
-    REQUIRE(actualDiffs.size() == 3);
+    REQUIRE(actualDiffs.size() == 2);
 
     SnapshotDiff diffA = actualDiffs.at(0);
     SnapshotDiff diffB = actualDiffs.at(1);
-    SnapshotDiff diffOther = actualDiffs.at(2);
 
     REQUIRE(diffA.offset == intAOffset);
     REQUIRE(diffB.offset == intBOffset);
-    REQUIRE(diffOther.offset == otherOffset);
 
     REQUIRE(diffA.operation == SnapshotMergeOperation::Sum);
     REQUIRE(diffB.operation == SnapshotMergeOperation::Sum);
-    REQUIRE(diffOther.operation == SnapshotMergeOperation::Overwrite);
 
     REQUIRE(diffA.dataType == SnapshotDataType::Int);
     REQUIRE(diffB.dataType == SnapshotDataType::Int);
-    REQUIRE(diffOther.dataType == SnapshotDataType::Raw);
 
     REQUIRE(diffA.size == sizeof(int32_t));
     REQUIRE(diffB.size == sizeof(int32_t));
-    REQUIRE(diffOther.size == otherData.size());
 
     // Check that original values have been subtracted from final values for
     // sums
     REQUIRE(*(int*)diffA.data == sumValueA);
     REQUIRE(*(int*)diffB.data == sumValueB);
 
-    std::vector<uint8_t> actualOtherData(diffOther.data,
-                                         diffOther.data + diffOther.size);
-    REQUIRE(actualOtherData == otherData);
-
     deallocatePages(snap.data, snapPages);
 }
 
@@ -307,8 +298,6 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
       faabric::util::SnapshotMergeOperation::Overwrite;
     size_t dataLength = 0;
 
-    int expectedNumDiffs = 1;
-
     SECTION("Integer")
     {
         int originalValue = 0;
@@ -370,25 +359,16 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
     SECTION("Raw")
     {
-        dataLength = 2 * sizeof(int32_t);
+        dataLength = 100;
         originalData = std::vector<uint8_t>(dataLength, 3);
-        updatedData = originalData;
-        expectedData = originalData;
+        updatedData = std::vector<uint8_t>(dataLength, 4);
+        expectedData = updatedData;
 
         dataType = faabric::util::SnapshotDataType::Raw;
-        operation = faabric::util::SnapshotMergeOperation::Overwrite;
 
-        SECTION("Ignore")
+        SECTION("Overwrite")
         {
-            operation = faabric::util::SnapshotMergeOperation::Ignore;
-
-            // Scatter some modifications through the updated data, to make sure
-            // none are picked up
-            updatedData[0] = 1;
-            updatedData[sizeof(int32_t) - 2] = 1;
-            updatedData[sizeof(int32_t) + 10] = 1;
-
-            expectedNumDiffs = 0;
+            operation = faabric::util::SnapshotMergeOperation::Overwrite;
         }
     }
 
@@ -417,18 +397,15 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     std::vector<SnapshotDiff> actualDiffs =
       snap.getChangeDiffs(sharedMem, sharedMemSize);
 
-    // Check number of diffs
-    REQUIRE(actualDiffs.size() == expectedNumDiffs);
+    // Check diff
+    REQUIRE(actualDiffs.size() == 1);
+    std::vector<SnapshotDiff> expectedDiffs = { { dataType,
+                                                  operation,
+                                                  offset,
+                                                  expectedData.data(),
+                                                  expectedData.size() } };
 
-    if (expectedNumDiffs == 1) {
-        std::vector<SnapshotDiff> expectedDiffs = { { dataType,
-                                                      operation,
-                                                      offset,
-                                                      expectedData.data(),
-                                                      expectedData.size() } };
-
-        checkDiffs(actualDiffs, expectedDiffs);
-    }
+    checkDiffs(actualDiffs, expectedDiffs);
 
     deallocatePages(snap.data, snapPages);
 }
@@ -548,6 +525,12 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
           dataD.size() },
     };
 
+    // Add a single merge region for all the changes
+    snap.addMergeRegion(0,
+                        offsetD + dataD.size() + 20,
+                        SnapshotDataType::Raw,
+                        SnapshotMergeOperation::Overwrite);
+
     // Check number of diffs
     std::vector<SnapshotDiff> actualDiffs =
       snap.getChangeDiffs(sharedMem, sharedMemSize);
@@ -564,32 +547,33 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
     uint8_t* sharedMem = setUpSnapshot(snapPages);
 
     // Add a couple of merge regions on each page, which should be skipped as
-    // they won't cover any changes
+    // they won't overlap any changes
     for (int i = 0; i < snapPages; i++) {
-        // Ignore
-        int iOff = i * HOST_PAGE_SIZE;
-        snap.addMergeRegion(iOff,
+        // Overwrite
+        int skippedOverwriteOffset = i * HOST_PAGE_SIZE;
+        snap.addMergeRegion(skippedOverwriteOffset,
                             10,
                             faabric::util::SnapshotDataType::Raw,
-                            faabric::util::SnapshotMergeOperation::Ignore);
+                            faabric::util::SnapshotMergeOperation::Overwrite);
 
         // Sum
-        int sOff = ((i + 1) * HOST_PAGE_SIZE) - (2 * sizeof(int32_t));
-        snap.addMergeRegion(sOff,
+        int skippedSumOffset =
+          ((i + 1) * HOST_PAGE_SIZE) - (2 * sizeof(int32_t));
+        snap.addMergeRegion(skippedSumOffset,
                             sizeof(int32_t),
                             faabric::util::SnapshotDataType::Int,
                             faabric::util::SnapshotMergeOperation::Sum);
     }
 
-    // Add an ignore region that should take effect, along with a corresponding
-    // change to be ignored
-    uint32_t ignoreA = (2 * HOST_PAGE_SIZE) + 2;
-    snap.addMergeRegion(ignoreA,
+    // Add an overwrite region that should take effect
+    uint32_t overwriteAOffset = (2 * HOST_PAGE_SIZE) + 20;
+    snap.addMergeRegion(overwriteAOffset,
                         20,
                         faabric::util::SnapshotDataType::Raw,
-                        faabric::util::SnapshotMergeOperation::Ignore);
-    std::vector<uint8_t> dataA(10, 1);
-    std::memcpy(sharedMem + ignoreA, dataA.data(), dataA.size());
+                        faabric::util::SnapshotMergeOperation::Overwrite);
+    std::vector<uint8_t> overwriteData(10, 1);
+    std::memcpy(
+      sharedMem + overwriteAOffset, overwriteData.data(), overwriteData.size());
 
     // Add a sum region and data that should also take effect
     uint32_t sumOffset = (4 * HOST_PAGE_SIZE) + 100;
@@ -605,6 +589,11 @@ TEST_CASE_METHOD(SnapshotMergeTestFixture,
 
     // Check diffs
     std::vector<SnapshotDiff> expectedDiffs = {
+        { faabric::util::SnapshotDataType::Raw,
+          faabric::util::SnapshotMergeOperation::Overwrite,
+          overwriteAOffset,
+          BYTES(overwriteData.data()),
+          overwriteData.size() },
         { faabric::util::SnapshotDataType::Int,
           faabric::util::SnapshotMergeOperation::Sum,
           sumOffset,

From ebc21eb427f2928f5c91d5d274635313cff670b4 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Tue, 9 Nov 2021 15:42:44 +0000
Subject: [PATCH 26/31] Formatting

---
 src/transport/PointToPointServer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transport/PointToPointServer.cpp b/src/transport/PointToPointServer.cpp
index e2dd6fa80..32db8cac7 100644
--- a/src/transport/PointToPointServer.cpp
+++ b/src/transport/PointToPointServer.cpp
@@ -29,10 +29,10 @@ void PointToPointServer::doAsyncRecv(int header,
 
             // Send the message locally to the downstream socket
             broker.sendMessage(msg.groupid(),
-                            msg.sendidx(),
-                            msg.recvidx(),
-                            BYTES_CONST(msg.data().c_str()),
-                            msg.data().size());
+                               msg.sendidx(),
+                               msg.recvidx(),
+                               BYTES_CONST(msg.data().c_str()),
+                               msg.data().size());
             break;
         }
         case faabric::transport::PointToPointCall::LOCK_GROUP: {

From 11473c824b0d0acd6e582d8b819325196e0b9135 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Wed, 10 Nov 2021 08:06:25 +0000
Subject: [PATCH 27/31] Fix dist tests and remove unnecessary unit test

---
 src/scheduler/Executor.cpp             |  9 ++-
 src/util/snapshot.cpp                  |  6 ++
 tests/dist/scheduler/functions.cpp     | 65 ++++++++++++++-----
 tests/test/scheduler/test_executor.cpp | 88 --------------------------
 4 files changed, 60 insertions(+), 108 deletions(-)

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 120a505a7..a8a1d27fd 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -163,9 +163,9 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
                 threadPoolIdx = msg.appidx() % threadPoolSize;
             }
 
-            SPDLOG_WARN("Overloaded app index {} to thread {}",
-                        msg.appidx(),
-                        threadPoolIdx);
+            SPDLOG_DEBUG("Overloaded app index {} to thread {}",
+                         msg.appidx(),
+                         threadPoolIdx);
         } else {
             // Take next from those that are available
             threadPoolIdx = *availablePoolThreads.begin();
@@ -274,6 +274,9 @@ void Executor::threadPoolThread(int threadPoolIdx)
               faabric::snapshot::getSnapshotRegistry().getSnapshot(
                 msg.snapshotkey());
 
+            SPDLOG_TRACE("Diffing pre and post execution snapshots for {}",
+                         msg.snapshotkey());
+
             std::vector<faabric::util::SnapshotDiff> diffs =
               snapshotPreExecution.getChangeDiffs(snapshotPostExecution.data,
                                                   snapshotPostExecution.size);
diff --git a/src/util/snapshot.cpp b/src/util/snapshot.cpp
index 8763a53b7..595061220 100644
--- a/src/util/snapshot.cpp
+++ b/src/util/snapshot.cpp
@@ -153,6 +153,12 @@ void SnapshotData::addMergeRegion(uint32_t offset,
           snapshotMergeOpStr(operation),
           region.offset,
           region.offset + length);
+    } else {
+        SPDLOG_DEBUG("Adding new {} {} merge region at {}-{}",
+                     snapshotDataTypeStr(dataType),
+                     snapshotMergeOpStr(operation),
+                     region.offset,
+                     region.offset + length);
     }
 
     mergeRegions[region.offset] = region;
diff --git a/tests/dist/scheduler/functions.cpp b/tests/dist/scheduler/functions.cpp
index 28436f7f4..a91ad0405 100644
--- a/tests/dist/scheduler/functions.cpp
+++ b/tests/dist/scheduler/functions.cpp
@@ -14,6 +14,7 @@
 #include <faabric/util/func.h>
 #include <faabric/util/logging.h>
 #include <faabric/util/memory.h>
+#include <faabric/util/snapshot.h>
 
 namespace tests {
 
@@ -60,20 +61,31 @@ int handleFakeDiffsFunction(faabric::scheduler::Executor* exec,
 {
     faabric::Message& msg = req->mutable_messages()->at(msgIdx);
 
-    faabric::util::SnapshotData snap = exec->snapshot();
-
     std::string msgInput = msg.inputdata();
     std::string snapshotKey = msg.snapshotkey();
 
-    // Modify the executor's memory
+    faabric::snapshot::SnapshotRegistry& reg =
+      faabric::snapshot::getSnapshotRegistry();
+
+    faabric::util::SnapshotData& originalSnap = reg.getSnapshot(snapshotKey);
+    faabric::util::SnapshotData updatedSnap = exec->snapshot();
+
+    // Add a single merge region to catch both diffs
+    int offsetA = 10;
+    int offsetB = 100;
     std::vector<uint8_t> inputBytes = faabric::util::stringToBytes(msgInput);
-    std::vector<uint8_t> keyBytes = faabric::util::stringToBytes(snapshotKey);
 
-    uint32_t offsetA = 10;
-    uint32_t offsetB = 100;
+    originalSnap.addMergeRegion(
+      0,
+      offsetB + inputBytes.size() + 10,
+      faabric::util::SnapshotDataType::Raw,
+      faabric::util::SnapshotMergeOperation::Overwrite);
 
-    std::memcpy(snap.data + offsetA, keyBytes.data(), keyBytes.size());
-    std::memcpy(snap.data + offsetB, inputBytes.data(), inputBytes.size());
+    // Modify the executor's memory
+    std::vector<uint8_t> keyBytes = faabric::util::stringToBytes(snapshotKey);
+    std::memcpy(updatedSnap.data + offsetA, keyBytes.data(), keyBytes.size());
+    std::memcpy(
+      updatedSnap.data + offsetB, inputBytes.data(), inputBytes.size());
 
     return 123;
 }
@@ -89,6 +101,9 @@ int handleFakeDiffsThreadedFunction(
     std::string snapshotKey = "fake-diffs-threaded-snap";
     std::string msgInput = msg.inputdata();
 
+    faabric::snapshot::SnapshotRegistry& reg =
+      faabric::snapshot::getSnapshotRegistry();
+
     // This function creates a snapshot, then spawns some child threads that
     // will modify the shared memory. It then awaits the results and checks that
     // the modifications are synced back to the original host.
@@ -104,8 +119,6 @@ int handleFakeDiffsThreadedFunction(
         snap.data = snapMemory;
         snap.size = snapSize;
 
-        faabric::snapshot::SnapshotRegistry& reg =
-          faabric::snapshot::getSnapshotRegistry();
         reg.takeSnapshot(snapshotKey, snap);
 
         auto req =
@@ -121,7 +134,7 @@ int handleFakeDiffsThreadedFunction(
             // Make a small modification to a page that will also be edited by
             // the child thread to make sure it's not overwritten
             std::vector<uint8_t> localChange(3, i);
-            uint32_t offset = 2 * i * faabric::util::HOST_PAGE_SIZE;
+            int offset = 2 * i * faabric::util::HOST_PAGE_SIZE;
             std::memcpy(
               snapMemory + offset, localChange.data(), localChange.size());
         }
@@ -157,7 +170,7 @@ int handleFakeDiffsThreadedFunction(
         for (int i = 0; i < nThreads; i++) {
             // Check local modifications
             std::vector<uint8_t> expectedLocal(3, i);
-            uint32_t localOffset = 2 * i * faabric::util::HOST_PAGE_SIZE;
+            int localOffset = 2 * i * faabric::util::HOST_PAGE_SIZE;
             std::vector<uint8_t> actualLocal(snapMemory + localOffset,
                                              snapMemory + localOffset +
                                                expectedLocal.size());
@@ -168,7 +181,7 @@ int handleFakeDiffsThreadedFunction(
             }
 
             // Check remote modifications
-            uint32_t offset = 2 * i * faabric::util::HOST_PAGE_SIZE + 10;
+            int offset = 2 * i * faabric::util::HOST_PAGE_SIZE + 10;
             std::string expectedData("thread_" + std::to_string(i));
             auto* charPtr = reinterpret_cast<char*>(snapMemory + offset);
             std::string actual(charPtr);
@@ -185,15 +198,33 @@ int handleFakeDiffsThreadedFunction(
         }
 
     } else {
+        // This is the code that will be executed by the remote threads.
+        // Add a merge region to catch the modification
         int idx = msg.appidx();
-        uint32_t offset = 2 * idx * faabric::util::HOST_PAGE_SIZE + 10;
 
-        // Modify the executor's memory
+        int regionOffset = 2 * idx * faabric::util::HOST_PAGE_SIZE;
+        int changeOffset = regionOffset + 10;
+
+        // Get the input data
         std::vector<uint8_t> inputBytes =
           faabric::util::stringToBytes(msgInput);
 
-        faabric::util::SnapshotData snap = exec->snapshot();
-        std::memcpy(snap.data + offset, inputBytes.data(), inputBytes.size());
+        faabric::util::SnapshotData& originalSnap =
+          reg.getSnapshot(snapshotKey);
+        faabric::util::SnapshotData updatedSnap = exec->snapshot();
+
+        // Make sure it's captured by the region
+        int regionLength = 20 + inputBytes.size();
+        originalSnap.addMergeRegion(
+          regionOffset,
+          regionLength,
+          faabric::util::SnapshotDataType::Raw,
+          faabric::util::SnapshotMergeOperation::Overwrite);
+
+        // Now modify the memory
+        std::memcpy(updatedSnap.data + changeOffset,
+                    inputBytes.data(),
+                    inputBytes.size());
 
         return 0;
     }
diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp
index 918b7d4ea..b1c74ed72 100644
--- a/tests/test/scheduler/test_executor.cpp
+++ b/tests/test/scheduler/test_executor.cpp
@@ -448,94 +448,6 @@ TEST_CASE_METHOD(TestExecutorFixture,
     }
 }
 
-TEST_CASE_METHOD(TestExecutorFixture,
-                 "Test executing remote chained threads",
-                 "[executor]")
-{
-    faabric::util::setMockMode(true);
-
-    std::string thisHost = conf.endpointHost;
-
-    // Add other host to available hosts
-    std::string otherHost = "other";
-    sch.addHostToGlobalSet(otherHost);
-
-    // Make sure we have only enough resources to execute the initial function
-    faabric::HostResources res;
-    res.set_slots(1);
-    sch.setThisHostResources(res);
-
-    // Set up other host to have some resources
-    faabric::HostResources resOther;
-    resOther.set_slots(20);
-    faabric::scheduler::queueResourceResponse(otherHost, resOther);
-
-    // Background thread to execute main function and await results
-    int nThreads = 8;
-    auto latch = faabric::util::Latch::create(2);
-    std::thread t([&latch, nThreads] {
-        std::shared_ptr<BatchExecuteRequest> req =
-          faabric::util::batchExecFactory("dummy", "thread-check", 1);
-        faabric::Message& msg = req->mutable_messages()->at(0);
-        msg.set_inputdata(std::to_string(nThreads));
-
-        auto& sch = faabric::scheduler::getScheduler();
-        sch.callFunctions(req, false);
-
-        latch->wait();
-
-        faabric::Message res = sch.getFunctionResult(msg.id(), 2000);
-        assert(res.returnvalue() == 0);
-    });
-
-    // Wait for the remote thread request to have been submitted
-    auto reqs = faabric::scheduler::getBatchRequests();
-    REQUIRE_RETRY(reqs = faabric::scheduler::getBatchRequests(),
-                  reqs.size() == 1);
-    std::string actualHost = reqs.at(0).first;
-    REQUIRE(actualHost == otherHost);
-
-    std::shared_ptr<faabric::BatchExecuteRequest> distReq = reqs.at(0).second;
-    REQUIRE(distReq->messages().size() == nThreads);
-    faabric::Message firstMsg = distReq->messages().at(0);
-
-    // Check restore hasn't been called (as we're the master)
-    REQUIRE(restoreCount == 0);
-
-    // Check the snapshot has been pushed to the other host
-    auto snapPushes = faabric::snapshot::getSnapshotPushes();
-    REQUIRE(snapPushes.size() == 1);
-    REQUIRE(snapPushes.at(0).first == otherHost);
-
-    // Now execute request on this host as if we were on the other host
-    conf.endpointHost = otherHost;
-    sch.callFunctions(distReq, true);
-
-    // Check restore has been called as we're no longer master
-    REQUIRE(restoreCount == 1);
-
-    // Wait for the results
-    auto results = faabric::snapshot::getThreadResults();
-    REQUIRE_RETRY(results = faabric::snapshot::getThreadResults(),
-                  results.size() == nThreads);
-
-    // Reset the host config for this host
-    conf.endpointHost = thisHost;
-
-    // Process the thread results
-    for (auto& r : results) {
-        REQUIRE(r.first == thisHost);
-        auto args = r.second;
-        sch.setThreadResultLocally(args.first, args.second);
-    }
-
-    // Rejoin the background thread
-    latch->wait();
-    if (t.joinable()) {
-        t.join();
-    }
-}
-
 TEST_CASE_METHOD(TestExecutorFixture,
                  "Test thread results returned on non-master",
                  "[executor]")

From 0d2708e425a9c978ceb557ff5aa70bb3ec7e1d96 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Wed, 10 Nov 2021 08:35:15 +0000
Subject: [PATCH 28/31] Clearer error message when insufficient pool threads

---
 src/scheduler/Executor.cpp             | 11 ++++++++++-
 tests/test/scheduler/test_executor.cpp |  6 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index a8a1d27fd..49812a55c 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -157,7 +157,16 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
             // to be the main thread and the zeroth in the communication group,
             // so will be blocking.
             if (isThreads && isMaster) {
-                assert(threadPoolSize > 2);
+                if (threadPoolSize <= 2) {
+                    SPDLOG_ERROR(
+                      "Insufficient pool threads ({}) to overload {} idx {}",
+                      availablePoolThreads.size(),
+                      funcStr,
+                      msg.appidx());
+
+                    throw std::runtime_error("Insufficient pool threads");
+                }
+
                 threadPoolIdx = (msg.appidx() % (threadPoolSize - 2)) + 2;
             } else {
                 threadPoolIdx = msg.appidx() % threadPoolSize;
diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp
index b1c74ed72..573a08522 100644
--- a/tests/test/scheduler/test_executor.cpp
+++ b/tests/test/scheduler/test_executor.cpp
@@ -806,6 +806,8 @@ TEST_CASE_METHOD(TestExecutorFixture,
 {
     faabric::util::setMockMode(true);
 
+    conf.overrideCpuCount = 4;
+
     std::string hostOverride = conf.endpointHost;
     int nMessages = 1;
     faabric::BatchExecuteRequest::BatchExecuteType requestType =
@@ -846,8 +848,6 @@ TEST_CASE_METHOD(TestExecutorFixture,
 
     // As we're faking a non-master execution results will be sent back to
     // the fake master so we can't wait on them, thus have to sleep
-    SLEEP_MS(1000);
-
-    REQUIRE(resetCount == expectedResets);
+    REQUIRE_RETRY({}, resetCount == expectedResets);
 }
 }

From 4a76b0beb5c84b12a43de0e8add4ca7e32dc306d Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Wed, 10 Nov 2021 09:24:08 +0000
Subject: [PATCH 29/31] Bump cores in failing tests

---
 src/scheduler/Executor.cpp              |  2 +-
 tests/test/scheduler/test_scheduler.cpp | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
index 49812a55c..b94606709 100644
--- a/src/scheduler/Executor.cpp
+++ b/src/scheduler/Executor.cpp
@@ -160,7 +160,7 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
                 if (threadPoolSize <= 2) {
                     SPDLOG_ERROR(
                       "Insufficient pool threads ({}) to overload {} idx {}",
-                      availablePoolThreads.size(),
+                      threadPoolSize,
                       funcStr,
                       msg.appidx());
 
diff --git a/tests/test/scheduler/test_scheduler.cpp b/tests/test/scheduler/test_scheduler.cpp
index 9f27c5c0e..69c683119 100644
--- a/tests/test/scheduler/test_scheduler.cpp
+++ b/tests/test/scheduler/test_scheduler.cpp
@@ -191,6 +191,10 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]")
     int32_t expectedSubType;
     std::string expectedContextData;
 
+    int thisCores = 5;
+    faabric::util::SystemConfig& conf = faabric::util::getSystemConfig();
+    conf.overrideCpuCount = thisCores;
+
     SECTION("Threads")
     {
         execMode = faabric::BatchExecuteRequest::THREADS;
@@ -232,7 +236,7 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]")
     // Mock everything
     faabric::util::setMockMode(true);
 
-    std::string thisHost = faabric::util::getSystemConfig().endpointHost;
+    std::string thisHost = conf.endpointHost;
 
     // Set up another host
     std::string otherHost = "beta";
@@ -240,7 +244,6 @@ TEST_CASE_METHOD(SlowExecutorFixture, "Test batch scheduling", "[scheduler]")
 
     int nCallsOne = 10;
     int nCallsTwo = 5;
-    int thisCores = 5;
     int otherCores = 11;
     int nCallsOffloadedOne = nCallsOne - thisCores;
 
@@ -390,6 +393,9 @@ TEST_CASE_METHOD(SlowExecutorFixture,
                  "Test overloaded scheduler",
                  "[scheduler]")
 {
+    faabric::util::SystemConfig& conf = faabric::util::getSystemConfig();
+    conf.overrideCpuCount = 5;
+
     faabric::util::setMockMode(true);
 
     faabric::BatchExecuteRequest::BatchExecuteType execMode;

From ef52533ee5f6110db6289b48f9821a4b85fc064c Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Wed, 10 Nov 2021 14:08:38 +0000
Subject: [PATCH 30/31] Factor out scheduler decision making

---
 include/faabric/scheduler/Scheduler.h |  4 +++
 src/scheduler/Scheduler.cpp           | 44 +++++++++++++++------------
 src/transport/PointToPointBroker.cpp  |  4 +++
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
index 0e7414fcb..48da69b86 100644
--- a/include/faabric/scheduler/Scheduler.h
+++ b/include/faabric/scheduler/Scheduler.h
@@ -231,6 +231,10 @@ class Scheduler
 
     std::unordered_map<std::string, std::set<std::string>> registeredHosts;
 
+    faabric::util::SchedulingDecision makeSchedulingDecision(
+      std::shared_ptr<faabric::BatchExecuteRequest> req,
+      bool forceLocal);
+
     faabric::util::SchedulingDecision doCallFunctions(
       std::shared_ptr<faabric::BatchExecuteRequest> req,
       faabric::util::SchedulingDecision& decision,
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 53d9114d2..946700919 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -215,7 +215,6 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
     // Note, we assume all the messages are for the same function and have the
     // same master host
     faabric::Message& firstMsg = req->mutable_messages()->at(0);
-    std::string funcStr = faabric::util::funcToString(firstMsg, false);
     std::string masterHost = firstMsg.masterhost();
     if (masterHost.empty()) {
         std::string funcStrWithId = faabric::util::funcToString(firstMsg, true);
@@ -225,22 +224,40 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
 
     // If we're not the master host, we need to forward the request back to the
     // master host. This will only happen if a nested batch execution happens.
-    SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
     if (!forceLocal && masterHost != thisHost) {
+        std::string funcStr = faabric::util::funcToString(firstMsg, false);
         SPDLOG_DEBUG("Forwarding {} back to master {}", funcStr, masterHost);
 
         getFunctionCallClient(masterHost).executeFunctions(req);
+        SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
         decision.returnHost = masterHost;
         return decision;
     }
 
     faabric::util::FullLock lock(mx);
 
-    // -------------------------------------------
-    // DECISION BUILDING
-    // -------------------------------------------
-    std::vector<std::string> hosts;
+    SchedulingDecision decision = makeSchedulingDecision(req, forceLocal);
+
+    // Send out point-to-point mappings if necessary (unless being forced to
+    // execute locally, in which case they will be transmitted from the
+    // master)
+    if (!forceLocal && (firstMsg.groupid() > 0)) {
+        broker.setAndSendMappingsFromSchedulingDecision(decision);
+    }
+
+    // Pass decision as hint
+    return doCallFunctions(req, decision, lock);
+}
+
+faabric::util::SchedulingDecision Scheduler::makeSchedulingDecision(
+  std::shared_ptr<faabric::BatchExecuteRequest> req,
+  bool forceLocal)
+{
     int nMessages = req->messages_size();
+    faabric::Message& firstMsg = req->mutable_messages()->at(0);
+    std::string funcStr = faabric::util::funcToString(firstMsg, false);
+
+    std::vector<std::string> hosts;
     if (forceLocal) {
         // We're forced to execute locally here so we do all the messages
         for (int i = 0; i < nMessages; i++) {
@@ -338,23 +355,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions(
     assert(hosts.size() == nMessages);
 
     // Set up decision
+    SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid());
     for (int i = 0; i < hosts.size(); i++) {
         decision.addMessage(hosts.at(i), req->messages().at(i));
     }
 
-    // -------------------------------------------
-    // POINT-TO-POINT MESSAGING
-    // -------------------------------------------
-
-    // Send out point-to-point mappings if necessary (unless being forced to
-    // execute locally, in which case they will be transmitted from the
-    // master)
-    if (!forceLocal && (firstMsg.groupid() > 0)) {
-        broker.setAndSendMappingsFromSchedulingDecision(decision);
-    }
-
-    // Pass decision as hint
-    return doCallFunctions(req, decision, lock);
+    return decision;
 }
 
 faabric::util::SchedulingDecision Scheduler::callFunctions(
diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index cb4c4ec2a..ae19bcb33 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -114,11 +114,15 @@ void PointToPointGroup::addGroupIfNotExists(int appId,
 
 void PointToPointGroup::clearGroup(int groupId)
 {
+    faabric::util::FullLock lock(groupsMutex);
+
     groups.erase(groupId);
 }
 
 void PointToPointGroup::clear()
 {
+    faabric::util::FullLock lock(groupsMutex);
+
     groups.clear();
 }
 

From 5af97fdb1b5f6687fdc4726732d42321276fd615 Mon Sep 17 00:00:00 2001
From: Simon Shillaker <scs17@ic.ac.uk>
Date: Wed, 10 Nov 2021 15:00:41 +0000
Subject: [PATCH 31/31] Fix locking bug

---
 src/transport/PointToPointBroker.cpp          |  2 +-
 .../transport/test_point_to_point_groups.cpp  | 84 ++++++++++++++-----
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/src/transport/PointToPointBroker.cpp b/src/transport/PointToPointBroker.cpp
index ae19bcb33..4d9b23fad 100644
--- a/src/transport/PointToPointBroker.cpp
+++ b/src/transport/PointToPointBroker.cpp
@@ -157,7 +157,7 @@ void PointToPointGroup::lock(int groupIdx, bool recursive)
                 // Recursive and either free, or already locked by this idx
                 recursiveLockOwners.push(groupIdx);
                 acquiredLock = true;
-            } else if (lockOwnerIdx == NO_LOCK_OWNER_IDX) {
+            } else if (!recursive && (lockOwnerIdx == NO_LOCK_OWNER_IDX)) {
                 // Non-recursive and free
                 lockOwnerIdx = groupIdx;
                 acquiredLock = true;
diff --git a/tests/test/transport/test_point_to_point_groups.cpp b/tests/test/transport/test_point_to_point_groups.cpp
index d0c4bfb33..757120b17 100644
--- a/tests/test/transport/test_point_to_point_groups.cpp
+++ b/tests/test/transport/test_point_to_point_groups.cpp
@@ -164,41 +164,85 @@ TEST_CASE_METHOD(PointToPointGroupFixture,
 }
 
 TEST_CASE_METHOD(PointToPointGroupFixture,
-                 "Test local locking and unlocking",
+                 "Test locking and unlocking",
                  "[ptp][transport]")
 {
-    std::atomic<int> sharedInt = 0;
     int appId = 123;
     int groupId = 234;
 
-    // Arbitrary group size, local locks don't care
-    auto group = setUpGroup(appId, groupId, 3);
+    int nThreads = 4;
+    int nLoops = 50;
 
-    group->localLock();
+    auto group = setUpGroup(appId, groupId, nThreads);
 
-    std::thread tA([&group, &sharedInt] {
-        group->localLock();
+    std::atomic<bool> success = true;
 
-        assert(sharedInt == 99);
-        sharedInt = 88;
+    int criticalVar = 1;
 
-        group->localUnlock();
-    });
+    bool useLocal = false;
+    bool recursive = false;
+    SECTION("Local-only") { useLocal = true; }
 
-    // Main thread sleep for a while, make sure the other can't run and update
-    // the counter
-    SLEEP_MS(1000);
+    SECTION("Distributed version non-recursive")
+    {
+        useLocal = false;
+        recursive = false;
+    }
 
-    REQUIRE(sharedInt == 0);
-    sharedInt.store(99);
+    SECTION("Distributed version recursive")
+    {
+        useLocal = false;
+        recursive = true;
+    }
 
-    group->localUnlock();
+    // Create high contention on the critical var that will be detected if
+    // locking isn't working.
+    std::vector<std::thread> threads;
+    for (int i = 0; i < nThreads; i++) {
+        threads.emplace_back(
+          [useLocal, recursive, i, nLoops, &group, &criticalVar, &success] {
+              if (useLocal) {
+                  group->localLock();
+              } else {
+                  group->lock(i, recursive);
+              }
+
+              // Check that while in this critical section, no changes from
+              // other threads are visible
+
+              criticalVar = 2;
+              for (int j = 0; j < nLoops; j++) {
+                  // Set the var
+                  criticalVar = i;
+
+                  // Sleep a bit
+                  int sleepTimeMs = std::rand() % 30;
+                  SLEEP_MS(sleepTimeMs);
+
+                  // Check the var is unchanged by others
+                  if (criticalVar != i) {
+                      SPDLOG_ERROR("Inner loop testing locking got {} != {}",
+                                   criticalVar,
+                                   i);
+                      success = false;
+                  }
+              }
+
+              if (useLocal) {
+                  group->localUnlock();
+              } else {
+                  group->unlock(i, recursive);
+              }
+          });
+    }
 
-    if (tA.joinable()) {
-        tA.join();
+    for (auto& t : threads) {
+        if (t.joinable()) {
+            t.join();
+        }
     }
 
-    REQUIRE(sharedInt == 88);
+    REQUIRE(success);
 }
 
 TEST_CASE_METHOD(PointToPointGroupFixture,