faasm · Shillaker · Nov 10, 2021 · Nov 1, 2021 · Nov 1, 2021 · Nov 1, 2021
diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h
@@ -81,13 +81,12 @@ class Executor
     uint32_t threadPoolSize = 0;
 
   private:
-    std::string lastSnapshot;
-
     std::atomic<bool> claimed = false;
 
     std::mutex threadsMutex;
     std::vector<std::shared_ptr<std::thread>> threadPoolThreads;
     std::vector<std::shared_ptr<std::thread>> deadThreads;
+    std::set<int> availablePoolThreads;
 
     std::vector<faabric::util::Queue<ExecutorTask>> threadTaskQueues;
 
@@ -105,6 +104,10 @@ class Scheduler
       std::shared_ptr<faabric::BatchExecuteRequest> req,
       bool forceLocal = false);
 
+    faabric::util::SchedulingDecision callFunctions(
+      std::shared_ptr<faabric::BatchExecuteRequest> req,
+      faabric::util::SchedulingDecision& hint);
+
     void reset();
 
     void resetThreadLocalCache();
@@ -204,6 +207,8 @@ class Scheduler
                        std::promise<std::unique_ptr<faabric::Message>>>
       localResults;
 
+    std::unordered_map<std::string, std::set<std::string>> pushedSnapshotsMap;
+
     std::mutex localResultsMutex;
 
     // ---- Clients ----
@@ -226,20 +231,18 @@ class Scheduler
 
     std::unordered_map<std::string, std::set<std::string>> registeredHosts;
 
+    faabric::util::SchedulingDecision doCallFunctions(
+      std::shared_ptr<faabric::BatchExecuteRequest> req,
+      faabric::util::SchedulingDecision& decision,
+      faabric::util::FullLock& lock);
+
     std::shared_ptr<Executor> claimExecutor(
       faabric::Message& msg,
       faabric::util::FullLock& schedulerLock);
 
     std::vector<std::string> getUnregisteredHosts(const std::string& funcStr,
                                                   bool noCache = false);
 
-    int scheduleFunctionsOnHost(
-      const std::string& host,
-      std::shared_ptr<faabric::BatchExecuteRequest> req,
-      faabric::util::SchedulingDecision& decision,
-      int offset,
-      faabric::util::SnapshotData* snapshot);
-
     // ---- Accounting and debugging ----
     std::vector<faabric::Message> recordedMessagesAll;
     std::vector<faabric::Message> recordedMessagesLocal;

diff --git a/include/faabric/transport/PointToPointBroker.h b/include/faabric/transport/PointToPointBroker.h
@@ -2,6 +2,7 @@
 
 #include <faabric/transport/PointToPointClient.h>
 #include <faabric/util/config.h>
+#include <faabric/util/locks.h>
 #include <faabric/util/scheduling.h>
 
 #include <condition_variable>
@@ -26,10 +27,16 @@ class PointToPointGroup
   public:
     static std::shared_ptr<PointToPointGroup> getGroup(int groupId);
 
+    static std::shared_ptr<PointToPointGroup> getOrAwaitGroup(int groupId);
+
     static bool groupExists(int groupId);
 
     static void addGroup(int appId, int groupId, int groupSize);
 
+    static void addGroupIfNotExists(int appId, int groupId, int groupSize);
+
+    static void clearGroup(int groupId);
+
     static void clear();
 
     PointToPointGroup(int appId, int groupIdIn, int groupSizeIn);
@@ -77,10 +84,6 @@ class PointToPointGroup
     std::queue<int> lockWaiters;
 
     void notifyLocked(int groupIdx);
-
-    void masterLock(int groupIdx, bool recursive);
-
-    void masterUnlock(int groupIdx, bool recursive);
 };
 
 class PointToPointBroker
@@ -108,21 +111,23 @@ class PointToPointBroker
 
     std::vector<uint8_t> recvMessage(int groupId, int sendIdx, int recvIdx);
 
+    void clearGroup(int groupId);
+
     void clear();
 
     void resetThreadLocalCache();
 
   private:
+    faabric::util::SystemConfig& conf;
+
     std::shared_mutex brokerMutex;
 
     std::unordered_map<int, std::set<int>> groupIdIdxsMap;
     std::unordered_map<std::string, std::string> mappings;
 
-    std::unordered_map<int, bool> groupMappingsFlags;
-    std::unordered_map<int, std::mutex> groupMappingMutexes;
-    std::unordered_map<int, std::condition_variable> groupMappingCvs;
+    std::unordered_map<int, faabric::util::FlagWaiter> groupFlags;
 
-    faabric::util::SystemConfig& conf;
+    faabric::util::FlagWaiter& getGroupFlag(int groupId);
 };
 
 PointToPointBroker& getPointToPointBroker();

diff --git a/include/faabric/transport/PointToPointServer.h b/include/faabric/transport/PointToPointServer.h
@@ -11,7 +11,7 @@ class PointToPointServer final : public MessageEndpointServer
     PointToPointServer();
 
   private:
-    PointToPointBroker& reg;
+    PointToPointBroker& broker;
 
     void doAsyncRecv(int header,
                      const uint8_t* buffer,

diff --git a/include/faabric/util/locks.h b/include/faabric/util/locks.h
@@ -1,10 +1,33 @@
 #pragma once
 
+#include <faabric/util/logging.h>
+
+#include <atomic>
+#include <condition_variable>
 #include <mutex>
 #include <shared_mutex>
 
+#define DEFAULT_FLAG_WAIT_MS 10000
+
 namespace faabric::util {
 typedef std::unique_lock<std::mutex> UniqueLock;
 typedef std::unique_lock<std::shared_mutex> FullLock;
 typedef std::shared_lock<std::shared_mutex> SharedLock;
+
+class FlagWaiter
+{
+  public:
+    FlagWaiter(int timeoutMsIn = DEFAULT_FLAG_WAIT_MS);
+
+    void waitOnFlag();
+
+    void setFlag(bool value);
+
+  private:
+    int timeoutMs;
+
+    std::mutex flagMx;
+    std::condition_variable cv;
+    std::atomic<bool> flag;
+};
 }
diff --git a/include/faabric/util/snapshot.h b/include/faabric/util/snapshot.h
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include <faabric/util/logging.h>
+#include <faabric/util/macros.h>
 
 namespace faabric::util {
 
@@ -27,14 +28,6 @@ enum SnapshotMergeOperation
     Min
 };
 
-struct SnapshotMergeRegion
-{
-    uint32_t offset = 0;
-    size_t length = 0;
-    SnapshotDataType dataType = SnapshotDataType::Raw;
-    SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
-};
-
 class SnapshotDiff
 {
   public:
@@ -44,6 +37,8 @@ class SnapshotDiff
     size_t size = 0;
     const uint8_t* data = nullptr;
 
+    bool noChange = false;
+
     SnapshotDiff() = default;
 
     SnapshotDiff(SnapshotDataType dataTypeIn,
@@ -58,13 +53,19 @@ class SnapshotDiff
         data = dataIn;
         size = sizeIn;
     }
+};
 
-    SnapshotDiff(uint32_t offsetIn, const uint8_t* dataIn, size_t sizeIn)
-    {
-        offset = offsetIn;
-        data = dataIn;
-        size = sizeIn;
-    }
+class SnapshotMergeRegion
+{
+  public:
+    uint32_t offset = 0;
+    size_t length = 0;
+    SnapshotDataType dataType = SnapshotDataType::Raw;
+    SnapshotMergeOperation operation = SnapshotMergeOperation::Overwrite;
+
+    void addDiffs(std::vector<SnapshotDiff>& diffs,
+                  const uint8_t* original,
+                  const uint8_t* updated);
 };
 
 class SnapshotData
@@ -84,12 +85,19 @@ class SnapshotData
     void addMergeRegion(uint32_t offset,
                         size_t length,
                         SnapshotDataType dataType,
-                        SnapshotMergeOperation operation);
+                        SnapshotMergeOperation operation,
+                        bool overwrite = false);
 
   private:
-    // Note - we care about the order of this map, as we iterate through it in
-    // order of offsets
+    // Note - we care about the order of this map, as we iterate through it
+    // in order of offsets
     std::map<uint32_t, SnapshotMergeRegion> mergeRegions;
+
+    std::vector<SnapshotDiff> getCustomDiffs(const uint8_t* updated,
+                                             size_t updatedSize);
+
+    std::vector<SnapshotDiff> getStandardDiffs(const uint8_t* updated,
+                                               size_t updatedSize);
 };
 
 std::string snapshotDataTypeStr(SnapshotDataType dt);

diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp
@@ -2,6 +2,7 @@
 #include <faabric/scheduler/Scheduler.h>
 #include <faabric/snapshot/SnapshotRegistry.h>
 #include <faabric/state/State.h>
+#include <faabric/transport/PointToPointBroker.h>
 #include <faabric/util/clock.h>
 #include <faabric/util/config.h>
 #include <faabric/util/environment.h>
@@ -44,6 +45,11 @@ Executor::Executor(faabric::Message& msg)
     // Set an ID for this Executor
     id = conf.endpointHost + "_" + std::to_string(faabric::util::generateGid());
     SPDLOG_DEBUG("Starting executor {}", id);
+
+    // Mark all thread pool threads as available
+    for (int i = 0; i < threadPoolSize; i++) {
+        availablePoolThreads.insert(i);
+    }
 }
 
 Executor::~Executor() {}
@@ -82,8 +88,6 @@ void Executor::finish()
     // Reset variables
     boundMessage.Clear();
 
-    lastSnapshot = "";
-
     claimed = false;
 
     threadPoolThreads.clear();
@@ -107,30 +111,18 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     faabric::util::UniqueLock lock(threadsMutex);
 
     // Restore if necessary. If we're executing threads on the master host we
-    // assume we don't need to restore, but for everything else we do. If we've
-    // already restored from this snapshot, we don't do so again.
+    // assume we don't need to restore, but for everything else we do.
     faabric::Message& firstMsg = req->mutable_messages()->at(0);
     std::string snapshotKey = firstMsg.snapshotkey();
     std::string thisHost = faabric::util::getSystemConfig().endpointHost;
 
     bool isMaster = firstMsg.masterhost() == thisHost;
     bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS;
     bool isSnapshot = !snapshotKey.empty();
-    bool alreadyRestored = snapshotKey == lastSnapshot;
 
-    if (isSnapshot && !alreadyRestored) {
-        if ((!isMaster && isThreads) || !isThreads) {
-            SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey);
-            lastSnapshot = snapshotKey;
-            restore(firstMsg);
-        } else {
-            SPDLOG_DEBUG("Skipping snapshot restore on master {} [{}]",
-                         funcStr,
-                         snapshotKey);
-        }
-    } else if (isSnapshot) {
-        SPDLOG_DEBUG(
-          "Skipping already restored snapshot {} [{}]", funcStr, snapshotKey);
+    if (isSnapshot && !isMaster) {
+        SPDLOG_DEBUG("Restoring {} from snapshot {}", funcStr, snapshotKey);
+        restore(firstMsg);
     }
 
     // Reset dirty page tracking if we're executing threads.
@@ -150,23 +142,42 @@ void Executor::executeTasks(std::vector<int> msgIdxs,
     // original function call will cause a reset
     bool skipReset = isMaster && isThreads;
 
-    // Iterate through and invoke tasks
+    // Iterate through and invoke tasks. By default, we allocate tasks
+    // one-to-one with thread pool threads. Only once the pool is exhausted do
+    // we start overallocating.
     for (int msgIdx : msgIdxs) {
         const faabric::Message& msg = req->messages().at(msgIdx);
 
-        // If executing threads, we must always keep thread pool index zero
-        // free, as this may be executing the function that spawned them
-        int threadPoolIdx;
-        if (isThreads) {
-            assert(threadPoolSize > 1);
-            threadPoolIdx = (msg.appidx() % (threadPoolSize - 1)) + 1;
+        int threadPoolIdx = -1;
+        if (availablePoolThreads.empty()) {
+            // Here all threads are still executing, so we have to overload.
+            // If any tasks are blocking we risk a deadlock, and can no longer
+            // guarantee the application will finish.
+            // In general if we're on the master host and this is a thread, we
+            // should avoid the zeroth and first pool threads as they are likely
+            // to be the main thread and the zeroth in the communication group,
+            // so will be blocking.
+            if (isThreads && isMaster) {
+                assert(threadPoolSize > 2);
+                threadPoolIdx = (msg.appidx() % (threadPoolSize - 2)) + 2;
+            } else {
+                threadPoolIdx = msg.appidx() % threadPoolSize;
+            }
+
+            SPDLOG_WARN("Overloaded app index {} to thread {}",
+                        msg.appidx(),
+                        threadPoolIdx);
         } else {
-            threadPoolIdx = msg.appidx() % threadPoolSize;
+            // Take next from those that are available
+            threadPoolIdx = *availablePoolThreads.begin();
+            availablePoolThreads.erase(threadPoolIdx);
+
+            SPDLOG_TRACE("Assigned app index {} to thread {}",
+                         msg.appidx(),
+                         threadPoolIdx);
         }
 
         // Enqueue the task
-        SPDLOG_TRACE(
-          "Assigning app index {} to thread {}", msg.appidx(), threadPoolIdx);
         threadTaskQueues[threadPoolIdx].enqueue(ExecutorTask(
           msgIdx, req, batchCounter, needsSnapshotPush, skipReset));
 
@@ -183,6 +194,8 @@ void Executor::threadPoolThread(int threadPoolIdx)
     SPDLOG_DEBUG("Thread pool thread {}:{} starting up", id, threadPoolIdx);
 
     auto& sch = faabric::scheduler::getScheduler();
+    faabric::transport::PointToPointBroker& broker =
+      faabric::transport::getPointToPointBroker();
     const auto& conf = faabric::util::getSystemConfig();
 
     bool selfShutdown = false;
@@ -286,6 +299,12 @@ void Executor::threadPoolThread(int threadPoolIdx)
             releaseClaim();
         }
 
+        // Return this thread index to the pool available for scheduling
+        {
+            faabric::util::UniqueLock lock(threadsMutex);
+            availablePoolThreads.insert(threadPoolIdx);
+        }
+
         // Vacate the slot occupied by this task. This must be done after
         // releasing the claim on this executor, otherwise the scheduler may try
         // to schedule another function and be unable to reuse this executor.
@@ -333,8 +352,9 @@ void Executor::threadPoolThread(int threadPoolIdx)
     }
 
     // We have to clean up TLS here as this should be the last use of the
-    // scheduler from this thread
+    // scheduler and point-to-point broker from this thread
     sch.resetThreadLocalCache();
+    broker.resetThreadLocalCache();
 }
 
 bool Executor::tryClaim()