From 867f41f3681cc11caaee42cbefcd657f3629df50 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Jan 2022 12:48:09 +0000 Subject: [PATCH 01/26] proto: add field to set migration check period --- src/proto/faabric.proto | 3 +++ src/util/json.cpp | 7 +++++++ tests/test/util/test_json.cpp | 2 ++ 3 files changed, 12 insertions(+) diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index 7982d34f4..50fc13dcf 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -164,6 +164,9 @@ message Message { bool recordExecGraph = 41; map intExecGraphDetails = 42; map execGraphDetails = 43; + + // Function migration + int32 migrationCheckPeriod = 44; } // --------------------------------------------- diff --git a/src/util/json.cpp b/src/util/json.cpp index 84b6da020..a4e2f2904 100644 --- a/src/util/json.cpp +++ b/src/util/json.cpp @@ -228,6 +228,10 @@ std::string messageToJson(const faabric::Message& msg) } } + if (msg.migrationcheckperiod() > 0) { + d.AddMember("migration_check_period", msg.migrationcheckperiod(), a); + } + StringBuffer sb; Writer writer(sb); d.Accept(writer); @@ -439,6 +443,9 @@ faabric::Message jsonToMessage(const std::string& jsonIn) msgIntMap[it.first] = it.second; } + msg.set_migrationcheckperiod( + getIntFromJson(d, "migration_check_period", 0)); + PROF_END(jsonDecode) return msg; diff --git a/tests/test/util/test_json.cpp b/tests/test/util/test_json.cpp index f8ac1766d..b38b82948 100644 --- a/tests/test/util/test_json.cpp +++ b/tests/test/util/test_json.cpp @@ -47,6 +47,8 @@ TEST_CASE("Test message to JSON round trip", "[util]") auto& intMap = *msg.mutable_intexecgraphdetails(); intMap["foo"] = 0; + msg.set_migrationcheckperiod(33); + SECTION("Dodgy characters") { msg.set_inputdata("[0], %$ 2233 9"); } SECTION("Bytes") From 4eef2d348a90568355fedcb6e63e96a8e0e11c6b Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Jan 2022 13:15:23 +0000 Subject: [PATCH 02/26] scheduler: add function migration thread that sleeps for a fixed amount of time, and a simple test. --- .../scheduler/FunctionMigrationThread.h | 26 ++++++++ include/faabric/scheduler/Scheduler.h | 5 ++ src/scheduler/CMakeLists.txt | 1 + src/scheduler/FunctionMigrationThread.cpp | 59 +++++++++++++++++++ src/scheduler/Scheduler.cpp | 5 ++ .../scheduler/test_function_migration.cpp | 34 +++++++++++ 6 files changed, 130 insertions(+) create mode 100644 include/faabric/scheduler/FunctionMigrationThread.h create mode 100644 src/scheduler/FunctionMigrationThread.cpp create mode 100644 tests/test/scheduler/test_function_migration.cpp diff --git a/include/faabric/scheduler/FunctionMigrationThread.h b/include/faabric/scheduler/FunctionMigrationThread.h new file mode 100644 index 000000000..876ca917f --- /dev/null +++ b/include/faabric/scheduler/FunctionMigrationThread.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +namespace faabric::scheduler { +class FunctionMigrationThread +{ + public: + // Start a background thread that, every wake up period, will check if there + // are migration opportunities for in-flight apps that have opted in to + // being checked for migrations. + void start(int wakeUpPeriodSecondsIn); + + void stop(); + + int wakeUpPeriodSeconds; + + private: + std::unique_ptr workThread = nullptr; + std::mutex mx; + std::condition_variable mustStopCv; + std::atomic isShutdown; +}; +} diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 58b84a623..70743576d 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -220,6 +220,11 @@ class Scheduler ExecGraph getFunctionExecGraph(unsigned int msgId); + // ---------------------------------- + // Function Migration + // ---------------------------------- + void checkForMigrationOpportunities(); + private: std::string thisHost; diff --git a/src/scheduler/CMakeLists.txt b/src/scheduler/CMakeLists.txt index f3772c10a..f1ced9340 100644 --- a/src/scheduler/CMakeLists.txt +++ b/src/scheduler/CMakeLists.txt @@ -4,6 +4,7 @@ faabric_lib(scheduler Executor.cpp FunctionCallClient.cpp FunctionCallServer.cpp + FunctionMigrationThread.cpp MpiContext.cpp MpiMessageBuffer.cpp MpiWorld.cpp diff --git a/src/scheduler/FunctionMigrationThread.cpp b/src/scheduler/FunctionMigrationThread.cpp new file mode 100644 index 000000000..805f7f8a8 --- /dev/null +++ b/src/scheduler/FunctionMigrationThread.cpp @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +namespace faabric::scheduler { +void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn) +{ + wakeUpPeriodSeconds = wakeUpPeriodSecondsIn; + + // Main work loop + workThread = std::make_unique([&] { + // As we only check for migration opportunities every (possibly user- + // defined) timeout, we also support stopping the main thread through + // a condition variable. + while (!isShutdown.load(std::memory_order_acquire)) { + faabric::util::UniqueLock lock(mx); + + if (isShutdown.load(std::memory_order_acquire)) { + break; + } + + std::cv_status returnVal = mustStopCv.wait_for( + lock, std::chrono::milliseconds(wakeUpPeriodSeconds * 1000)); + + // If we hit the timeout it means we have not been notified to + // reset or stop. Thus we check for migration oportunities. + if (returnVal == std::cv_status::timeout) { + SPDLOG_DEBUG("Checking for migration oportunities"); + faabric::scheduler::getScheduler() + .checkForMigrationOpportunities(); + } + }; + + SPDLOG_DEBUG("Exiting main function migration thread loop"); + }); +} + +void FunctionMigrationThread::stop() +{ + if (workThread == nullptr) { + return; + } + + { + faabric::util::UniqueLock lock(mx); + + // We set the flag _before_ we notify and after we acquire the lock. + // Therefore, either we check the flag (before going to sleep) or are + // woken by the notification. + isShutdown.store(true, std::memory_order_release); + mustStopCv.notify_one(); + } + + if (workThread->joinable()) { + workThread->join(); + } +} +} diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6dfcbb072..c4af4fbe0 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1129,4 +1129,9 @@ ExecGraphNode Scheduler::getFunctionExecGraphNode(unsigned int messageId) return node; } + +void Scheduler::checkForMigrationOpportunities() +{ + SPDLOG_INFO("Not implemented"); +} } diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp new file mode 100644 index 000000000..d56db92e5 --- /dev/null +++ b/tests/test/scheduler/test_function_migration.cpp @@ -0,0 +1,34 @@ +#include + +#include +#include + +#include +#include + +using namespace faabric::scheduler; + +namespace tests { +class FunctionMigrationTestFixture : public SchedulerTestFixture +{ + public: + FunctionMigrationTestFixture() { faabric::util::setMockMode(true); } + + ~FunctionMigrationTestFixture() { faabric::util::setMockMode(false); } + + protected: + FunctionMigrationThread migrationThread; +}; + +TEST_CASE_METHOD(FunctionMigrationTestFixture, + "Test starting and stopping the function migration thread", + "[scheduler]") +{ + int wakeUpPeriodSeconds = 2; + migrationThread.start(wakeUpPeriodSeconds); + + SLEEP_MS(SHORT_TEST_TIMEOUT_MS); + + migrationThread.stop(); +} +} From 0a971c8b4c066b1766e2ccf07e2b15a099e97f66 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Jan 2022 20:20:33 +0000 Subject: [PATCH 03/26] scheduler: add logic for check for migration opportunities method --- include/faabric/scheduler/Scheduler.h | 16 +- include/faabric/util/scheduling.h | 12 + src/proto/faabric.proto | 17 ++ src/scheduler/Scheduler.cpp | 169 ++++++++++- tests/test/scheduler/test_executor.cpp | 17 ++ .../scheduler/test_function_migration.cpp | 274 +++++++++++++++++- 6 files changed, 500 insertions(+), 5 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 70743576d..d7b9efa35 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -22,6 +22,10 @@ namespace faabric::scheduler { +typedef std::pair, + std::shared_ptr> + InFlightPair; + class Scheduler; Scheduler& getScheduler(); @@ -223,7 +227,12 @@ class Scheduler // ---------------------------------- // Function Migration // ---------------------------------- - void checkForMigrationOpportunities(); + void checkForMigrationOpportunities( + faabric::util::MigrationStrategy = + faabric::util::MigrationStrategy::BIN_PACK); + + std::shared_ptr canAppBeMigrated( + uint32_t appId); private: std::string thisHost; @@ -295,6 +304,11 @@ class Scheduler // ---- Point-to-point ---- faabric::transport::PointToPointBroker& broker; + + // ---- Function migration ---- + std::unordered_map inFlightRequests; + std::unordered_map> + pendingMigrations; }; } diff --git a/include/faabric/util/scheduling.h b/include/faabric/util/scheduling.h index 0f2a6a64d..2905d8862 100644 --- a/include/faabric/util/scheduling.h +++ b/include/faabric/util/scheduling.h @@ -54,4 +54,16 @@ enum SchedulingTopologyHint FORCE_LOCAL, NEVER_ALONE }; + +// Migration strategies help the scheduler decide wether the scheduling decision +// for a batch request could be changed with the new set of available resources. +// - BIN_PACK: sort hosts by the number of functions from the batch they are +// running. Bin-pack batches in increasing order to hosts in +// decreasing order. +// - EMPTY_HOSTS: pack batches in increasing order to empty hosts. +enum MigrationStrategy +{ + BIN_PACK, + EMPTY_HOSTS +}; } diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index 50fc13dcf..7382b9fe7 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -245,3 +245,20 @@ message PointToPointMappings { repeated PointToPointMapping mappings = 3; } + +// --------------------------------------------- +// FUNCTION MIGRATIONS +// --------------------------------------------- + +message PendingMigrations { + int32 appId = 1; + int32 groupId = 2; + + message PendingMigration { + int32 messageId = 1; + string srcHost = 2; + string dstHost = 3; + } + + repeated PendingMigration migrations = 3; +} diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index c4af4fbe0..1697f1d68 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -137,6 +137,10 @@ void Scheduler::reset() threadResults.clear(); pushedSnapshotsMap.clear(); + // Reset function migration tracking + inFlightRequests.clear(); + pendingMigrations.clear(); + // Records recordedMessagesAll.clear(); recordedMessagesLocal.clear(); @@ -452,6 +456,25 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( throw std::runtime_error("Invalid scheduler hint for messages"); } + // Record in-flight request if function desires to be migrated + if (firstMsg.migrationcheckperiod() > 0) { + auto decisionPtr = + std::make_shared(decision); + inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); + /* + if (inFlightRequests.size() == 1) { + functionMigrationThread.start(firstMsg.migrationcheckperiod()); + } else if (firstMsg.migrationcheckperiod() != + functionMigrationThread.wakeUpPeriodSeconds) { + SPDLOG_WARN("Ignoring migration check period as the migration" + "thread was initialised with a different one." + "(provided: {}, current: {})", + firstMsg.migrationcheckperiod(), + functionMigrationThread.wakeUpPeriodSeconds); + } + */ + } + // NOTE: we want to schedule things on this host _last_, otherwise functions // may start executing before all messages have been dispatched, thus // slowing the remaining scheduling. @@ -889,6 +912,20 @@ void Scheduler::setFunctionResult(faabric::Message& msg) // Write the successful result to the result queue std::vector inputData = faabric::util::messageToBytes(msg); redis.publishSchedulerResult(key, msg.statuskey(), inputData); + + // Remove the app from in-flight map if still there, and this host is the + // master host for the message + if (msg.masterhost() == thisHost) { + faabric::util::FullLock lock(mx); + + inFlightRequests.erase(msg.appid()); + pendingMigrations.erase(msg.appid()); + // If there are no more apps to track, stop the thread checking for + // migration opportunities + if (inFlightRequests.size() == 0) { + // functionMigrationThread.stop(); + } + } } void Scheduler::registerThread(uint32_t msgId) @@ -1130,8 +1167,136 @@ ExecGraphNode Scheduler::getFunctionExecGraphNode(unsigned int messageId) return node; } -void Scheduler::checkForMigrationOpportunities() +void Scheduler::checkForMigrationOpportunities( + faabric::util::MigrationStrategy migrationStrategy) { - SPDLOG_INFO("Not implemented"); + // Vector to cache all migrations we have to do, and update the shared map + // at the very end just once. This is because we need a unique lock to write + // to the shared map, but the rest of this method can do with a shared lock. + std::vector> + tmpPendingMigrations; + + { + faabric::util::SharedLock lock(mx); + + // For each in-flight request that has opted in to be migrated, + // check if there is an opportunity to migrate + for (const auto& app : inFlightRequests) { + auto req = app.second.first; + auto originalDecision = *app.second.second; + + // If we have already recorded a pending migration for this req, + // skip + if (canAppBeMigrated(originalDecision.appId) != nullptr) { + continue; + } + + faabric::PendingMigrations msg; + msg.set_appid(originalDecision.appId); + // TODO - generate a new groupId here for processes to wait on + // during the migration? msg.set_groupid(); + + if (migrationStrategy == + faabric::util::MigrationStrategy::BIN_PACK) { + // We assume the batch was originally scheduled using + // bin-packing, thus the scheduling decision has at the begining + // (left) the hosts with the most allocated requests, and at the + // end (right) the hosts with the fewest. To check for migration + // oportunities, we compare a pointer to the possible + // destination of the migration (left), with one to the possible + // source of the migration (right). NOTE - this is a slight + // simplification, but makes the code simpler. + auto left = originalDecision.hosts.begin(); + auto right = originalDecision.hosts.end() - 1; + faabric::HostResources r = (*left == thisHost) + ? getThisHostResources() + : getHostResources(*left); + auto nAvailable = [&r]() -> int { + return r.slots() - r.usedslots(); + }; + auto claimSlot = [&r]() { + int currentUsedSlots = r.usedslots(); + SPDLOG_INFO("Old slots: {} - New slots: {}", + currentUsedSlots, + currentUsedSlots + 1); + r.set_usedslots(currentUsedSlots + 1); + }; + while (left < right) { + // If both pointers point to the same host, no migration + // opportunity, and must check another possible source of + // the migration + if (*left == *right) { + --right; + continue; + } + + // If the left pointer (possible destination of the + // migration) is out of available resources, no migration + // opportunity, and must check another possible destination + // of migration + if (nAvailable() == 0) { + auto oldHost = *left; + ++left; + if (*left != oldHost) { + r = (*left == thisHost) ? getThisHostResources() + : getHostResources(*left); + } + continue; + } + + // If each pointer points to a request scheduled in a + // different host, and the possible destination has slots, + // there is a migration opportunity + auto* migration = msg.add_migrations(); + auto msgIdPtr = + originalDecision.messageIds.begin() + + std::distance(originalDecision.hosts.begin(), right); + migration->set_messageid(*msgIdPtr); + migration->set_srchost(*right); + migration->set_dsthost(*left); + // Decrement by one the availability, and check for more + // possible sources of migration + claimSlot(); + --right; + } + } else { + SPDLOG_ERROR("Unrecognised migration strategy: {}", + migrationStrategy); + throw std::runtime_error("Unrecognised migration strategy."); + } + + if (msg.migrations_size() > 0) { + tmpPendingMigrations.emplace_back( + std::make_shared(msg)); + SPDLOG_DEBUG("Detected migration opportunity for app: {}", + msg.appid()); + } else { + SPDLOG_DEBUG("No migration opportunity detected for app: {}", + msg.appid()); + } + } + } + + // Finally, store all the pending migrations in the shared map acquiring + // a unique lock. + if (tmpPendingMigrations.size() > 0) { + faabric::util::FullLock lock(mx); + for (auto msgPtr : tmpPendingMigrations) { + SPDLOG_INFO("Adding app: {}", msgPtr->appid()); + pendingMigrations[msgPtr->appid()] = std::move(msgPtr); + } + } +} + +std::shared_ptr Scheduler::canAppBeMigrated( + uint32_t appId) +{ + faabric::util::SharedLock lock(mx); + + if (pendingMigrations.find(appId) == pendingMigrations.end()) { + return nullptr; + } + + return pendingMigrations[appId]; } } diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index 3af1a230b..c456ef282 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -201,6 +201,23 @@ int32_t TestExecutor::executeTask( throw std::runtime_error("This is a test error"); } + if (msg.function() == "sleep") { + // Sleep for sufficiently more than the check period + int timeToSleepMs = SHORT_TEST_TIMEOUT_MS; + if (!msg.inputdata().empty()) { + timeToSleepMs = std::stoi(msg.inputdata()); + } + SPDLOG_DEBUG("Sleep test function going to sleep for {} ms", + timeToSleepMs); + SLEEP_MS(timeToSleepMs); + SPDLOG_DEBUG("Sleep test function waking up"); + + msg.set_outputdata( + fmt::format("Migration test function {} executed", msg.id())); + + return 0; + } + if (reqOrig->type() == faabric::BatchExecuteRequest::THREADS) { SPDLOG_DEBUG("TestExecutor executing simple thread {}", msg.id()); return msg.id() / 100; diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index d56db92e5..7d2090a08 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -4,6 +4,8 @@ #include #include +#include +#include #include using namespace faabric::scheduler; @@ -12,12 +14,65 @@ namespace tests { class FunctionMigrationTestFixture : public SchedulerTestFixture { public: - FunctionMigrationTestFixture() { faabric::util::setMockMode(true); } + FunctionMigrationTestFixture() + { + faabric::util::setMockMode(true); - ~FunctionMigrationTestFixture() { faabric::util::setMockMode(false); } + std::shared_ptr fac = + std::make_shared(); + setExecutorFactory(fac); + } + + ~FunctionMigrationTestFixture() + { + sch.clearRecordedMessages(); + faabric::util::setMockMode(false); + + // Remove all hosts from global set + for (const std::string& host : sch.getAvailableHosts()) { + sch.removeHostFromGlobalSet(host); + } + } protected: FunctionMigrationThread migrationThread; + std::string masterHost = faabric::util::getSystemConfig().endpointHost; + + // Helper method to set the available hosts and slots per host prior to + // making a scheduling decision + void setHostResources(std::vector registeredHosts, + std::vector slotsPerHost, + std::vector usedSlotsPerHost) + { + assert(registeredHosts.size() == slotsPerHost.size()); + auto& sch = faabric::scheduler::getScheduler(); + sch.clearRecordedMessages(); + + for (int i = 0; i < registeredHosts.size(); i++) { + faabric::HostResources resources; + resources.set_slots(slotsPerHost.at(i)); + resources.set_usedslots(usedSlotsPerHost.at(i)); + + sch.addHostToGlobalSet(registeredHosts.at(i)); + + // If setting resources for the master host, update the scheduler. + // Otherwise, queue the resource response + if (i == 0) { + sch.setThisHostResources(resources); + } else { + faabric::scheduler::queueResourceResponse(registeredHosts.at(i), + resources); + } + } + } + + void updateLocalResources(int slots, int usedSlots) + { + faabric::HostResources r; + r.set_slots(slots); + r.set_usedslots(usedSlots); + sch.setThisHostResources(r); + } }; TEST_CASE_METHOD(FunctionMigrationTestFixture, @@ -31,4 +86,219 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, migrationThread.stop(); } + +TEST_CASE_METHOD( + FunctionMigrationTestFixture, + "Test function migration thread only works if set in the message", + "[scheduler]") +{ + // First set resources before calling the functions: one will be allocated + // locally, another one in the remote host + std::vector hosts = { masterHost, "hostA" }; + std::vector slots = { 1, 1 }; + std::vector usedSlots = { 0, 0 }; + setHostResources(hosts, slots, usedSlots); + + // The sleep function sleeps for a set timeout before returning + auto req = faabric::util::batchExecFactory("foo", "sleep", 2); + int timeToSleep = SHORT_TEST_TIMEOUT_MS; + req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); + uint32_t appId = req->messages().at(0).appid(); + + std::shared_ptr expectedMigrations; + SECTION("Migration not enabled") { expectedMigrations = nullptr; } + + SECTION("Migration enabled") + { + // Set to a non-zero value so that migration is enabled + req->mutable_messages()->at(0).set_migrationcheckperiod(2); + + // Build expected result + faabric::PendingMigrations expected; + expected.set_appid(appId); + auto* migration = expected.add_migrations(); + migration->set_messageid(req->messages().at(1).id()); + migration->set_srchost(hosts.at(1)); + migration->set_dsthost(hosts.at(0)); + expectedMigrations = + std::make_shared(expected); + } + + auto decision = sch.callFunctions(req); + + // Update host resources so that a migration opportunity appears, but will + // only be detected if migration check period is set. + updateLocalResources(2, 1); + + sch.checkForMigrationOpportunities(); + + auto actualMigrations = sch.canAppBeMigrated(appId); + if (expectedMigrations == nullptr) { + REQUIRE(actualMigrations == expectedMigrations); + } else { + REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); + REQUIRE(actualMigrations->migrations_size() == + expectedMigrations->migrations_size()); + for (int i = 0; i < actualMigrations->migrations_size(); i++) { + auto actual = actualMigrations->mutable_migrations()->at(i); + auto expected = expectedMigrations->mutable_migrations()->at(i); + REQUIRE(actual.messageid() == expected.messageid()); + REQUIRE(actual.srchost() == expected.srchost()); + REQUIRE(actual.dsthost() == expected.dsthost()); + } + } + + faabric::Message res = + sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); + REQUIRE(res.returnvalue() == 0); + + // Check that after the result is set, the app can't be migrated no more + sch.checkForMigrationOpportunities(); + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); +} + +TEST_CASE_METHOD( + FunctionMigrationTestFixture, + "Test function migration thread detects migration opportunities", + "[scheduler]") +{ + std::vector hosts = { masterHost, "hostA" }; + std::vector slots = { 1, 1 }; + std::vector usedSlots = { 0, 0 }; + setHostResources(hosts, slots, usedSlots); + + auto req = faabric::util::batchExecFactory("foo", "sleep", 2); + int timeToSleep = SHORT_TEST_TIMEOUT_MS; + req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); + req->mutable_messages()->at(0).set_migrationcheckperiod(2); + uint32_t appId = req->messages().at(0).appid(); + + auto decision = sch.callFunctions(req); + + std::shared_ptr expectedMigrations; + + // As we don't update the available resources, no migration opportunities + // will appear, even though we are checking for them + SECTION("Can not migrate") { expectedMigrations = nullptr; } + + SECTION("Can migrate") + { + // Update host resources so that a migration opportunity appears + updateLocalResources(2, 1); + + // Build expected result + faabric::PendingMigrations expected; + expected.set_appid(appId); + auto* migration = expected.add_migrations(); + migration->set_messageid(req->messages().at(1).id()); + migration->set_srchost(hosts.at(1)); + migration->set_dsthost(hosts.at(0)); + expectedMigrations = + std::make_shared(expected); + } + + sch.checkForMigrationOpportunities(); + + auto actualMigrations = sch.canAppBeMigrated(appId); + if (expectedMigrations == nullptr) { + REQUIRE(actualMigrations == expectedMigrations); + } else { + REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); + REQUIRE(actualMigrations->migrations_size() == + expectedMigrations->migrations_size()); + for (int i = 0; i < actualMigrations->migrations_size(); i++) { + auto actual = actualMigrations->mutable_migrations()->at(i); + auto expected = expectedMigrations->mutable_migrations()->at(i); + REQUIRE(actual.messageid() == expected.messageid()); + REQUIRE(actual.srchost() == expected.srchost()); + REQUIRE(actual.dsthost() == expected.dsthost()); + } + } + + faabric::Message res = + sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); + REQUIRE(res.returnvalue() == 0); + + // Check that after the result is set, the app can't be migrated no more + sch.checkForMigrationOpportunities(); + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); +} + +TEST_CASE_METHOD( + FunctionMigrationTestFixture, + "Test detecting migration opportunities with several hosts and requests", + "[scheduler]") +{ + // First set resources before calling the functions: one will be allocated + // locally, another one in the remote host + std::vector hosts = { masterHost, "hostA", "hostB", "hostC" }; + std::vector slots = { 1, 1, 1, 1 }; + std::vector usedSlots = { 0, 0, 0, 0 }; + setHostResources(hosts, slots, usedSlots); + + auto req = faabric::util::batchExecFactory("foo", "sleep", 4); + int timeToSleep = SHORT_TEST_TIMEOUT_MS; + req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); + req->mutable_messages()->at(0).set_migrationcheckperiod(2); + uint32_t appId = req->messages().at(0).appid(); + + auto decision = sch.callFunctions(req); + + // Set up expectations + std::shared_ptr expectedMigrations; + SECTION("Can not migrate") { expectedMigrations = nullptr; } + + SECTION("Can migrate") + { + // Update host resources so that two migration opportunities appear in + // different hosts. + std::vector newSlots = { 2, 2, 1, 1 }; + std::vector newUsedSlots = { 1, 1, 1, 1 }; + setHostResources(hosts, newSlots, newUsedSlots); + + // Build expected result + faabric::PendingMigrations expected; + expected.set_appid(appId); + // Migrate last message (scheduled to last host) to first host. This + // fills up the first host. + auto* migration1 = expected.add_migrations(); + migration1->set_messageid(req->messages().at(3).id()); + migration1->set_srchost(hosts.at(3)); + migration1->set_dsthost(hosts.at(0)); + // Migrate penultimate message (scheduled to penultimate host) to second + // host. This fills up the second host. + auto* migration2 = expected.add_migrations(); + migration2->set_messageid(req->messages().at(2).id()); + migration2->set_srchost(hosts.at(2)); + migration2->set_dsthost(hosts.at(1)); + expectedMigrations = + std::make_shared(expected); + } + + sch.checkForMigrationOpportunities(); + + auto actualMigrations = sch.canAppBeMigrated(appId); + if (expectedMigrations == nullptr) { + REQUIRE(actualMigrations == expectedMigrations); + } else { + REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); + REQUIRE(actualMigrations->migrations_size() == + expectedMigrations->migrations_size()); + for (int i = 0; i < actualMigrations->migrations_size(); i++) { + auto actual = actualMigrations->mutable_migrations()->at(i); + auto expected = expectedMigrations->mutable_migrations()->at(i); + REQUIRE(actual.messageid() == expected.messageid()); + REQUIRE(actual.srchost() == expected.srchost()); + REQUIRE(actual.dsthost() == expected.dsthost()); + } + } + + faabric::Message res = + sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); + REQUIRE(res.returnvalue() == 0); + + // Check that after the result is set, the app can't be migrated no more + sch.checkForMigrationOpportunities(); + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); +} } From ff175dfb6c247a058a99fdd68655ae8ed8f9ec32 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Sat, 8 Jan 2022 15:44:41 +0000 Subject: [PATCH 04/26] scheduler: start/stop migration thread and fix race condition when removing from in-flight map --- include/faabric/scheduler/Scheduler.h | 2 ++ src/scheduler/Scheduler.cpp | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index d7b9efa35..2d78f15dd 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -306,6 +307,7 @@ class Scheduler faabric::transport::PointToPointBroker& broker; // ---- Function migration ---- + FunctionMigrationThread functionMigrationThread; std::unordered_map inFlightRequests; std::unordered_map> pendingMigrations; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 1697f1d68..663fc2ac0 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -138,6 +138,7 @@ void Scheduler::reset() pushedSnapshotsMap.clear(); // Reset function migration tracking + functionMigrationThread.stop(); inFlightRequests.clear(); pendingMigrations.clear(); @@ -461,18 +462,17 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( auto decisionPtr = std::make_shared(decision); inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); - /* if (inFlightRequests.size() == 1) { functionMigrationThread.start(firstMsg.migrationcheckperiod()); } else if (firstMsg.migrationcheckperiod() != functionMigrationThread.wakeUpPeriodSeconds) { - SPDLOG_WARN("Ignoring migration check period as the migration" - "thread was initialised with a different one." - "(provided: {}, current: {})", + SPDLOG_WARN("Ignoring migration check period for app {} as the" + "migration thread is already running with a different" + " check period (provided: {}, current: {})", + firstMsg.appid(), firstMsg.migrationcheckperiod(), functionMigrationThread.wakeUpPeriodSeconds); } - */ } // NOTE: we want to schedule things on this host _last_, otherwise functions @@ -909,10 +909,6 @@ void Scheduler::setFunctionResult(faabric::Message& msg) throw std::runtime_error("Result key empty. Cannot publish result"); } - // Write the successful result to the result queue - std::vector inputData = faabric::util::messageToBytes(msg); - redis.publishSchedulerResult(key, msg.statuskey(), inputData); - // Remove the app from in-flight map if still there, and this host is the // master host for the message if (msg.masterhost() == thisHost) { @@ -923,9 +919,13 @@ void Scheduler::setFunctionResult(faabric::Message& msg) // If there are no more apps to track, stop the thread checking for // migration opportunities if (inFlightRequests.size() == 0) { - // functionMigrationThread.stop(); + functionMigrationThread.stop(); } } + + // Write the successful result to the result queue + std::vector inputData = faabric::util::messageToBytes(msg); + redis.publishSchedulerResult(key, msg.statuskey(), inputData); } void Scheduler::registerThread(uint32_t msgId) From aa9fcc8ac305b8856c9db5308eb28ed516a33763 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Sat, 8 Jan 2022 19:15:08 +0000 Subject: [PATCH 05/26] tests: add test for function migration thread and fix bug in thread shutdown --- include/faabric/scheduler/Scheduler.h | 2 +- src/scheduler/FunctionMigrationThread.cpp | 20 +++- src/scheduler/Scheduler.cpp | 21 ++-- .../scheduler/test_function_migration.cpp | 110 +++++++++++++++--- 4 files changed, 121 insertions(+), 32 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 2d78f15dd..200b805f6 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -228,7 +228,7 @@ class Scheduler // ---------------------------------- // Function Migration // ---------------------------------- - void checkForMigrationOpportunities( + bool checkForMigrationOpportunities( faabric::util::MigrationStrategy = faabric::util::MigrationStrategy::BIN_PACK); diff --git a/src/scheduler/FunctionMigrationThread.cpp b/src/scheduler/FunctionMigrationThread.cpp index 805f7f8a8..a1d35755d 100644 --- a/src/scheduler/FunctionMigrationThread.cpp +++ b/src/scheduler/FunctionMigrationThread.cpp @@ -6,7 +6,9 @@ namespace faabric::scheduler { void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn) { + // Initialise wake up period and shutdown variable wakeUpPeriodSeconds = wakeUpPeriodSecondsIn; + isShutdown.store(false, std::memory_order_release); // Main work loop workThread = std::make_unique([&] { @@ -24,11 +26,17 @@ void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn) lock, std::chrono::milliseconds(wakeUpPeriodSeconds * 1000)); // If we hit the timeout it means we have not been notified to - // reset or stop. Thus we check for migration oportunities. + // stop. Thus we check for migration opportunities. if (returnVal == std::cv_status::timeout) { - SPDLOG_DEBUG("Checking for migration oportunities"); - faabric::scheduler::getScheduler() - .checkForMigrationOpportunities(); + SPDLOG_DEBUG( + "Migration thread checking for migration opportunities"); + // If there are no more apps in-flight to be checked-for, the + // scheduler will return false, so we can shut down + bool shutdown = faabric::scheduler::getScheduler() + .checkForMigrationOpportunities(); + if (shutdown) { + isShutdown.store(true, std::memory_order_release); + } } }; @@ -42,7 +50,7 @@ void FunctionMigrationThread::stop() return; } - { + if (!isShutdown.load(std::memory_order_acquire)) { faabric::util::UniqueLock lock(mx); // We set the flag _before_ we notify and after we acquire the lock. @@ -55,5 +63,7 @@ void FunctionMigrationThread::stop() if (workThread->joinable()) { workThread->join(); } + + workThread = nullptr; } } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 663fc2ac0..6f27e3e8b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -916,11 +916,6 @@ void Scheduler::setFunctionResult(faabric::Message& msg) inFlightRequests.erase(msg.appid()); pendingMigrations.erase(msg.appid()); - // If there are no more apps to track, stop the thread checking for - // migration opportunities - if (inFlightRequests.size() == 0) { - functionMigrationThread.stop(); - } } // Write the successful result to the result queue @@ -1167,7 +1162,7 @@ ExecGraphNode Scheduler::getFunctionExecGraphNode(unsigned int messageId) return node; } -void Scheduler::checkForMigrationOpportunities( +bool Scheduler::checkForMigrationOpportunities( faabric::util::MigrationStrategy migrationStrategy) { // Vector to cache all migrations we have to do, and update the shared map @@ -1179,6 +1174,11 @@ void Scheduler::checkForMigrationOpportunities( { faabric::util::SharedLock lock(mx); + // If no in-flight requests, stop the background thread + if (inFlightRequests.size() == 0) { + return true; + } + // For each in-flight request that has opted in to be migrated, // check if there is an opportunity to migrate for (const auto& app : inFlightRequests) { @@ -1188,6 +1188,9 @@ void Scheduler::checkForMigrationOpportunities( // If we have already recorded a pending migration for this req, // skip if (canAppBeMigrated(originalDecision.appId) != nullptr) { + SPDLOG_TRACE("Skipping app {} as migration opportunity has " + "already been recorded", + originalDecision.appId); continue; } @@ -1216,9 +1219,6 @@ void Scheduler::checkForMigrationOpportunities( }; auto claimSlot = [&r]() { int currentUsedSlots = r.usedslots(); - SPDLOG_INFO("Old slots: {} - New slots: {}", - currentUsedSlots, - currentUsedSlots + 1); r.set_usedslots(currentUsedSlots + 1); }; while (left < right) { @@ -1282,10 +1282,11 @@ void Scheduler::checkForMigrationOpportunities( if (tmpPendingMigrations.size() > 0) { faabric::util::FullLock lock(mx); for (auto msgPtr : tmpPendingMigrations) { - SPDLOG_INFO("Adding app: {}", msgPtr->appid()); pendingMigrations[msgPtr->appid()] = std::move(msgPtr); } } + + return false; } std::shared_ptr Scheduler::canAppBeMigrated( diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index 7d2090a08..022b9cb49 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -25,7 +25,6 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture ~FunctionMigrationTestFixture() { - sch.clearRecordedMessages(); faabric::util::setMockMode(false); // Remove all hosts from global set @@ -89,7 +88,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, TEST_CASE_METHOD( FunctionMigrationTestFixture, - "Test function migration thread only works if set in the message", + "Test migration oportunities are only detected if set in the message", "[scheduler]") { // First set resources before calling the functions: one will be allocated @@ -105,6 +104,7 @@ TEST_CASE_METHOD( req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); uint32_t appId = req->messages().at(0).appid(); + // Build expected pending migrations std::shared_ptr expectedMigrations; SECTION("Migration not enabled") { expectedMigrations = nullptr; } @@ -157,10 +157,9 @@ TEST_CASE_METHOD( REQUIRE(sch.canAppBeMigrated(appId) == nullptr); } -TEST_CASE_METHOD( - FunctionMigrationTestFixture, - "Test function migration thread detects migration opportunities", - "[scheduler]") +TEST_CASE_METHOD(FunctionMigrationTestFixture, + "Test checking for migration opportunities", + "[scheduler]") { std::vector hosts = { masterHost, "hostA" }; std::vector slots = { 1, 1 }; @@ -170,9 +169,12 @@ TEST_CASE_METHOD( auto req = faabric::util::batchExecFactory("foo", "sleep", 2); int timeToSleep = SHORT_TEST_TIMEOUT_MS; req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); - req->mutable_messages()->at(0).set_migrationcheckperiod(2); uint32_t appId = req->messages().at(0).appid(); + // By setting the check period to a non-zero value, we are effectively + // opting in to be considered for migration + req->mutable_messages()->at(0).set_migrationcheckperiod(2); + auto decision = sch.callFunctions(req); std::shared_ptr expectedMigrations; @@ -226,11 +228,11 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( FunctionMigrationTestFixture, - "Test detecting migration opportunities with several hosts and requests", + "Test detecting migration opportunities for several messages and hosts", "[scheduler]") { - // First set resources before calling the functions: one will be allocated - // locally, another one in the remote host + // First set resources before calling the functions: one request will be + // allocated to each host std::vector hosts = { masterHost, "hostA", "hostB", "hostC" }; std::vector slots = { 1, 1, 1, 1 }; std::vector usedSlots = { 0, 0, 0, 0 }; @@ -239,9 +241,11 @@ TEST_CASE_METHOD( auto req = faabric::util::batchExecFactory("foo", "sleep", 4); int timeToSleep = SHORT_TEST_TIMEOUT_MS; req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); - req->mutable_messages()->at(0).set_migrationcheckperiod(2); uint32_t appId = req->messages().at(0).appid(); + // Opt in to be considered for migration + req->mutable_messages()->at(0).set_migrationcheckperiod(2); + auto decision = sch.callFunctions(req); // Set up expectations @@ -256,17 +260,17 @@ TEST_CASE_METHOD( std::vector newUsedSlots = { 1, 1, 1, 1 }; setHostResources(hosts, newSlots, newUsedSlots); - // Build expected result + // Build expected result: two migrations faabric::PendingMigrations expected; expected.set_appid(appId); - // Migrate last message (scheduled to last host) to first host. This - // fills up the first host. + // Migration 1: migrate last message (originally scheduled to last host) + // to first host. This fills up the first host. auto* migration1 = expected.add_migrations(); migration1->set_messageid(req->messages().at(3).id()); migration1->set_srchost(hosts.at(3)); migration1->set_dsthost(hosts.at(0)); - // Migrate penultimate message (scheduled to penultimate host) to second - // host. This fills up the second host. + // Migration 2: migrate penultimate message (originally scheduled to + // penultimate host) to second host. This fills up the second host. auto* migration2 = expected.add_migrations(); migration2->set_messageid(req->messages().at(2).id()); migration2->set_srchost(hosts.at(2)); @@ -301,4 +305,78 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); REQUIRE(sch.canAppBeMigrated(appId) == nullptr); } + +TEST_CASE_METHOD( + FunctionMigrationTestFixture, + "Test function migration thread detects migration opportunities", + "[scheduler]") +{ + std::vector hosts = { masterHost, "hostA" }; + std::vector slots = { 1, 1 }; + std::vector usedSlots = { 0, 0 }; + setHostResources(hosts, slots, usedSlots); + + auto req = faabric::util::batchExecFactory("foo", "sleep", 2); + int checkPeriodSecs = 1; + int timeToSleep = 4 * checkPeriodSecs * 1000; + req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); + uint32_t appId = req->messages().at(0).appid(); + + // Opt in to be migrated + req->mutable_messages()->at(0).set_migrationcheckperiod(checkPeriodSecs); + + auto decision = sch.callFunctions(req); + + std::shared_ptr expectedMigrations; + + SECTION("Can not migrate") { expectedMigrations = nullptr; } + + // As we don't update the available resources, no migration opportunities + // will appear, even though we are checking for them + SECTION("Can migrate") + { + // Update host resources so that a migration opportunity appears + updateLocalResources(2, 1); + + // Build expected result + faabric::PendingMigrations expected; + expected.set_appid(appId); + auto* migration = expected.add_migrations(); + migration->set_messageid(req->messages().at(1).id()); + migration->set_srchost(hosts.at(1)); + migration->set_dsthost(hosts.at(0)); + expectedMigrations = + std::make_shared(expected); + } + + // Instead of directly calling the scheduler function to check for migration + // opportunites, sleep for enough time (twice the check period) so that a + // migration is detected by the background thread. + SLEEP_MS(2 * checkPeriodSecs * 1000); + + auto actualMigrations = sch.canAppBeMigrated(appId); + if (expectedMigrations == nullptr) { + REQUIRE(actualMigrations == expectedMigrations); + } else { + REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); + REQUIRE(actualMigrations->migrations_size() == + expectedMigrations->migrations_size()); + for (int i = 0; i < actualMigrations->migrations_size(); i++) { + auto actual = actualMigrations->mutable_migrations()->at(i); + auto expected = expectedMigrations->mutable_migrations()->at(i); + REQUIRE(actual.messageid() == expected.messageid()); + REQUIRE(actual.srchost() == expected.srchost()); + REQUIRE(actual.dsthost() == expected.dsthost()); + } + } + + faabric::Message res = + sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); + REQUIRE(res.returnvalue() == 0); + + // Check that after the result is set, the app can't be migrated no more + sch.checkForMigrationOpportunities(); + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); +} + } From 4efe4ca8b441f73d645597137a310a3fa044c1fb Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Sun, 9 Jan 2022 14:52:53 +0000 Subject: [PATCH 06/26] util: add method to deep copy a faabric message + test --- include/faabric/util/message.h | 7 ++++ src/util/CMakeLists.txt | 1 + src/util/message.cpp | 63 ++++++++++++++++++++++++++++++++ tests/test/util/test_message.cpp | 51 ++++++++++++++++++++++++++ tests/utils/message_utils.cpp | 2 + 5 files changed, 124 insertions(+) create mode 100644 include/faabric/util/message.h create mode 100644 src/util/message.cpp create mode 100644 tests/test/util/test_message.cpp diff --git a/include/faabric/util/message.h b/include/faabric/util/message.h new file mode 100644 index 000000000..dfa802228 --- /dev/null +++ b/include/faabric/util/message.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +namespace faabric::util { +void copyMessage(const faabric::Message* src, faabric::Message* dst); +} diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index e891fd668..43569cbc5 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -17,6 +17,7 @@ faabric_lib(util locks.cpp logging.cpp memory.cpp + message.cpp network.cpp queue.cpp random.cpp diff --git a/src/util/message.cpp b/src/util/message.cpp new file mode 100644 index 000000000..4cd19dd35 --- /dev/null +++ b/src/util/message.cpp @@ -0,0 +1,63 @@ +#include + +namespace faabric::util { +void copyMessage(const faabric::Message* src, faabric::Message* dst) +{ + dst->set_id(src->id()); + dst->set_appid(src->appid()); + dst->set_appidx(src->appidx()); + dst->set_masterhost(src->masterhost()); + + dst->set_type(src->type()); + dst->set_user(src->user()); + dst->set_function(src->function()); + + dst->set_inputdata(src->inputdata()); + dst->set_outputdata(src->outputdata()); + + dst->set_funcptr(src->funcptr()); + dst->set_returnvalue(src->returnvalue()); + + dst->set_snapshotkey(src->snapshotkey()); + + dst->set_timestamp(src->timestamp()); + dst->set_resultkey(src->resultkey()); + dst->set_executeslocally(src->executeslocally()); + dst->set_statuskey(src->statuskey()); + + dst->set_executedhost(src->executedhost()); + dst->set_finishtimestamp(src->finishtimestamp()); + + dst->set_isasync(src->isasync()); + dst->set_ispython(src->ispython()); + dst->set_isstatusrequest(src->isstatusrequest()); + dst->set_isexecgraphrequest(src->isexecgraphrequest()); + + dst->set_pythonuser(src->pythonuser()); + dst->set_pythonfunction(src->pythonfunction()); + dst->set_pythonentry(src->pythonentry()); + + dst->set_groupid(src->groupid()); + dst->set_groupidx(src->groupidx()); + dst->set_groupsize(src->groupsize()); + + dst->set_ismpi(src->ismpi()); + dst->set_mpiworldid(src->mpiworldid()); + dst->set_mpirank(src->mpirank()); + dst->set_mpiworldsize(src->mpiworldsize()); + + dst->set_cmdline(src->cmdline()); + + dst->set_recordexecgraph(src->recordexecgraph()); + + dst->set_issgx(src->issgx()); + + dst->set_sgxsid(src->sgxsid()); + dst->set_sgxnonce(src->sgxnonce()); + dst->set_sgxtag(src->sgxtag()); + dst->set_sgxpolicy(src->sgxpolicy()); + dst->set_sgxresult(src->sgxresult()); + + dst->set_migrationcheckperiod(src->migrationcheckperiod()); +} +} diff --git a/tests/test/util/test_message.cpp b/tests/test/util/test_message.cpp new file mode 100644 index 000000000..3627582eb --- /dev/null +++ b/tests/test/util/test_message.cpp @@ -0,0 +1,51 @@ +#include + +#include + +#include + +using namespace faabric::util; + +namespace tests { +TEST_CASE("Test copying faabric message", "[util]") +{ + faabric::Message msg; + msg.set_type(faabric::Message_MessageType_FLUSH); + msg.set_user("user 1"); + msg.set_function("great function"); + msg.set_executedhost("blah.host.blah"); + msg.set_finishtimestamp(123456543); + + msg.set_pythonuser("py user"); + msg.set_pythonfunction("py func"); + msg.set_pythonentry("py entry"); + + msg.set_isasync(true); + msg.set_ispython(true); + msg.set_isstatusrequest(true); + msg.set_isexecgraphrequest(true); + + msg.set_ismpi(true); + msg.set_mpiworldid(1234); + msg.set_mpirank(5678); + msg.set_mpiworldsize(33); + + msg.set_cmdline("some cmdline"); + + msg.set_issgx(true); + msg.set_sgxsid("test sid string"); + msg.set_sgxnonce("test nonce string"); + msg.set_sgxtag("test tag string"); + msg.set_sgxpolicy("test policy string"); + msg.set_sgxresult("test result string"); + + msg.set_recordexecgraph(true); + + msg.set_migrationcheckperiod(33); + + faabric::Message msgCopy; + copyMessage(&msg, &msgCopy); + + checkMessageEquality(msg, msgCopy); +} +} diff --git a/tests/utils/message_utils.cpp b/tests/utils/message_utils.cpp index 58924958c..d3189ec18 100644 --- a/tests/utils/message_utils.cpp +++ b/tests/utils/message_utils.cpp @@ -52,5 +52,7 @@ void checkMessageEquality(const faabric::Message& msgA, checkMessageMapEquality(msgA.execgraphdetails(), msgB.execgraphdetails()); checkMessageMapEquality(msgA.intexecgraphdetails(), msgB.intexecgraphdetails()); + + REQUIRE(msgA.migrationcheckperiod() == msgB.migrationcheckperiod()); } } From 13560b34a2051ac6a23e7f4317c20e4589f30b4d Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Sun, 9 Jan 2022 14:54:46 +0000 Subject: [PATCH 07/26] function call server: add call to add a pending migration to remote hosts --- include/faabric/scheduler/FunctionCallApi.h | 3 +- .../faabric/scheduler/FunctionCallClient.h | 6 + .../faabric/scheduler/FunctionCallServer.h | 4 + include/faabric/scheduler/Scheduler.h | 16 +- src/proto/faabric.proto | 2 +- src/scheduler/FunctionCallClient.cpp | 28 ++ src/scheduler/FunctionCallServer.cpp | 16 + src/scheduler/FunctionMigrationThread.cpp | 9 +- src/scheduler/Scheduler.cpp | 279 +++++++++++------- .../scheduler/test_function_migration.cpp | 175 +++++------ 10 files changed, 315 insertions(+), 223 deletions(-) diff --git a/include/faabric/scheduler/FunctionCallApi.h b/include/faabric/scheduler/FunctionCallApi.h index eb9b83449..53c037ffc 100644 --- a/include/faabric/scheduler/FunctionCallApi.h +++ b/include/faabric/scheduler/FunctionCallApi.h @@ -7,6 +7,7 @@ enum FunctionCalls ExecuteFunctions = 1, Flush = 2, Unregister = 3, - GetResources = 4 + GetResources = 4, + AddPendingMigration = 5 }; } diff --git a/include/faabric/scheduler/FunctionCallClient.h b/include/faabric/scheduler/FunctionCallClient.h index 921139c86..117a6430c 100644 --- a/include/faabric/scheduler/FunctionCallClient.h +++ b/include/faabric/scheduler/FunctionCallClient.h @@ -22,6 +22,9 @@ getBatchRequests(); std::vector> getResourceRequests(); +std::vector>> +getAddPendingMigrationRequests(); + std::vector> getUnregisterRequests(); @@ -42,6 +45,9 @@ class FunctionCallClient : public faabric::transport::MessageEndpointClient faabric::HostResources getResources(); + void sendAddPendingMigration( + std::shared_ptr req); + void executeFunctions(std::shared_ptr req); void unregister(faabric::UnregisterRequest& req); diff --git a/include/faabric/scheduler/FunctionCallServer.h b/include/faabric/scheduler/FunctionCallServer.h index 23227c43c..39403b31e 100644 --- a/include/faabric/scheduler/FunctionCallServer.h +++ b/include/faabric/scheduler/FunctionCallServer.h @@ -29,6 +29,10 @@ class FunctionCallServer final const uint8_t* buffer, size_t bufferSize); + std::unique_ptr recvAddPendingMigration( + const uint8_t* buffer, + size_t bufferSize); + void recvExecuteFunctions(const uint8_t* buffer, size_t bufferSize); void recvUnregister(const uint8_t* buffer, size_t bufferSize); diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 200b805f6..56ff78396 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -228,13 +228,15 @@ class Scheduler // ---------------------------------- // Function Migration // ---------------------------------- - bool checkForMigrationOpportunities( - faabric::util::MigrationStrategy = - faabric::util::MigrationStrategy::BIN_PACK); + void checkForMigrationOpportunities(); std::shared_ptr canAppBeMigrated( uint32_t appId); + void addPendingMigration(std::shared_ptr msg); + + void removePendingMigration(uint32_t appId); + private: std::string thisHost; @@ -311,6 +313,14 @@ class Scheduler std::unordered_map inFlightRequests; std::unordered_map> pendingMigrations; + + std::vector> + doCheckForMigrationOpportunities( + faabric::util::MigrationStrategy migrationStrategy = + faabric::util::MigrationStrategy::BIN_PACK); + + void broadcastAddPendingMigration( + std::shared_ptr pendingMigrations); }; } diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index 7382b9fe7..d6f4e17a1 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -255,7 +255,7 @@ message PendingMigrations { int32 groupId = 2; message PendingMigration { - int32 messageId = 1; + Message msg = 1; string srcHost = 2; string dstHost = 3; } diff --git a/src/scheduler/FunctionCallClient.cpp b/src/scheduler/FunctionCallClient.cpp index db7490ef5..5c67d2406 100644 --- a/src/scheduler/FunctionCallClient.cpp +++ b/src/scheduler/FunctionCallClient.cpp @@ -28,6 +28,10 @@ static std::unordered_map> queuedResourceResponses; +static std::vector< + std::pair>> + addPendingMigrationRequests; + static std::vector> unregisterRequests; @@ -57,6 +61,13 @@ std::vector> getResourceRequests() return resourceRequests; } +std::vector>> +getAddPendingMigrationRequests() +{ + faabric::util::UniqueLock lock(mockMutex); + return addPendingMigrationRequests; +} + std::vector> getUnregisterRequests() { @@ -76,6 +87,7 @@ void clearMockRequests() functionCalls.clear(); batchMessages.clear(); resourceRequests.clear(); + addPendingMigrationRequests.clear(); unregisterRequests.clear(); for (auto& p : queuedResourceResponses) { @@ -128,6 +140,22 @@ faabric::HostResources FunctionCallClient::getResources() return response; } +void FunctionCallClient::sendAddPendingMigration( + std::shared_ptr req) +{ + faabric::PendingMigrations request; + faabric::EmptyResponse response; + + if (faabric::util::isMockMode()) { + faabric::util::UniqueLock lock(mockMutex); + addPendingMigrationRequests.emplace_back(host, req); + } else { + syncSend(faabric::scheduler::FunctionCalls::AddPendingMigration, + req.get(), + &response); + } +} + void FunctionCallClient::executeFunctions( const std::shared_ptr req) { diff --git a/src/scheduler/FunctionCallServer.cpp b/src/scheduler/FunctionCallServer.cpp index 2e0604baf..e686e834e 100644 --- a/src/scheduler/FunctionCallServer.cpp +++ b/src/scheduler/FunctionCallServer.cpp @@ -49,6 +49,9 @@ std::unique_ptr FunctionCallServer::doSyncRecv( case faabric::scheduler::FunctionCalls::GetResources: { return recvGetResources(buffer, bufferSize); } + case faabric::scheduler::FunctionCalls::AddPendingMigration: { + return recvAddPendingMigration(buffer, bufferSize); + } default: { throw std::runtime_error( fmt::format("Unrecognized sync call header: {}", header)); @@ -100,4 +103,17 @@ std::unique_ptr FunctionCallServer::recvGetResources( scheduler.getThisHostResources()); return response; } + +std::unique_ptr +FunctionCallServer::recvAddPendingMigration(const uint8_t* buffer, + size_t bufferSize) +{ + PARSE_MSG(faabric::PendingMigrations, buffer, bufferSize); + + auto msgPtr = std::make_shared(msg); + + scheduler.addPendingMigration(msgPtr); + + return std::make_unique(); +} } diff --git a/src/scheduler/FunctionMigrationThread.cpp b/src/scheduler/FunctionMigrationThread.cpp index a1d35755d..a13274464 100644 --- a/src/scheduler/FunctionMigrationThread.cpp +++ b/src/scheduler/FunctionMigrationThread.cpp @@ -30,13 +30,8 @@ void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn) if (returnVal == std::cv_status::timeout) { SPDLOG_DEBUG( "Migration thread checking for migration opportunities"); - // If there are no more apps in-flight to be checked-for, the - // scheduler will return false, so we can shut down - bool shutdown = faabric::scheduler::getScheduler() - .checkForMigrationOpportunities(); - if (shutdown) { - isShutdown.store(true, std::memory_order_release); - } + faabric::scheduler::getScheduler() + .checkForMigrationOpportunities(); } }; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6f27e3e8b..9133a838f 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -120,6 +121,8 @@ void Scheduler::reset() } deadExecutorsList.clear(); + functionMigrationThread.stop(); + faabric::util::FullLock lock(mx); executors.clear(); deadExecutors.clear(); @@ -138,7 +141,6 @@ void Scheduler::reset() pushedSnapshotsMap.clear(); // Reset function migration tracking - functionMigrationThread.stop(); inFlightRequests.clear(); pendingMigrations.clear(); @@ -458,6 +460,10 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( } // Record in-flight request if function desires to be migrated + // NOTE: ideally, instead of allowing the applications to specify a check + // period, we would have a default one (overwritable through an env. + // variable), and apps would just opt in/out of being migrated. We set + // the actual check period instead to ease with experiments. if (firstMsg.migrationcheckperiod() > 0) { auto decisionPtr = std::make_shared(decision); @@ -912,10 +918,7 @@ void Scheduler::setFunctionResult(faabric::Message& msg) // Remove the app from in-flight map if still there, and this host is the // master host for the message if (msg.masterhost() == thisHost) { - faabric::util::FullLock lock(mx); - - inFlightRequests.erase(msg.appid()); - pendingMigrations.erase(msg.appid()); + removePendingMigration(msg.appid()); } // Write the successful result to the result queue @@ -1162,131 +1165,48 @@ ExecGraphNode Scheduler::getFunctionExecGraphNode(unsigned int messageId) return node; } -bool Scheduler::checkForMigrationOpportunities( - faabric::util::MigrationStrategy migrationStrategy) +void Scheduler::checkForMigrationOpportunities() { - // Vector to cache all migrations we have to do, and update the shared map - // at the very end just once. This is because we need a unique lock to write - // to the shared map, but the rest of this method can do with a shared lock. std::vector> tmpPendingMigrations; { + // Acquire a shared lock to read from the in-flight requests map faabric::util::SharedLock lock(mx); - // If no in-flight requests, stop the background thread - if (inFlightRequests.size() == 0) { - return true; - } - - // For each in-flight request that has opted in to be migrated, - // check if there is an opportunity to migrate - for (const auto& app : inFlightRequests) { - auto req = app.second.first; - auto originalDecision = *app.second.second; - - // If we have already recorded a pending migration for this req, - // skip - if (canAppBeMigrated(originalDecision.appId) != nullptr) { - SPDLOG_TRACE("Skipping app {} as migration opportunity has " - "already been recorded", - originalDecision.appId); - continue; - } - - faabric::PendingMigrations msg; - msg.set_appid(originalDecision.appId); - // TODO - generate a new groupId here for processes to wait on - // during the migration? msg.set_groupid(); - - if (migrationStrategy == - faabric::util::MigrationStrategy::BIN_PACK) { - // We assume the batch was originally scheduled using - // bin-packing, thus the scheduling decision has at the begining - // (left) the hosts with the most allocated requests, and at the - // end (right) the hosts with the fewest. To check for migration - // oportunities, we compare a pointer to the possible - // destination of the migration (left), with one to the possible - // source of the migration (right). NOTE - this is a slight - // simplification, but makes the code simpler. - auto left = originalDecision.hosts.begin(); - auto right = originalDecision.hosts.end() - 1; - faabric::HostResources r = (*left == thisHost) - ? getThisHostResources() - : getHostResources(*left); - auto nAvailable = [&r]() -> int { - return r.slots() - r.usedslots(); - }; - auto claimSlot = [&r]() { - int currentUsedSlots = r.usedslots(); - r.set_usedslots(currentUsedSlots + 1); - }; - while (left < right) { - // If both pointers point to the same host, no migration - // opportunity, and must check another possible source of - // the migration - if (*left == *right) { - --right; - continue; - } - - // If the left pointer (possible destination of the - // migration) is out of available resources, no migration - // opportunity, and must check another possible destination - // of migration - if (nAvailable() == 0) { - auto oldHost = *left; - ++left; - if (*left != oldHost) { - r = (*left == thisHost) ? getThisHostResources() - : getHostResources(*left); - } - continue; - } - - // If each pointer points to a request scheduled in a - // different host, and the possible destination has slots, - // there is a migration opportunity - auto* migration = msg.add_migrations(); - auto msgIdPtr = - originalDecision.messageIds.begin() + - std::distance(originalDecision.hosts.begin(), right); - migration->set_messageid(*msgIdPtr); - migration->set_srchost(*right); - migration->set_dsthost(*left); - // Decrement by one the availability, and check for more - // possible sources of migration - claimSlot(); - --right; - } - } else { - SPDLOG_ERROR("Unrecognised migration strategy: {}", - migrationStrategy); - throw std::runtime_error("Unrecognised migration strategy."); - } - - if (msg.migrations_size() > 0) { - tmpPendingMigrations.emplace_back( - std::make_shared(msg)); - SPDLOG_DEBUG("Detected migration opportunity for app: {}", - msg.appid()); - } else { - SPDLOG_DEBUG("No migration opportunity detected for app: {}", - msg.appid()); - } - } + tmpPendingMigrations = doCheckForMigrationOpportunities(); } - // Finally, store all the pending migrations in the shared map acquiring - // a unique lock. + // If we find migration opportunites if (tmpPendingMigrations.size() > 0) { + // Acquire full lock to write to the pending migrations map faabric::util::FullLock lock(mx); + for (auto msgPtr : tmpPendingMigrations) { + // First, broadcast the pending migrations to other hosts + broadcastAddPendingMigration(msgPtr); + // Second, update our local records pendingMigrations[msgPtr->appid()] = std::move(msgPtr); } } +} - return false; +void Scheduler::broadcastAddPendingMigration( + std::shared_ptr pendingMigrations) +{ + // Get all hosts for the to-be migrated app + auto msg = pendingMigrations->migrations().at(0).msg(); + std::string funcStr = faabric::util::funcToString(msg, false); + std::set& thisRegisteredHosts = registeredHosts[funcStr]; + + // Remove this host from the set + registeredHosts.erase(thisHost); + + // Send pending migrations to all involved hosts + for (auto& otherHost : thisRegisteredHosts) { + getFunctionCallClient(otherHost).sendAddPendingMigration( + pendingMigrations); + } } std::shared_ptr Scheduler::canAppBeMigrated( @@ -1300,4 +1220,135 @@ std::shared_ptr Scheduler::canAppBeMigrated( return pendingMigrations[appId]; } + +void Scheduler::addPendingMigration( + std::shared_ptr pMigration) +{ + faabric::util::FullLock lock(mx); + + auto msg = pMigration->migrations().at(0).msg(); + if (pendingMigrations.find(msg.appid()) != pendingMigrations.end()) { + SPDLOG_ERROR("Received remote request to add a pending migration for " + "app {}, but already recorded another migration request" + " for the same app.", + msg.appid()); + throw std::runtime_error("Remote request for app already there"); + } + + pendingMigrations[msg.appid()] = pMigration; +} + +void Scheduler::removePendingMigration(uint32_t appId) +{ + faabric::util::FullLock lock(mx); + + inFlightRequests.erase(appId); + pendingMigrations.erase(appId); +} + +std::vector> +Scheduler::doCheckForMigrationOpportunities( + faabric::util::MigrationStrategy migrationStrategy) +{ + std::vector> + pendingMigrationsVec; + + // For each in-flight request that has opted in to be migrated, + // check if there is an opportunity to migrate + for (const auto& app : inFlightRequests) { + auto req = app.second.first; + auto originalDecision = *app.second.second; + + // If we have already recorded a pending migration for this req, + // skip + if (canAppBeMigrated(originalDecision.appId) != nullptr) { + SPDLOG_TRACE("Skipping app {} as migration opportunity has " + "already been recorded", + originalDecision.appId); + continue; + } + + faabric::PendingMigrations msg; + msg.set_appid(originalDecision.appId); + + if (migrationStrategy == faabric::util::MigrationStrategy::BIN_PACK) { + // We assume the batch was originally scheduled using + // bin-packing, thus the scheduling decision has at the begining + // (left) the hosts with the most allocated requests, and at the + // end (right) the hosts with the fewest. To check for migration + // oportunities, we compare a pointer to the possible + // destination of the migration (left), with one to the possible + // source of the migration (right). NOTE - this is a slight + // simplification, but makes the code simpler. + auto left = originalDecision.hosts.begin(); + auto right = originalDecision.hosts.end() - 1; + faabric::HostResources r = (*left == thisHost) + ? getThisHostResources() + : getHostResources(*left); + auto nAvailable = [&r]() -> int { + return r.slots() - r.usedslots(); + }; + auto claimSlot = [&r]() { + int currentUsedSlots = r.usedslots(); + r.set_usedslots(currentUsedSlots + 1); + }; + while (left < right) { + // If both pointers point to the same host, no migration + // opportunity, and must check another possible source of + // the migration + if (*left == *right) { + --right; + continue; + } + + // If the left pointer (possible destination of the + // migration) is out of available resources, no migration + // opportunity, and must check another possible destination + // of migration + if (nAvailable() == 0) { + auto oldHost = *left; + ++left; + if (*left != oldHost) { + r = (*left == thisHost) ? getThisHostResources() + : getHostResources(*left); + } + continue; + } + + // If each pointer points to a request scheduled in a + // different host, and the possible destination has slots, + // there is a migration opportunity + auto* migration = msg.add_migrations(); + migration->set_srchost(*right); + migration->set_dsthost(*left); + + faabric::Message* msgPtr = + &(*(req->mutable_messages()->begin() + + std::distance(originalDecision.hosts.begin(), right))); + auto* migrationMsgPtr = migration->mutable_msg(); + faabric::util::copyMessage(msgPtr, migrationMsgPtr); + // Decrement by one the availability, and check for more + // possible sources of migration + claimSlot(); + --right; + } + } else { + SPDLOG_ERROR("Unrecognised migration strategy: {}", + migrationStrategy); + throw std::runtime_error("Unrecognised migration strategy."); + } + + if (msg.migrations_size() > 0) { + pendingMigrationsVec.emplace_back( + std::make_shared(msg)); + SPDLOG_DEBUG("Detected migration opportunity for app: {}", + msg.appid()); + } else { + SPDLOG_DEBUG("No migration opportunity detected for app: {}", + msg.appid()); + } + } + + return pendingMigrationsVec; +} } diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index 022b9cb49..5f612e379 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include using namespace faabric::scheduler; @@ -72,6 +73,62 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture r.set_usedslots(usedSlots); sch.setThisHostResources(r); } + + std::shared_ptr + buildPendingMigrationsExpectation( + std::shared_ptr req, + std::vector hosts, + std::vector> migrations) + { + faabric::PendingMigrations expected; + expected.set_appid(req->messages().at(0).appid()); + + for (auto pair : migrations) { + auto* migration = expected.add_migrations(); + auto* migrationMsg = migration->mutable_msg(); + faabric::util::copyMessage(&req->mutable_messages()->at(pair.first), + migrationMsg); + migration->set_srchost(hosts.at(pair.first)); + migration->set_dsthost(hosts.at(pair.second)); + } + + return std::make_shared(expected); + } + + void checkPendingMigrationsExpectation( + std::shared_ptr expectedMigrations, + std::shared_ptr actualMigrations, + std::vector hosts) + { + if (expectedMigrations == nullptr) { + REQUIRE(actualMigrations == expectedMigrations); + } else { + // Check actual migration matches expectation + REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); + REQUIRE(actualMigrations->migrations_size() == + expectedMigrations->migrations_size()); + for (int i = 0; i < actualMigrations->migrations_size(); i++) { + auto actual = actualMigrations->mutable_migrations()->at(i); + auto expected = expectedMigrations->mutable_migrations()->at(i); + REQUIRE(actual.msg().id() == expected.msg().id()); + REQUIRE(actual.srchost() == expected.srchost()); + REQUIRE(actual.dsthost() == expected.dsthost()); + } + + // Check we have sent a message to all other hosts with the pending + // migration + auto pendingRequests = getAddPendingMigrationRequests(); + REQUIRE(pendingRequests.size() == hosts.size() - 1); + for (auto& pendingReq : getAddPendingMigrationRequests()) { + std::string host = pendingReq.first; + std::shared_ptr migration = + pendingReq.second; + auto it = std::find(hosts.begin(), hosts.end(), host); + REQUIRE(it != hosts.end()); + REQUIRE(migration == actualMigrations); + } + } + } }; TEST_CASE_METHOD(FunctionMigrationTestFixture, @@ -98,7 +155,8 @@ TEST_CASE_METHOD( std::vector usedSlots = { 0, 0 }; setHostResources(hosts, slots, usedSlots); - // The sleep function sleeps for a set timeout before returning + // Second, prepare the request we will migrate in-flight. + // NOTE: the sleep function sleeps for a set timeout before returning. auto req = faabric::util::batchExecFactory("foo", "sleep", 2); int timeToSleep = SHORT_TEST_TIMEOUT_MS; req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep)); @@ -113,15 +171,10 @@ TEST_CASE_METHOD( // Set to a non-zero value so that migration is enabled req->mutable_messages()->at(0).set_migrationcheckperiod(2); - // Build expected result - faabric::PendingMigrations expected; - expected.set_appid(appId); - auto* migration = expected.add_migrations(); - migration->set_messageid(req->messages().at(1).id()); - migration->set_srchost(hosts.at(1)); - migration->set_dsthost(hosts.at(0)); + // Build expected migrations + std::vector> migrations = { { 1, 0 } }; expectedMigrations = - std::make_shared(expected); + buildPendingMigrationsExpectation(req, hosts, migrations); } auto decision = sch.callFunctions(req); @@ -133,20 +186,8 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); auto actualMigrations = sch.canAppBeMigrated(appId); - if (expectedMigrations == nullptr) { - REQUIRE(actualMigrations == expectedMigrations); - } else { - REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); - REQUIRE(actualMigrations->migrations_size() == - expectedMigrations->migrations_size()); - for (int i = 0; i < actualMigrations->migrations_size(); i++) { - auto actual = actualMigrations->mutable_migrations()->at(i); - auto expected = expectedMigrations->mutable_migrations()->at(i); - REQUIRE(actual.messageid() == expected.messageid()); - REQUIRE(actual.srchost() == expected.srchost()); - REQUIRE(actual.dsthost() == expected.dsthost()); - } - } + checkPendingMigrationsExpectation( + expectedMigrations, actualMigrations, hosts); faabric::Message res = sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); @@ -188,34 +229,17 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // Update host resources so that a migration opportunity appears updateLocalResources(2, 1); - // Build expected result - faabric::PendingMigrations expected; - expected.set_appid(appId); - auto* migration = expected.add_migrations(); - migration->set_messageid(req->messages().at(1).id()); - migration->set_srchost(hosts.at(1)); - migration->set_dsthost(hosts.at(0)); + // Build expected migrations + std::vector> migrations = { { 1, 0 } }; expectedMigrations = - std::make_shared(expected); + buildPendingMigrationsExpectation(req, hosts, migrations); } sch.checkForMigrationOpportunities(); auto actualMigrations = sch.canAppBeMigrated(appId); - if (expectedMigrations == nullptr) { - REQUIRE(actualMigrations == expectedMigrations); - } else { - REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); - REQUIRE(actualMigrations->migrations_size() == - expectedMigrations->migrations_size()); - for (int i = 0; i < actualMigrations->migrations_size(); i++) { - auto actual = actualMigrations->mutable_migrations()->at(i); - auto expected = expectedMigrations->mutable_migrations()->at(i); - REQUIRE(actual.messageid() == expected.messageid()); - REQUIRE(actual.srchost() == expected.srchost()); - REQUIRE(actual.dsthost() == expected.dsthost()); - } - } + checkPendingMigrationsExpectation( + expectedMigrations, actualMigrations, hosts); faabric::Message res = sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); @@ -261,41 +285,16 @@ TEST_CASE_METHOD( setHostResources(hosts, newSlots, newUsedSlots); // Build expected result: two migrations - faabric::PendingMigrations expected; - expected.set_appid(appId); - // Migration 1: migrate last message (originally scheduled to last host) - // to first host. This fills up the first host. - auto* migration1 = expected.add_migrations(); - migration1->set_messageid(req->messages().at(3).id()); - migration1->set_srchost(hosts.at(3)); - migration1->set_dsthost(hosts.at(0)); - // Migration 2: migrate penultimate message (originally scheduled to - // penultimate host) to second host. This fills up the second host. - auto* migration2 = expected.add_migrations(); - migration2->set_messageid(req->messages().at(2).id()); - migration2->set_srchost(hosts.at(2)); - migration2->set_dsthost(hosts.at(1)); + std::vector> migrations = { { 3, 0 }, { 2, 1 } }; expectedMigrations = - std::make_shared(expected); + buildPendingMigrationsExpectation(req, hosts, migrations); } sch.checkForMigrationOpportunities(); auto actualMigrations = sch.canAppBeMigrated(appId); - if (expectedMigrations == nullptr) { - REQUIRE(actualMigrations == expectedMigrations); - } else { - REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); - REQUIRE(actualMigrations->migrations_size() == - expectedMigrations->migrations_size()); - for (int i = 0; i < actualMigrations->migrations_size(); i++) { - auto actual = actualMigrations->mutable_migrations()->at(i); - auto expected = expectedMigrations->mutable_migrations()->at(i); - REQUIRE(actual.messageid() == expected.messageid()); - REQUIRE(actual.srchost() == expected.srchost()); - REQUIRE(actual.dsthost() == expected.dsthost()); - } - } + checkPendingMigrationsExpectation( + expectedMigrations, actualMigrations, hosts); faabric::Message res = sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); @@ -338,15 +337,10 @@ TEST_CASE_METHOD( // Update host resources so that a migration opportunity appears updateLocalResources(2, 1); - // Build expected result - faabric::PendingMigrations expected; - expected.set_appid(appId); - auto* migration = expected.add_migrations(); - migration->set_messageid(req->messages().at(1).id()); - migration->set_srchost(hosts.at(1)); - migration->set_dsthost(hosts.at(0)); + // Build expected migrations + std::vector> migrations = { { 1, 0 } }; expectedMigrations = - std::make_shared(expected); + buildPendingMigrationsExpectation(req, hosts, migrations); } // Instead of directly calling the scheduler function to check for migration @@ -355,20 +349,8 @@ TEST_CASE_METHOD( SLEEP_MS(2 * checkPeriodSecs * 1000); auto actualMigrations = sch.canAppBeMigrated(appId); - if (expectedMigrations == nullptr) { - REQUIRE(actualMigrations == expectedMigrations); - } else { - REQUIRE(actualMigrations->appid() == expectedMigrations->appid()); - REQUIRE(actualMigrations->migrations_size() == - expectedMigrations->migrations_size()); - for (int i = 0; i < actualMigrations->migrations_size(); i++) { - auto actual = actualMigrations->mutable_migrations()->at(i); - auto expected = expectedMigrations->mutable_migrations()->at(i); - REQUIRE(actual.messageid() == expected.messageid()); - REQUIRE(actual.srchost() == expected.srchost()); - REQUIRE(actual.dsthost() == expected.dsthost()); - } - } + checkPendingMigrationsExpectation( + expectedMigrations, actualMigrations, hosts); faabric::Message res = sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep); @@ -378,5 +360,4 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); REQUIRE(sch.canAppBeMigrated(appId) == nullptr); } - } From 0b02abaf7f3b495b4d71047e764eb8f8c2b2d9ad Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Sun, 9 Jan 2022 15:03:48 +0000 Subject: [PATCH 08/26] tests: add further testing --- tests/test/scheduler/test_executor.cpp | 1 - .../scheduler/test_function_migration.cpp | 25 +++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index c456ef282..1c0e812c9 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -202,7 +202,6 @@ int32_t TestExecutor::executeTask( } if (msg.function() == "sleep") { - // Sleep for sufficiently more than the check period int timeToSleepMs = SHORT_TEST_TIMEOUT_MS; if (!msg.inputdata().empty()) { timeToSleepMs = std::stoi(msg.inputdata()); diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index 5f612e379..6c1c4f62d 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -328,10 +328,10 @@ TEST_CASE_METHOD( std::shared_ptr expectedMigrations; - SECTION("Can not migrate") { expectedMigrations = nullptr; } - // As we don't update the available resources, no migration opportunities // will appear, even though we are checking for them + SECTION("Can not migrate") { expectedMigrations = nullptr; } + SECTION("Can migrate") { // Update host resources so that a migration opportunity appears @@ -360,4 +360,25 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); REQUIRE(sch.canAppBeMigrated(appId) == nullptr); } + +TEST_CASE_METHOD(FunctionMigrationTestFixture, + "Test adding and removing pending migrations manually", + "[scheduler]") +{ + auto req = faabric::util::batchExecFactory("foo", "sleep", 2); + uint32_t appId = req->messages().at(0).appid(); + std::vector hosts = { masterHost, "hostA" }; + std::vector> migrations = { { 1, 0 } }; + auto expectedMigrations = + buildPendingMigrationsExpectation(req, hosts, migrations); + + // Add migration manually + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + sch.addPendingMigration(expectedMigrations); + REQUIRE(sch.canAppBeMigrated(appId) == expectedMigrations); + + // Remove migration manually + sch.removePendingMigration(appId); + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); +} } From 1b98e6b0e5eba0efbc9e90415c890a23fec29f02 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 11:56:20 +0000 Subject: [PATCH 09/26] mpi: add migration points, link with executor, and tests --- include/faabric/scheduler/MpiWorld.h | 13 ++ include/faabric/scheduler/Scheduler.h | 5 + src/scheduler/Executor.cpp | 23 ++++ src/scheduler/MpiWorld.cpp | 116 +++++++++++++++++- src/scheduler/Scheduler.cpp | 41 ++++++- .../scheduler/test_function_migration.cpp | 99 ++++++++++++++- 6 files changed, 287 insertions(+), 10 deletions(-) diff --git a/include/faabric/scheduler/MpiWorld.h b/include/faabric/scheduler/MpiWorld.h index 4165cd4c8..56a1c01f3 100644 --- a/include/faabric/scheduler/MpiWorld.h +++ b/include/faabric/scheduler/MpiWorld.h @@ -31,6 +31,9 @@ std::vector getMpiHostsToRanksMessages(); std::vector> getMpiMockedMessages( int sendRank); +std::vector> +getMpiMockedPendingMigrations(); + typedef faabric::util::FixedCapacityQueue> InMemoryMpiQueue; @@ -271,6 +274,16 @@ class MpiWorld int recvRank, int batchSize = 0); + /* Function Migration */ + + void tryMigrate(int thisRank); + + void prepareMigration( + int thisRank, + std::shared_ptr pendingMigrations); + + void finishMigration(int thisRank); + /* Helper methods */ void checkRanksRange(int sendRank, int recvRank); diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 56ff78396..ffbe3c361 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -82,6 +82,9 @@ class Executor faabric::Message& msg, bool createIfNotExists = false); + void doMigration( + std::shared_ptr pendingMigrations); + protected: virtual void restore(const std::string& snapshotKey); @@ -101,6 +104,8 @@ class Executor uint32_t threadPoolSize = 0; + void migrateFunction(const faabric::Message& msg, const std::string& host); + private: std::atomic claimed = false; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 7576c6deb..ba272afa2 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -735,6 +735,29 @@ void Executor::releaseClaim() claimed.store(false); } +void Executor::doMigration( + std::shared_ptr pendingMigrations) +{ + for (int i = 0; i < pendingMigrations->migrations_size(); i++) { + auto m = pendingMigrations->mutable_migrations()->at(i); + if (m.msg().id() == boundMessage.id()) { + migrateFunction(m.msg(), m.dsthost()); + // TODO: terminate current executing thread + } + } +} + +void Executor::migrateFunction(const faabric::Message& msg, + const std::string& host) +{ + SPDLOG_INFO("Executor received request to migrate message {} from host {}" + " to host {}", + msg.id(), + msg.executedhost(), + host); + SPDLOG_ERROR("Executor::migrate() not implemented"); +} + // ------------------------------------------ // HOOKS // ------------------------------------------ diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index 6fac56a87..874bd3f85 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -53,6 +53,9 @@ static std::vector rankMessages; static std::map>> mpiMockedMessages; +static std::vector> + mpiMockedPendingMigrations; + std::vector getMpiHostsToRanksMessages() { faabric::util::UniqueLock lock(mockMutex); @@ -66,6 +69,13 @@ std::vector> getMpiMockedMessages( return mpiMockedMessages[sendRank]; } +std::vector> +getMpiMockedPendingMigrations() +{ + faabric::util::UniqueLock lock(mockMutex); + return mpiMockedPendingMigrations; +} + MpiWorld::MpiWorld() : thisHost(faabric::util::getSystemConfig().endpointHost) , basePort(faabric::util::getSystemConfig().mpiBasePort) @@ -222,10 +232,16 @@ void MpiWorld::create(faabric::Message& call, int newId, int newSize) msg.set_mpiworldid(id); msg.set_mpirank(i + 1); msg.set_mpiworldsize(size); - // Log chained functions to generate execution graphs - if (thisRankMsg != nullptr && thisRankMsg->recordexecgraph()) { - sch.logChainedFunction(call.id(), msg.id()); - msg.set_recordexecgraph(true); + if (thisRankMsg != nullptr) { + // Set message fields to allow for function migration + msg.set_appid(thisRankMsg->appid()); + msg.set_inputdata(thisRankMsg->inputdata()); + msg.set_migrationcheckperiod(thisRankMsg->migrationcheckperiod()); + // Log chained functions to generate execution graphs + if (thisRankMsg->recordexecgraph()) { + sch.logChainedFunction(call.id(), msg.id()); + msg.set_recordexecgraph(true); + } } } @@ -329,6 +345,7 @@ void MpiWorld::destroy() // Clear structures used for mocking rankMessages.clear(); mpiMockedMessages.clear(); + mpiMockedPendingMigrations.clear(); } void MpiWorld::initialiseFromMsg(faabric::Message& msg) @@ -1290,6 +1307,11 @@ void MpiWorld::allReduce(int rank, // Second, 0 broadcasts the result to all ranks broadcast( 0, rank, recvBuffer, datatype, count, faabric::MPIMessage::ALLREDUCE); + + // All reduce triggers a migration point in MPI + if (thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0) { + tryMigrate(rank); + } } void MpiWorld::op_reduce(faabric_op_t* operation, @@ -1537,6 +1559,11 @@ void MpiWorld::barrier(int thisRank) broadcast( 0, thisRank, nullptr, MPI_INT, 0, faabric::MPIMessage::BARRIER_DONE); SPDLOG_TRACE("MPI - barrier done {}", thisRank); + + // Barrier triggers a migration point + if (thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0) { + tryMigrate(thisRank); + } } std::shared_ptr MpiWorld::getLocalQueue(int sendRank, @@ -1732,4 +1759,85 @@ void MpiWorld::checkRanksRange(int sendRank, int recvRank) throw std::runtime_error("Recv rank outside range"); } } + +// This method will either (i) do nothing or (ii) run the whole migration. +// Note that every rank will call this method. +void MpiWorld::tryMigrate(int thisRank) +{ + auto pendingMigrations = + getScheduler().canAppBeMigrated(thisRankMsg->appid()); + + if (pendingMigrations == nullptr) { + return; + } + + prepareMigration(thisRank, pendingMigrations); + if (faabric::util::isMockMode()) { + // When mocking in the tests, the call to get the current executing + // executor may fail + faabric::util::UniqueLock lock(mockMutex); + mpiMockedPendingMigrations.push_back(pendingMigrations); + } else { + // An executing thread may never return from the next call. However, + // the restored executing thread will know it is an MPI function, its + // rank, and its world id, and will therefore call the method to finish + // the migration from the snapshot server. + getExecutingExecutor()->doMigration(pendingMigrations); + } + finishMigration(thisRank); +} + +// Once per host we modify the per-world accounting of rank-to-hosts mappings +// and the corresponding ports for cross-host messaging. +void MpiWorld::prepareMigration( + int thisRank, + std::shared_ptr pendingMigrations) +{ + // Check that there are no pending asynchronous messages to send and receive + for (auto umb : unackedMessageBuffers) { + if (umb->size() > 0) { + SPDLOG_ERROR("Trying to migrate MPI application (id: {}) but rank" + " {} has {} pending async messages to receive", + thisRankMsg->appid(), + thisRank, + umb->size()); + throw std::runtime_error( + "Migrating with pending async messages is not supported"); + } + } + + if (!iSendRequests.empty()) { + SPDLOG_ERROR("Trying to migrate MPI application (id: {}) but rank" + " {} has {} pending async send messages to acknowledge", + thisRankMsg->appid(), + thisRank, + iSendRequests.size()); + throw std::runtime_error( + "Migrating with pending async messages is not supported"); + } + + if (thisRank == localLeader) { + for (int i = 0; i < pendingMigrations->migrations_size(); i++) { + auto m = pendingMigrations->mutable_migrations()->at(i); + assert(hostForRank.at(m.msg().mpirank()) == m.srchost()); + hostForRank.at(m.msg().mpirank()) = m.dsthost(); + } + + // Reset the internal mappings. + initLocalBasePorts(hostForRank); + initLocalRemoteLeaders(); + } +} + +// This method will be called from two different points: +// (i) snapshot server when restoring an MPI function after migration +// (ii) after running the migration from a rank that is not migrated +void MpiWorld::finishMigration(int thisRank) +{ + if (thisRank == localLeader) { + getScheduler().removePendingMigration(thisRankMsg->appid()); + } + + barrier(thisRank); +} } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 9133a838f..8e2173ddd 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -465,10 +465,43 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // variable), and apps would just opt in/out of being migrated. We set // the actual check period instead to ease with experiments. if (firstMsg.migrationcheckperiod() > 0) { - auto decisionPtr = - std::make_shared(decision); - inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); - if (inFlightRequests.size() == 1) { + bool startMigrationThread = inFlightRequests.size() == 0; + + if (inFlightRequests.find(decision.appId) != inFlightRequests.end()) { + // MPI applications are made up of two different requests: the + // original one (with one message) and the second one (with + // world size - 1 messages) created during world creation time. + // Thus, to correctly track migration opportunities we must merge + // both. We append the batch request to the original one (instead + // of the other way around) not to affect the rest of this methods + // functionality. + if (firstMsg.ismpi()) { + startMigrationThread = false; + auto originalReq = inFlightRequests[decision.appId].first; + auto originalDecision = inFlightRequests[decision.appId].second; + assert(req->messages_size() == firstMsg.mpiworldsize() - 1); + for (int i = 0; i < firstMsg.mpiworldsize() - 1; i++) { + // Append message to original request + auto* newMsgPtr = originalReq->add_messages(); + faabric::util::copyMessage(&req->messages().at(i), + newMsgPtr); + // Append message to original decision + originalDecision->addMessage(decision.hosts.at(i), + req->messages().at(i)); + } + } else { + SPDLOG_ERROR("There is already an in-flight request for app {}", + firstMsg.appid()); + throw std::runtime_error("App already in-flight"); + } + } else { + auto decisionPtr = + std::make_shared(decision); + inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); + } + + // Decide wether we have to start the migration thread or not + if (startMigrationThread) { functionMigrationThread.start(firstMsg.migrationcheckperiod()); } else if (firstMsg.migrationcheckperiod() != functionMigrationThread.wakeUpPeriodSeconds) { diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index 6c1c4f62d..ad5d690ac 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -98,7 +98,8 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture void checkPendingMigrationsExpectation( std::shared_ptr expectedMigrations, std::shared_ptr actualMigrations, - std::vector hosts) + std::vector hosts, + bool skipMsgIdCheck = false) { if (expectedMigrations == nullptr) { REQUIRE(actualMigrations == expectedMigrations); @@ -110,7 +111,9 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture for (int i = 0; i < actualMigrations->migrations_size(); i++) { auto actual = actualMigrations->mutable_migrations()->at(i); auto expected = expectedMigrations->mutable_migrations()->at(i); - REQUIRE(actual.msg().id() == expected.msg().id()); + if (!skipMsgIdCheck) { + REQUIRE(actual.msg().id() == expected.msg().id()); + } REQUIRE(actual.srchost() == expected.srchost()); REQUIRE(actual.dsthost() == expected.dsthost()); } @@ -381,4 +384,96 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, sch.removePendingMigration(appId); REQUIRE(sch.canAppBeMigrated(appId) == nullptr); } + +TEST_CASE_METHOD(FunctionMigrationTestFixture, + "Test MPI function migration points", + "[scheduler]") +{ + // Set up host resources + std::vector hosts = { masterHost, "hostA" }; + std::vector slots = { 2, 2 }; + std::vector usedSlots = { 0, 0 }; + setHostResources(hosts, slots, usedSlots); + + // Clear MPI registries + getMpiWorldRegistry().clear(); + + auto req = faabric::util::batchExecFactory("mpi", "sleep", 1); + int checkPeriodSecs = 1; + int timeToSleep = 4 * checkPeriodSecs * 1000; + + int worldId = 123; + int worldSize = 4; + auto* firstMsg = req->mutable_messages(0); + firstMsg->set_inputdata(std::to_string(timeToSleep)); + firstMsg->set_ismpi(true); + firstMsg->set_mpiworldsize(worldSize); + firstMsg->set_mpiworldid(worldId); + firstMsg->set_migrationcheckperiod(checkPeriodSecs); + uint32_t appId = req->messages().at(0).appid(); + + // Call function that wil just sleep + auto decision = sch.callFunctions(req); + + // Manually create the world, and trigger a second function invocation in + // the remote host + MpiWorld world; + world.create(*firstMsg, worldId, worldSize); + + // Update host resources so that a migration opportunity appears + updateLocalResources(4, 2); + + // Build expected migrations + std::shared_ptr expectedMigrations; + // NOTE: we need to add to the original request the ones that will be + // chained by MPI (this is only needed to build the expectation). + for (int i = 0; i < worldSize - 1; i++) { + auto* msg = req->add_messages(); + msg->set_appid(firstMsg->appid()); + } + std::vector> migrations = { { 1, 0 }, { 1, 0 } }; + expectedMigrations = + buildPendingMigrationsExpectation(req, hosts, migrations); + + // Instead of directly calling the scheduler function to check for migration + // opportunites, sleep for enough time (twice the check period) so that a + // migration is detected by the background thread. + SLEEP_MS(2 * checkPeriodSecs * 1000); + + // When checking that a migration has taken place in MPI, we skip the msg + // id check, as part of the request is build by the runtime, and therefore + // we don't have access to the actual messages scheduled (thus can't build + // an expectation with the right id) + auto actualMigrations = sch.canAppBeMigrated(appId); + checkPendingMigrationsExpectation( + expectedMigrations, actualMigrations, hosts, true); + + // Check that certain MPI calls actually do the migration + SECTION("MPI barrier triggers a migration point") { world.barrier(0); } + + SECTION("MPI all reduce triggers a migration point") + { + std::vector messageData = { 0, 1, 2 }; + world.allReduce(0, + BYTES(messageData.data()), + BYTES(messageData.data()), + MPI_INT, + messageData.size(), + MPI_SUM); + } + + // When performing the migration, MPI will remove it from the pending + // migrations map + REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + checkPendingMigrationsExpectation( + expectedMigrations, getMpiMockedPendingMigrations().front(), hosts, true); + + faabric::Message res = + sch.getFunctionResult(firstMsg->id(), 2 * timeToSleep); + REQUIRE(res.returnvalue() == 0); + + // Clean up + world.destroy(); +} + } From 6308962121eff4732553962069279cfc1c7157dc Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 12:14:07 +0000 Subject: [PATCH 10/26] tests: fix data race --- .../test/scheduler/test_function_migration.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index ad5d690ac..257c7ed53 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -425,15 +425,17 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // Build expected migrations std::shared_ptr expectedMigrations; - // NOTE: we need to add to the original request the ones that will be + // We need to add to the original request the ones that will be // chained by MPI (this is only needed to build the expectation). - for (int i = 0; i < worldSize - 1; i++) { - auto* msg = req->add_messages(); - msg->set_appid(firstMsg->appid()); + // NOTE: we do it in a copy of the original request, as otherwise TSAN + // complains about a data race. + auto reqCopy = faabric::util::batchExecFactory("mpi", "sleep", worldSize); + for (int i = 0; i < worldSize; i++) { + reqCopy->mutable_messages(i)->set_appid(firstMsg->appid()); } std::vector> migrations = { { 1, 0 }, { 1, 0 } }; expectedMigrations = - buildPendingMigrationsExpectation(req, hosts, migrations); + buildPendingMigrationsExpectation(reqCopy, hosts, migrations); // Instead of directly calling the scheduler function to check for migration // opportunites, sleep for enough time (twice the check period) so that a @@ -441,9 +443,8 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, SLEEP_MS(2 * checkPeriodSecs * 1000); // When checking that a migration has taken place in MPI, we skip the msg - // id check, as part of the request is build by the runtime, and therefore - // we don't have access to the actual messages scheduled (thus can't build - // an expectation with the right id) + // id check. Part of the request is build by the runtime, and therefore + // we don't have access to the actual messages scheduled. auto actualMigrations = sch.canAppBeMigrated(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts, true); From 51921003ff2754beee57e57774d722f9cf144f0c Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 15:18:53 +0000 Subject: [PATCH 11/26] pr: re-factor methods as suggested in the pr comments --- include/faabric/scheduler/FunctionCallApi.h | 2 +- .../faabric/scheduler/FunctionCallClient.h | 5 ++-- .../faabric/scheduler/FunctionCallServer.h | 2 +- .../scheduler/FunctionMigrationThread.h | 6 ++-- include/faabric/scheduler/Scheduler.h | 4 +-- src/scheduler/FunctionCallClient.cpp | 19 +++++++----- src/scheduler/FunctionCallServer.cpp | 8 ++--- src/scheduler/MpiWorld.cpp | 2 +- src/scheduler/Scheduler.cpp | 10 +++---- .../scheduler/test_function_migration.cpp | 30 +++++++++---------- 10 files changed, 45 insertions(+), 43 deletions(-) diff --git a/include/faabric/scheduler/FunctionCallApi.h b/include/faabric/scheduler/FunctionCallApi.h index 53c037ffc..c95aede16 100644 --- a/include/faabric/scheduler/FunctionCallApi.h +++ b/include/faabric/scheduler/FunctionCallApi.h @@ -8,6 +8,6 @@ enum FunctionCalls Flush = 2, Unregister = 3, GetResources = 4, - AddPendingMigration = 5 + PendingMigrations = 5 }; } diff --git a/include/faabric/scheduler/FunctionCallClient.h b/include/faabric/scheduler/FunctionCallClient.h index 117a6430c..399f10082 100644 --- a/include/faabric/scheduler/FunctionCallClient.h +++ b/include/faabric/scheduler/FunctionCallClient.h @@ -23,7 +23,7 @@ std::vector> getResourceRequests(); std::vector>> -getAddPendingMigrationRequests(); +getPendingMigrationsRequests(); std::vector> getUnregisterRequests(); @@ -45,8 +45,7 @@ class FunctionCallClient : public faabric::transport::MessageEndpointClient faabric::HostResources getResources(); - void sendAddPendingMigration( - std::shared_ptr req); + void sendPendingMigrations(std::shared_ptr req); void executeFunctions(std::shared_ptr req); diff --git a/include/faabric/scheduler/FunctionCallServer.h b/include/faabric/scheduler/FunctionCallServer.h index 39403b31e..aaacf6b1a 100644 --- a/include/faabric/scheduler/FunctionCallServer.h +++ b/include/faabric/scheduler/FunctionCallServer.h @@ -29,7 +29,7 @@ class FunctionCallServer final const uint8_t* buffer, size_t bufferSize); - std::unique_ptr recvAddPendingMigration( + std::unique_ptr recvPendingMigrations( const uint8_t* buffer, size_t bufferSize); diff --git a/include/faabric/scheduler/FunctionMigrationThread.h b/include/faabric/scheduler/FunctionMigrationThread.h index 876ca917f..a945e432b 100644 --- a/include/faabric/scheduler/FunctionMigrationThread.h +++ b/include/faabric/scheduler/FunctionMigrationThread.h @@ -5,12 +5,12 @@ #include namespace faabric::scheduler { +// Start a background thread that, every wake up period, will check if there +// are migration opportunities for in-flight apps that have opted in to +// being checked for migrations. class FunctionMigrationThread { public: - // Start a background thread that, every wake up period, will check if there - // are migration opportunities for in-flight apps that have opted in to - // being checked for migrations. void start(int wakeUpPeriodSecondsIn); void stop(); diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index ffbe3c361..a3961389f 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -235,7 +235,7 @@ class Scheduler // ---------------------------------- void checkForMigrationOpportunities(); - std::shared_ptr canAppBeMigrated( + std::shared_ptr getPendingAppMigrations( uint32_t appId); void addPendingMigration(std::shared_ptr msg); @@ -324,7 +324,7 @@ class Scheduler faabric::util::MigrationStrategy migrationStrategy = faabric::util::MigrationStrategy::BIN_PACK); - void broadcastAddPendingMigration( + void broadcastPendingMigrations( std::shared_ptr pendingMigrations); }; diff --git a/src/scheduler/FunctionCallClient.cpp b/src/scheduler/FunctionCallClient.cpp index 5c67d2406..ad01260d6 100644 --- a/src/scheduler/FunctionCallClient.cpp +++ b/src/scheduler/FunctionCallClient.cpp @@ -30,7 +30,7 @@ static std::unordered_map>> - addPendingMigrationRequests; + pendingMigrationsRequests; static std::vector> unregisterRequests; @@ -62,10 +62,10 @@ std::vector> getResourceRequests() } std::vector>> -getAddPendingMigrationRequests() +getPendingMigrationsRequests() { faabric::util::UniqueLock lock(mockMutex); - return addPendingMigrationRequests; + return pendingMigrationsRequests; } std::vector> @@ -87,7 +87,7 @@ void clearMockRequests() functionCalls.clear(); batchMessages.clear(); resourceRequests.clear(); - addPendingMigrationRequests.clear(); + pendingMigrationsRequests.clear(); unregisterRequests.clear(); for (auto& p : queuedResourceResponses) { @@ -140,17 +140,20 @@ faabric::HostResources FunctionCallClient::getResources() return response; } -void FunctionCallClient::sendAddPendingMigration( - std::shared_ptr req) +// This function call is used by the master host of an application to let know +// other hosts running functions of the same application that a migration +// opportunity has been found. +void FunctionCallClient::sendPendingMigrations( + std::shared_ptr req) { faabric::PendingMigrations request; faabric::EmptyResponse response; if (faabric::util::isMockMode()) { faabric::util::UniqueLock lock(mockMutex); - addPendingMigrationRequests.emplace_back(host, req); + pendingMigrationsRequests.emplace_back(host, req); } else { - syncSend(faabric::scheduler::FunctionCalls::AddPendingMigration, + syncSend(faabric::scheduler::FunctionCalls::PendingMigrations, req.get(), &response); } diff --git a/src/scheduler/FunctionCallServer.cpp b/src/scheduler/FunctionCallServer.cpp index e686e834e..550dc5ded 100644 --- a/src/scheduler/FunctionCallServer.cpp +++ b/src/scheduler/FunctionCallServer.cpp @@ -49,8 +49,8 @@ std::unique_ptr FunctionCallServer::doSyncRecv( case faabric::scheduler::FunctionCalls::GetResources: { return recvGetResources(buffer, bufferSize); } - case faabric::scheduler::FunctionCalls::AddPendingMigration: { - return recvAddPendingMigration(buffer, bufferSize); + case faabric::scheduler::FunctionCalls::PendingMigrations: { + return recvPendingMigrations(buffer, bufferSize); } default: { throw std::runtime_error( @@ -105,8 +105,8 @@ std::unique_ptr FunctionCallServer::recvGetResources( } std::unique_ptr -FunctionCallServer::recvAddPendingMigration(const uint8_t* buffer, - size_t bufferSize) +FunctionCallServer::recvPendingMigrations(const uint8_t* buffer, + size_t bufferSize) { PARSE_MSG(faabric::PendingMigrations, buffer, bufferSize); diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index 874bd3f85..e34da32b1 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1765,7 +1765,7 @@ void MpiWorld::checkRanksRange(int sendRank, int recvRank) void MpiWorld::tryMigrate(int thisRank) { auto pendingMigrations = - getScheduler().canAppBeMigrated(thisRankMsg->appid()); + getScheduler().getPendingAppMigrations(thisRankMsg->appid()); if (pendingMigrations == nullptr) { return; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 8e2173ddd..24c51957b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1217,14 +1217,14 @@ void Scheduler::checkForMigrationOpportunities() for (auto msgPtr : tmpPendingMigrations) { // First, broadcast the pending migrations to other hosts - broadcastAddPendingMigration(msgPtr); + broadcastPendingMigrations(msgPtr); // Second, update our local records pendingMigrations[msgPtr->appid()] = std::move(msgPtr); } } } -void Scheduler::broadcastAddPendingMigration( +void Scheduler::broadcastPendingMigrations( std::shared_ptr pendingMigrations) { // Get all hosts for the to-be migrated app @@ -1237,12 +1237,12 @@ void Scheduler::broadcastAddPendingMigration( // Send pending migrations to all involved hosts for (auto& otherHost : thisRegisteredHosts) { - getFunctionCallClient(otherHost).sendAddPendingMigration( + getFunctionCallClient(otherHost).sendPendingMigrations( pendingMigrations); } } -std::shared_ptr Scheduler::canAppBeMigrated( +std::shared_ptr Scheduler::getPendingAppMigrations( uint32_t appId) { faabric::util::SharedLock lock(mx); @@ -1294,7 +1294,7 @@ Scheduler::doCheckForMigrationOpportunities( // If we have already recorded a pending migration for this req, // skip - if (canAppBeMigrated(originalDecision.appId) != nullptr) { + if (getPendingAppMigrations(originalDecision.appId) != nullptr) { SPDLOG_TRACE("Skipping app {} as migration opportunity has " "already been recorded", originalDecision.appId); diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index 257c7ed53..c3535a923 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -120,9 +120,9 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture // Check we have sent a message to all other hosts with the pending // migration - auto pendingRequests = getAddPendingMigrationRequests(); + auto pendingRequests = getPendingMigrationsRequests(); REQUIRE(pendingRequests.size() == hosts.size() - 1); - for (auto& pendingReq : getAddPendingMigrationRequests()) { + for (auto& pendingReq : getPendingMigrationsRequests()) { std::string host = pendingReq.first; std::shared_ptr migration = pendingReq.second; @@ -188,7 +188,7 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); - auto actualMigrations = sch.canAppBeMigrated(appId); + auto actualMigrations = sch.getPendingAppMigrations(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts); @@ -198,7 +198,7 @@ TEST_CASE_METHOD( // Check that after the result is set, the app can't be migrated no more sch.checkForMigrationOpportunities(); - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); } TEST_CASE_METHOD(FunctionMigrationTestFixture, @@ -240,7 +240,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, sch.checkForMigrationOpportunities(); - auto actualMigrations = sch.canAppBeMigrated(appId); + auto actualMigrations = sch.getPendingAppMigrations(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts); @@ -250,7 +250,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // Check that after the result is set, the app can't be migrated no more sch.checkForMigrationOpportunities(); - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); } TEST_CASE_METHOD( @@ -295,7 +295,7 @@ TEST_CASE_METHOD( sch.checkForMigrationOpportunities(); - auto actualMigrations = sch.canAppBeMigrated(appId); + auto actualMigrations = sch.getPendingAppMigrations(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts); @@ -305,7 +305,7 @@ TEST_CASE_METHOD( // Check that after the result is set, the app can't be migrated no more sch.checkForMigrationOpportunities(); - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); } TEST_CASE_METHOD( @@ -351,7 +351,7 @@ TEST_CASE_METHOD( // migration is detected by the background thread. SLEEP_MS(2 * checkPeriodSecs * 1000); - auto actualMigrations = sch.canAppBeMigrated(appId); + auto actualMigrations = sch.getPendingAppMigrations(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts); @@ -361,7 +361,7 @@ TEST_CASE_METHOD( // Check that after the result is set, the app can't be migrated no more sch.checkForMigrationOpportunities(); - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); } TEST_CASE_METHOD(FunctionMigrationTestFixture, @@ -376,13 +376,13 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, buildPendingMigrationsExpectation(req, hosts, migrations); // Add migration manually - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); sch.addPendingMigration(expectedMigrations); - REQUIRE(sch.canAppBeMigrated(appId) == expectedMigrations); + REQUIRE(sch.getPendingAppMigrations(appId) == expectedMigrations); // Remove migration manually sch.removePendingMigration(appId); - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); } TEST_CASE_METHOD(FunctionMigrationTestFixture, @@ -445,7 +445,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // When checking that a migration has taken place in MPI, we skip the msg // id check. Part of the request is build by the runtime, and therefore // we don't have access to the actual messages scheduled. - auto actualMigrations = sch.canAppBeMigrated(appId); + auto actualMigrations = sch.getPendingAppMigrations(appId); checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts, true); @@ -465,7 +465,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // When performing the migration, MPI will remove it from the pending // migrations map - REQUIRE(sch.canAppBeMigrated(appId) == nullptr); + REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); checkPendingMigrationsExpectation( expectedMigrations, getMpiMockedPendingMigrations().front(), hosts, true); From b3b50d55b89983b900ac542f43bba0df940bc653 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 15:47:10 +0000 Subject: [PATCH 12/26] executor: throw exception to properly shutdown executing task --- include/faabric/scheduler/Scheduler.h | 10 +++++++++- src/scheduler/Executor.cpp | 12 ++++++++++-- src/scheduler/MpiWorld.cpp | 13 +++++++++++-- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index a3961389f..8a1edcc7e 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -23,6 +23,14 @@ namespace faabric::scheduler { +class ExecutorMigratedException : public faabric::util::FaabricException +{ + public: + explicit ExecutorMigratedException(std::string message) + : FaabricException(std::move(message)) + {} +}; + typedef std::pair, std::shared_ptr> InFlightPair; @@ -82,7 +90,7 @@ class Executor faabric::Message& msg, bool createIfNotExists = false); - void doMigration( + bool doMigration( std::shared_ptr pendingMigrations); protected: diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index ba272afa2..8aeb8d313 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -538,6 +538,12 @@ void Executor::threadPoolThread(int threadPoolIdx) "Task {} threw exception. What: {}", msg.id(), ex.what()); SPDLOG_ERROR(errorMessage); msg.set_outputdata(errorMessage); + } catch (const faabric::scheduler::ExecutorMigratedException& ex) { + SPDLOG_TRACE("Task {} has been migrated.", msg.id()); + + returnValue = 0; + msg.set_outputdata( + "The execution of this message has been migrated"); } // Handle thread-local diffing for every thread @@ -735,16 +741,18 @@ void Executor::releaseClaim() claimed.store(false); } -void Executor::doMigration( +bool Executor::doMigration( std::shared_ptr pendingMigrations) { for (int i = 0; i < pendingMigrations->migrations_size(); i++) { auto m = pendingMigrations->mutable_migrations()->at(i); if (m.msg().id() == boundMessage.id()) { migrateFunction(m.msg(), m.dsthost()); - // TODO: terminate current executing thread + return true; } } + + return false; } void Executor::migrateFunction(const faabric::Message& msg, diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index e34da32b1..a563fe33a 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1772,6 +1772,8 @@ void MpiWorld::tryMigrate(int thisRank) } prepareMigration(thisRank, pendingMigrations); + + bool mustShutdown = false; if (faabric::util::isMockMode()) { // When mocking in the tests, the call to get the current executing // executor may fail @@ -1782,9 +1784,16 @@ void MpiWorld::tryMigrate(int thisRank) // the restored executing thread will know it is an MPI function, its // rank, and its world id, and will therefore call the method to finish // the migration from the snapshot server. - getExecutingExecutor()->doMigration(pendingMigrations); + mustShutdown = getExecutingExecutor()->doMigration(pendingMigrations); + } + + if (mustShutdown) { + destroy(); + throw faabric::scheduler::ExecutorMigratedException( + "Executor has been migrated"); + } else { + finishMigration(thisRank); } - finishMigration(thisRank); } // Once per host we modify the per-world accounting of rank-to-hosts mappings From fe07c6f755c04dc02dfc5b5526ebc2a0ed0b9fde Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 17:05:42 +0000 Subject: [PATCH 13/26] executor: close the loop to alllow for function migration --- include/faabric/scheduler/Scheduler.h | 16 +++++++++------- src/proto/faabric.proto | 1 + src/scheduler/Executor.cpp | 27 +++++++++++++++++++++------ src/scheduler/Scheduler.cpp | 12 +++++++++++- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 8a1edcc7e..2350746e3 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -250,6 +250,15 @@ class Scheduler void removePendingMigration(uint32_t appId); + // ---------------------------------- + // Clients + // ---------------------------------- + faabric::scheduler::FunctionCallClient& getFunctionCallClient( + const std::string& otherHost); + + faabric::snapshot::SnapshotClient& getSnapshotClient( + const std::string& otherHost); + private: std::string thisHost; @@ -274,13 +283,6 @@ class Scheduler std::mutex localResultsMutex; - // ---- Clients ---- - faabric::scheduler::FunctionCallClient& getFunctionCallClient( - const std::string& otherHost); - - faabric::snapshot::SnapshotClient& getSnapshotClient( - const std::string& otherHost); - // ---- Host resources and hosts ---- faabric::HostResources thisHostResources; std::atomic thisHostUsedSlots = 0; diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index d6f4e17a1..e295e038c 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -25,6 +25,7 @@ message BatchExecuteRequest { FUNCTIONS = 0; THREADS = 1; PROCESSES = 2; + MIGRATION = 3; } BatchExecuteType type = 2; diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 8aeb8d313..f778134f4 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -758,12 +759,26 @@ bool Executor::doMigration( void Executor::migrateFunction(const faabric::Message& msg, const std::string& host) { - SPDLOG_INFO("Executor received request to migrate message {} from host {}" - " to host {}", - msg.id(), - msg.executedhost(), - host); - SPDLOG_ERROR("Executor::migrate() not implemented"); + SPDLOG_DEBUG("Executor received request to migrate message {} from host {}" + " to host {}", + msg.id(), + msg.executedhost(), + host); + + // Take snapshot and push to recepient host + // TODO - could we just push the snapshot diffs here? + auto snap = std::make_shared(getMemoryView()); + std::string snapKey = fmt::format("{}:{}", + faabric::util::funcToString(msg, false), + faabric::util::generateGid()); + sch.getSnapshotClient(host).pushSnapshot(snapKey, snap); + + // Create request execution for migrated function + auto req = faabric::util::batchExecFactory(msg.user(), msg.function(), 1); + faabric::util::copyMessage(&msg, req->mutable_messages(0)); + req->set_type(faabric::BatchExecuteRequest::MIGRATION); + req->mutable_messages(0)->set_snapshotkey(snapKey); + sch.getFunctionCallClient(host).executeFunctions(req); } // ------------------------------------------ diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 24c51957b..8f7ed22f2 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -550,6 +550,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // ------------------------------------------- // SNAPSHOTS // ------------------------------------------- + bool isMigration = req->type() == faabric::BatchExecuteRequest::MIGRATION; // Push out snapshot diffs to registered hosts. We have to do this to // *all* hosts, regardless of whether they will be executing functions. @@ -571,7 +572,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( snapshotKey = firstMsg.snapshotkey(); } - if (!snapshotKey.empty()) { + if (!snapshotKey.empty() && !isMigration) { auto snap = faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey); @@ -591,6 +592,15 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( } } + // Now reset the tracking on the snapshot before we start executing + snap->clearTrackedChanges(); + } else if (isMigration) { + // If we are executing a migrated function, we don't need to distribute + // the snapshot to other hosts, as this snapshot is specific to the + // to-be-restored function + auto snap = + faabric::snapshot::getSnapshotRegistry().getSnapshot(snapshotKey); + // Now reset the tracking on the snapshot before we start executing snap->clearTrackedChanges(); } From a45cc005fa32e4ae9cab46fdedcf38a7025ec78a Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Jan 2022 17:24:04 +0000 Subject: [PATCH 14/26] scheduler: add UNDERFULL scheduling topology hint --- include/faabric/util/scheduling.h | 7 +++- src/scheduler/MpiWorld.cpp | 12 ++++--- src/scheduler/Scheduler.cpp | 3 ++ .../scheduler/test_scheduling_decisions.cpp | 36 +++++++++++++++++++ 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/include/faabric/util/scheduling.h b/include/faabric/util/scheduling.h index 2905d8862..71bab4f8c 100644 --- a/include/faabric/util/scheduling.h +++ b/include/faabric/util/scheduling.h @@ -46,13 +46,18 @@ class SchedulingDecision // requests in a batch. // - NORMAL: bin-packs requests to slots in hosts starting from the master // host, and overloadds the master if it runs out of resources. +// - FORCE_LOCAL: force local execution irrespective of the available +// resources. // - NEVER_ALONE: never allocates a single (non-master) request to a host // without other requests of the batch. +// - UNDERFULL: schedule up to 50% of the master hosts' capacity to force +// migration opportunities to appear. enum SchedulingTopologyHint { NORMAL, FORCE_LOCAL, - NEVER_ALONE + NEVER_ALONE, + UNDERFULL, }; // Migration strategies help the scheduler decide wether the scheduling decision diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index a563fe33a..f23051736 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -247,11 +247,15 @@ void MpiWorld::create(faabric::Message& call, int newId, int newSize) std::vector executedAt; if (size > 1) { - // Send the init messages (note that message i corresponds to rank i+1) - // By default, we use the NEVER_ALONE policy for MPI executions to - // minimise cross-host messaging + // 10/01/22 - We add this check to force different scheduling behaviour + // if running migration experiments. The check can be deleted + // afterwards. + bool mustUnderfull = + thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0; faabric::util::SchedulingDecision decision = sch.callFunctions( - req, faabric::util::SchedulingTopologyHint::NEVER_ALONE); + req, + mustUnderfull ? faabric::util::SchedulingTopologyHint::UNDERFULL + : faabric::util::SchedulingTopologyHint::NORMAL); executedAt = decision.hosts; } assert(executedAt.size() == size - 1); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 8f7ed22f2..45e78ba86 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -310,6 +310,9 @@ faabric::util::SchedulingDecision Scheduler::makeSchedulingDecision( // Work out how many we can handle locally int slots = thisHostResources.slots(); + if (topologyHint == faabric::util::SchedulingTopologyHint::UNDERFULL) { + slots = slots / 2; + } // Work out available cores, flooring at zero int available = diff --git a/tests/test/scheduler/test_scheduling_decisions.cpp b/tests/test/scheduler/test_scheduling_decisions.cpp index fb18e679f..0a6f7258c 100644 --- a/tests/test/scheduler/test_scheduling_decisions.cpp +++ b/tests/test/scheduler/test_scheduling_decisions.cpp @@ -390,4 +390,40 @@ TEST_CASE_METHOD(SchedulingDecisionTestFixture, testActualSchedulingDecision(req, config); } + +TEST_CASE_METHOD(SchedulingDecisionTestFixture, + "Test underfull scheduling topology hint", + "[scheduler]") +{ + SchedulingConfig config = { + .hosts = { masterHost, "hostA" }, + .slots = { 2, 2 }, + .numReqs = 2, + .topologyHint = faabric::util::SchedulingTopologyHint::UNDERFULL, + .expectedHosts = { masterHost, "hostA" }, + }; + + std::shared_ptr req; + + SECTION("Test hint's basic functionality") + { + req = faabric::util::batchExecFactory("foo", "bar", config.numReqs); + } + + SECTION("Test hint does not affect other hosts") + { + config.numReqs = 3; + config.expectedHosts = { masterHost, "hostA", "hostA" }; + req = faabric::util::batchExecFactory("foo", "bar", config.numReqs); + } + + SECTION("Test with hint we still overload to master") + { + config.numReqs = 4; + config.expectedHosts = { masterHost, "hostA", "hostA", masterHost }; + req = faabric::util::batchExecFactory("foo", "bar", config.numReqs); + } + + testActualSchedulingDecision(req, config); +} } From c177e6abcf21cb91a71a1f922f5b0eea3a0b52b9 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 12 Jan 2022 07:44:51 +0000 Subject: [PATCH 15/26] Add migration exception, catch in executor --- include/faabric/util/func.h | 8 ++++++++ src/scheduler/Executor.cpp | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/faabric/util/func.h b/include/faabric/util/func.h index b4fa1ba09..b8cbd3812 100644 --- a/include/faabric/util/func.h +++ b/include/faabric/util/func.h @@ -7,6 +7,14 @@ namespace faabric::util { +class FunctionMigratedException : public faabric::util::FaabricException +{ + public: + explicit FunctionMigratedException(std::string message) + : FaabricException(std::move(message)) + {} +}; + std::string funcToString(const faabric::Message& msg, bool includeId); std::string funcToString( diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index ba272afa2..2c538d844 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -528,9 +528,21 @@ void Executor::threadPoolThread(int threadPoolIdx) // Execute the task int32_t returnValue; + bool migrated = false; try { returnValue = executeTask(threadPoolIdx, task.messageIndex, task.req); + } catch (const faabric::util::FunctionMigratedException& ex) { + SPDLOG_DEBUG( + "Task {} migrated, shutting down executor {}", msg.id(), id); + + // Note that when a task has been migrated, we need to perform all + // the normal executor shutdown, but we must NOT set the result for + // the call. + migrated = true; + selfShutdown = true; + returnValue = -99; + } catch (const std::exception& ex) { returnValue = 1; @@ -667,6 +679,12 @@ void Executor::threadPoolThread(int threadPoolIdx) // executor. sch.vacateSlot(); + // If the function has been migrated, we drop out here and shut down the + // executor + if (migrated) { + break; + } + // Finally set the result of the task, this will allow anything // waiting on its result to continue execution, therefore must be // done once the executor has been reset, otherwise the executor may From f5fa773e6a7b41e1391708dfd030390d3d628be5 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 12 Jan 2022 07:48:40 +0000 Subject: [PATCH 16/26] Remove manual message copying --- include/faabric/util/message.h | 7 ---- src/scheduler/Scheduler.cpp | 4 +- src/util/CMakeLists.txt | 1 - src/util/message.cpp | 63 -------------------------------- tests/test/util/test_message.cpp | 51 -------------------------- 5 files changed, 2 insertions(+), 124 deletions(-) delete mode 100644 include/faabric/util/message.h delete mode 100644 src/util/message.cpp delete mode 100644 tests/test/util/test_message.cpp diff --git a/include/faabric/util/message.h b/include/faabric/util/message.h deleted file mode 100644 index dfa802228..000000000 --- a/include/faabric/util/message.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include - -namespace faabric::util { -void copyMessage(const faabric::Message* src, faabric::Message* dst); -} diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 24c51957b..d405f42c1 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -483,8 +483,8 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( for (int i = 0; i < firstMsg.mpiworldsize() - 1; i++) { // Append message to original request auto* newMsgPtr = originalReq->add_messages(); - faabric::util::copyMessage(&req->messages().at(i), - newMsgPtr); + *newMsgPtr = req->messages().at(i); + // Append message to original decision originalDecision->addMessage(decision.hosts.at(i), req->messages().at(i)); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 43569cbc5..e891fd668 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -17,7 +17,6 @@ faabric_lib(util locks.cpp logging.cpp memory.cpp - message.cpp network.cpp queue.cpp random.cpp diff --git a/src/util/message.cpp b/src/util/message.cpp deleted file mode 100644 index 4cd19dd35..000000000 --- a/src/util/message.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include - -namespace faabric::util { -void copyMessage(const faabric::Message* src, faabric::Message* dst) -{ - dst->set_id(src->id()); - dst->set_appid(src->appid()); - dst->set_appidx(src->appidx()); - dst->set_masterhost(src->masterhost()); - - dst->set_type(src->type()); - dst->set_user(src->user()); - dst->set_function(src->function()); - - dst->set_inputdata(src->inputdata()); - dst->set_outputdata(src->outputdata()); - - dst->set_funcptr(src->funcptr()); - dst->set_returnvalue(src->returnvalue()); - - dst->set_snapshotkey(src->snapshotkey()); - - dst->set_timestamp(src->timestamp()); - dst->set_resultkey(src->resultkey()); - dst->set_executeslocally(src->executeslocally()); - dst->set_statuskey(src->statuskey()); - - dst->set_executedhost(src->executedhost()); - dst->set_finishtimestamp(src->finishtimestamp()); - - dst->set_isasync(src->isasync()); - dst->set_ispython(src->ispython()); - dst->set_isstatusrequest(src->isstatusrequest()); - dst->set_isexecgraphrequest(src->isexecgraphrequest()); - - dst->set_pythonuser(src->pythonuser()); - dst->set_pythonfunction(src->pythonfunction()); - dst->set_pythonentry(src->pythonentry()); - - dst->set_groupid(src->groupid()); - dst->set_groupidx(src->groupidx()); - dst->set_groupsize(src->groupsize()); - - dst->set_ismpi(src->ismpi()); - dst->set_mpiworldid(src->mpiworldid()); - dst->set_mpirank(src->mpirank()); - dst->set_mpiworldsize(src->mpiworldsize()); - - dst->set_cmdline(src->cmdline()); - - dst->set_recordexecgraph(src->recordexecgraph()); - - dst->set_issgx(src->issgx()); - - dst->set_sgxsid(src->sgxsid()); - dst->set_sgxnonce(src->sgxnonce()); - dst->set_sgxtag(src->sgxtag()); - dst->set_sgxpolicy(src->sgxpolicy()); - dst->set_sgxresult(src->sgxresult()); - - dst->set_migrationcheckperiod(src->migrationcheckperiod()); -} -} diff --git a/tests/test/util/test_message.cpp b/tests/test/util/test_message.cpp deleted file mode 100644 index 3627582eb..000000000 --- a/tests/test/util/test_message.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include - -#include - -#include - -using namespace faabric::util; - -namespace tests { -TEST_CASE("Test copying faabric message", "[util]") -{ - faabric::Message msg; - msg.set_type(faabric::Message_MessageType_FLUSH); - msg.set_user("user 1"); - msg.set_function("great function"); - msg.set_executedhost("blah.host.blah"); - msg.set_finishtimestamp(123456543); - - msg.set_pythonuser("py user"); - msg.set_pythonfunction("py func"); - msg.set_pythonentry("py entry"); - - msg.set_isasync(true); - msg.set_ispython(true); - msg.set_isstatusrequest(true); - msg.set_isexecgraphrequest(true); - - msg.set_ismpi(true); - msg.set_mpiworldid(1234); - msg.set_mpirank(5678); - msg.set_mpiworldsize(33); - - msg.set_cmdline("some cmdline"); - - msg.set_issgx(true); - msg.set_sgxsid("test sid string"); - msg.set_sgxnonce("test nonce string"); - msg.set_sgxtag("test tag string"); - msg.set_sgxpolicy("test policy string"); - msg.set_sgxresult("test result string"); - - msg.set_recordexecgraph(true); - - msg.set_migrationcheckperiod(33); - - faabric::Message msgCopy; - copyMessage(&msg, &msgCopy); - - checkMessageEquality(msg, msgCopy); -} -} From ec38aad93db32f796ddabe7555189afe015c1afe Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 12 Jan 2022 08:00:35 +0000 Subject: [PATCH 17/26] Remove unnecessary migration APIs --- include/faabric/scheduler/MpiWorld.h | 3 --- include/faabric/scheduler/Scheduler.h | 3 --- src/scheduler/Executor.cpp | 18 +++++-------- src/scheduler/MpiWorld.cpp | 39 --------------------------- 4 files changed, 6 insertions(+), 57 deletions(-) diff --git a/include/faabric/scheduler/MpiWorld.h b/include/faabric/scheduler/MpiWorld.h index 56a1c01f3..3ddf51065 100644 --- a/include/faabric/scheduler/MpiWorld.h +++ b/include/faabric/scheduler/MpiWorld.h @@ -275,9 +275,6 @@ class MpiWorld int batchSize = 0); /* Function Migration */ - - void tryMigrate(int thisRank); - void prepareMigration( int thisRank, std::shared_ptr pendingMigrations); diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index a3961389f..75658e95c 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -82,9 +82,6 @@ class Executor faabric::Message& msg, bool createIfNotExists = false); - void doMigration( - std::shared_ptr pendingMigrations); - protected: virtual void restore(const std::string& snapshotKey); diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 2c538d844..86936b2b7 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -1,3 +1,4 @@ +#include "faabric/scheduler/MpiWorldRegistry.h" #include #include #include @@ -543,6 +544,11 @@ void Executor::threadPoolThread(int threadPoolIdx) selfShutdown = true; returnValue = -99; + // MPI migration + if(msg.ismpi()) { + // TODO - call MPI migration stuff here + } + } catch (const std::exception& ex) { returnValue = 1; @@ -753,18 +759,6 @@ void Executor::releaseClaim() claimed.store(false); } -void Executor::doMigration( - std::shared_ptr pendingMigrations) -{ - for (int i = 0; i < pendingMigrations->migrations_size(); i++) { - auto m = pendingMigrations->mutable_migrations()->at(i); - if (m.msg().id() == boundMessage.id()) { - migrateFunction(m.msg(), m.dsthost()); - // TODO: terminate current executing thread - } - } -} - void Executor::migrateFunction(const faabric::Message& msg, const std::string& host) { diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index e34da32b1..51508ccd4 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1307,11 +1307,6 @@ void MpiWorld::allReduce(int rank, // Second, 0 broadcasts the result to all ranks broadcast( 0, rank, recvBuffer, datatype, count, faabric::MPIMessage::ALLREDUCE); - - // All reduce triggers a migration point in MPI - if (thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0) { - tryMigrate(rank); - } } void MpiWorld::op_reduce(faabric_op_t* operation, @@ -1559,11 +1554,6 @@ void MpiWorld::barrier(int thisRank) broadcast( 0, thisRank, nullptr, MPI_INT, 0, faabric::MPIMessage::BARRIER_DONE); SPDLOG_TRACE("MPI - barrier done {}", thisRank); - - // Barrier triggers a migration point - if (thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0) { - tryMigrate(thisRank); - } } std::shared_ptr MpiWorld::getLocalQueue(int sendRank, @@ -1760,33 +1750,6 @@ void MpiWorld::checkRanksRange(int sendRank, int recvRank) } } -// This method will either (i) do nothing or (ii) run the whole migration. -// Note that every rank will call this method. -void MpiWorld::tryMigrate(int thisRank) -{ - auto pendingMigrations = - getScheduler().getPendingAppMigrations(thisRankMsg->appid()); - - if (pendingMigrations == nullptr) { - return; - } - - prepareMigration(thisRank, pendingMigrations); - if (faabric::util::isMockMode()) { - // When mocking in the tests, the call to get the current executing - // executor may fail - faabric::util::UniqueLock lock(mockMutex); - mpiMockedPendingMigrations.push_back(pendingMigrations); - } else { - // An executing thread may never return from the next call. However, - // the restored executing thread will know it is an MPI function, its - // rank, and its world id, and will therefore call the method to finish - // the migration from the snapshot server. - getExecutingExecutor()->doMigration(pendingMigrations); - } - finishMigration(thisRank); -} - // Once per host we modify the per-world accounting of rank-to-hosts mappings // and the corresponding ports for cross-host messaging. void MpiWorld::prepareMigration( @@ -1837,7 +1800,5 @@ void MpiWorld::finishMigration(int thisRank) if (thisRank == localLeader) { getScheduler().removePendingMigration(thisRankMsg->appid()); } - - barrier(thisRank); } } From fefbe12df5bfd205f45352225bd4c56b852bc132 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 12 Jan 2022 09:48:52 +0000 Subject: [PATCH 18/26] Make getMemoryView public --- include/faabric/scheduler/Scheduler.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 75658e95c..487d602d4 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -82,13 +82,13 @@ class Executor faabric::Message& msg, bool createIfNotExists = false); + virtual std::span getMemoryView(); + protected: virtual void restore(const std::string& snapshotKey); virtual void postFinish(); - virtual std::span getMemoryView(); - virtual void setMemorySize(size_t newSize); faabric::Message boundMessage; From 07709f04c3ba47b1b1ecf7cadc5d24570288ebf2 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Wed, 12 Jan 2022 10:08:29 +0000 Subject: [PATCH 19/26] scheduler: get functions to migrate properly --- src/scheduler/Executor.cpp | 14 +++++++------- src/scheduler/FunctionMigrationThread.cpp | 2 +- src/scheduler/MpiWorld.cpp | 5 ++++- src/scheduler/MpiWorldRegistry.cpp | 4 +--- src/scheduler/Scheduler.cpp | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index f778134f4..f74138473 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -532,6 +532,12 @@ void Executor::threadPoolThread(int threadPoolIdx) try { returnValue = executeTask(threadPoolIdx, task.messageIndex, task.req); + } catch (const faabric::scheduler::ExecutorMigratedException& ex) { + SPDLOG_TRACE("Task {} has been migrated.", msg.id()); + + returnValue = 0; + msg.set_outputdata( + "The execution of this message has been migrated"); } catch (const std::exception& ex) { returnValue = 1; @@ -539,12 +545,6 @@ void Executor::threadPoolThread(int threadPoolIdx) "Task {} threw exception. What: {}", msg.id(), ex.what()); SPDLOG_ERROR(errorMessage); msg.set_outputdata(errorMessage); - } catch (const faabric::scheduler::ExecutorMigratedException& ex) { - SPDLOG_TRACE("Task {} has been migrated.", msg.id()); - - returnValue = 0; - msg.set_outputdata( - "The execution of this message has been migrated"); } // Handle thread-local diffing for every thread @@ -768,7 +768,7 @@ void Executor::migrateFunction(const faabric::Message& msg, // Take snapshot and push to recepient host // TODO - could we just push the snapshot diffs here? auto snap = std::make_shared(getMemoryView()); - std::string snapKey = fmt::format("{}:{}", + std::string snapKey = fmt::format("{}_migration_{}", faabric::util::funcToString(msg, false), faabric::util::generateGid()); sch.getSnapshotClient(host).pushSnapshot(snapKey, snap); diff --git a/src/scheduler/FunctionMigrationThread.cpp b/src/scheduler/FunctionMigrationThread.cpp index a13274464..d72afe8ff 100644 --- a/src/scheduler/FunctionMigrationThread.cpp +++ b/src/scheduler/FunctionMigrationThread.cpp @@ -28,7 +28,7 @@ void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn) // If we hit the timeout it means we have not been notified to // stop. Thus we check for migration opportunities. if (returnVal == std::cv_status::timeout) { - SPDLOG_DEBUG( + SPDLOG_TRACE( "Migration thread checking for migration opportunities"); faabric::scheduler::getScheduler() .checkForMigrationOpportunities(); diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index f23051736..c3130684e 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1792,10 +1792,13 @@ void MpiWorld::tryMigrate(int thisRank) } if (mustShutdown) { + SPDLOG_INFO("MPI rank {} is being migrated", thisRank); destroy(); throw faabric::scheduler::ExecutorMigratedException( "Executor has been migrated"); } else { + SPDLOG_INFO("MPI rank {} is part of a migration, but not migrated", + thisRank); finishMigration(thisRank); } } @@ -1808,7 +1811,7 @@ void MpiWorld::prepareMigration( { // Check that there are no pending asynchronous messages to send and receive for (auto umb : unackedMessageBuffers) { - if (umb->size() > 0) { + if (umb != nullptr && umb->size() > 0) { SPDLOG_ERROR("Trying to migrate MPI application (id: {}) but rank" " {} has {} pending async messages to receive", thisRankMsg->appid(), diff --git a/src/scheduler/MpiWorldRegistry.cpp b/src/scheduler/MpiWorldRegistry.cpp index 5a2295188..e6b512b6a 100644 --- a/src/scheduler/MpiWorldRegistry.cpp +++ b/src/scheduler/MpiWorldRegistry.cpp @@ -52,9 +52,7 @@ MpiWorld& MpiWorldRegistry::getOrInitialiseWorld(faabric::Message& msg) { faabric::util::SharedLock lock(registryMutex); MpiWorld& world = worldMap[worldId]; - if (msg.recordexecgraph()) { - world.setMsgForRank(msg); - } + world.setMsgForRank(msg); return world; } } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 45e78ba86..5a8586a28 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -452,6 +452,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( faabric::Message& firstMsg = req->mutable_messages()->at(0); std::string funcStr = faabric::util::funcToString(firstMsg, false); int nMessages = req->messages_size(); + bool isMigration = req->type() == faabric::BatchExecuteRequest::MIGRATION; if (decision.hosts.size() != nMessages) { SPDLOG_ERROR( @@ -467,7 +468,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // period, we would have a default one (overwritable through an env. // variable), and apps would just opt in/out of being migrated. We set // the actual check period instead to ease with experiments. - if (firstMsg.migrationcheckperiod() > 0) { + if (!isMigration && firstMsg.migrationcheckperiod() > 0) { bool startMigrationThread = inFlightRequests.size() == 0; if (inFlightRequests.find(decision.appId) != inFlightRequests.end()) { @@ -553,7 +554,6 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( // ------------------------------------------- // SNAPSHOTS // ------------------------------------------- - bool isMigration = req->type() == faabric::BatchExecuteRequest::MIGRATION; // Push out snapshot diffs to registered hosts. We have to do this to // *all* hosts, regardless of whether they will be executing functions. From 446e90a6eddcd41f23657c956fbdd1259111483a Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Wed, 2 Feb 2022 17:19:37 +0000 Subject: [PATCH 20/26] cleanup --- include/faabric/scheduler/MpiWorld.h | 2 -- include/faabric/scheduler/Scheduler.h | 4 --- src/scheduler/Executor.cpp | 7 +++--- src/scheduler/MpiWorld.cpp | 9 ++----- src/scheduler/Scheduler.cpp | 1 - .../scheduler/test_function_migration.cpp | 25 +------------------ 6 files changed, 7 insertions(+), 41 deletions(-) diff --git a/include/faabric/scheduler/MpiWorld.h b/include/faabric/scheduler/MpiWorld.h index 9ba43f7da..94136b0dd 100644 --- a/include/faabric/scheduler/MpiWorld.h +++ b/include/faabric/scheduler/MpiWorld.h @@ -280,8 +280,6 @@ class MpiWorld int recvRank, int batchSize = 0); - void finishMigration(int thisRank); - /* Helper methods */ void checkRanksRange(int sendRank, int recvRank); diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 3ab10685d..97b69d538 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -90,10 +90,6 @@ class Executor faabric::Message& msg, bool createIfNotExists = false); - // TODO - maybe remove me - bool doMigration( - std::shared_ptr pendingMigrations); - virtual std::span getMemoryView(); protected: diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index c3a9ab1a4..d8bb9c191 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -545,10 +545,11 @@ void Executor::threadPoolThread(int threadPoolIdx) returnValue = -99; // MPI migration - if(msg.ismpi()) { - // TODO - when should we delete the pending migration? + if (msg.ismpi()) { + // TODO - delete the pending migration auto& mpiWorld = - faabric::scheduler::getMpiWorldRegistry().getWorld(msg.mpiworldid()); + faabric::scheduler::getMpiWorldRegistry().getWorld( + msg.mpiworldid()); mpiWorld.destroy(); } } catch (const std::exception& ex) { diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index a3627ec43..0cbfa2284 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1758,8 +1758,6 @@ void MpiWorld::checkRanksRange(int sendRank, int recvRank) } } -// Once per host we modify the per-world accounting of rank-to-hosts mappings -// and the corresponding ports for cross-host messaging. void MpiWorld::prepareMigration( int thisRank, std::shared_ptr pendingMigrations) @@ -1802,16 +1800,13 @@ void MpiWorld::prepareMigration( // TODO - merge with initLocalQueues for (const int sendRank : ranksForHost[thisHost]) { for (const int recvRank : ranksForHost[thisHost]) { - if (localQueues[getIndexForRanks(sendRank, recvRank)] == nullptr) { + if (localQueues[getIndexForRanks(sendRank, recvRank)] == + nullptr) { localQueues[getIndexForRanks(sendRank, recvRank)] = std::make_shared(); } } } - - // Lastly, remove the migrations from the pending migrations map - // TODO - when should we remove the pending migration from the map? - // getScheduler().removePendingMigration(thisRankMsg->appid()); } } } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7ecd76817..a9174b57d 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1371,7 +1371,6 @@ Scheduler::doCheckForMigrationOpportunities( &(*(req->mutable_messages()->begin() + std::distance(originalDecision.hosts.begin(), right))); auto* migrationMsgPtr = migration->mutable_msg(); - // faabric::util::copyMessage(msgPtr, migrationMsgPtr); *migrationMsgPtr = *msgPtr; // Decrement by one the availability, and check for more // possible sources of migration diff --git a/tests/test/scheduler/test_function_migration.cpp b/tests/test/scheduler/test_function_migration.cpp index c3535a923..4ee305fde 100644 --- a/tests/test/scheduler/test_function_migration.cpp +++ b/tests/test/scheduler/test_function_migration.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include using namespace faabric::scheduler; @@ -86,8 +85,7 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture for (auto pair : migrations) { auto* migration = expected.add_migrations(); auto* migrationMsg = migration->mutable_msg(); - faabric::util::copyMessage(&req->mutable_messages()->at(pair.first), - migrationMsg); + *migrationMsg = req->mutable_messages()->at(pair.first); migration->set_srchost(hosts.at(pair.first)); migration->set_dsthost(hosts.at(pair.second)); } @@ -449,26 +447,6 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, checkPendingMigrationsExpectation( expectedMigrations, actualMigrations, hosts, true); - // Check that certain MPI calls actually do the migration - SECTION("MPI barrier triggers a migration point") { world.barrier(0); } - - SECTION("MPI all reduce triggers a migration point") - { - std::vector messageData = { 0, 1, 2 }; - world.allReduce(0, - BYTES(messageData.data()), - BYTES(messageData.data()), - MPI_INT, - messageData.size(), - MPI_SUM); - } - - // When performing the migration, MPI will remove it from the pending - // migrations map - REQUIRE(sch.getPendingAppMigrations(appId) == nullptr); - checkPendingMigrationsExpectation( - expectedMigrations, getMpiMockedPendingMigrations().front(), hosts, true); - faabric::Message res = sch.getFunctionResult(firstMsg->id(), 2 * timeToSleep); REQUIRE(res.returnvalue() == 0); @@ -476,5 +454,4 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture, // Clean up world.destroy(); } - } From c090a16bf5d34595f7c38497881e35723c3386e9 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 10:41:24 +0000 Subject: [PATCH 21/26] mpi: use initLocalQueues() instead of repeating the logic, remove assertion from the method --- src/scheduler/MpiWorld.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index 0cbfa2284..43869bbae 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1579,8 +1579,6 @@ std::shared_ptr MpiWorld::getLocalQueue(int sendRank, // Note - the queues themselves perform concurrency control void MpiWorld::initLocalQueues() { - // Assert we only allocate queues once - assert(localQueues.size() == 0); localQueues.resize(size * size); for (const int sendRank : ranksForHost[thisHost]) { for (const int recvRank : ranksForHost[thisHost]) { @@ -1797,16 +1795,7 @@ void MpiWorld::prepareMigration( initLocalRemoteLeaders(); // Add the necessary new local messaging queues - // TODO - merge with initLocalQueues - for (const int sendRank : ranksForHost[thisHost]) { - for (const int recvRank : ranksForHost[thisHost]) { - if (localQueues[getIndexForRanks(sendRank, recvRank)] == - nullptr) { - localQueues[getIndexForRanks(sendRank, recvRank)] = - std::make_shared(); - } - } - } + initLocalQueues(); } } } From 51dcc9592322cf2f892471a0e62bc0b159560ccc Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 10:50:27 +0000 Subject: [PATCH 22/26] mpi: remove unused getMpiMockedPendingMigrations --- include/faabric/scheduler/MpiWorld.h | 3 --- src/scheduler/MpiWorld.cpp | 11 ----------- 2 files changed, 14 deletions(-) diff --git a/include/faabric/scheduler/MpiWorld.h b/include/faabric/scheduler/MpiWorld.h index 94136b0dd..25c62e414 100644 --- a/include/faabric/scheduler/MpiWorld.h +++ b/include/faabric/scheduler/MpiWorld.h @@ -31,9 +31,6 @@ std::vector getMpiHostsToRanksMessages(); std::vector> getMpiMockedMessages( int sendRank); -std::vector> -getMpiMockedPendingMigrations(); - typedef faabric::util::FixedCapacityQueue> InMemoryMpiQueue; diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index 43869bbae..b9b811846 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -53,9 +53,6 @@ static std::vector rankMessages; static std::map>> mpiMockedMessages; -static std::vector> - mpiMockedPendingMigrations; - std::vector getMpiHostsToRanksMessages() { faabric::util::UniqueLock lock(mockMutex); @@ -69,13 +66,6 @@ std::vector> getMpiMockedMessages( return mpiMockedMessages[sendRank]; } -std::vector> -getMpiMockedPendingMigrations() -{ - faabric::util::UniqueLock lock(mockMutex); - return mpiMockedPendingMigrations; -} - MpiWorld::MpiWorld() : thisHost(faabric::util::getSystemConfig().endpointHost) , basePort(faabric::util::getSystemConfig().mpiBasePort) @@ -350,7 +340,6 @@ void MpiWorld::destroy() // Clear structures used for mocking rankMessages.clear(); mpiMockedMessages.clear(); - mpiMockedPendingMigrations.clear(); } void MpiWorld::initialiseFromMsg(faabric::Message& msg) From ba44bdd03bdd409172bc55492b6136e1b6e19457 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 11:23:00 +0000 Subject: [PATCH 23/26] scheduler: remove unused function declarations --- include/faabric/scheduler/Scheduler.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 97b69d538..0c843047e 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -23,14 +23,6 @@ namespace faabric::scheduler { -class ExecutorMigratedException : public faabric::util::FaabricException -{ - public: - explicit ExecutorMigratedException(std::string message) - : FaabricException(std::move(message)) - {} -}; - typedef std::pair, std::shared_ptr> InFlightPair; @@ -109,8 +101,6 @@ class Executor uint32_t threadPoolSize = 0; - void migrateFunction(const faabric::Message& msg, const std::string& host); - private: std::atomic claimed = false; From 28d143e81d6a7c1ca113db8ec20b0546af326e29 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 16:12:44 +0000 Subject: [PATCH 24/26] scheduler: factor out method to start the function migration thread if necessary --- include/faabric/scheduler/Scheduler.h | 4 + src/scheduler/Scheduler.cpp | 111 ++++++++++++++------------ 2 files changed, 64 insertions(+), 51 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 0c843047e..ba24cb0d4 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -323,6 +323,10 @@ class Scheduler void broadcastPendingMigrations( std::shared_ptr pendingMigrations); + + void doStartFunctionMigrationThread( + std::shared_ptr req, + faabric::util::SchedulingDecision& decision); }; } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index a9174b57d..b7b1f5a54 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -463,58 +463,8 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( } // Record in-flight request if function desires to be migrated - // NOTE: ideally, instead of allowing the applications to specify a check - // period, we would have a default one (overwritable through an env. - // variable), and apps would just opt in/out of being migrated. We set - // the actual check period instead to ease with experiments. if (!isMigration && firstMsg.migrationcheckperiod() > 0) { - bool startMigrationThread = inFlightRequests.size() == 0; - - if (inFlightRequests.find(decision.appId) != inFlightRequests.end()) { - // MPI applications are made up of two different requests: the - // original one (with one message) and the second one (with - // world size - 1 messages) created during world creation time. - // Thus, to correctly track migration opportunities we must merge - // both. We append the batch request to the original one (instead - // of the other way around) not to affect the rest of this methods - // functionality. - if (firstMsg.ismpi()) { - startMigrationThread = false; - auto originalReq = inFlightRequests[decision.appId].first; - auto originalDecision = inFlightRequests[decision.appId].second; - assert(req->messages_size() == firstMsg.mpiworldsize() - 1); - for (int i = 0; i < firstMsg.mpiworldsize() - 1; i++) { - // Append message to original request - auto* newMsgPtr = originalReq->add_messages(); - *newMsgPtr = req->messages().at(i); - - // Append message to original decision - originalDecision->addMessage(decision.hosts.at(i), - req->messages().at(i)); - } - } else { - SPDLOG_ERROR("There is already an in-flight request for app {}", - firstMsg.appid()); - throw std::runtime_error("App already in-flight"); - } - } else { - auto decisionPtr = - std::make_shared(decision); - inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); - } - - // Decide wether we have to start the migration thread or not - if (startMigrationThread) { - functionMigrationThread.start(firstMsg.migrationcheckperiod()); - } else if (firstMsg.migrationcheckperiod() != - functionMigrationThread.wakeUpPeriodSeconds) { - SPDLOG_WARN("Ignoring migration check period for app {} as the" - "migration thread is already running with a different" - " check period (provided: {}, current: {})", - firstMsg.appid(), - firstMsg.migrationcheckperiod(), - functionMigrationThread.wakeUpPeriodSeconds); - } + doStartFunctionMigrationThread(req, decision); } // NOTE: we want to schedule things on this host _last_, otherwise functions @@ -1396,4 +1346,63 @@ Scheduler::doCheckForMigrationOpportunities( return pendingMigrationsVec; } + +// Start the function migration thread if necessary +// NOTE: ideally, instead of allowing the applications to specify a check +// period, we would have a default one (overwritable through an env. +// variable), and apps would just opt in/out of being migrated. We set +// the actual check period instead to ease with experiments. +void Scheduler::doStartFunctionMigrationThread( + std::shared_ptr req, + faabric::util::SchedulingDecision& decision) +{ + bool startMigrationThread = inFlightRequests.size() == 0; + faabric::Message& firstMsg = req->mutable_messages()->at(0); + + if (inFlightRequests.find(decision.appId) != inFlightRequests.end()) { + // MPI applications are made up of two different requests: the + // original one (with one message) and the second one (with + // world size - 1 messages) created during world creation time. + // Thus, to correctly track migration opportunities we must merge + // both. We append the batch request to the original one (instead + // of the other way around) not to affect the rest of this methods + // functionality. + if (firstMsg.ismpi()) { + startMigrationThread = false; + auto originalReq = inFlightRequests[decision.appId].first; + auto originalDecision = inFlightRequests[decision.appId].second; + assert(req->messages_size() == firstMsg.mpiworldsize() - 1); + for (int i = 0; i < firstMsg.mpiworldsize() - 1; i++) { + // Append message to original request + auto* newMsgPtr = originalReq->add_messages(); + *newMsgPtr = req->messages().at(i); + + // Append message to original decision + originalDecision->addMessage(decision.hosts.at(i), + req->messages().at(i)); + } + } else { + SPDLOG_ERROR("There is already an in-flight request for app {}", + firstMsg.appid()); + throw std::runtime_error("App already in-flight"); + } + } else { + auto decisionPtr = + std::make_shared(decision); + inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr); + } + + // Decide wether we have to start the migration thread or not + if (startMigrationThread) { + functionMigrationThread.start(firstMsg.migrationcheckperiod()); + } else if (firstMsg.migrationcheckperiod() != + functionMigrationThread.wakeUpPeriodSeconds) { + SPDLOG_WARN("Ignoring migration check period for app {} as the" + "migration thread is already running with a different" + " check period (provided: {}, current: {})", + firstMsg.appid(), + firstMsg.migrationcheckperiod(), + functionMigrationThread.wakeUpPeriodSeconds); + } +} } From 54ab7b4bcace3e7432c062aeb2ac3973fb9bb659 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 16:14:08 +0000 Subject: [PATCH 25/26] mpi: use a boolean flag to indicate that app has been migrated, and check for flag in barriers to remove used pendingMigrations in the scheduler --- include/faabric/scheduler/MpiWorld.h | 3 +++ src/scheduler/Executor.cpp | 3 +-- src/scheduler/MpiWorld.cpp | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/faabric/scheduler/MpiWorld.h b/include/faabric/scheduler/MpiWorld.h index 25c62e414..1b85655ef 100644 --- a/include/faabric/scheduler/MpiWorld.h +++ b/include/faabric/scheduler/MpiWorld.h @@ -289,5 +289,8 @@ class MpiWorld MPI_Status* status, faabric::MPIMessage::MPIMessageType messageType = faabric::MPIMessage::NORMAL); + + /* Function migration */ + bool hasBeenMigrated = false; }; } diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index d8bb9c191..305ca58b3 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -1,5 +1,5 @@ -#include "faabric/scheduler/MpiWorldRegistry.h" #include +#include #include #include #include @@ -546,7 +546,6 @@ void Executor::threadPoolThread(int threadPoolIdx) // MPI migration if (msg.ismpi()) { - // TODO - delete the pending migration auto& mpiWorld = faabric::scheduler::getMpiWorldRegistry().getWorld( msg.mpiworldid()); diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index b9b811846..bc3e88856 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -1547,6 +1547,18 @@ void MpiWorld::barrier(int thisRank) thisRank, 0, nullptr, MPI_INT, 0, faabric::MPIMessage::BARRIER_JOIN); } + if (thisRank == localLeader && hasBeenMigrated) { + hasBeenMigrated = false; + if (thisRankMsg != nullptr) { + faabric::scheduler::getScheduler().removePendingMigration( + thisRankMsg->appid()); + } else { + SPDLOG_ERROR("App has been migrated but rank ({}) message not set", + thisRank); + throw std::runtime_error("App migrated but rank message not set"); + } + } + // Rank 0 broadcasts that the barrier is done (the others block here) broadcast( 0, thisRank, nullptr, MPI_INT, 0, faabric::MPIMessage::BARRIER_DONE); @@ -1779,6 +1791,9 @@ void MpiWorld::prepareMigration( hostForRank.at(m.msg().mpirank()) = m.dsthost(); } + // Set the migration flag + hasBeenMigrated = true; + // Reset the internal mappings. initLocalBasePorts(hostForRank); initLocalRemoteLeaders(); From 3bc208927b52dfee3cc71407b86dc6afda76a803 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 3 Feb 2022 17:09:52 +0000 Subject: [PATCH 26/26] proto: make topologyHint a message field, add test for json serialisation, remove unnecessary field to callFunctions, and re-factor necessary calls to callFunctions --- include/faabric/scheduler/Scheduler.h | 4 +--- include/faabric/util/scheduling.h | 19 +++++++++++++++++++ src/proto/faabric.proto | 3 +++ src/scheduler/FunctionCallServer.cpp | 5 +++-- src/scheduler/MpiWorld.cpp | 10 +--------- src/scheduler/Scheduler.cpp | 17 +++++++++++------ src/util/json.cpp | 9 +++++++++ tests/test/scheduler/test_executor.cpp | 9 ++++----- tests/test/scheduler/test_scheduler.cpp | 9 ++------- .../scheduler/test_scheduling_decisions.cpp | 11 +++++++++-- tests/test/util/test_json.cpp | 2 ++ 11 files changed, 64 insertions(+), 34 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index ba24cb0d4..147797382 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -136,9 +136,7 @@ class Scheduler void callFunction(faabric::Message& msg, bool forceLocal = false); faabric::util::SchedulingDecision callFunctions( - std::shared_ptr req, - faabric::util::SchedulingTopologyHint = - faabric::util::SchedulingTopologyHint::NORMAL); + std::shared_ptr req); faabric::util::SchedulingDecision callFunctions( std::shared_ptr req, diff --git a/include/faabric/util/scheduling.h b/include/faabric/util/scheduling.h index 71bab4f8c..0e5c94325 100644 --- a/include/faabric/util/scheduling.h +++ b/include/faabric/util/scheduling.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -60,6 +61,24 @@ enum SchedulingTopologyHint UNDERFULL, }; +// Map to convert input strings to scheduling topology hints and the other way +// around +const std::unordered_map + strToTopologyHint = { + { "NORMAL", SchedulingTopologyHint::NORMAL }, + { "FORCE_LOCAL", SchedulingTopologyHint::FORCE_LOCAL }, + { "NEVER_ALONE", SchedulingTopologyHint::NEVER_ALONE }, + { "UNDERFULL", SchedulingTopologyHint::UNDERFULL }, + }; + +const std::unordered_map + topologyHintToStr = { + { SchedulingTopologyHint::NORMAL, "NORMAL" }, + { SchedulingTopologyHint::FORCE_LOCAL, "FORCE_LOCAL" }, + { SchedulingTopologyHint::NEVER_ALONE, "NEVER_ALONE" }, + { SchedulingTopologyHint::UNDERFULL, "UNDERFULL" }, + }; + // Migration strategies help the scheduler decide wether the scheduling decision // for a batch request could be changed with the new set of available resources. // - BIN_PACK: sort hosts by the number of functions from the batch they are diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index e295e038c..e5b705a31 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -168,6 +168,9 @@ message Message { // Function migration int32 migrationCheckPeriod = 44; + + // Scheduling + string topologyHint = 45; } // --------------------------------------------- diff --git a/src/scheduler/FunctionCallServer.cpp b/src/scheduler/FunctionCallServer.cpp index 550dc5ded..e9beb13c1 100644 --- a/src/scheduler/FunctionCallServer.cpp +++ b/src/scheduler/FunctionCallServer.cpp @@ -79,8 +79,9 @@ void FunctionCallServer::recvExecuteFunctions(const uint8_t* buffer, // This host has now been told to execute these functions no matter what // TODO - avoid this copy - scheduler.callFunctions(std::make_shared(msg), - faabric::util::SchedulingTopologyHint::FORCE_LOCAL); + msg.mutable_messages()->at(0).set_topologyhint("FORCE_LOCAL"); + scheduler.callFunctions( + std::make_shared(msg)); } void FunctionCallServer::recvUnregister(const uint8_t* buffer, diff --git a/src/scheduler/MpiWorld.cpp b/src/scheduler/MpiWorld.cpp index bc3e88856..e3f2fb14b 100644 --- a/src/scheduler/MpiWorld.cpp +++ b/src/scheduler/MpiWorld.cpp @@ -238,15 +238,7 @@ void MpiWorld::create(faabric::Message& call, int newId, int newSize) std::vector executedAt; if (size > 1) { - // 10/01/22 - We add this check to force different scheduling behaviour - // if running migration experiments. The check can be deleted - // afterwards. - bool mustUnderfull = - thisRankMsg != nullptr && thisRankMsg->migrationcheckperiod() > 0; - faabric::util::SchedulingDecision decision = sch.callFunctions( - req, - mustUnderfull ? faabric::util::SchedulingTopologyHint::UNDERFULL - : faabric::util::SchedulingTopologyHint::NORMAL); + faabric::util::SchedulingDecision decision = sch.callFunctions(req); executedAt = decision.hosts; } assert(executedAt.size() == size - 1); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index b7b1f5a54..c9ad06ec7 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -239,8 +239,7 @@ void Scheduler::notifyExecutorShutdown(Executor* exec, } faabric::util::SchedulingDecision Scheduler::callFunctions( - std::shared_ptr req, - faabric::util::SchedulingTopologyHint topologyHint) + std::shared_ptr req) { // Note, we assume all the messages are for the same function and have the // same master host @@ -252,6 +251,12 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( throw std::runtime_error("Message with no master host"); } + // Get topology hint from message + faabric::util::SchedulingTopologyHint topologyHint = + firstMsg.topologyhint().empty() + ? faabric::util::SchedulingTopologyHint::NORMAL + : faabric::util::strToTopologyHint.at(firstMsg.topologyhint()); + // If we're not the master host, we need to forward the request back to the // master host. This will only happen if a nested batch execution happens. if (topologyHint != faabric::util::SchedulingTopologyHint::FORCE_LOCAL && @@ -744,12 +749,12 @@ void Scheduler::callFunction(faabric::Message& msg, bool forceLocal) // Specify that this is a normal function, not a thread req->set_type(req->FUNCTIONS); - // Make the call if (forceLocal) { - callFunctions(req, faabric::util::SchedulingTopologyHint::FORCE_LOCAL); - } else { - callFunctions(req); + req->mutable_messages()->at(0).set_topologyhint("FORCE_LOCAL"); } + + // Make the call + callFunctions(req); } void Scheduler::clearRecordedMessages() diff --git a/src/util/json.cpp b/src/util/json.cpp index a4e2f2904..38d6061e7 100644 --- a/src/util/json.cpp +++ b/src/util/json.cpp @@ -232,6 +232,13 @@ std::string messageToJson(const faabric::Message& msg) d.AddMember("migration_check_period", msg.migrationcheckperiod(), a); } + if (!msg.topologyhint().empty()) { + d.AddMember( + "topology_hint", + Value(msg.topologyhint().c_str(), msg.topologyhint().size()).Move(), + a); + } + StringBuffer sb; Writer writer(sb); d.Accept(writer); @@ -446,6 +453,8 @@ faabric::Message jsonToMessage(const std::string& jsonIn) msg.set_migrationcheckperiod( getIntFromJson(d, "migration_check_period", 0)); + msg.set_topologyhint(getStringFromJson(d, "topology_hint", "NORMAL")); + PROF_END(jsonDecode) return msg; diff --git a/tests/test/scheduler/test_executor.cpp b/tests/test/scheduler/test_executor.cpp index 1c0e812c9..fe62c31e3 100644 --- a/tests/test/scheduler/test_executor.cpp +++ b/tests/test/scheduler/test_executor.cpp @@ -274,14 +274,12 @@ class TestExecutorFixture conf.overrideCpuCount = 10; conf.boundTimeout = SHORT_TEST_TIMEOUT_MS; - faabric::util::SchedulingTopologyHint topologyHint = - faabric::util::SchedulingTopologyHint::NORMAL; if (forceLocal) { - topologyHint = faabric::util::SchedulingTopologyHint::FORCE_LOCAL; + req->mutable_messages()->at(0).set_topologyhint("FORCE_LOCAL"); } - return sch.callFunctions(req, topologyHint).hosts; + return sch.callFunctions(req).hosts; } }; @@ -867,7 +865,8 @@ TEST_CASE_METHOD(TestExecutorFixture, } // Call functions and force to execute locally - sch.callFunctions(req, faabric::util::SchedulingTopologyHint::FORCE_LOCAL); + req->mutable_messages()->at(0).set_topologyhint("FORCE_LOCAL"); + sch.callFunctions(req); // Await execution for (auto& m : *req->mutable_messages()) { diff --git a/tests/test/scheduler/test_scheduler.cpp b/tests/test/scheduler/test_scheduler.cpp index be67f9afe..142d446bf 100644 --- a/tests/test/scheduler/test_scheduler.cpp +++ b/tests/test/scheduler/test_scheduler.cpp @@ -1002,17 +1002,12 @@ TEST_CASE_METHOD(DummyExecutorFixture, expectedDecision.addMessage(expectedHosts.at(i), req->messages().at(i)); } - // Set topology hint - faabric::util::SchedulingTopologyHint topologyHint = - faabric::util::SchedulingTopologyHint::NORMAL; - if (forceLocal) { - topologyHint = faabric::util::SchedulingTopologyHint::FORCE_LOCAL; + req->mutable_messages()->at(0).set_topologyhint("FORCE_LOCAL"); } // Schedule and check decision - faabric::util::SchedulingDecision actualDecision = - sch.callFunctions(req, topologyHint); + faabric::util::SchedulingDecision actualDecision = sch.callFunctions(req); checkSchedulingDecisionEquality(expectedDecision, actualDecision); // Check mappings set up locally or not diff --git a/tests/test/scheduler/test_scheduling_decisions.cpp b/tests/test/scheduler/test_scheduling_decisions.cpp index 0a6f7258c..4f985a6cc 100644 --- a/tests/test/scheduler/test_scheduling_decisions.cpp +++ b/tests/test/scheduler/test_scheduling_decisions.cpp @@ -97,9 +97,13 @@ class SchedulingDecisionTestFixture : public SchedulerTestFixture // Set resources for all hosts setHostResources(config.hosts, config.slots); + // Set topology hint in request + req->mutable_messages()->at(0).set_topologyhint( + faabric::util::topologyHintToStr.at(config.topologyHint)); + // The first time we run the batch request, we will follow the // unregistered hosts path - actualDecision = sch.callFunctions(req, config.topologyHint); + actualDecision = sch.callFunctions(req); REQUIRE(actualDecision.hosts == config.expectedHosts); checkRecordedBatchMessages(actualDecision, config); @@ -114,9 +118,12 @@ class SchedulingDecisionTestFixture : public SchedulerTestFixture faabric::util::batchExecFactory("foo", "baz", req->messages_size()); setHostResources(config.hosts, config.slots); + reqCopy->mutable_messages()->at(0).set_topologyhint( + faabric::util::topologyHintToStr.at(config.topologyHint)); + // The second time we run the batch request, we will follow // the registered hosts path - actualDecision = sch.callFunctions(reqCopy, config.topologyHint); + actualDecision = sch.callFunctions(reqCopy); REQUIRE(actualDecision.hosts == config.expectedHosts); checkRecordedBatchMessages(actualDecision, config); } diff --git a/tests/test/util/test_json.cpp b/tests/test/util/test_json.cpp index b38b82948..2fefc6ed3 100644 --- a/tests/test/util/test_json.cpp +++ b/tests/test/util/test_json.cpp @@ -49,6 +49,8 @@ TEST_CASE("Test message to JSON round trip", "[util]") msg.set_migrationcheckperiod(33); + msg.set_topologyhint("TEST_TOPOLOGY_HINT"); + SECTION("Dodgy characters") { msg.set_inputdata("[0], %$ 2233 9"); } SECTION("Bytes")