Skip to content

Commit

Permalink
Merge pull request #671 from flyingcircusio/PL-129777-agent-maintenan…
Browse files Browse the repository at this point in the history
…ce-scheduling

fc-agent: improve maintenance scheduling
  • Loading branch information
osnyx committed Jul 19, 2023
2 parents 552f529 + c3abf47 commit 9556b59
Show file tree
Hide file tree
Showing 45 changed files with 2,612 additions and 1,320 deletions.
38 changes: 34 additions & 4 deletions nixos/platform/agent.nix
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ in
'';
};

maintenancePreparationSeconds = mkOption {
default = 300;
description = ''
Expected time in seconds needed to prepare for the execution of
maintenance activities. The value should cover typical cases where
maintenance-enter commands have to do extra work or wait for some
condition to be true. These commands should typically not take
longer than 5 minutes in total which is the default here.
If commands are expected to take longer and it's not feasible to
pause them after 5 minutes and continue later (using TEMPFAIL), the
preparation time can be increased as needed.
Currently, the directory doesn't know about "preparation time" as a
separate concept, so this value is just added to the estimated run
time for each activity. This overestimates the actual preparation
time if multiple activities are scheduled continuously because
maintenance-enter commands are just run once for all runnable
activities.
We don't enforce it at the moment but will probably add a timeout
for maintenance-enter commands later based on this value.
'';
type = types.ints.positive;
};

};
};

Expand Down Expand Up @@ -162,6 +188,9 @@ in
];

environment.etc."fc-agent.conf".text = ''
[maintenance]
preparation_seconds = ${toString cfg.agent.maintenancePreparationSeconds}
[maintenance-enter]
${concatStringsSep "\n" (
mapAttrsToList (k: v: "${k} = ${v.enter}") cfg.agent.maintenance)}
Expand Down Expand Up @@ -195,7 +224,7 @@ in

script =
let
verbose = lib.optionalString cfg.agent.verbose "--verbose";
verbose = lib.optionalString cfg.agent.verbose "--show-caller-info";
options = "--enc-path=${cfg.encPath} ${verbose}";
wrappedExtraCommands = lib.optionalString (cfg.agent.extraCommands != "") ''
(
Expand Down Expand Up @@ -240,11 +269,12 @@ in
IOWeight = 10; # 1-10000
ExecStart =
let
verbose = lib.optionalString cfg.agent.verbose "--verbose";
verbose = lib.optionalString cfg.agent.verbose "--show-caller-info";
options = "--enc-path=${cfg.encPath} ${verbose}";
runNow = lib.optionalString (!cfg.agent.updateInMaintenance) "--run-now";
in
"${pkgs.fc.agent}/bin/fc-maintenance ${options} request update ${runNow}";
if cfg.agent.updateInMaintenance
then "${pkgs.fc.agent}/bin/fc-maintenance ${options} request update"
else "${pkgs.fc.agent}/bin/fc-manage ${options} switch --update-channel --lazy";
};

path = commonEnvPath;
Expand Down
4 changes: 2 additions & 2 deletions pkgs/default.nix
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Collection of own packages
{ pkgs }:
{ pkgs, pythonPackages }:

let
self = {
callPackage = pkgs.newScope self;

fc = import ./fc {
inherit (self) callPackage;
inherit pkgs;
inherit pkgs pythonPackages;
};

};
Expand Down
16 changes: 11 additions & 5 deletions pkgs/fc/agent/default.nix
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
{ lib
, stdenv
, fetchPypi
, fetchFromGitHub
, dmidecode
, gitMinimal
, gptfdisk
, libyaml
, multipath-tools
, nix
, python310
, buildPythonPackage
, pythonPackages
, python
, util-linux
, xfsprogs
, pytest
, structlog
}:

let
py = python310.pkgs;
py = pythonPackages;

pytest-structlog = py.buildPythonPackage rec {
pname = "pytest-structlog";
Expand All @@ -26,11 +31,11 @@ let
sha256 = "00g2ivgj4y398d0y60lk710zz62pj80r9ya3b4iqijkp4j8nh4gp";
};

buildInputs = [ py.pytest py.structlog ];
buildInputs = [ pytest structlog ];
};

in
py.buildPythonPackage rec {
buildPythonPackage rec {
name = "fc-agent-${version}";
version = "1.0";
namePrefix = "";
Expand All @@ -40,6 +45,7 @@ py.buildPythonPackage rec {
py.freezegun
py.pytest-cov
py.responses
py.pytest
py.pytest-mock
py.pytest-subprocess
pytest-structlog
Expand Down Expand Up @@ -71,6 +77,6 @@ py.buildPythonPackage rec {
xfsprogs
];
dontStrip = true;
passthru.pythonDevEnv = python310.withPackages (_: checkInputs ++ propagatedBuildInputs);
passthru.pythonDevEnv = python.withPackages (_: checkInputs ++ propagatedBuildInputs);

}
38 changes: 26 additions & 12 deletions pkgs/fc/agent/fc/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import contextlib
import shutil
import textwrap
import unittest
import uuid
from pathlib import Path

import responses
import shortuuid
import structlog
from fc.maintenance.activity import Activity
Expand All @@ -12,8 +15,8 @@


@fixture
def agent_maintenance_config(tmpdir):
config_file = str(tmpdir / "fc-agent.conf")
def agent_maintenance_config(tmp_path):
config_file = str(tmp_path / "fc-agent.conf")
with open(config_file, "w") as f:
f.write(
textwrap.dedent(
Expand All @@ -31,27 +34,32 @@ def agent_maintenance_config(tmpdir):


@fixture
def reqmanager(tmpdir, agent_maintenance_config):
with ReqManager(
Path(tmpdir), config_file=Path(agent_maintenance_config)
) as rm:
yield rm
def reqmanager(tmp_path, agent_maintenance_config):
spooldir = tmp_path / "maintenance"
spooldir.mkdir()
enc_path = tmp_path / "enc.json"
enc_path.write_text("{}")
with unittest.mock.patch("fc.util.directory.connect"):
with ReqManager(
spooldir=spooldir,
enc_path=enc_path,
config_file=agent_maintenance_config,
) as rm:
yield rm


@fixture
def request_population(tmpdir, agent_maintenance_config):
def request_population(tmp_path, agent_maintenance_config, reqmanager):
@contextlib.contextmanager
def _request_population(n):
"""Creates a ReqManager with a pregenerated population of N requests.
The ReqManager and a list of Requests are passed to the calling code.
"""
with ReqManager(
Path(tmpdir), config_file=Path(agent_maintenance_config)
) as reqmanager:
with reqmanager:
requests = []
for i in range(n):
req = Request(Activity(), 60, comment=str(i))
req = Request(Activity(), comment=str(i))
req._reqid = shortuuid.encode(uuid.UUID(int=i))
reqmanager.add(req)
requests.append(req)
Expand All @@ -65,3 +73,9 @@ def logger():
_logger = structlog.get_logger()
_logger.trace = lambda *a, **k: None
return _logger


@fixture
def mocked_responses():
with responses.RequestsMock() as rsps:
yield rsps
63 changes: 49 additions & 14 deletions pkgs/fc/agent/fc/maintenance/activity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,48 @@
"""Base class for maintenance activities."""
from enum import Enum
from typing import NamedTuple, Optional

import structlog
from fc.maintenance.estimate import Estimate

class RebootType(Enum):
WARM = 1
COLD = 2

class RebootType(str, Enum):
WARM = "reboot"
COLD = "poweroff"


class ActivityMergeResult(NamedTuple):
merged: Optional["Activity"] = None
is_effective: bool = False
is_significant: bool = False
changes: dict = {}


class Activity:
"""Maintenance activity which is executed as request payload.
Activities are executed possibly several times until they succeed or
exceeed their retry limit. Individual maintenance activities should
exceed their retry limit. Individual maintenance activities should
subclass this class and add custom behaviour to its methods.
Attributes: `stdout`, `stderr` capture the outcomes of shellouts.
Attributes: `stdout`, `stderr` capture the outcomes of shell-outs.
`returncode` controls the resulting request state. If `duration` is
set, it overrules execution timing done by the calling scope. Use
this if a logical transaction spans several attempts, e.g. for
reboots.
"""

stdout = None
stderr = None
returncode = None
duration = None
request = None # backpointer, will be set in Request
reboot_needed = None
stdout: None | str = None
stderr: None | str = None
returncode: None | int = None
duration: None | float = None
request = None # back-pointer, will be set in Request
reboot_needed: None | RebootType = None
# Do we predict that this activity will actually change anything?
is_effective = True
comment = ""
estimate = Estimate("10m")
log = None

def __init__(self):
"""Creates activity object (add args if you like).
Expand All @@ -41,10 +57,12 @@ def __getstate__(self):
# Deserializing loggers breaks, remove them before serializing (to YAML).
if "log" in state:
del state["log"]
if "request" in state:
del state["request"]
return state

def set_up_logging(self, log):
self.log = log
self.log = log.bind(activity_type=self.__class__.__name__)

def run(self):
"""Executes maintenance activity.
Expand All @@ -53,8 +71,8 @@ def run(self):
whatever you want here, but do not destruct `request.yaml`.
Directory contents is preserved between several attempts.
This method is expected to update self.stdout, self.stderr, and
self.returncode after each run. Request state is determined
This method is expected to update `self.stdout`, `self.stderr`, and
`self.returncode` after each run. Request state is determined
according to the EXIT_* constants in `state.py`. Any returncode
not listed there means hard failure and causes the request to be
archived. Uncaught exceptions are handled the same way.
Expand All @@ -74,3 +92,20 @@ def load(self):
def dump(self):
"""Saves additional state during serialization."""
pass

def merge(self, other) -> ActivityMergeResult:
"""Merges in other activity. Settings from other have precedence.
Returns merge result.
"""
return ActivityMergeResult()

def __rich__(self):
cls = self.__class__
out = f"{cls.__module__}.{cls.__qualname__}"
match self.reboot_needed:
case RebootType.WARM:
out += " (warm reboot needed)"
case RebootType.COLD:
out += " (cold reboot needed)"

return out
75 changes: 75 additions & 0 deletions pkgs/fc/agent/fc/maintenance/activity/reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Scheduled machine reboot.
This activity does nothing if the machine has been booted for another reason in
the time between creation and execution.
"""

from typing import Union

import structlog

from ..estimate import Estimate
from . import Activity, ActivityMergeResult, RebootType

_log = structlog.get_logger()


class RebootActivity(Activity):
estimate = Estimate("5m")

def __init__(
self, action: Union[str, RebootType] = RebootType.WARM, log=_log
):
super().__init__()
self.set_up_logging(log)
self.reboot_needed = RebootType(action)

@property
def comment(self):
return "Scheduled {}".format(
"cold boot" if self.reboot_needed == RebootType.COLD else "reboot"
)

def merge(self, other):
if not isinstance(other, RebootActivity):
self.log.debug(
"merge-incompatible-skip",
self_type=type(self),
other_type=type(other),
)
return ActivityMergeResult()

if self.reboot_needed == other.reboot_needed:
self.log.debug("merge-reboot-identical")
return ActivityMergeResult(self, is_effective=True)

if (
self.reboot_needed == RebootType.COLD
and other.reboot_needed == RebootType.WARM
):
self.log.debug(
"merge-reboot-cold-warm",
help=(
"merging a warm reboot into a cold reboot results in a "
"cold reboot."
),
)
return ActivityMergeResult(self, is_effective=True)

if (
self.reboot_needed == RebootType.WARM
and other.reboot_needed == RebootType.COLD
):
self.log.debug(
"merge-reboot-warm-to-cold",
help=(
"merging a cold reboot into a warm reboot results in a "
"cold reboot. This is a significant change."
),
)
return ActivityMergeResult(
self,
is_effective=True,
is_significant=True,
changes={"before": RebootType.WARM, "after": RebootType.COLD},
)
Loading

0 comments on commit 9556b59

Please sign in to comment.