Skip to content

Commit

Permalink
wip nix error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
dpausp committed Apr 18, 2023
1 parent 57fc9c5 commit dd95e15
Show file tree
Hide file tree
Showing 10 changed files with 198 additions and 84 deletions.
19 changes: 16 additions & 3 deletions pkgs/fc/agent/fc/maintenance/activity/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,22 @@ def prepare(self, dry_run=False):
else:
out_link = NEXT_SYSTEM

self.next_system = nixos.build_system(
self.next_channel_url, out_link=out_link, log=self.log
)
try:
self.next_system = nixos.build_system(
self.next_channel_url, out_link=out_link, log=self.log
)
except nixos.ChannelException as e:
self.log.error(
"update-prepare-build-failed",
current_version=self.current_version,
current_channel_url=self.current_channel_url,
current_environment=self.current_environment,
next_channel=self.next_channel_url,
next_version=self.next_version,
next_environment=self.next_environment,
)
raise

self.unit_changes = nixos.dry_activate_system(
self.next_system, self.log
)
Expand Down
3 changes: 1 addition & 2 deletions pkgs/fc/agent/fc/maintenance/activity/vm_change.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ def merge(self, other) -> ActivityMergeResult:
if not isinstance(other, VMChangeActivity):
self.log.debug(
"merge-incompatible-skip",
self_type=type(self),
other_type=type(other),
other_type=type(other).__name__,
)
return ActivityMergeResult()

Expand Down
8 changes: 6 additions & 2 deletions pkgs/fc/agent/fc/maintenance/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@
ReqManager,
)
from fc.maintenance.request import Request
from fc.util import nixos
from fc.util.enc import load_enc
from fc.util.lock import locked
from fc.util.logging import (
drop_cmd_output_logfile,
init_command_logging,
init_logging,
)
from typer import Argument, Option, Typer
from typer import Argument, Exit, Option, Typer

app = Typer(pretty_exceptions_show_locals=False)
log = structlog.get_logger()
Expand Down Expand Up @@ -268,7 +269,10 @@ def update():
current_requests = rm.requests.values()

with locked(log, context.lock_dir):
request = request_update(log, enc, current_requests)
try:
request = request_update(log, enc, current_requests)
except nixos.ChannelException:
raise Exit(2)

with rm:
request = rm.add(request)
Expand Down
5 changes: 5 additions & 0 deletions pkgs/fc/agent/fc/maintenance/maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
def request_reboot_for_memory(log, enc) -> Optional[Request]:
"""Schedules reboot if the memory size has changed."""
wanted_memory = int(enc["parameters"].get("memory", 0))
log.debug("request-reboot-for-memory", enc_memory=wanted_memory)

if not wanted_memory:
return

Expand All @@ -29,6 +31,8 @@ def request_reboot_for_memory(log, enc) -> Optional[Request]:
def request_reboot_for_cpu(log, enc) -> Optional[Request]:
"""Schedules reboot if the number of cores has changed."""
wanted_cores = int(enc["parameters"].get("cores", 0))
log.debug("request-reboot-for-cpu", enc_cores=wanted_cores)

if not wanted_cores:
return

Expand All @@ -47,6 +51,7 @@ def request_reboot_for_qemu(log) -> Optional[Request]:
# of the virtual machine. However, if we got rebooted with a fresh
# Qemu instance, we need to update it from the marker on the tmp
# partition.
log.debug("request-reboot-for-qemu-start")
if not os.path.isdir("/var/lib/qemu"):
os.makedirs("/var/lib/qemu")
if os.path.exists("/tmp/fc-data/qemu-binary-generation-booted"):
Expand Down
1 change: 1 addition & 0 deletions pkgs/fc/agent/fc/maintenance/reqmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ def postpone(self):
self.log.debug(
"postpone-maintenance-directory", args=postpone_maintenance
)
# This directory call just returns an empty string.
self.directory.postpone_maintenance(postpone_maintenance)
for req in postponed:
req.state = State.pending
Expand Down
5 changes: 4 additions & 1 deletion pkgs/fc/agent/fc/manage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def check():
log = structlog.get_logger()
try:
enc = fc.util.enc.load_enc(log, context.enc_path)
result = fc.manage.manage.check(log, enc)
result = fc.manage.manage.check(log, enc, context.logdir)
except Exception:
print("UNKNOWN: Exception occurred while running checks")
traceback.print_exc()
Expand Down Expand Up @@ -142,11 +142,13 @@ def switch_cmd(
show_trace=context.show_trace or show_trace,
)
except nixos.ChannelException:
fc.util.logging.mark_cmd_output_logfile(log, "last_failed")
raise Exit(2)

if not keep_cmd_output:
fc.util.logging.drop_cmd_output_logfile(log)

fc.util.logging.remove_mark_cmd_output_logfile(log, "last_failed")
log.info("fc-manage-succeeded")


Expand Down Expand Up @@ -294,6 +296,7 @@ def fc_manage(
show_trace=show_trace,
)
except nixos.ChannelException:
fc.util.logging.mark_cmd_output_logfile(log, "last_failed")
raise Exit(2)

if not keep_cmd_output:
Expand Down
9 changes: 8 additions & 1 deletion pkgs/fc/agent/fc/manage/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class SwitchFailed(Exception):
pass


def check(log, enc) -> CheckResult:
def check(log, enc, logdir) -> CheckResult:
errors = []
warnings = []
if INITIAL_RUN_MARKER.exists():
Expand Down Expand Up @@ -121,6 +121,13 @@ def check(log, enc) -> CheckResult:
warnings.append(f"NixOS warnings found ({len(nixos_warnings)})")
warnings.extend(nixos_warnings)

failed_agent_cmd_output = Path(logdir) / "fc-agent" / "last_failed.log"
if failed_agent_cmd_output.exists():
build_log_content = failed_agent_cmd_output.read_text()
error = nixos.find_nix_build_error(build_log_content, log)
error = error if error.endswith(".") else error + "."
errors.append("NixOS build error: " + error)

return CheckResult(errors, warnings)


Expand Down
45 changes: 39 additions & 6 deletions pkgs/fc/agent/fc/util/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import io
import json
import os
import shutil
import string
import sys
import syslog
Expand Down Expand Up @@ -558,10 +559,7 @@ def init_command_logging(log, logdir):
)


def drop_cmd_output_logfile(log):
"""Deletes the log file used by the cmd_output_file logger.
Used to throw away the command log file if nothing interesting has
happened."""
def get_cmd_log_file(log):
logger_factory = structlog.get_config()["logger_factory"]

if not isinstance(logger_factory, MultiOptimisticLoggerFactory):
Expand All @@ -580,11 +578,46 @@ def drop_cmd_output_logfile(log):
)
raise

cmd_log_file = cmd_output_file_factory._file
return cmd_output_file_factory._file


def mark_cmd_output_logfile(log, label):
cmd_log_file = get_cmd_log_file(log)

cmd_logdir = Path(cmd_log_file.name).parent
marker_link = cmd_logdir / f"{label}.log"
log.debug(
"logging-cmd-output-mark",
cmd_log_file=cmd_log_file.name,
marker=str(marker_link),
)
marker_link.unlink(missing_ok=True)
marker_link.symlink_to(cmd_log_file.name)


def remove_mark_cmd_output_logfile(log, label):
cmd_log_file = get_cmd_log_file(log)

cmd_logdir = Path(cmd_log_file.name).parent
marker_link = cmd_logdir / f"{label}.log"
log.debug(
"logging-cmd-output-remove-mark",
cmd_log_file=cmd_log_file.name,
marker=str(marker_link),
)
marker_link.unlink(missing_ok=True)


def drop_cmd_output_logfile(log):
"""Deletes the log file used by the cmd_output_file logger.
Used to throw away the command log file if nothing interesting has
happened."""

cmd_log_file = get_cmd_log_file(log)

log.debug(
"logging-cmd-output-drop",
_replace_msg="Remove command logfile because nothing changed.",
msg="Remove command logfile because nothing changed.",
)

cmd_log_file.close()
Expand Down
51 changes: 28 additions & 23 deletions pkgs/fc/agent/fc/util/nixos.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,36 +320,41 @@ def switch_to_channel(channel_url, lazy=False, log=_log):
return switch_to_system(built_system, lazy)


def find_nix_build_error(stderr, log=_log):
"""Returns a one-line error message from Nix build output or
a generic message if nothing is found.
def find_nix_build_error(stderr: str, log=_log):
"""Returns the (hopefully) interesting part of the error message from Nix
build output or a generic message if nothing is found.
"""

try:
lines = stderr.splitlines()[::-1]
# Detect builder failures and return which builder failed.
if lines and lines[0].startswith("error: build of "):
for line in lines:
if line.startswith("builder for "):
lines = stderr.splitlines()
error_lines = []

for pos, line in enumerate(lines):
if line.startswith("error:"):
error = line.removeprefix("error:").strip()

if error.startswith("builder for "):
if ";" in line:
return line.split(";")[0]
error_lines.append(error.split(";")[0])
else:
return line
error_lines.append(error)
elif lines[pos + 1].strip() == "Failed assertions:":
error_lines.append("Failed assertions:")
error_lines.extend(l.strip() for l in lines[pos + 2 : -1])
break
else:
error_lines.append(error)

# Check for evaluation errors.
# We assume that the line right after the traceback
# (and possibly build output before it) is the most interesting one.
# Start at the last line and find the start of the traceback.
for pos, line in enumerate(lines):
if line.strip().startswith("while evaluating") and pos > 0:
errormsg = lines[pos - 1].strip()
if errormsg.endswith(" Definition values:"):
return errormsg[: -len(" Definition values:")]
elif errormsg.endswith(" conflicting definition values:"):
breakpoint()
return "\n".join(reversed(lines[:pos]))
elif error_lines:
line = line.strip()
if line.startswith(("- In", "at ")):
error_lines.append(line)
else:
return errormsg
break

if error_lines:
return "\n".join(error_lines)

except Exception:
log.error("find-nix-build-error-failed", exc_info=True)

Expand Down
Loading

0 comments on commit dd95e15

Please sign in to comment.