Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Improve logging to catch early errors on startup

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/10158

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/9099
131 changes: 93 additions & 38 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import (
"context"
goerrors "errors"
"fmt"
"net/url"
"os"
Expand Down Expand Up @@ -88,7 +89,6 @@
testingMode, _ := cmd.Flags().GetBool("testing-mode")
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
return err
}
return nil
Expand Down Expand Up @@ -141,56 +141,82 @@
defer cancel()
go service.ProcessWindowsControlEvents(stopBeat)

upgradeDetailsFromMarker, err := handleUpgrade()
if err != nil {
return fmt.Errorf("error checking for and handling upgrade: %w", err)
}

locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
if err := locker.TryLock(); err != nil {
return err
}
defer func() {
_ = locker.Unlock()
}()

return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
}

func logReturn(l *logger.Logger, err error) error {
if err != nil && !errors.Is(err, context.Canceled) {
l.Errorf("%s", err)
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
}
return err
}

func runElasticAgent(
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
// errors to ensure that any issues are captured in the logs.
func runElasticAgentCritical(
ctx context.Context,
cancel context.CancelFunc,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
err := coordinator.RestoreConfig()
var errs []error

// early handleUpgrade, but don't error yet
upgradeDetailsFromMarker, err := handleUpgrade()
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
}

// single run, but don't error yet
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
lockErr := locker.TryLock()
if lockErr != nil {
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
}
defer func() {
_ = locker.Unlock()
}()

// try restore (if app locker didn't fail), but don't error yet
if lockErr == nil {
err = coordinator.RestoreConfig()
if err != nil {
errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err))
}
}

// try load config, but don't error yet
cfg, err := loadConfig(ctx, override)
if err != nil {
return err
// failed to load configuration, just load the default to create the logger
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
cfg = configuration.DefaultConfiguration()
}

logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
}
baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))

// failed to create the baseLogger, this comes from the configuration being possibly invalid
// switch to a default config and try again
cfg = configuration.DefaultConfiguration()
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))

// this really should not happen, but this whole critical function is very defensive
baseLogger, err = logger.New("", true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))

// again? no way, but you never know
baseLogger = logger.NewWithoutConfig("")
}
}
}

// Make sure to flush any buffered logs before we're done.
Expand All @@ -200,10 +226,39 @@
"source": agentName,
})

// at this point the logger is working, so any errors that we hit can now be logged and returned
if len(errs) > 0 {
return logReturn(l, goerrors.Join(errs...))
}

// actually run the agent now
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return logReturn(l, err)
}

// runElasticAgent runs the actual Elastic Agent.
func runElasticAgent(
ctx context.Context,
cancel context.CancelFunc,
baseLogger *logger.Logger,
l *logger.Logger,
cfg *configuration.Configuration,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
}

// try early to check if running as root
isRoot, err := utils.HasRoot()
if err != nil {
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
}

l.Infow("Elastic Agent started",
Expand All @@ -213,7 +268,7 @@

cfg, err = tryDelayEnroll(ctx, l, cfg, override)
if err != nil {
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
return errors.New(err, "failed to perform delayed enrollment")
}

// agent ID needs to stay empty in bootstrap mode
Expand All @@ -225,31 +280,31 @@
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
if err != nil {
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
return fmt.Errorf("failed to read/write secrets: %w", err)
}

// Migrate .yml files if the corresponding .enc does not exist

// the encrypted config does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating fleet config"))
return errors.New(err, "error migrating fleet config")
}

// the encrypted state does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l,
paths.AgentStateStoreYmlFile(),
paths.AgentStateStoreFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating agent state"))
return errors.New(err, "error migrating agent state")
}

agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
if err != nil {
return logReturn(l, errors.New(err,
return errors.New(err,
"could not load agent info",
errors.TypeFilesystem,
errors.M(errors.MetaKeyPath, paths.AgentConfigFile())))
errors.M(errors.MetaKeyPath, paths.AgentConfigFile()))
}

// Ensure that the log level now matches what is configured in the agentInfo.
Expand All @@ -275,14 +330,14 @@

execPath, err := reexecPath()
if err != nil {
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
return fmt.Errorf("failed to get reexec path: %w", err)
}
rexLogger := l.Named("reexec")
rex := reexec.NewManager(rexLogger, execPath)

tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
if err != nil {
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
return fmt.Errorf("could not initiate APM tracer: %w", err)
}
if tracer != nil {
l.Info("APM instrumentation enabled")
Expand All @@ -298,12 +353,12 @@
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
if err != nil {
return logReturn(l, err)
return err
}

monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
if err != nil {
return logReturn(l, err)
return err
}
coord.RegisterMonitoringServer(monitoringServer)
defer func() {
Expand All @@ -327,7 +382,7 @@

// start the control listener
if err := control.Start(); err != nil {
return logReturn(l, err)
return err
}
defer control.Stop()

Expand Down Expand Up @@ -410,7 +465,7 @@
if isRex {
rex.ShutdownComplete()
}
return logReturn(l, err)
return err
}

func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {
Expand Down Expand Up @@ -665,7 +720,7 @@
tracer *apm.Tracer,
coord *coordinator.Coordinator,
) (*reload.ServerReloader, error) {
if err := report.SetupMetrics(logger, agentName, version.GetDefaultVersion()); err != nil {

Check failure on line 723 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

SA1019: report.SetupMetrics is deprecated: use SetupMetricsOptions (staticcheck)
return nil, err
}

Expand Down
Loading