From 38026220e1a45a871d49ab1e0aa9d778410a8afb Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Wed, 1 Oct 2025 14:05:21 -0400 Subject: [PATCH 1/2] Improve logging to catch early errors on startup (#10158) * Update run command to have a critical section to ensure errors are always logged. * Add changelog. * Cleanup from code review. (cherry picked from commit 18beeba11ea6812bcb799b8f0ae76e55676f67e9) # Conflicts: # internal/pkg/agent/cmd/run.go --- ...ging-to-catch-early-errors-on-startup.yaml | 32 ++++ internal/pkg/agent/cmd/run.go | 137 +++++++++++++----- 2 files changed, 134 insertions(+), 35 deletions(-) create mode 100644 changelog/fragments/1758827841-Improve-logging-to-catch-early-errors-on-startup.yaml diff --git a/changelog/fragments/1758827841-Improve-logging-to-catch-early-errors-on-startup.yaml b/changelog/fragments/1758827841-Improve-logging-to-catch-early-errors-on-startup.yaml new file mode 100644 index 00000000000..b3889decf36 --- /dev/null +++ b/changelog/fragments/1758827841-Improve-logging-to-catch-early-errors-on-startup.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Improve logging to catch early errors on startup + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/10158 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/9099 diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 1c02f713235..e87247d2c7b 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -6,6 +6,7 @@ package cmd import ( "context" + goerrors "errors" "fmt" "net/url" "os" @@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command { testingMode, _ := cmd.Flags().GetBool("testing-mode") if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) { fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage()) - logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err)) return err } return nil @@ -140,51 +140,85 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t defer cancel() go service.ProcessWindowsControlEvents(stopBeat) - upgradeDetailsFromMarker, err := handleUpgrade() - if err != nil { - return fmt.Errorf("error checking for and handling upgrade: %w", err) - } - - locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName) - if err := locker.TryLock(); err != nil { - return err - } - defer func() { - _ = locker.Unlock() - }() - - return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...) + return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...) } func logReturn(l *logger.Logger, err error) error { if err != nil && !errors.Is(err, context.Canceled) { l.Errorf("%s", err) + logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err)) } return err } -func runElasticAgent( +// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any +// errors to ensure that any issues are captured in the logs. +func runElasticAgentCritical( ctx context.Context, cancel context.CancelFunc, override application.CfgOverrider, stop chan bool, testingMode bool, fleetInitTimeout time.Duration, - upgradeDetailsFromMarker *details.Details, modifiers ...component.PlatformModifier, ) error { - cfg, err := loadConfig(ctx, override) +<<<<<<< HEAD +======= + var errs []error + + // early handleUpgrade, but don't error yet + upgradeDetailsFromMarker, err := handleUpgrade() if err != nil { - return err + errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err)) } - logLvl := logger.DefaultLogLevel - if cfg.Settings.LoggingConfig != nil { - logLvl = cfg.Settings.LoggingConfig.Level + // single run, but don't error yet + locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName) + lockErr := locker.TryLock() + if lockErr != nil { + errs = append(errs, fmt.Errorf("failed to get app lock: %w", err)) } + defer func() { + _ = locker.Unlock() + }() + + // try restore (if app locker didn't fail), but don't error yet + if lockErr == nil { + err = coordinator.RestoreConfig() + if err != nil { + errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err)) + } + } + + // try load config, but don't error yet +>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158)) + cfg, err := loadConfig(ctx, override) + if err != nil { + // failed to load configuration, just load the default to create the logger + errs = append(errs, fmt.Errorf("failed to load configuration: %w", err)) + cfg = configuration.DefaultConfiguration() + } + baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true) if err != nil { - return err + errs = append(errs, fmt.Errorf("failed to create logger: %w", err)) + + // failed to create the baseLogger, this comes from the configuration being possibly invalid + // switch to a default config and try again + cfg = configuration.DefaultConfiguration() + baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err)) + + // this really should not happen, but this whole critical function is very defensive + baseLogger, err = logger.New("", true) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err)) + + // again? no way, but you never know + baseLogger = logger.NewWithoutConfig("") + } + } } // Make sure to flush any buffered logs before we're done. @@ -194,10 +228,39 @@ func runElasticAgent( "source": agentName, }) + // at this point the logger is working, so any errors that we hit can now be logged and returned + if len(errs) > 0 { + return logReturn(l, goerrors.Join(errs...)) + } + + // actually run the agent now + err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...) + return logReturn(l, err) +} + +// runElasticAgent runs the actual Elastic Agent. +func runElasticAgent( + ctx context.Context, + cancel context.CancelFunc, + baseLogger *logger.Logger, + l *logger.Logger, + cfg *configuration.Configuration, + override application.CfgOverrider, + stop chan bool, + testingMode bool, + fleetInitTimeout time.Duration, + upgradeDetailsFromMarker *details.Details, + modifiers ...component.PlatformModifier, +) error { + logLvl := logger.DefaultLogLevel + if cfg.Settings.LoggingConfig != nil { + logLvl = cfg.Settings.LoggingConfig.Level + } + // try early to check if running as root isRoot, err := utils.HasRoot() if err != nil { - return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err)) + return fmt.Errorf("failed to check for root/Administrator privileges: %w", err) } l.Infow("Elastic Agent started", @@ -207,7 +270,7 @@ func runElasticAgent( cfg, err = tryDelayEnroll(ctx, l, cfg, override) if err != nil { - return logReturn(l, errors.New(err, "failed to perform delayed enrollment")) + return errors.New(err, "failed to perform delayed enrollment") } pathConfigFile := paths.AgentConfigFile() @@ -223,7 +286,7 @@ func runElasticAgent( // that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration. err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot)) if err != nil { - return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err)) + return fmt.Errorf("failed to read/write secrets: %w", err) } // Migrate .yml files if the corresponding .enc does not exist @@ -231,7 +294,7 @@ func runElasticAgent( // the encrypted config does not exist but the unencrypted file does err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile()) if err != nil { - return logReturn(l, errors.New(err, "error migrating fleet config")) + return errors.New(err, "error migrating fleet config") } // the encrypted state does not exist but the unencrypted file does @@ -239,15 +302,19 @@ func runElasticAgent( paths.AgentStateStoreYmlFile(), paths.AgentStateStoreFile()) if err != nil { - return logReturn(l, errors.New(err, "error migrating agent state")) + return errors.New(err, "error migrating agent state") } agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID) if err != nil { - return logReturn(l, errors.New(err, + return errors.New(err, "could not load agent info", errors.TypeFilesystem, +<<<<<<< HEAD errors.M(errors.MetaKeyPath, pathConfigFile))) +======= + errors.M(errors.MetaKeyPath, paths.AgentConfigFile())) +>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158)) } // Ensure that the log level now matches what is configured in the agentInfo. @@ -273,14 +340,14 @@ func runElasticAgent( execPath, err := reexecPath() if err != nil { - return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err)) + return fmt.Errorf("failed to get reexec path: %w", err) } rexLogger := l.Named("reexec") rex := reexec.NewManager(rexLogger, execPath) tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig) if err != nil { - return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err)) + return fmt.Errorf("could not initiate APM tracer: %w", err) } if tracer != nil { l.Info("APM instrumentation enabled") @@ -296,12 +363,12 @@ func runElasticAgent( coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode, fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...) if err != nil { - return logReturn(l, err) + return err } monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord) if err != nil { - return logReturn(l, err) + return err } coord.RegisterMonitoringServer(monitoringServer) defer func() { @@ -325,7 +392,7 @@ func runElasticAgent( // start the control listener if err := control.Start(); err != nil { - return logReturn(l, err) + return err } defer control.Stop() @@ -408,7 +475,7 @@ LOOP: if isRex { rex.ShutdownComplete() } - return logReturn(l, err) + return err } func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) { From 77542f1c1b924172a67598f1b3668a73065ba936 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Mon, 6 Oct 2025 11:30:04 -0400 Subject: [PATCH 2/2] Fix backport. --- internal/pkg/agent/cmd/run.go | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index e87247d2c7b..ea4186c6465 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -162,8 +162,6 @@ func runElasticAgentCritical( fleetInitTimeout time.Duration, modifiers ...component.PlatformModifier, ) error { -<<<<<<< HEAD -======= var errs []error // early handleUpgrade, but don't error yet @@ -182,16 +180,7 @@ func runElasticAgentCritical( _ = locker.Unlock() }() - // try restore (if app locker didn't fail), but don't error yet - if lockErr == nil { - err = coordinator.RestoreConfig() - if err != nil { - errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err)) - } - } - // try load config, but don't error yet ->>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158)) cfg, err := loadConfig(ctx, override) if err != nil { // failed to load configuration, just load the default to create the logger @@ -310,11 +299,7 @@ func runElasticAgent( return errors.New(err, "could not load agent info", errors.TypeFilesystem, -<<<<<<< HEAD - errors.M(errors.MetaKeyPath, pathConfigFile))) -======= - errors.M(errors.MetaKeyPath, paths.AgentConfigFile())) ->>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158)) + errors.M(errors.MetaKeyPath, pathConfigFile)) } // Ensure that the log level now matches what is configured in the agentInfo.