Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Improve logging to catch early errors on startup

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/10158

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/9099
137 changes: 102 additions & 35 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import (
"context"
goerrors "errors"
"fmt"
"net/url"
"os"
Expand Down Expand Up @@ -87,7 +88,6 @@
testingMode, _ := cmd.Flags().GetBool("testing-mode")
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
return err
}
return nil
Expand Down Expand Up @@ -140,51 +140,85 @@
defer cancel()
go service.ProcessWindowsControlEvents(stopBeat)

upgradeDetailsFromMarker, err := handleUpgrade()
if err != nil {
return fmt.Errorf("error checking for and handling upgrade: %w", err)
}

locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
if err := locker.TryLock(); err != nil {
return err
}
defer func() {
_ = locker.Unlock()
}()

return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
}

func logReturn(l *logger.Logger, err error) error {
if err != nil && !errors.Is(err, context.Canceled) {
l.Errorf("%s", err)
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
}
return err
}

func runElasticAgent(
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
// errors to ensure that any issues are captured in the logs.
func runElasticAgentCritical(
ctx context.Context,
cancel context.CancelFunc,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
cfg, err := loadConfig(ctx, override)
<<<<<<< HEAD

Check failure on line 165 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected <<, expected }
=======
var errs []error

// early handleUpgrade, but don't error yet
upgradeDetailsFromMarker, err := handleUpgrade()

Check failure on line 170 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: non-declaration statement outside function body
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
}

logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
// single run, but don't error yet
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
lockErr := locker.TryLock()
if lockErr != nil {
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
}
defer func() {

Check failure on line 181 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected {, expected name

Check failure on line 181 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

method has no receiver
_ = locker.Unlock()
}()

Check failure on line 183 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected ( after top level declaration

// try restore (if app locker didn't fail), but don't error yet
if lockErr == nil {
err = coordinator.RestoreConfig()
if err != nil {
errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err))
}
}

// try load config, but don't error yet
>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158))

Check failure on line 194 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

invalid character U+0023 '#'
cfg, err := loadConfig(ctx, override)
if err != nil {
// failed to load configuration, just load the default to create the logger
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
cfg = configuration.DefaultConfiguration()
}

baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))

// failed to create the baseLogger, this comes from the configuration being possibly invalid
// switch to a default config and try again
cfg = configuration.DefaultConfiguration()
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))

// this really should not happen, but this whole critical function is very defensive
baseLogger, err = logger.New("", true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))

// again? no way, but you never know
baseLogger = logger.NewWithoutConfig("")
}
}
}

// Make sure to flush any buffered logs before we're done.
Expand All @@ -194,10 +228,39 @@
"source": agentName,
})

// at this point the logger is working, so any errors that we hit can now be logged and returned
if len(errs) > 0 {
return logReturn(l, goerrors.Join(errs...))
}

// actually run the agent now
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return logReturn(l, err)
}

// runElasticAgent runs the actual Elastic Agent.
func runElasticAgent(
ctx context.Context,
cancel context.CancelFunc,
baseLogger *logger.Logger,
l *logger.Logger,
cfg *configuration.Configuration,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
}

// try early to check if running as root
isRoot, err := utils.HasRoot()
if err != nil {
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
}

l.Infow("Elastic Agent started",
Expand All @@ -207,7 +270,7 @@

cfg, err = tryDelayEnroll(ctx, l, cfg, override)
if err != nil {
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
return errors.New(err, "failed to perform delayed enrollment")
}
pathConfigFile := paths.AgentConfigFile()

Expand All @@ -223,31 +286,35 @@
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
if err != nil {
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
return fmt.Errorf("failed to read/write secrets: %w", err)
}

// Migrate .yml files if the corresponding .enc does not exist

// the encrypted config does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating fleet config"))
return errors.New(err, "error migrating fleet config")
}

// the encrypted state does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l,
paths.AgentStateStoreYmlFile(),
paths.AgentStateStoreFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating agent state"))
return errors.New(err, "error migrating agent state")
}

agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
if err != nil {
return logReturn(l, errors.New(err,
return errors.New(err,
"could not load agent info",
errors.TypeFilesystem,
<<<<<<< HEAD

Check failure on line 313 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected <<, expected expression
errors.M(errors.MetaKeyPath, pathConfigFile)))

Check failure on line 314 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected ) at end of statement
=======

Check failure on line 315 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected ==, expected }
errors.M(errors.MetaKeyPath, paths.AgentConfigFile()))
>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158))

Check failure on line 317 in internal/pkg/agent/cmd/run.go

View workflow job for this annotation

GitHub Actions / lint (ubuntu-latest)

syntax error: unexpected >>, expected }
}

// Ensure that the log level now matches what is configured in the agentInfo.
Expand All @@ -273,14 +340,14 @@

execPath, err := reexecPath()
if err != nil {
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
return fmt.Errorf("failed to get reexec path: %w", err)
}
rexLogger := l.Named("reexec")
rex := reexec.NewManager(rexLogger, execPath)

tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
if err != nil {
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
return fmt.Errorf("could not initiate APM tracer: %w", err)
}
if tracer != nil {
l.Info("APM instrumentation enabled")
Expand All @@ -296,12 +363,12 @@
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
if err != nil {
return logReturn(l, err)
return err
}

monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
if err != nil {
return logReturn(l, err)
return err
}
coord.RegisterMonitoringServer(monitoringServer)
defer func() {
Expand All @@ -325,7 +392,7 @@

// start the control listener
if err := control.Start(); err != nil {
return logReturn(l, err)
return err
}
defer control.Stop()

Expand Down Expand Up @@ -408,7 +475,7 @@
if isRex {
rex.ShutdownComplete()
}
return logReturn(l, err)
return err
}

func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {
Expand Down
Loading