From 44e1c133835bc0064d5f751ebd0440bc0fee253f Mon Sep 17 00:00:00 2001 From: Victor Lyuboslavsky <2685025+getvictor@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:16:14 -0500 Subject: [PATCH 1/2] Fixed DB lock contention during vulnerability cron's software cleanup that caused failures under load --- changes/41374-unused-software | 1 + server/datastore/mysql/software.go | 57 +++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 16 deletions(-) create mode 100644 changes/41374-unused-software diff --git a/changes/41374-unused-software b/changes/41374-unused-software new file mode 100644 index 00000000000..ff8ecdff396 --- /dev/null +++ b/changes/41374-unused-software @@ -0,0 +1 @@ +* Fixed DB lock contention during vulnerability cron's software cleanup that caused failures under load. diff --git a/server/datastore/mysql/software.go b/server/datastore/mysql/software.go index 2c27e02b1f4..dd992a95dc6 100644 --- a/server/datastore/mysql/software.go +++ b/server/datastore/mysql/software.go @@ -63,6 +63,10 @@ var softwareInsertBatchSize = 1000 // outside the main software ingestion transaction. Smaller batches reduce lock contention. var softwareInventoryInsertBatchSize = 100 +// cleanupBatchSize controls how many orphaned software rows are deleted per batch during SyncHostsSoftware cleanup. +// Smaller batches hold locks for shorter durations, reducing contention with concurrent software ingestion. +var cleanupBatchSize = 1000 + func softwareSliceToMap(softwareItems []fleet.Software) map[string]fleet.Software { result := make(map[string]fleet.Software, len(softwareItems)) for _, s := range softwareItems { @@ -2640,19 +2644,6 @@ func (ds *Datastore) SyncHostsSoftware(ctx context.Context, updatedAt time.Time) updated_at = VALUES(updated_at)` valuesPart = `(?, ?, ?, ?, ?),` - - // We must ensure that software is not in host_software table before deleting it. - // This prevents a race condition where a host just added the software, but it is not part of software_host_counts yet. - // When a host adds software, software table and host_software table are updated in the same transaction. - cleanupSoftwareStmt = ` - DELETE s - FROM software s - LEFT JOIN software_host_counts shc - ON s.id = shc.software_id - WHERE - shc.software_id IS NULL AND - NOT EXISTS (SELECT 1 FROM host_software hsw WHERE hsw.software_id = s.id) - ` ) // Create a fresh swap table to populate with new counts. If a previous run left a partial swap table, drop it first. @@ -2763,13 +2754,47 @@ func (ds *Datastore) SyncHostsSoftware(ctx context.Context, updatedAt time.Time) return err } - // Remove any unused software (those not in host_software). - if _, err := ds.writer(ctx).ExecContext(ctx, cleanupSoftwareStmt); err != nil { - return ctxerr.Wrap(ctx, err, "delete unused software") + // Remove any unused software (those not in host_software) in batches to reduce lock contention. + if err := ds.cleanupUnusedSoftware(ctx); err != nil { + return err } return nil } +// cleanupUnusedSoftware deletes orphaned software rows (not referenced by any host) in batches. +func (ds *Datastore) cleanupUnusedSoftware(ctx context.Context) error { + // findUnusedSoftwareStmt finds software rows that are not referenced by any host and have no entry in software_host_counts. + // We must ensure that software is not in the host_software table before deleting it. + // This prevents a race condition where a host just added the software, but it is not part of software_host_counts yet. + // When a host adds software, the software table and host_software table are updated in the same transaction. + const findUnusedSoftwareStmt = ` + SELECT s.id + FROM software s + LEFT JOIN software_host_counts shc ON s.id = shc.software_id + WHERE shc.software_id IS NULL + AND NOT EXISTS (SELECT 1 FROM host_software hsw WHERE hsw.software_id = s.id) + LIMIT ? + ` + + for { + var ids []uint + if err := sqlx.SelectContext(ctx, ds.writer(ctx), &ids, findUnusedSoftwareStmt, cleanupBatchSize); err != nil { + return ctxerr.Wrap(ctx, err, "find unused software for cleanup") + } + if len(ids) == 0 { + return nil + } + + stmt, args, err := sqlx.In(`DELETE FROM software WHERE id IN (?)`, ids) + if err != nil { + return ctxerr.Wrap(ctx, err, "build delete unused software query") + } + if _, err := ds.writer(ctx).ExecContext(ctx, stmt, args...); err != nil { + return ctxerr.Wrap(ctx, err, "delete unused software batch") + } + } +} + func (ds *Datastore) CleanupSoftwareTitles(ctx context.Context) error { var n int64 defer func(start time.Time) { From b44a64b41e41fb66635e471ff0d76262dc7eefdc Mon Sep 17 00:00:00 2001 From: Victor Lyuboslavsky <2685025+getvictor@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:38:54 -0500 Subject: [PATCH 2/2] Updated comment to be more accurate. --- server/datastore/mysql/software.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/datastore/mysql/software.go b/server/datastore/mysql/software.go index dd992a95dc6..f73a69048d1 100644 --- a/server/datastore/mysql/software.go +++ b/server/datastore/mysql/software.go @@ -2763,10 +2763,10 @@ func (ds *Datastore) SyncHostsSoftware(ctx context.Context, updatedAt time.Time) // cleanupUnusedSoftware deletes orphaned software rows (not referenced by any host) in batches. func (ds *Datastore) cleanupUnusedSoftware(ctx context.Context) error { - // findUnusedSoftwareStmt finds software rows that are not referenced by any host and have no entry in software_host_counts. - // We must ensure that software is not in the host_software table before deleting it. - // This prevents a race condition where a host just added the software, but it is not part of software_host_counts yet. - // When a host adds software, the software table and host_software table are updated in the same transaction. + // findUnusedSoftwareStmt finds software rows not referenced by any host and absent from software_host_counts. + // The NOT EXISTS check on host_software reduces (but does not fully prevent) the chance of deleting software that + // is mid-ingestion (inserted into software but not yet linked in host_software). In the unlikely event this happens, + // the next hourly ingestion cycle will re-create and re-link the software entry. const findUnusedSoftwareStmt = ` SELECT s.id FROM software s