Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/41374-unused-software
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Fixed DB lock contention during vulnerability cron's software cleanup that caused failures under load.
57 changes: 41 additions & 16 deletions server/datastore/mysql/software.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ var softwareInsertBatchSize = 1000
// outside the main software ingestion transaction. Smaller batches reduce lock contention.
var softwareInventoryInsertBatchSize = 100

// cleanupBatchSize controls how many orphaned software rows are deleted per batch during SyncHostsSoftware cleanup.
// Smaller batches hold locks for shorter durations, reducing contention with concurrent software ingestion.
var cleanupBatchSize = 1000

func softwareSliceToMap(softwareItems []fleet.Software) map[string]fleet.Software {
result := make(map[string]fleet.Software, len(softwareItems))
for _, s := range softwareItems {
Expand Down Expand Up @@ -2640,19 +2644,6 @@ func (ds *Datastore) SyncHostsSoftware(ctx context.Context, updatedAt time.Time)
updated_at = VALUES(updated_at)`

valuesPart = `(?, ?, ?, ?, ?),`

// We must ensure that software is not in host_software table before deleting it.
// This prevents a race condition where a host just added the software, but it is not part of software_host_counts yet.
// When a host adds software, software table and host_software table are updated in the same transaction.
cleanupSoftwareStmt = `
DELETE s
FROM software s
LEFT JOIN software_host_counts shc
ON s.id = shc.software_id
WHERE
shc.software_id IS NULL AND
NOT EXISTS (SELECT 1 FROM host_software hsw WHERE hsw.software_id = s.id)
`
)

// Create a fresh swap table to populate with new counts. If a previous run left a partial swap table, drop it first.
Expand Down Expand Up @@ -2763,13 +2754,47 @@ func (ds *Datastore) SyncHostsSoftware(ctx context.Context, updatedAt time.Time)
return err
}

// Remove any unused software (those not in host_software).
if _, err := ds.writer(ctx).ExecContext(ctx, cleanupSoftwareStmt); err != nil {
return ctxerr.Wrap(ctx, err, "delete unused software")
// Remove any unused software (those not in host_software) in batches to reduce lock contention.
if err := ds.cleanupUnusedSoftware(ctx); err != nil {
return err
}
return nil
}

// cleanupUnusedSoftware deletes orphaned software rows (not referenced by any host) in batches.
func (ds *Datastore) cleanupUnusedSoftware(ctx context.Context) error {
// findUnusedSoftwareStmt finds software rows not referenced by any host and absent from software_host_counts.
// The NOT EXISTS check on host_software reduces (but does not fully prevent) the chance of deleting software that
// is mid-ingestion (inserted into software but not yet linked in host_software). In the unlikely event this happens,
// the next hourly ingestion cycle will re-create and re-link the software entry.
const findUnusedSoftwareStmt = `
SELECT s.id
FROM software s
LEFT JOIN software_host_counts shc ON s.id = shc.software_id
WHERE shc.software_id IS NULL
AND NOT EXISTS (SELECT 1 FROM host_software hsw WHERE hsw.software_id = s.id)
LIMIT ?
`

for {
var ids []uint
if err := sqlx.SelectContext(ctx, ds.writer(ctx), &ids, findUnusedSoftwareStmt, cleanupBatchSize); err != nil {
return ctxerr.Wrap(ctx, err, "find unused software for cleanup")
}
if len(ids) == 0 {
return nil
}

stmt, args, err := sqlx.In(`DELETE FROM software WHERE id IN (?)`, ids)
if err != nil {
return ctxerr.Wrap(ctx, err, "build delete unused software query")
}
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, args...); err != nil {
return ctxerr.Wrap(ctx, err, "delete unused software batch")
}
Comment on lines +2788 to +2794
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cleanupUnusedSoftware is now a two-phase SELECT-then-DELETE. That introduces a race window where a concurrent ingestion transaction can insert a host_software row referencing one of the selected software IDs after the SELECT but before the DELETE executes; the DELETE would then remove that software row and cascade-delete the newly inserted host_software row, losing data. To keep the safety property the previous single-statement DELETE had, re-check the orphan conditions at DELETE time (e.g., include the NOT EXISTS/LEFT JOIN predicates in the DELETE statement, or use a batched DELETE with LIMIT in a single SQL statement/derived table) so rows that became referenced are not deleted.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a huge concern. The chance of a host suddenly reporting that exact orphaned software in the millisecond window between SELECT and DELETE is negligible. If it did happen, the next hourly ingestion would simply re-add it. It's self-healing.

Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}

func (ds *Datastore) CleanupSoftwareTitles(ctx context.Context) error {
var n int64
defer func(start time.Time) {
Expand Down
Loading