Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/41664-vulnerability-cron-db-contention
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
* Reduced database contention during the vulnerability cron.
* Added a secondary index on `host_software(software_id)` to improve query performance.
Comment thread
ksykulev marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package tables

import (
"database/sql"
"fmt"
)

func init() {
MigrationClient.AddMigration(Up_20260316120000, Down_20260316120000)
}

func Up_20260316120000(tx *sql.Tx) error {
// The FK is unnecessary because kernel_host_counts is fully rebuilt on each vulnerability cron run via a swap table,
// and CREATE TABLE ... LIKE does not copy foreign keys. Keeping the FK would require restoring it after every swap,
// which can fail if a referenced software_title is deleted between the SELECT and the ALTER TABLE.
// Orphaned rows are harmless because API queries (ListKernelsByOS) JOIN back to software_titles, excluding any
// rows that reference deleted titles.
if _, err := tx.Exec(`ALTER TABLE kernel_host_counts DROP FOREIGN KEY kernel_host_counts_ibfk_1`); err != nil {
Comment thread
ksykulev marked this conversation as resolved.
return fmt.Errorf("dropping kernel_host_counts foreign key: %w", err)
}
return nil
}

func Down_20260316120000(_ *sql.Tx) error {
return nil
}
130 changes: 93 additions & 37 deletions server/datastore/mysql/operating_system_vulnerabilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,50 +469,106 @@ GROUP BY id, cve, version
// It should be called as part of vulnerabilities job, which should only run on 1 server at a time.
// If concurrent calls are expected, add proper locking.
func (ds *Datastore) InsertKernelSoftwareMapping(ctx context.Context) error {
_, err := ds.writer(ctx).ExecContext(ctx, `UPDATE kernel_host_counts SET hosts_count = 0`)
if err != nil {
return ctxerr.Wrap(ctx, err, "zero out existing kernel hosts counts")
}
const (
swapTable = "kernel_host_counts_swap"
swapTableCreate = "CREATE TABLE IF NOT EXISTS " + swapTable + " LIKE kernel_host_counts"
Comment thread
getvictor marked this conversation as resolved.

statsStmt := `
INSERT INTO kernel_host_counts (software_title_id, software_id, os_version_id, hosts_count, team_id)
SELECT
software_titles.id AS software_title_id,
software.id AS software_id,
operating_systems.os_version_id AS os_version_id,
COUNT(host_operating_system.host_id) AS hosts_count,
COALESCE(hosts.team_id, 0) AS team_id
FROM
software_titles
JOIN software ON software.title_id = software_titles.id
JOIN host_software ON host_software.software_id = software.id
JOIN host_operating_system ON host_operating_system.host_id = host_software.host_id
JOIN operating_systems ON operating_systems.id = host_operating_system.os_id
JOIN hosts ON hosts.id = host_software.host_id
WHERE
software_titles.is_kernel = TRUE
GROUP BY
software_title_id,
software_id,
os_version_id,
team_id
ON DUPLICATE KEY UPDATE
hosts_count=VALUES(hosts_count)
`
selectStmt = `
SELECT
software_titles.id AS software_title_id,
software.id AS software_id,
operating_systems.os_version_id AS os_version_id,
COUNT(host_operating_system.host_id) AS hosts_count,
COALESCE(hosts.team_id, 0) AS team_id
FROM
software_titles
JOIN software ON software.title_id = software_titles.id
JOIN host_software ON host_software.software_id = software.id
JOIN host_operating_system ON host_operating_system.host_id = host_software.host_id
JOIN operating_systems ON operating_systems.id = host_operating_system.os_id
JOIN hosts ON hosts.id = host_software.host_id
WHERE
software_titles.is_kernel = TRUE
GROUP BY
software_titles.id,
software.id,
operating_systems.os_version_id,
hosts.team_id`

_, err = ds.writer(ctx).ExecContext(ctx, statsStmt)
if err != nil {
return ctxerr.Wrap(ctx, err, "insert kernel software mapping")
insertStmt = `INSERT INTO ` + swapTable + ` (software_title_id, software_id, os_version_id, hosts_count, team_id) VALUES %s`

valuesPart = `(?, ?, ?, ?, ?),`
)

// Create a fresh swap table. Drop any leftover from a previous failed run.
if _, err := ds.writer(ctx).ExecContext(ctx, "DROP TABLE IF EXISTS "+swapTable); err != nil {
return ctxerr.Wrap(ctx, err, "drop existing kernel swap table")
}
if _, err := ds.writer(ctx).ExecContext(ctx, swapTableCreate); err != nil {
return ctxerr.Wrap(ctx, err, "create kernel swap table")
}

_, err = ds.writer(ctx).ExecContext(ctx, `DELETE k FROM kernel_host_counts k LEFT JOIN software ON k.software_id = software.id WHERE software.id IS NULL`)
// Read kernel host counts from the reader to avoid contention with per-host writes on the writer.
rows, err := ds.reader(ctx).QueryContext(ctx, selectStmt)
if err != nil {
return ctxerr.Wrap(ctx, err, "clean up orphan kernels by software id")
return ctxerr.Wrap(ctx, err, "read kernel host counts")
}
defer rows.Close()

// Batch insert into the swap table on the writer.
const batchSize = 100
var batchCount int
args := make([]any, 0, batchSize*5)
for rows.Next() {
var (
softwareTitleID uint
softwareID uint
osVersionID uint
hostsCount uint
teamID uint
)
if err := rows.Scan(&softwareTitleID, &softwareID, &osVersionID, &hostsCount, &teamID); err != nil {
return ctxerr.Wrap(ctx, err, "scan kernel host count row")
}
args = append(args, softwareTitleID, softwareID, osVersionID, hostsCount, teamID)
batchCount++

_, err = ds.writer(ctx).ExecContext(ctx, `DELETE k FROM kernel_host_counts k LEFT JOIN operating_systems ON k.os_version_id = operating_systems.os_version_id WHERE operating_systems.id IS NULL`)
if err != nil {
return ctxerr.Wrap(ctx, err, "clean up orphan kernels by os version id")
if batchCount == batchSize {
values := strings.TrimSuffix(strings.Repeat(valuesPart, batchCount), ",")
if _, err := ds.writer(ctx).ExecContext(ctx, fmt.Sprintf(insertStmt, values), args...); err != nil {
return ctxerr.Wrap(ctx, err, "insert kernel host counts batch into swap table")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this fails mid batch what happens? We aren't in a transaction. Do we care?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This query is inserting into a new kernel_host_counts_swap table that no one else is using. There is no contention. Any way it could fail?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Parent context is cancelled.
Mysql server restart or crash.
Connection timeout / network failure.

But I don't think we actually care if the swap table is left partial.

}
args = args[:0]
batchCount = 0
}
}
if batchCount > 0 {
values := strings.TrimSuffix(strings.Repeat(valuesPart, batchCount), ",")
if _, err := ds.writer(ctx).ExecContext(ctx, fmt.Sprintf(insertStmt, values), args...); err != nil {
return ctxerr.Wrap(ctx, err, "insert last kernel host counts batch into swap table")
}
}
if err := rows.Err(); err != nil {
return ctxerr.Wrap(ctx, err, "iterate kernel host count rows")
}

// Atomic table swap.
if err := ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
if _, err := tx.ExecContext(ctx, "DROP TABLE IF EXISTS kernel_host_counts_old"); err != nil {
return ctxerr.Wrap(ctx, err, "drop leftover old kernel table")
}
if _, err := tx.ExecContext(ctx, `
RENAME TABLE
kernel_host_counts TO kernel_host_counts_old,
`+swapTable+` TO kernel_host_counts`); err != nil {
return ctxerr.Wrap(ctx, err, "atomic kernel table swap")
}
if _, err := tx.ExecContext(ctx, "DROP TABLE IF EXISTS kernel_host_counts_old"); err != nil {
return ctxerr.Wrap(ctx, err, "drop old kernel table after swap")
}
return nil
}); err != nil {
return err
}

// Refresh the pre-aggregated OS version vulnerabilities table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -739,9 +739,9 @@ func testKernelVulnsHostCount(t *testing.T, ds *Datastore) {

ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error {
var count uint
err := sqlx.GetContext(ctx, q, &count, "SELECT hosts_count FROM kernel_host_counts WHERE os_version_id = ?", os1.OSVersionID)
err := sqlx.GetContext(ctx, q, &count, "SELECT COUNT(*) FROM kernel_host_counts WHERE os_version_id = ?", os1.OSVersionID)
require.NoError(t, err)
assert.Zero(t, count)
assert.Zero(t, count, "expected no rows in kernel_host_counts after all hosts with this kernel were deleted")
return nil
})

Expand Down
7 changes: 3 additions & 4 deletions server/datastore/mysql/schema.sql

Large diffs are not rendered by default.

58 changes: 44 additions & 14 deletions server/datastore/mysql/software.go
Original file line number Diff line number Diff line change
Expand Up @@ -2827,21 +2827,51 @@ func (ds *Datastore) CleanupSoftwareTitles(ctx context.Context) error {
)
}(time.Now())

const deleteOrphanedSoftwareTitlesStmt = `
DELETE st FROM software_titles st
LEFT JOIN software s ON st.id = s.title_id
LEFT JOIN software_installers si ON st.id = si.title_id
LEFT JOIN in_house_apps iha ON st.id = iha.title_id
LEFT JOIN vpp_apps vap ON st.id = vap.title_id
WHERE s.title_id IS NULL AND si.title_id IS NULL AND iha.title_id IS NULL AND vap.title_id IS NULL`

res, err := ds.writer(ctx).ExecContext(ctx, deleteOrphanedSoftwareTitlesStmt)
if err != nil {
return ctxerr.Wrap(ctx, err, "executing delete of software titles")
}
ra, _ := res.RowsAffected()
n += ra
const (
findOrphanedSoftwareTitlesStmt = `
SELECT st.id
FROM software_titles st
LEFT JOIN software s ON st.id = s.title_id
LEFT JOIN software_installers si ON st.id = si.title_id
LEFT JOIN in_house_apps iha ON st.id = iha.title_id
LEFT JOIN vpp_apps vap ON st.id = vap.title_id
WHERE st.id > ? AND s.title_id IS NULL AND si.title_id IS NULL AND iha.title_id IS NULL AND vap.title_id IS NULL
ORDER BY st.id
LIMIT ?`

// Re-check orphan status on the writer to avoid deleting a title that an IT admin just linked
// (e.g., added a software installer) between the reader SELECT and this DELETE.
deleteOrphanedSoftwareTitlesStmt = `
DELETE st FROM software_titles st
LEFT JOIN software s ON st.id = s.title_id
LEFT JOIN software_installers si ON st.id = si.title_id
LEFT JOIN in_house_apps iha ON st.id = iha.title_id
LEFT JOIN vpp_apps vap ON st.id = vap.title_id
WHERE st.id IN (?) AND s.title_id IS NULL AND si.title_id IS NULL AND iha.title_id IS NULL AND vap.title_id IS NULL`
)

var lastID uint
for range cleanupMaxIterations {
var ids []uint
if err := sqlx.SelectContext(ctx, ds.reader(ctx), &ids, findOrphanedSoftwareTitlesStmt, lastID, cleanupBatchSize); err != nil {
return ctxerr.Wrap(ctx, err, "find orphaned software titles for cleanup")
}
if len(ids) == 0 {
return nil
}
lastID = ids[len(ids)-1]

stmt, args, err := sqlx.In(deleteOrphanedSoftwareTitlesStmt, ids)
if err != nil {
return ctxerr.Wrap(ctx, err, "build delete orphaned software titles query")
}
res, err := ds.writer(ctx).ExecContext(ctx, stmt, args...)
if err != nil {
return ctxerr.Wrap(ctx, err, "delete orphaned software titles batch")
}
ra, _ := res.RowsAffected()
n += ra
}
return nil
}

Expand Down
Loading