From 0dadf80a57a721c0cc39534fb5c536327f58295d Mon Sep 17 00:00:00 2001 From: Tim Lee Date: Thu, 30 Apr 2026 13:50:28 -0600 Subject: [PATCH 1/2] Revert "Fix SCEP autorenew failing for offline hosts (#44250)" This reverts commit 98aa8f8c3c418dabe5b880b2cbe7a5e00a547f2c. --- changes/44111-scep-autorenew-fail | 2 - server/datastore/mysql/apple_mdm.go | 28 +- server/datastore/mysql/apple_mdm_test.go | 308 -------------------- server/datastore/mysql/challenges_test.go | 131 --------- server/datastore/mysql/host_certificates.go | 19 -- server/datastore/mysql/mdm.go | 68 +---- server/fleet/mdm.go | 8 +- server/service/integration_mdm_test.go | 157 +--------- 8 files changed, 21 insertions(+), 700 deletions(-) delete mode 100644 changes/44111-scep-autorenew-fail delete mode 100644 server/datastore/mysql/challenges_test.go diff --git a/changes/44111-scep-autorenew-fail b/changes/44111-scep-autorenew-fail deleted file mode 100644 index 36e5353f30f..00000000000 --- a/changes/44111-scep-autorenew-fail +++ /dev/null @@ -1,2 +0,0 @@ -- Fixed SCEP renewals not retrying after initial failure -- Extend SCEP enrollment/renewal challenge credential TTL diff --git a/server/datastore/mysql/apple_mdm.go b/server/datastore/mysql/apple_mdm.go index 5d5124614e0..c01e7de14ae 100644 --- a/server/datastore/mysql/apple_mdm.go +++ b/server/datastore/mysql/apple_mdm.go @@ -3539,34 +3539,20 @@ func (ds *Datastore) UpdateOrDeleteHostMDMAppleProfile(ctx context.Context, prof } // Check whether we want to set a install operation as 'verifying' for an iOS/iPadOS device. - // For non-managed-cert profiles, iOS/iPadOS short-circuits to 'verified' because there is - // no osquery available to drive the standard verifying -> verified transition. Managed-cert - // profiles instead use CertificateList ingestion (updateHostMDMManagedCertDetailsDB) as - // their verification trigger; leaving them at 'verifying' here closes the renewal-cron race - // where 'verified' would arrive before fresh cert metadata had been ingested. See #44111. - var iOSAckCheck struct { - IsIOS bool `db:"is_ios"` - IsManagedCert bool `db:"is_managed_cert"` - } + var isIOSIPadOSInstallVerifiying bool if profile.OperationType == fleet.MDMOperationTypeInstall && profile.Status != nil && *profile.Status == fleet.MDMDeliveryVerifying { - if err := ds.writer(ctx).GetContext(ctx, &iOSAckCheck, ` - SELECT - (h.platform = 'ios' OR h.platform = 'ipados') AS is_ios, - EXISTS(SELECT 1 FROM host_mdm_managed_certificates WHERE host_uuid = h.uuid AND profile_uuid = ?) AS is_managed_cert - FROM hosts h - WHERE h.uuid = ?`, - profile.ProfileUUID, profile.HostUUID, + if err := ds.writer(ctx).GetContext(ctx, &isIOSIPadOSInstallVerifiying, ` + SELECT platform = 'ios' OR platform = 'ipados' FROM hosts WHERE uuid = ?`, + profile.HostUUID, ); err != nil { return err } } status := profile.Status - if iOSAckCheck.IsIOS && !iOSAckCheck.IsManagedCert { - // iOS/iPadOS devices do not have osquery, thus they go from 'pending' - // straight to 'verified'. Managed-cert profiles are the exception and - // transition via updateHostMDMManagedCertDetailsDB once fresh metadata - // arrives. + if isIOSIPadOSInstallVerifiying { + // iOS/iPadOS devices do not have osquery, + // thus they go from 'pending' straight to 'verified' status = &fleet.MDMDeliveryVerified } diff --git a/server/datastore/mysql/apple_mdm_test.go b/server/datastore/mysql/apple_mdm_test.go index 7c3c25fde46..c4e993b4d92 100644 --- a/server/datastore/mysql/apple_mdm_test.go +++ b/server/datastore/mysql/apple_mdm_test.go @@ -54,7 +54,6 @@ func TestMDMApple(t *testing.T) { {"TestListMDMAppleConfigProfiles", testListMDMAppleConfigProfiles}, {"TestHostDetailsMDMProfiles", testHostDetailsMDMProfiles}, {"TestHostDetailsMDMProfilesIOSIPadOS", testHostDetailsMDMProfilesIOSIPadOS}, - {"TestIOSManagedCertProfileStaysVerifying", testIOSManagedCertProfileStaysVerifying}, {"TestBatchSetMDMAppleProfiles", testBatchSetMDMAppleProfiles}, {"TestMDMAppleProfileManagement", testMDMAppleProfileManagement}, {"TestMDMAppleProfileManagementBatch2", testMDMAppleProfileManagementBatch2}, @@ -7636,120 +7635,6 @@ func testHostDetailsMDMProfilesIOSIPadOS(t *testing.T, ds *Datastore) { } } -// testIOSManagedCertProfileStaysVerifying covers the iOS/iPadOS race window in issue #44111. -// -// Non-cert iOS/iPadOS profiles short-circuit pending -> verified on MDM ack because there's -// no osquery to drive the standard verifying -> verified transition. That short-circuit was -// firing for managed-cert profiles too, which created a window where the renewal cron would -// see status='verified' but the cert metadata in host_mdm_managed_certificates still -// reflected the OLD cert — the renewal cron's renewal-window HAVING clause kept matching and -// fired renewal redundantly each tick until CertificateList ingestion caught up. -// -// Fix: managed-cert profiles park at 'verifying' on ack and only flip to 'verified' when -// updateHostMDMManagedCertDetailsDB ingests fresh cert metadata. -func testIOSManagedCertProfileStaysVerifying(t *testing.T, ds *Datastore) { - ctx := t.Context() - - cp, err := ds.NewMDMAppleConfigProfile(ctx, fleet.MDMAppleConfigProfile{ - Name: "scep-cert", - Identifier: "com.example.scep", - Mobileconfig: []byte("scep-profile-bytes"), - }, nil) - require.NoError(t, err) - - iOSHost, err := ds.NewHost(ctx, &fleet.Host{ - DetailUpdatedAt: time.Now(), - LabelUpdatedAt: time.Now(), - PolicyUpdatedAt: time.Now(), - SeenTime: time.Now(), - OsqueryHostID: ptr.String("ios-managed-cert-osquery-id"), - NodeKey: ptr.String("ios-managed-cert-node-key"), - UUID: "ios-managed-cert-uuid", - Hostname: "ios-managed-cert", - Platform: "ios", - }) - require.NoError(t, err) - - const cmdUUID = "ios-managed-cert-cmd" - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - INSERT INTO host_mdm_apple_profiles - (host_uuid, profile_uuid, command_uuid, status, operation_type, detail, profile_name, profile_identifier, checksum) - VALUES (?, ?, ?, ?, ?, '', ?, ?, ?)`, - iOSHost.UUID, cp.ProfileUUID, cmdUUID, fleet.MDMDeliveryPending, fleet.MDMOperationTypeInstall, - cp.Name, "com.test.profile."+cp.ProfileUUID, test.MakeTestChecksum(0), - ) - return err - }) - - // Mark this profile as a managed-cert profile by inserting the matching row. - // At render time this only carries type/ca_name/challenge_retrieved_at; cert metadata - // arrives later via updateHostMDMManagedCertDetailsDB. - const caName = "test-ca" - require.NoError(t, ds.BulkUpsertMDMManagedCertificates(ctx, []*fleet.MDMManagedCertificate{ - { - HostUUID: iOSHost.UUID, - ProfileUUID: cp.ProfileUUID, - Type: fleet.CAConfigCustomSCEPProxy, - CAName: caName, - }, - })) - - // MDM ack lands as 'verifying'. Without the fix, this would be intercepted and short- - // circuited to 'verified'. With the fix, it must remain 'verifying' for managed-cert - // profiles so the renewal cron's IN ('verified', 'failed') filter excludes the row - // while cert metadata is being refreshed. - require.NoError(t, ds.UpdateOrDeleteHostMDMAppleProfile(ctx, &fleet.HostMDMAppleProfile{ - HostUUID: iOSHost.UUID, - CommandUUID: cmdUUID, - ProfileUUID: cp.ProfileUUID, - Status: &fleet.MDMDeliveryVerifying, - OperationType: fleet.MDMOperationTypeInstall, - })) - - gotProfs, err := ds.GetHostMDMAppleProfiles(ctx, iOSHost.UUID) - require.NoError(t, err) - require.Len(t, gotProfs, 1) - require.NotNil(t, gotProfs[0].Status) - assert.Equal(t, fleet.MDMDeliveryVerifying, *gotProfs[0].Status, - "managed-cert profile must stay at 'verifying' on iOS until cert metadata refreshes") - - // Simulate the device's CertificateList response landing in UpdateHostCertificates, - // which calls updateHostMDMManagedCertDetailsDB with fresh cert metadata. That path - // must flip the corresponding install row from 'verifying' to 'verified'. - notValidBefore := time.Now().Add(-1 * time.Hour).UTC().Round(time.Microsecond) - notValidAfter := time.Now().Add(30 * 24 * time.Hour).UTC().Round(time.Microsecond) - serial := "abcdef0123456789" - require.NoError(t, ds.UpdateHostCertificates(ctx, iOSHost.ID, iOSHost.UUID, []*fleet.HostCertificateRecord{ - { - HostID: iOSHost.ID, - SHA1Sum: []byte("0123456789abcdef0123"), // 20 bytes - NotValidBefore: notValidBefore, - NotValidAfter: notValidAfter, - CommonName: "fleet-managed-cert", - Serial: serial, - SubjectCommonName: "fleet-" + cp.ProfileUUID, - SubjectOrganizationalUnit: "fleet", - Source: fleet.SystemHostCertificate, - }, - })) - - gotProfs, err = ds.GetHostMDMAppleProfiles(ctx, iOSHost.UUID) - require.NoError(t, err) - require.Len(t, gotProfs, 1) - require.NotNil(t, gotProfs[0].Status) - assert.Equal(t, fleet.MDMDeliveryVerified, *gotProfs[0].Status, - "updateHostMDMManagedCertDetailsDB must flip 'verifying' -> 'verified' once metadata refreshes") - - // And cert metadata must reflect the new cert. - managedProf, err := ds.GetAppleHostMDMCertificateProfile(ctx, iOSHost.UUID, cp.ProfileUUID, caName) - require.NoError(t, err) - require.NotNil(t, managedProf) - require.NotNil(t, managedProf.Serial) - // Serials are stored zero-padded to 40 chars in the cert detail update path. - assert.Equal(t, fmt.Sprintf("%040s", serial), *managedProf.Serial) -} - func testMDMAppleBootstrapPackageWithS3(t *testing.T, ds *Datastore) { ctx := t.Context() var nfe fleet.NotFoundError @@ -8924,199 +8809,6 @@ func testMDMManagedSCEPCertificates(t *testing.T, ds *Datastore) { require.NoError(t, err) require.NotNil(t, profile) }) - - // Regression test for issue #44111: renewal cron must not flip the status of a - // profile that's still being delivered ('pending' or 'verifying'). Otherwise an - // offline host that hasn't picked up the renewal yet would have its profile - // re-rendered with a fresh challenge every cron tick, generating orphan nano - // commands and challenge rows hourly. - t.Run("Renewal cron skips in-flight statuses (issue #44111)", func(t *testing.T) { - notValidBefore := time.Now().Add(-16 * 24 * time.Hour).UTC().Round(time.Microsecond) - notValidAfter := time.Now().Add(14 * 24 * time.Hour).UTC().Round(time.Microsecond) - err = ds.BulkUpsertMDMManagedCertificates(ctx, []*fleet.MDMManagedCertificate{ - { - HostUUID: host.UUID, - ProfileUUID: initialCP.ProfileUUID, - ChallengeRetrievedAt: challengeRetrievedAt, - NotValidBefore: ¬ValidBefore, - NotValidAfter: ¬ValidAfter, - Type: caType, - CAName: caName, - Serial: &serial, - }, - }) - require.NoError(t, err) - - for _, inFlightStatus := range []fleet.MDMDeliveryStatus{fleet.MDMDeliveryPending, fleet.MDMDeliveryVerifying} { - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ? WHERE host_uuid = ? AND profile_uuid = ? - `, inFlightStatus, host.UUID, initialCP.ProfileUUID) - return err - }) - - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.NotNil(t, profile.Status) - assert.Equalf(t, inFlightStatus, *profile.Status, - "renewal cron must not flip status away from %q (in-flight delivery)", inFlightStatus) - } - - // Sanity check: 'failed' must still be picked up so the cron can recover - // from a transient SCEP server outage on the next tick — but only after - // renewalFailedRetryBackoff has elapsed since the failure (gate against - // permanent-failure looping). Backdate updated_at to simulate the row - // having been in 'failed' long enough to be eligible for retry. - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ?, updated_at = ? - WHERE host_uuid = ? AND profile_uuid = ? - `, fleet.MDMDeliveryFailed, time.Now().Add(-25*time.Hour), host.UUID, initialCP.ProfileUUID) - return err - }) - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.Nil(t, profile.Status, "renewal cron must re-trigger 'failed' to recover from transient outages") - }) - - // Regression test for the permanent-failure loop. Pre-PR, BulkUpsert wiped cert - // metadata on every reconcile, which inadvertently kept permanent failures from - // looping. After the COALESCE fix that's gone, so we need an explicit backoff: - // 'failed' rows that were just marked failed (e.g., this cron tick's reconcile - // just rendered them as failed because the CA was deleted) must NOT be picked up - // on the very next tick. They become eligible only after renewalFailedRetryBackoff. - t.Run("Permanent-failure backoff (issue #44111)", func(t *testing.T) { - notValidBefore := time.Now().Add(-15 * 24 * time.Hour).UTC().Round(time.Microsecond) - notValidAfter := time.Now().Add(14 * 24 * time.Hour).UTC().Round(time.Microsecond) - err = ds.BulkUpsertMDMManagedCertificates(ctx, []*fleet.MDMManagedCertificate{ - { - HostUUID: host.UUID, - ProfileUUID: initialCP.ProfileUUID, - ChallengeRetrievedAt: challengeRetrievedAt, - NotValidBefore: ¬ValidBefore, - NotValidAfter: ¬ValidAfter, - Type: caType, - CAName: caName, - Serial: &serial, - }, - }) - require.NoError(t, err) - - // Mark failed with updated_at set to NOW — the typical path when reconcile - // just rendered the profile as failed (e.g. CA deleted, IDP missing). - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ?, updated_at = NOW(6) - WHERE host_uuid = ? AND profile_uuid = ? - `, fleet.MDMDeliveryFailed, host.UUID, initialCP.ProfileUUID) - return err - }) - - // Cron must NOT pick this up — would loop hourly otherwise. - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.NotNil(t, profile.Status, - "renewal cron must NOT re-fire on a freshly-failed row (would loop on permanent failures)") - assert.Equal(t, fleet.MDMDeliveryFailed, *profile.Status) - - // Once the backoff window has passed, the cron is allowed to retry once. - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET updated_at = ? - WHERE host_uuid = ? AND profile_uuid = ? - `, time.Now().Add(-25*time.Hour), host.UUID, initialCP.ProfileUUID) - return err - }) - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.Nil(t, profile.Status, - "renewal cron must re-fire 'failed' once renewalFailedRetryBackoff has elapsed") - }) - - // Regression test for issue #44111: the reconcile re-render that fires after a - // renewal trigger calls BulkUpsertMDMManagedCertificates with a payload that has - // nil NotValidBefore/NotValidAfter/Serial (those fields aren't known at profile - // render time). That upsert must not clobber the populated cert metadata, otherwise - // RenewMDMManagedCertificates' `HAVING validity_period IS NOT NULL` clause excludes - // the row and the renewal cron silently stops re-trying. - t.Run("Reconcile re-render preserves cert metadata (issue #44111)", func(t *testing.T) { - notValidBefore := time.Now().Add(-16 * 24 * time.Hour).UTC().Round(time.Microsecond) - notValidAfter := time.Now().Add(14 * 24 * time.Hour).UTC().Round(time.Microsecond) - err = ds.BulkUpsertMDMManagedCertificates(ctx, []*fleet.MDMManagedCertificate{ - { - HostUUID: host.UUID, - ProfileUUID: initialCP.ProfileUUID, - ChallengeRetrievedAt: challengeRetrievedAt, - NotValidBefore: ¬ValidBefore, - NotValidAfter: ¬ValidAfter, - Type: caType, - CAName: caName, - Serial: &serial, - }, - }) - require.NoError(t, err) - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ? WHERE host_uuid = ? AND profile_uuid = ? - `, fleet.MDMDeliveryVerified, host.UUID, initialCP.ProfileUUID) - return err - }) - - // Renewal cron flips status to NULL since the cert is in the renewal window. - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.Nil(t, profile.Status) - - // Reconcile re-renders the profile and upserts with the render-time payload — - // type/ca_name/challenge_retrieved_at set, all cert fields nil. Mirrors the - // payload built by ReplaceCustomSCEPProxyURLVariable and the NDES handler. - err = ds.BulkUpsertMDMManagedCertificates(ctx, []*fleet.MDMManagedCertificate{ - { - HostUUID: host.UUID, - ProfileUUID: initialCP.ProfileUUID, - ChallengeRetrievedAt: challengeRetrievedAt, - Type: caType, - CAName: caName, - }, - }) - require.NoError(t, err) - - // Cert metadata must survive the reconcile re-render. - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.NotNil(t, profile) - require.NotNil(t, profile.Serial, "serial must be preserved across reconcile re-render") - assert.Equal(t, serial, *profile.Serial) - require.NotNil(t, profile.NotValidBefore, "not_valid_before must be preserved across reconcile re-render") - assert.Equal(t, ¬ValidBefore, profile.NotValidBefore) - require.NotNil(t, profile.NotValidAfter, "not_valid_after must be preserved across reconcile re-render") - assert.Equal(t, ¬ValidAfter, profile.NotValidAfter) - - // If the device's SCEP handshake never completes, the renewal cron must still - // pick up the row on a subsequent run. Re-mark verified, re-run renewal — must - // flip status to NULL again, proving validity_period is still queryable. - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ? WHERE host_uuid = ? AND profile_uuid = ? - `, fleet.MDMDeliveryVerified, host.UUID, initialCP.ProfileUUID) - return err - }) - err = ds.RenewMDMManagedCertificates(ctx) - require.NoError(t, err) - profile, err = ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, initialCP.ProfileUUID, caName) - require.NoError(t, err) - require.Nil(t, profile.Status, "renewal cron must still detect cert with preserved metadata") - }) }) } } diff --git a/server/datastore/mysql/challenges_test.go b/server/datastore/mysql/challenges_test.go deleted file mode 100644 index 0fb1b0e0b0b..00000000000 --- a/server/datastore/mysql/challenges_test.go +++ /dev/null @@ -1,131 +0,0 @@ -package mysql - -import ( - "database/sql" - "testing" - "time" - - "github.com/fleetdm/fleet/v4/server/fleet" - "github.com/jmoiron/sqlx" - "github.com/stretchr/testify/require" -) - -func TestChallenges(t *testing.T) { - ds := CreateMySQLDS(t) - - cases := []struct { - name string - fn func(t *testing.T, ds *Datastore) - }{ - {"NewAndConsume", testChallengeNewAndConsume}, - {"ConsumeMissing", testChallengeConsumeMissing}, - {"ConsumeWithinTTL", testChallengeConsumeWithinTTL}, - {"ConsumeExpired", testChallengeConsumeExpired}, - {"CleanupRespectsTTL", testChallengeCleanupRespectsTTL}, - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - defer TruncateTables(t, ds) - c.fn(t, ds) - }) - } -} - -func testChallengeNewAndConsume(t *testing.T, ds *Datastore) { - ctx := t.Context() - - challenge, err := ds.NewChallenge(ctx) - require.NoError(t, err) - require.NotEmpty(t, challenge) - - err = ds.ConsumeChallenge(ctx, challenge) - require.NoError(t, err) - - // Second consume must fail — challenge is one-time. - err = ds.ConsumeChallenge(ctx, challenge) - require.Error(t, err) - require.ErrorIs(t, err, sql.ErrNoRows) -} - -func testChallengeConsumeMissing(t *testing.T, ds *Datastore) { - ctx := t.Context() - - err := ds.ConsumeChallenge(ctx, "") - require.Error(t, err) - require.ErrorIs(t, err, sql.ErrNoRows) - - err = ds.ConsumeChallenge(ctx, "never-issued") - require.Error(t, err) - require.ErrorIs(t, err, sql.ErrNoRows) -} - -// testChallengeConsumeWithinTTL backdates a challenge to just within OneTimeChallengeTTL and -// confirms it's still consumable. Regression coverage for issue #44111: devices may take -// hours/days to process the InstallProfile push before sending the SCEP request. -func testChallengeConsumeWithinTTL(t *testing.T, ds *Datastore) { - ctx := t.Context() - - challenge, err := ds.NewChallenge(ctx) - require.NoError(t, err) - - // Backdate to 1 minute inside the TTL. - backdated := time.Now().Add(-fleet.OneTimeChallengeTTL).Add(1 * time.Minute) - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, `UPDATE challenges SET created_at = ? WHERE challenge = ?`, backdated, challenge) - return err - }) - - err = ds.ConsumeChallenge(ctx, challenge) - require.NoError(t, err) -} - -// testChallengeConsumeExpired backdates a challenge past OneTimeChallengeTTL and confirms it's -// rejected as expired (returns sql.ErrNoRows wrapped with "challenge expired"). -func testChallengeConsumeExpired(t *testing.T, ds *Datastore) { - ctx := t.Context() - - challenge, err := ds.NewChallenge(ctx) - require.NoError(t, err) - - // Backdate past the TTL. - expired := time.Now().Add(-fleet.OneTimeChallengeTTL).Add(-1 * time.Minute) - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, `UPDATE challenges SET created_at = ? WHERE challenge = ?`, expired, challenge) - return err - }) - - err = ds.ConsumeChallenge(ctx, challenge) - require.Error(t, err) - require.ErrorIs(t, err, sql.ErrNoRows) -} - -// testChallengeCleanupRespectsTTL verifies CleanupExpiredChallenges deletes only challenges -// older than OneTimeChallengeTTL and leaves still-valid challenges in place. -func testChallengeCleanupRespectsTTL(t *testing.T, ds *Datastore) { - ctx := t.Context() - - freshChallenge, err := ds.NewChallenge(ctx) - require.NoError(t, err) - expiredChallenge, err := ds.NewChallenge(ctx) - require.NoError(t, err) - - expired := time.Now().Add(-fleet.OneTimeChallengeTTL).Add(-1 * time.Minute) - ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, `UPDATE challenges SET created_at = ? WHERE challenge = ?`, expired, expiredChallenge) - return err - }) - - deleted, err := ds.CleanupExpiredChallenges(ctx) - require.NoError(t, err) - require.EqualValues(t, 1, deleted) - - // Fresh challenge survives and is still consumable. - err = ds.ConsumeChallenge(ctx, freshChallenge) - require.NoError(t, err) - - // Expired challenge is gone. - err = ds.ConsumeChallenge(ctx, expiredChallenge) - require.Error(t, err) - require.ErrorIs(t, err, sql.ErrNoRows) -} diff --git a/server/datastore/mysql/host_certificates.go b/server/datastore/mysql/host_certificates.go index ccda58fe58e..8077156d8cb 100644 --- a/server/datastore/mysql/host_certificates.go +++ b/server/datastore/mysql/host_certificates.go @@ -520,25 +520,6 @@ func updateHostMDMManagedCertDetailsDB(ctx context.Context, tx sqlx.ExtContext, if _, err := tx.ExecContext(ctx, stmt, args...); err != nil { return ctxerr.Wrap(ctx, err, "updating host mdm managed certificates") } - - // Fresh cert metadata is the verification signal for iOS/iPadOS managed-cert - // profiles (which UpdateOrDeleteHostMDMAppleProfile parks at 'verifying' on - // MDM ack instead of short-circuiting to 'verified'). Flip the install row - // here so the renewal cron's IN ('verified', 'failed') filter only matches - // once cert metadata is in sync. For macOS this is redundant with - // VerifyHostMDMProfiles but idempotent. See issue #44111. - flipStmt := ` - UPDATE host_mdm_apple_profiles - SET status = ? - WHERE host_uuid = ? AND profile_uuid = ? - AND status = ? AND operation_type = ?` - if _, err := tx.ExecContext(ctx, flipStmt, - fleet.MDMDeliveryVerified, - certToUpdate.HostUUID, certToUpdate.ProfileUUID, - fleet.MDMDeliveryVerifying, fleet.MDMOperationTypeInstall, - ); err != nil { - return ctxerr.Wrap(ctx, err, "flipping iOS managed cert profile to verified") - } } return nil } diff --git a/server/datastore/mysql/mdm.go b/server/datastore/mysql/mdm.go index 9a07b88c29c..505dc88ac0e 100644 --- a/server/datastore/mysql/mdm.go +++ b/server/datastore/mysql/mdm.go @@ -19,26 +19,6 @@ import ( "github.com/jmoiron/sqlx" ) -// renewalFailedRetryBackoff bounds how often RenewMDMManagedCertificates re-triggers -// renewal for a managed-cert profile sitting in the 'failed' state. Without this gate, -// a permanent failure (CA deleted, IDP variables missing, license downgraded — anything -// that fails at profile-render time via fleet.MarkProfilesFailed) would loop every cron -// tick: cron flips status to NULL, reconcile re-renders and immediately fails again, -// status returns to 'failed', repeat. The backoff is set well above the cron interval -// (1h) so transient SCEP-server outages still recover within a day, but permanent -// failures don't churn nano commands and profile renders hourly. See issue #44111. -const renewalFailedRetryBackoff = 24 * time.Hour - -var mdmCommandsAllowedOrderKeys = common_mysql.OrderKeyAllowlist{ - "command_uuid": "command_uuid", - "request_type": "request_type", - "status": "status", - "updated_at": "updated_at", - "hostname": "hostname", - "host_uuid": "host_uuid", - "name": "name", -} - func (ds *Datastore) GetMDMCommandPlatform(ctx context.Context, commandUUID string) (string, error) { stmt := ` SELECT CASE @@ -2788,14 +2768,6 @@ func (ds *Datastore) BulkUpsertMDMManagedCertificates(ctx context.Context, paylo } executeUpsertBatch := func(valuePart string, args []any) error { - // Cert metadata columns (not_valid_before, not_valid_after, serial) use COALESCE so a - // nil incoming value preserves the previously stored value. The reconcile re-render - // path (e.g. ReplaceCustomSCEPProxyURLVariable, NDES/Smallstep handlers) upserts with - // those fields nil — they aren't known until the device completes the SCEP handshake - // and osquery reports the issued cert via updateHostMDMManagedCertDetailsDB. Without - // COALESCE, a renewal trigger silently wipes cert metadata, which then disables the - // renewal cron itself (its HAVING clause requires validity_period IS NOT NULL). See - // issue #44111. stmt := fmt.Sprintf(` INSERT INTO host_mdm_managed_certificates ( host_uuid, @@ -2810,11 +2782,11 @@ func (ds *Datastore) BulkUpsertMDMManagedCertificates(ctx context.Context, paylo VALUES %s ON DUPLICATE KEY UPDATE challenge_retrieved_at = VALUES(challenge_retrieved_at), - not_valid_before = COALESCE(VALUES(not_valid_before), not_valid_before), - not_valid_after = COALESCE(VALUES(not_valid_after), not_valid_after), + not_valid_before = VALUES(not_valid_before), + not_valid_after = VALUES(not_valid_after), type = VALUES(type), ca_name = VALUES(ca_name), - serial = COALESCE(VALUES(serial), serial)`, + serial = VALUES(serial)`, strings.TrimSuffix(valuePart, ","), ) @@ -2873,33 +2845,18 @@ func (ds *Datastore) RenewMDMManagedCertificates(ctx context.Context) error { ) continue } - // This will trigger a resend next time profiles are checked. Restrict to settled - // statuses ('verified', 'failed') to avoid re-firing renewal while a previous - // delivery is still in flight ('pending', 'verifying') — that would generate a - // fresh challenge and InstallProfile command every cron tick for any host that - // hasn't yet picked up the renewal (e.g. offline laptop), creating orphan nano - // commands and challenge rows hourly. See issue #44111. - // - // 'failed' rows are additionally gated on hp.updated_at being older than - // renewalFailedRetryBackoff. Without that gate, a permanent failure (CA deleted, - // IDP variables missing, license downgraded — anything that fails at profile-render - // time via fleet.MarkProfilesFailed) would loop indefinitely: cron flips status to - // NULL, reconcile re-renders and immediately fails again, status returns to 'failed', - // next cron tick repeats. Pre-fix this was masked by bug #1's metadata wipe acting - // as an accidental circuit breaker; once metadata is preserved (COALESCE), the loop - // becomes visible. The backoff gives transient SCEP failures a chance to recover - // without spamming a fresh challenge + nano command per cron tick. - updateQuery := `UPDATE ` + table + ` SET status = NULL WHERE status IN (?, ?) AND operation_type = ? AND (` + // This will trigger a resend next time profiles are checked + updateQuery := `UPDATE ` + table + ` SET status = NULL WHERE status IS NOT NULL AND operation_type = ? AND (` hostProfileClause := `` - values := []any{fleet.MDMDeliveryVerified, fleet.MDMDeliveryFailed, fleet.MDMOperationTypeInstall} + values := []any{fleet.MDMOperationTypeInstall} hostCertsToRenew := []struct { HostUUID string `db:"host_uuid"` ProfileUUID string `db:"profile_uuid"` NotValidAfter time.Time `db:"not_valid_after"` ValidityPeriod int `db:"validity_period"` }{} - // Fetch all MDM Managed certificates of the given type that are in a settled - // status ('verified' or 'failed') and which + // Fetch all MDM Managed certificates of the given type that aren't already queued for + // resend(hmap.status=null) and which // * Have a validity period > 30 days and are expiring in the next 30 days // * Have a validity period <= 30 days and are within half the validity period of expiration // nb: we SELECT not_valid_after and validity_period here so we can use them in the HAVING clause, but @@ -2916,17 +2873,12 @@ func (ds *Datastore) RenewMDMManagedCertificates(ctx context.Context) error { `+table+` hp ON hmmc.host_uuid = hp.host_uuid AND hmmc.profile_uuid = hp.profile_uuid WHERE - hmmc.type = ? - AND hp.operation_type = ? - AND ( - hp.status = ? - OR (hp.status = ? AND hp.updated_at < DATE_SUB(NOW(), INTERVAL ? SECOND)) - ) + hmmc.type = ? AND hp.status IS NOT NULL AND hp.operation_type = ? HAVING validity_period IS NOT NULL AND ((validity_period > 30 AND not_valid_after < DATE_ADD(NOW(), INTERVAL 30 DAY)) OR (validity_period <= 30 AND not_valid_after < DATE_ADD(NOW(), INTERVAL validity_period/2 DAY))) - LIMIT ?`, hostCertType, fleet.MDMOperationTypeInstall, fleet.MDMDeliveryVerified, fleet.MDMDeliveryFailed, int(renewalFailedRetryBackoff.Seconds()), limit) + LIMIT ?`, hostCertType, fleet.MDMOperationTypeInstall, limit) if err != nil { return ctxerr.Wrap(ctx, err, "retrieving mdm managed certificates to renew") } diff --git a/server/fleet/mdm.go b/server/fleet/mdm.go index 8268eb1a10a..c6dcf0994a0 100644 --- a/server/fleet/mdm.go +++ b/server/fleet/mdm.go @@ -85,12 +85,8 @@ const ( FleetVarSmallstepSCEPProxyURLPrefix FleetVarName = "SMALLSTEP_SCEP_PROXY_URL_" FleetVarSCEPWindowsCertificateID FleetVarName = "SCEP_WINDOWS_CERTIFICATE_ID" // nolint:gosec // G101: Potential hardcoded credentials - // OneTimeChallengeTTL is the time to live for one-time challenges. The challenge is - // generated at profile-render time but consumed when the device makes its SCEP request, - // which can be hours or days later if the device is offline (asleep, on a plane, etc.). - // 7 days covers a typical absence without being unbounded; once consumed, the challenge - // is deleted immediately regardless of TTL. See issue #44111. - OneTimeChallengeTTL = 7 * 24 * time.Hour + // OneTimeChallengeTTL is the time to live for one-time challenges. + OneTimeChallengeTTL = 1 * time.Hour ) // HasCAVariables returns true if any of the given Fleet variable names diff --git a/server/service/integration_mdm_test.go b/server/service/integration_mdm_test.go index e043a6d22b3..9650eec5d85 100644 --- a/server/service/integration_mdm_test.go +++ b/server/service/integration_mdm_test.go @@ -18021,11 +18021,10 @@ func (s *integrationMDMTestSuite) TestCustomSCEPIntegration() { profileUUID2 := checkProfileInstallStatus(t, getHostResp, "N0", fleet.MDMDeliveryPending, "") require.Equal(t, profileUUID, profileUUID2, "Expected the same profile UUID after re-sending the SCEP profile") - // Expire the challenge so that the next PKIOperation fails. Backdate beyond - // fleet.OneTimeChallengeTTL so this stays correct as the constant evolves. + // Expire the challenge so that the next PKIOperation fails mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error { stmt := "UPDATE challenges SET created_at = ? WHERE challenge = ?" - res, err := q.ExecContext(context.Background(), stmt, time.Now().Add(-fleet.OneTimeChallengeTTL-time.Hour), gotChallenge2) + res, err := q.ExecContext(context.Background(), stmt, time.Now().Add(-2*time.Hour), gotChallenge2) require.NoError(t, err, "Failed to expire the challenge in the database") rowsAffected, _ := res.RowsAffected() require.Equal(t, int64(1), rowsAffected, "Expected to update 1 row for the challenge") @@ -18130,158 +18129,6 @@ func (s *integrationMDMTestSuite) TestCustomSCEPIntegration() { "scepName certificate authority doesn't exist") } -// TestCustomSCEPRenewalPreservesCertMetadata is the end-to-end regression for issue #44111. -// -// The unit tests in apple_mdm_test.go prove that BulkUpsertMDMManagedCertificates preserves -// cert metadata when called with a hand-crafted nil-metadata payload. This test drives the -// real reconcile path (preprocessProfileContents via awaitTriggerProfileSchedule) so that if -// the render-time payload structure ever changes (e.g. gains a non-nil sentinel for an -// existing nil field), the unit test would still pass while this one would catch the -// regression. -// -// It also exercises the recovery loop: renewal cron flips status -> reconcile re-renders -// (the path that previously clobbered cert metadata) -> simulated SCEP failure -> renewal -// cron must re-fire on the next tick. Without the COALESCE fix in -// BulkUpsertMDMManagedCertificates, the second renewal call silently no-ops because -// validity_period IS NULL excludes the row from the renewal cron's HAVING clause. -func (s *integrationMDMTestSuite) TestCustomSCEPRenewalPreservesCertMetadata() { - t := s.T() - ctx := context.Background() - s.setSkipWorkerJobs(t) - scepServer := scep_server.StartTestSCEPServer(t) - - s.awaitTriggerProfileSchedule(t) - - // Enroll a host and drain the enrollment commands so the queue is clean. - host, mdmDevice := createHostThenEnrollMDM(s.ds, s.server.URL, t) - setupPusher(s, t, mdmDevice) - s.awaitRunAppleMDMWorkerSchedule() - for { - cmd, err := mdmDevice.Idle() - require.NoError(t, err) - if cmd == nil { - break - } - _, err = mdmDevice.Acknowledge(cmd.CommandUUID) - require.NoError(t, err) - } - require.NoError(t, s.keyValueStore.Delete(ctx, fleet.MDMProfileProcessingKeyPrefix+":"+host.UUID)) - - // Configure a custom SCEP CA and push a profile that uses it. - const caName = "renewCA" - ca := newMockCustomSCEPProxyCA(scepServer.URL+"/scep", caName) - var caResp createCertificateAuthorityResponse - s.DoJSON("POST", "/api/latest/fleet/certificate_authorities", - &createCertificateAuthorityRequest{ - CertificateAuthorityPayload: fleet.CertificateAuthorityPayload{CustomSCEPProxy: &ca}, - }, http.StatusOK, &caResp) - s.Do("POST", "/api/v1/fleet/mdm/profiles/batch", - batchSetMDMProfilesRequest{Profiles: []fleet.MDMProfileBatchPayload{ - {Name: "N0", Contents: customSCEPForTest("N0", "I0", caName)}, - }}, http.StatusNoContent) - s.awaitTriggerProfileSchedule(t) - - // Look up the profile UUID assigned to this host's profile install. - var hostResp getDeviceHostResponse - s.DoJSON("GET", fmt.Sprintf("/api/latest/fleet/hosts/%d", host.ID), nil, http.StatusOK, &hostResp) - require.NotNil(t, hostResp.Host.MDM.Profiles) - var profileUUID string - for _, p := range *hostResp.Host.MDM.Profiles { - if p.Name == "N0" { - profileUUID = p.ProfileUUID - break - } - } - require.NotEmpty(t, profileUUID) - - // Drive the SCEP install to completion: device picks up the profile, acks it, then - // osquery-style verification flips status to 'verified'. We don't need to drive a - // real PKIOperation because the renewal cron only cares about the profile delivery - // status and the cert metadata in host_mdm_managed_certificates (seeded below). - cmd, err := mdmDevice.Idle() - require.NoError(t, err) - require.NotNil(t, cmd, "expected SCEP profile to be delivered") - _, err = mdmDevice.Acknowledge(cmd.CommandUUID) - require.NoError(t, err) - - require.NoError(t, apple_mdm.VerifyHostMDMProfiles(ctx, s.ds, host, - map[string]*fleet.HostMacOSProfile{ - "I0": {Identifier: "I0", DisplayName: "N0", InstallDate: time.Now()}, - })) - - fetchManaged := func(t *testing.T) *fleet.HostMDMCertificateProfile { - t.Helper() - prof, err := s.ds.GetAppleHostMDMCertificateProfile(ctx, host.UUID, profileUUID, caName) - require.NoError(t, err) - require.NotNil(t, prof) - return prof - } - require.NotNil(t, fetchManaged(t).Status) - require.Equal(t, fleet.MDMDeliveryVerified, *fetchManaged(t).Status) - - // Mimic osquery cert reporting (updateHostMDMManagedCertDetailsDB) by directly - // populating cert metadata. Place the cert deep in the renewal window so - // RenewMDMManagedCertificates picks it up: validity_period=30 days, 1 day remaining. - const serial = "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef" - notValidBefore := time.Now().Add(-29 * 24 * time.Hour).UTC().Round(time.Microsecond) - notValidAfter := time.Now().Add(1 * 24 * time.Hour).UTC().Round(time.Microsecond) - mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_managed_certificates - SET serial = ?, not_valid_before = ?, not_valid_after = ? - WHERE host_uuid = ? AND profile_uuid = ?`, - serial, notValidBefore, notValidAfter, host.UUID, profileUUID) - return err - }) - - // 1) Renewal cron flips verified -> NULL since the cert is in the renewal window. - require.NoError(t, s.ds.RenewMDMManagedCertificates(ctx)) - - // 2) Reconcile re-renders the profile. preprocessProfileContents calls - // BulkUpsertMDMManagedCertificates with a payload from - // ReplaceCustomSCEPProxyURLVariable that has nil NotValidBefore/After/Serial. - // Without the COALESCE fix, this clobbers cert metadata to NULL. - s.awaitTriggerProfileSchedule(t) - - // 3) Cert metadata must survive the real reconcile path. - after := fetchManaged(t) - require.NotNil(t, after.Serial, "serial must be preserved by reconcile (issue #44111)") - require.Equal(t, serial, *after.Serial) - require.NotNil(t, after.NotValidBefore) - require.True(t, notValidBefore.Equal(*after.NotValidBefore), - "not_valid_before must be preserved (got %s, want %s)", *after.NotValidBefore, notValidBefore) - require.NotNil(t, after.NotValidAfter) - require.True(t, notValidAfter.Equal(*after.NotValidAfter), - "not_valid_after must be preserved (got %s, want %s)", *after.NotValidAfter, notValidAfter) - - // 4) Simulate the offline-recovery loop: device eventually picks up the renewal - // but the SCEP handshake fails (e.g. transient backend issue), so the profile - // transitions to 'failed'. Backdate updated_at past renewalFailedRetryBackoff - // to skip past the permanent-failure circuit breaker; this models a transient - // failure that has since cleared. - mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error { - _, err := q.ExecContext(ctx, ` - UPDATE host_mdm_apple_profiles SET status = ?, updated_at = ? - WHERE host_uuid = ? AND profile_uuid = ?`, - fleet.MDMDeliveryFailed, time.Now().Add(-25*time.Hour), host.UUID, profileUUID) - return err - }) - - // 5) Renewal cron must re-fire to recover. Before the COALESCE fix, this would - // silently no-op because validity_period IS NULL excluded the row from the - // renewal cron's HAVING clause. - require.NoError(t, s.ds.RenewMDMManagedCertificates(ctx)) - - var rawStatus sql.NullString - mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error { - return sqlx.GetContext(ctx, q, &rawStatus, ` - SELECT status FROM host_mdm_apple_profiles - WHERE host_uuid = ? AND profile_uuid = ?`, host.UUID, profileUUID) - }) - require.False(t, rawStatus.Valid, - "renewal cron must re-fire after a 'failed' delivery; got status %q (issue #44111)", rawStatus.String) -} - func (s *integrationMDMTestSuite) TestVPPAppsMDMFiltering() { t := s.T() From 506ca39eae0d544b70144de44b1add6d99dbc430 Mon Sep 17 00:00:00 2001 From: Tim Lee Date: Thu, 30 Apr 2026 14:00:35 -0600 Subject: [PATCH 2/2] lint fix --- server/datastore/mysql/mdm.go | 1 - 1 file changed, 1 deletion(-) diff --git a/server/datastore/mysql/mdm.go b/server/datastore/mysql/mdm.go index 505dc88ac0e..a6363261d1c 100644 --- a/server/datastore/mysql/mdm.go +++ b/server/datastore/mysql/mdm.go @@ -14,7 +14,6 @@ import ( "github.com/fleetdm/fleet/v4/server/mdm" "github.com/fleetdm/fleet/v4/server/mdm/apple/mobileconfig" microsoft_mdm "github.com/fleetdm/fleet/v4/server/mdm/microsoft" - common_mysql "github.com/fleetdm/fleet/v4/server/platform/mysql" "github.com/google/go-cmp/cmp" "github.com/jmoiron/sqlx" )