From 5228e238f382647a10bd4e08a28715f8785ac813 Mon Sep 17 00:00:00 2001 From: Nic Klaassen Date: Fri, 21 Apr 2023 21:30:59 -0700 Subject: [PATCH] fix: avoid inadvertent deletion of active HSM keys This is a partial fix for #25017 The latest version of the YubiHSM2 SDK has changed the behavior for keys longer than 2 bytes, which used to be silently truncated for all operations. This causes an unfortunate interaction with `DeleteUnusedKeys` when the SDK is upgraded in an active Teleport cluster. Because none of the active keys can be queried from the HSM individually by their ID, but they can be listed by their label, all of the active keys end up being deleted. Yeah that's bad. `DeleteUnusedKeys` is written this way in an attempt to be "stateless". Trying to synchronously delete keys at the instant they are rotated out during a CA rotation would be error-prone. If the auth server were to restart or crash at the wrong moment, you could be left with an orphaned key on your HSM forever, with no reference to it stored by Teleport or anywhere else. Instead, the Auth server labels all keys it creates with its own host UUID. Then periodically (during startup) it lists all keys in the HSM that are labeled with its own UUID, and if they are not currently active, deletes them. This goes catastrophically wrong when individual lookup operations fail, but list operations succeed. The fix here is to avoid deleting any keys if any single lookup fails. The YubiHSM2 SDK version 2023.1 is still not supported, but with this fix at least we won't delete any active keys. --- lib/auth/auth.go | 16 ++++++--- lib/auth/keystore/gcp_kms.go | 30 +++++++++++++---- lib/auth/keystore/keystore_test.go | 29 +++++++++++++--- lib/auth/keystore/manager.go | 4 +-- lib/auth/keystore/pkcs11.go | 53 ++++++++++++++++++++---------- lib/auth/keystore/software.go | 4 +-- 6 files changed, 99 insertions(+), 37 deletions(-) diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 3a1d178276510..ba41ce39326cc 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -4823,7 +4823,7 @@ func (a *Server) deleteUnusedKeys(ctx context.Context) error { return trace.Wrap(err) } - var usedKeys [][]byte + var activeKeys [][]byte for _, caType := range types.CertAuthTypes { caID := types.CertAuthID{Type: caType, DomainName: clusterName.GetClusterName()} ca, err := a.Services.GetCertAuthority(ctx, caID, true) @@ -4832,17 +4832,23 @@ func (a *Server) deleteUnusedKeys(ctx context.Context) error { } for _, keySet := range []types.CAKeySet{ca.GetActiveKeys(), ca.GetAdditionalTrustedKeys()} { for _, sshKeyPair := range keySet.SSH { - usedKeys = append(usedKeys, sshKeyPair.PrivateKey) + activeKeys = append(activeKeys, sshKeyPair.PrivateKey) } for _, tlsKeyPair := range keySet.TLS { - usedKeys = append(usedKeys, tlsKeyPair.Key) + activeKeys = append(activeKeys, tlsKeyPair.Key) } for _, jwtKeyPair := range keySet.JWT { - usedKeys = append(usedKeys, jwtKeyPair.PrivateKey) + activeKeys = append(activeKeys, jwtKeyPair.PrivateKey) } } } - return trace.Wrap(a.keyStore.DeleteUnusedKeys(ctx, usedKeys)) + if err := a.keyStore.DeleteUnusedKeys(ctx, activeKeys); err != nil { + // Key deletion is best-effort, log a warning if it fails and carry on. + // We don't want to prevent a CA rotation, which may be necessary in + // some cases where this would fail. + log.WithError(err).Warning("Failed attempt to delete unused HSM keys") + } + return nil } // GetLicense return the license used the start the teleport enterprise auth server diff --git a/lib/auth/keystore/gcp_kms.go b/lib/auth/keystore/gcp_kms.go index 4a0cd2a45875c..08485472086ee 100644 --- a/lib/auth/keystore/gcp_kms.go +++ b/lib/auth/keystore/gcp_kms.go @@ -230,16 +230,20 @@ func (g *gcpKMSKeyStore) canSignWithKey(ctx context.Context, raw []byte, keyType // DeleteUnusedKeys deletes all keys from KMS if they are: // 1. Labeled by this server (matching HostUUID) when they were created -// 2. Not included in the argument usedKeys -func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error { - usedKmsKeyVersions := make(map[string]struct{}) - for _, usedKey := range usedKeys { - keyID, err := parseGCPKMSKeyID(usedKey) +// 2. Not included in the argument activeKeys +func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error { + // Make a map of currently active key versions, this is used for lookups to + // check which keys in KMS are unused, and holds a count of how many times + // they are found in KMS. If any active keys are not found in KMS, we are in + // a bad state, so key deletion will be aborted. + activeKmsKeyVersions := make(map[string]int) + for _, activeKey := range activeKeys { + keyID, err := parseGCPKMSKeyID(activeKey) if err != nil { // could be a different type of key continue } - usedKmsKeyVersions[keyID.keyVersionName] = struct{}{} + activeKmsKeyVersions[keyID.keyVersionName] = 0 } var unusedKeyIDs []gcpKMSKeyID @@ -252,7 +256,9 @@ func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte key, err := iter.Next() for err == nil { keyVersionName := key.Name + keyVersionSuffix - if _, used := usedKmsKeyVersions[keyVersionName]; !used { + if _, active := activeKmsKeyVersions[keyVersionName]; active { + activeKmsKeyVersions[keyVersionName]++ + } else { unusedKeyIDs = append(unusedKeyIDs, gcpKMSKeyID{ keyVersionName: keyVersionName, }) @@ -263,6 +269,16 @@ func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte return trace.Wrap(err) } + for keyVersion, found := range activeKmsKeyVersions { + if found == 0 { + // Failed to find a currently active key owned by this host. + // The cluster is in a bad state, refuse to delete any keys. + return trace.NotFound( + "cannot find currently active CA key in %q GCP KMS, aborting attempt to delete unused keys", + keyVersion) + } + } + for _, unusedKey := range unusedKeyIDs { g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("deleting unused GCP KMS key created by this server") err := g.deleteKey(ctx, unusedKey.marshal()) diff --git a/lib/auth/keystore/keystore_test.go b/lib/auth/keystore/keystore_test.go index 4d5167fde88a4..6256511abc954 100644 --- a/lib/auth/keystore/keystore_test.go +++ b/lib/auth/keystore/keystore_test.go @@ -163,10 +163,11 @@ func TestKeyStore(t *testing.T) { yubiSlotNumber := 0 backends := []struct { - desc string - config Config - isSoftware bool - shouldSkip func() bool + desc string + config Config + isSoftware bool + shouldSkip func() bool + fakeKeyHack func([]byte) []byte }{ { desc: "software", @@ -233,6 +234,15 @@ func TestKeyStore(t *testing.T) { shouldSkip: func() bool { return false }, + fakeKeyHack: func(key []byte) []byte { + // GCP KMS keys are never really deleted, their state is just + // set to destroyed, so this hack modifies a key to make it + // unrecognizable + kmsKey, err := parseGCPKMSKeyID(key) + require.NoError(t, err) + kmsKey.keyVersionName += "fake" + return kmsKey.marshal() + }, }, } @@ -420,6 +430,17 @@ func TestKeyStore(t *testing.T) { require.Error(t, err) } + // Make sure key deletion is aborted when one of the active keys + // cannot be found. + // Use rawKeys[1] as a fake active key, it was just deleted in the + // previous step. + fakeActiveKey := rawKeys[1] + if tc.fakeKeyHack != nil { + fakeActiveKey = tc.fakeKeyHack(fakeActiveKey) + } + err = keyStore.DeleteUnusedKeys(ctx, [][]byte{fakeActiveKey}) + require.True(t, trace.IsNotFound(err), "expected NotFound error, got %v", err) + // delete the final key so we don't leak it err = keyStore.deleteKey(ctx, rawKeys[0]) require.NoError(t, err) diff --git a/lib/auth/keystore/manager.go b/lib/auth/keystore/manager.go index c1ad8d2e8b1f1..2f717e114b9ce 100644 --- a/lib/auth/keystore/manager.go +++ b/lib/auth/keystore/manager.go @@ -57,8 +57,8 @@ func WithDigestAlgorithm(alg crypto.Hash) RSAKeyOption { type backend interface { // DeleteUnusedKeys deletes all keys from the KeyStore if they are: // 1. Labeled by this KeyStore when they were created - // 2. Not included in the argument usedKeys - DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error + // 2. Not included in the argument activeKeys + DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error // generateRSA creates a new RSA private key and returns its identifier and // a crypto.Signer. The returned identifier can be passed to getSigner diff --git a/lib/auth/keystore/pkcs11.go b/lib/auth/keystore/pkcs11.go index 062c04fa27480..8e3bc276c1689 100644 --- a/lib/auth/keystore/pkcs11.go +++ b/lib/auth/keystore/pkcs11.go @@ -212,21 +212,39 @@ func (p *pkcs11KeyStore) deleteKey(_ context.Context, rawKey []byte) error { } // DeleteUnusedKeys deletes all keys from the KeyStore if they are: -// 1. Labeled by this KeyStore when they were created -// 2. Not included in the argument usedKeys -func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error { +// 1. Labeled with the local HostUUID when they were created +// 2. Not included in the argument activeKeys +// This is meant to delete unused keys after they have been rotated out by a CA +// rotation. +func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error { p.log.Debug("Deleting unused keys from HSM") - var usedPublicKeys []*rsa.PublicKey - for _, usedKey := range usedKeys { - if keyType(usedKey) != types.PrivateKeyType_PKCS11 { + + // It's necessary to fetch all PublicKeys for the known activeKeys in order to + // compare with the signers returned by FindKeyPairs below. We have no way + // to find the CKA_ID of an unused key if it is not known. + var activePublicKeys []*rsa.PublicKey + for _, activeKey := range activeKeys { + if keyType(activeKey) != types.PrivateKeyType_PKCS11 { continue } - signer, err := p.getSigner(ctx, usedKey) - if trace.IsNotFound(err) { - // key is for different host, or truly not found in HSM. Either - // way, it won't be deleted below. + keyID, err := parseKeyID(activeKey) + if err != nil { + return trace.Wrap(err) + } + if keyID.HostID != p.hostUUID { + // This key was labeled with a foreign host UUID, it is likely not + // present on the attached HSM and definitely will not be returned + // by FindKeyPairs below which queries by host UUID. continue } + signer, err := p.getSigner(ctx, activeKey) + if trace.IsNotFound(err) { + // Failed to find a currently active key owned by this host. + // The cluster is in a bad state, refuse to delete any keys. + return trace.NotFound( + "cannot find currently active CA key %q in HSM, aborting attempt to delete unused keys", + keyID.KeyID) + } if err != nil { return trace.Wrap(err) } @@ -234,15 +252,15 @@ func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte if !ok { return trace.BadParameter("unknown public key type: %T", signer.Public()) } - usedPublicKeys = append(usedPublicKeys, rsaPublicKey) + activePublicKeys = append(activePublicKeys, rsaPublicKey) } - keyIsUsed := func(signer crypto.Signer) bool { + keyIsActive := func(signer crypto.Signer) bool { rsaPublicKey, ok := signer.Public().(*rsa.PublicKey) if !ok { // unknown key type... we don't know what this is, so don't delete it return true } - for _, k := range usedPublicKeys { + for _, k := range activePublicKeys { if rsaPublicKey.Equal(k) { return true } @@ -254,13 +272,14 @@ func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte return trace.Wrap(err) } for _, signer := range signers { - if keyIsUsed(signer) { + if keyIsActive(signer) { continue } if err := signer.Delete(); err != nil { - // Key deletion is best-effort, log a warning on errors. Errors have - // been observed when FindKeyPairs returns duplicate keys. - p.log.Warnf("failed deleting unused key from HSM: %v", err) + // Key deletion is best-effort, log a warning on errors, and + // continue trying to delete other keys. Errors have been observed + // when FindKeyPairs returns duplicate keys. + p.log.Warnf("Failed deleting unused key from HSM: %v", err) } } return nil diff --git a/lib/auth/keystore/software.go b/lib/auth/keystore/software.go index ea395fb8bf70e..49fb0f4509675 100644 --- a/lib/auth/keystore/software.go +++ b/lib/auth/keystore/software.go @@ -84,8 +84,8 @@ func (s *softwareKeyStore) deleteKey(_ context.Context, _ []byte) error { // DeleteUnusedKeys deletes all keys from the KeyStore if they are: // 1. Labeled by this KeyStore when they were created -// 2. Not included in the argument usedKeys +// 2. Not included in the argument activeKeys // This is a no-op for rawKeyStore. -func (s *softwareKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error { +func (s *softwareKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error { return nil }