Skip to content

Commit

Permalink
fix: avoid inadvertent deletion of active HSM keys (#25208)
Browse files Browse the repository at this point in the history
This is a partial fix for #25017

The latest version of the YubiHSM2 SDK has changed the behavior for keys
longer than 2 bytes, which used to be silently truncated for all
operations.
This causes an unfortunate interaction with `DeleteUnusedKeys` when the
SDK is upgraded in an active Teleport cluster.
Because none of the active keys can be queried from the HSM
individually by their ID, but they can be listed by their label, all of
the active keys end up being deleted.
Yeah that's bad.

`DeleteUnusedKeys` is written this way in an attempt to be "stateless".
Trying to synchronously delete keys at the instant they are
rotated out during a CA rotation would be error-prone.
If the auth server were to restart or crash at the wrong moment, you
could be left with an orphaned key on your HSM forever, with no
reference to it stored by Teleport or anywhere else.

Instead, the Auth server labels all keys it creates with its own host
UUID.
Then periodically (during startup) it lists all keys in the HSM that are
labeled with its own UUID, and if they are not currently active, deletes
them.
This goes catastrophically wrong when individual lookup operations fail,
but list operations succeed.

The fix here is to avoid deleting any keys if any single lookup fails.

The YubiHSM2 SDK version 2023.1 is still not supported, but with this
fix at least we won't delete any active keys.
  • Loading branch information
nklaassen committed Apr 26, 2023
1 parent 4cb61eb commit 3b08563
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 37 deletions.
16 changes: 11 additions & 5 deletions lib/auth/auth.go
Expand Up @@ -4823,7 +4823,7 @@ func (a *Server) deleteUnusedKeys(ctx context.Context) error {
return trace.Wrap(err)
}

var usedKeys [][]byte
var activeKeys [][]byte
for _, caType := range types.CertAuthTypes {
caID := types.CertAuthID{Type: caType, DomainName: clusterName.GetClusterName()}
ca, err := a.Services.GetCertAuthority(ctx, caID, true)
Expand All @@ -4832,17 +4832,23 @@ func (a *Server) deleteUnusedKeys(ctx context.Context) error {
}
for _, keySet := range []types.CAKeySet{ca.GetActiveKeys(), ca.GetAdditionalTrustedKeys()} {
for _, sshKeyPair := range keySet.SSH {
usedKeys = append(usedKeys, sshKeyPair.PrivateKey)
activeKeys = append(activeKeys, sshKeyPair.PrivateKey)
}
for _, tlsKeyPair := range keySet.TLS {
usedKeys = append(usedKeys, tlsKeyPair.Key)
activeKeys = append(activeKeys, tlsKeyPair.Key)
}
for _, jwtKeyPair := range keySet.JWT {
usedKeys = append(usedKeys, jwtKeyPair.PrivateKey)
activeKeys = append(activeKeys, jwtKeyPair.PrivateKey)
}
}
}
return trace.Wrap(a.keyStore.DeleteUnusedKeys(ctx, usedKeys))
if err := a.keyStore.DeleteUnusedKeys(ctx, activeKeys); err != nil {
// Key deletion is best-effort, log a warning if it fails and carry on.
// We don't want to prevent a CA rotation, which may be necessary in
// some cases where this would fail.
log.WithError(err).Warning("Failed attempt to delete unused HSM keys")
}
return nil
}

// GetLicense return the license used the start the teleport enterprise auth server
Expand Down
30 changes: 23 additions & 7 deletions lib/auth/keystore/gcp_kms.go
Expand Up @@ -230,16 +230,20 @@ func (g *gcpKMSKeyStore) canSignWithKey(ctx context.Context, raw []byte, keyType

// DeleteUnusedKeys deletes all keys from KMS if they are:
// 1. Labeled by this server (matching HostUUID) when they were created
// 2. Not included in the argument usedKeys
func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error {
usedKmsKeyVersions := make(map[string]struct{})
for _, usedKey := range usedKeys {
keyID, err := parseGCPKMSKeyID(usedKey)
// 2. Not included in the argument activeKeys
func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error {
// Make a map of currently active key versions, this is used for lookups to
// check which keys in KMS are unused, and holds a count of how many times
// they are found in KMS. If any active keys are not found in KMS, we are in
// a bad state, so key deletion will be aborted.
activeKmsKeyVersions := make(map[string]int)
for _, activeKey := range activeKeys {
keyID, err := parseGCPKMSKeyID(activeKey)
if err != nil {
// could be a different type of key
continue
}
usedKmsKeyVersions[keyID.keyVersionName] = struct{}{}
activeKmsKeyVersions[keyID.keyVersionName] = 0
}

var unusedKeyIDs []gcpKMSKeyID
Expand All @@ -252,7 +256,9 @@ func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte
key, err := iter.Next()
for err == nil {
keyVersionName := key.Name + keyVersionSuffix
if _, used := usedKmsKeyVersions[keyVersionName]; !used {
if _, active := activeKmsKeyVersions[keyVersionName]; active {
activeKmsKeyVersions[keyVersionName]++
} else {
unusedKeyIDs = append(unusedKeyIDs, gcpKMSKeyID{
keyVersionName: keyVersionName,
})
Expand All @@ -263,6 +269,16 @@ func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte
return trace.Wrap(err)
}

for keyVersion, found := range activeKmsKeyVersions {
if found == 0 {
// Failed to find a currently active key owned by this host.
// The cluster is in a bad state, refuse to delete any keys.
return trace.NotFound(
"cannot find currently active CA key in %q GCP KMS, aborting attempt to delete unused keys",
keyVersion)
}
}

for _, unusedKey := range unusedKeyIDs {
g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("deleting unused GCP KMS key created by this server")
err := g.deleteKey(ctx, unusedKey.marshal())
Expand Down
29 changes: 25 additions & 4 deletions lib/auth/keystore/keystore_test.go
Expand Up @@ -163,10 +163,11 @@ func TestKeyStore(t *testing.T) {

yubiSlotNumber := 0
backends := []struct {
desc string
config Config
isSoftware bool
shouldSkip func() bool
desc string
config Config
isSoftware bool
shouldSkip func() bool
fakeKeyHack func([]byte) []byte
}{
{
desc: "software",
Expand Down Expand Up @@ -233,6 +234,15 @@ func TestKeyStore(t *testing.T) {
shouldSkip: func() bool {
return false
},
fakeKeyHack: func(key []byte) []byte {
// GCP KMS keys are never really deleted, their state is just
// set to destroyed, so this hack modifies a key to make it
// unrecognizable
kmsKey, err := parseGCPKMSKeyID(key)
require.NoError(t, err)
kmsKey.keyVersionName += "fake"
return kmsKey.marshal()
},
},
}

Expand Down Expand Up @@ -420,6 +430,17 @@ func TestKeyStore(t *testing.T) {
require.Error(t, err)
}

// Make sure key deletion is aborted when one of the active keys
// cannot be found.
// Use rawKeys[1] as a fake active key, it was just deleted in the
// previous step.
fakeActiveKey := rawKeys[1]
if tc.fakeKeyHack != nil {
fakeActiveKey = tc.fakeKeyHack(fakeActiveKey)
}
err = keyStore.DeleteUnusedKeys(ctx, [][]byte{fakeActiveKey})
require.True(t, trace.IsNotFound(err), "expected NotFound error, got %v", err)

// delete the final key so we don't leak it
err = keyStore.deleteKey(ctx, rawKeys[0])
require.NoError(t, err)
Expand Down
4 changes: 2 additions & 2 deletions lib/auth/keystore/manager.go
Expand Up @@ -57,8 +57,8 @@ func WithDigestAlgorithm(alg crypto.Hash) RSAKeyOption {
type backend interface {
// DeleteUnusedKeys deletes all keys from the KeyStore if they are:
// 1. Labeled by this KeyStore when they were created
// 2. Not included in the argument usedKeys
DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error
// 2. Not included in the argument activeKeys
DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error

// generateRSA creates a new RSA private key and returns its identifier and
// a crypto.Signer. The returned identifier can be passed to getSigner
Expand Down
53 changes: 36 additions & 17 deletions lib/auth/keystore/pkcs11.go
Expand Up @@ -212,37 +212,55 @@ func (p *pkcs11KeyStore) deleteKey(_ context.Context, rawKey []byte) error {
}

// DeleteUnusedKeys deletes all keys from the KeyStore if they are:
// 1. Labeled by this KeyStore when they were created
// 2. Not included in the argument usedKeys
func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error {
// 1. Labeled with the local HostUUID when they were created
// 2. Not included in the argument activeKeys
// This is meant to delete unused keys after they have been rotated out by a CA
// rotation.
func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error {
p.log.Debug("Deleting unused keys from HSM")
var usedPublicKeys []*rsa.PublicKey
for _, usedKey := range usedKeys {
if keyType(usedKey) != types.PrivateKeyType_PKCS11 {

// It's necessary to fetch all PublicKeys for the known activeKeys in order to
// compare with the signers returned by FindKeyPairs below. We have no way
// to find the CKA_ID of an unused key if it is not known.
var activePublicKeys []*rsa.PublicKey
for _, activeKey := range activeKeys {
if keyType(activeKey) != types.PrivateKeyType_PKCS11 {
continue
}
signer, err := p.getSigner(ctx, usedKey)
if trace.IsNotFound(err) {
// key is for different host, or truly not found in HSM. Either
// way, it won't be deleted below.
keyID, err := parseKeyID(activeKey)
if err != nil {
return trace.Wrap(err)
}
if keyID.HostID != p.hostUUID {
// This key was labeled with a foreign host UUID, it is likely not
// present on the attached HSM and definitely will not be returned
// by FindKeyPairs below which queries by host UUID.
continue
}
signer, err := p.getSigner(ctx, activeKey)
if trace.IsNotFound(err) {
// Failed to find a currently active key owned by this host.
// The cluster is in a bad state, refuse to delete any keys.
return trace.NotFound(
"cannot find currently active CA key %q in HSM, aborting attempt to delete unused keys",
keyID.KeyID)
}
if err != nil {
return trace.Wrap(err)
}
rsaPublicKey, ok := signer.Public().(*rsa.PublicKey)
if !ok {
return trace.BadParameter("unknown public key type: %T", signer.Public())
}
usedPublicKeys = append(usedPublicKeys, rsaPublicKey)
activePublicKeys = append(activePublicKeys, rsaPublicKey)
}
keyIsUsed := func(signer crypto.Signer) bool {
keyIsActive := func(signer crypto.Signer) bool {
rsaPublicKey, ok := signer.Public().(*rsa.PublicKey)
if !ok {
// unknown key type... we don't know what this is, so don't delete it
return true
}
for _, k := range usedPublicKeys {
for _, k := range activePublicKeys {
if rsaPublicKey.Equal(k) {
return true
}
Expand All @@ -254,13 +272,14 @@ func (p *pkcs11KeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte
return trace.Wrap(err)
}
for _, signer := range signers {
if keyIsUsed(signer) {
if keyIsActive(signer) {
continue
}
if err := signer.Delete(); err != nil {
// Key deletion is best-effort, log a warning on errors. Errors have
// been observed when FindKeyPairs returns duplicate keys.
p.log.Warnf("failed deleting unused key from HSM: %v", err)
// Key deletion is best-effort, log a warning on errors, and
// continue trying to delete other keys. Errors have been observed
// when FindKeyPairs returns duplicate keys.
p.log.Warnf("Failed deleting unused key from HSM: %v", err)
}
}
return nil
Expand Down
4 changes: 2 additions & 2 deletions lib/auth/keystore/software.go
Expand Up @@ -84,8 +84,8 @@ func (s *softwareKeyStore) deleteKey(_ context.Context, _ []byte) error {

// DeleteUnusedKeys deletes all keys from the KeyStore if they are:
// 1. Labeled by this KeyStore when they were created
// 2. Not included in the argument usedKeys
// 2. Not included in the argument activeKeys
// This is a no-op for rawKeyStore.
func (s *softwareKeyStore) DeleteUnusedKeys(ctx context.Context, usedKeys [][]byte) error {
func (s *softwareKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error {
return nil
}

0 comments on commit 3b08563

Please sign in to comment.