Skip to content

Commit

Permalink
Automatically replace revoked certs managed on-demand
Browse files Browse the repository at this point in the history
When I initially wrote the auto-replace feature, it was for the standard mode of operation,
which I presumed the vast majority of CertMagic deployments use. At the time, On-Demand
mode of operation was fairly niche. And at the time, it looked tricky to properly enable this feature for on-demand certificates, so I shelved it considering it would be low-impact anyway.
So on-demand certificates didn't benefit from auto-replace in the case of revocation (oh well,
no other servers / ACME clients do that at all anyway).

I guess since that time, the use of CertMagic's exclusive on-demand feature has risen in
popularity. But there is no way to tell, and I had no real way of knowing whether any
significant use of the feature is being had since Caddy has no telemetry. (We used to
have telemetry -- benign, anonymous technical stats to help us understand usage -- but
unfortunately public backlash forced us to end the program.) Based on public feedback
forced by external events, it seems that on-demand TLS deployments are probably rare,
but each of those few deployments actually serve thousands of sites/domains. (The
true importance of this feature would have been clear months ago if Caddy had telemetry,
as Caddy is the primary importer of CertMagic.)

This commit should enable auto-replace for on-demand certificates. It required some
refactoring and some decisions that aren't *entirely* clear are right, but that's how it
goes.

I haven't tested this. (Last time I worked on this feature it took me about 2 days to test properly.)
  • Loading branch information
mholt committed Jan 31, 2022
1 parent b6b3db3 commit 9245be5
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 92 deletions.
73 changes: 39 additions & 34 deletions handshake.go
Expand Up @@ -228,6 +228,9 @@ func DefaultCertificateSelector(hello *tls.ClientHelloInfo, choices []Certificat
func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNecessary, obtainIfNecessary bool) (Certificate, error) {
log := loggerNamed(cfg.Logger, "handshake")

// TODO: get a proper context... somehow...?
ctx := context.Background()

// First check our in-memory cache to see if we've already loaded it
cert, matched, defaulted := cfg.getCertificate(hello)
if matched {
Expand All @@ -239,12 +242,10 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
zap.String("hash", cert.hash))
}
if cert.managed && cfg.OnDemand != nil && obtainIfNecessary {
// It's been reported before that if the machine goes to sleep (or
// suspends the process) that certs which are already loaded into
// memory won't get renewed in the background, so we need to check
// expiry on each handshake too, sigh:
// https://caddy.community/t/local-certificates-not-renewing-on-demand/9482
return cfg.optionalMaintenance(loggerNamed(cfg.Logger, "on_demand"), cert, hello)
// On-demand certificates are maintained in the background, but
// maintenance is triggered by handshakes instead of by a timer
// as in maintain.go.
return cfg.optionalMaintenance(ctx, loggerNamed(cfg.Logger, "on_demand"), cert, hello)
}
return cert, nil
}
Expand Down Expand Up @@ -290,7 +291,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
zap.Time("expiration", loadedCert.Leaf.NotAfter),
zap.String("hash", loadedCert.hash))
}
loadedCert, err = cfg.handshakeMaintenance(hello, loadedCert)
loadedCert, err = cfg.handshakeMaintenance(ctx, hello, loadedCert)
if err != nil {
if log != nil {
log.Error("maintining newly-loaded certificate",
Expand All @@ -302,7 +303,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
}
if cfg.OnDemand != nil && obtainIfNecessary {
// By this point, we need to ask the CA for a certificate
return cfg.obtainOnDemandCertificate(hello)
return cfg.obtainOnDemandCertificate(ctx, hello)
}
}

Expand Down Expand Up @@ -336,8 +337,8 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
// optionalMaintenance will perform maintenance on the certificate (if necessary) and
// will return the resulting certificate. This should only be done if the certificate
// is managed, OnDemand is enabled, and the scope is allowed to obtain certificates.
func (cfg *Config) optionalMaintenance(log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) {
newCert, err := cfg.handshakeMaintenance(hello, cert)
func (cfg *Config) optionalMaintenance(ctx context.Context, log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) {
newCert, err := cfg.handshakeMaintenance(ctx, hello, cert)
if err == nil {
return newCert, nil
}
Expand Down Expand Up @@ -382,7 +383,7 @@ func (cfg *Config) checkIfCertShouldBeObtained(name string) error {
// hello, it will wait and use what the other goroutine obtained.
//
// This function is safe for use by multiple concurrent goroutines.
func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certificate, error) {
func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.ClientHelloInfo) (Certificate, error) {
log := loggerNamed(cfg.Logger, "on_demand")

name := cfg.getNameFromClientHello(hello)
Expand Down Expand Up @@ -436,9 +437,10 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif
log.Info("obtaining new certificate", zap.String("server_name", name))
}

// TODO: use a proper context; we use one with timeout because retries are enabled because interactive is false
// TODO: we are only adding a timeout because we don't know if the context passed in is actually cancelable...
// (timeout duration is based on https://caddy.community/t/zerossl-dns-challenge-failing-often-route53-plugin/13822/24?u=matt)
ctx, cancel := context.WithTimeout(context.TODO(), 180*time.Second)
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, 180*time.Second)
defer cancel()

// Obtain the certificate
Expand All @@ -464,32 +466,35 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif
// OCSP stapling errors are not returned, only logged.
//
// This function is safe for use by multiple concurrent goroutines.
func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) {
func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) {
log := loggerNamed(cfg.Logger, "on_demand")

// Check cert expiration
if currentlyInRenewalWindow(cert.Leaf.NotBefore, cert.Leaf.NotAfter, cfg.RenewalWindowRatio) {
return cfg.renewDynamicCertificate(hello, cert)
return cfg.renewDynamicCertificate(ctx, hello, cert)
}

// Check OCSP staple validity
if cert.ocsp != nil {
refreshTime := cert.ocsp.ThisUpdate.Add(cert.ocsp.NextUpdate.Sub(cert.ocsp.ThisUpdate) / 2)
if time.Now().After(refreshTime) {
_, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil)
if err != nil {
// An error with OCSP stapling is not the end of the world, and in fact, is
// quite common considering not all certs have issuer URLs that support it.
if log != nil {
log.Warn("stapling OCSP",
zap.String("server_name", hello.ServerName),
zap.Error(err))
}
if cert.ocsp != nil && !freshOCSP(cert.ocsp) {
ocspResp, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil)
if err != nil {
// An error with OCSP stapling is not the end of the world, and in fact, is
// quite common considering not all certs have issuer URLs that support it.
if log != nil {
log.Warn("stapling OCSP",
zap.String("server_name", hello.ServerName),
zap.Error(err))
}
cfg.certCache.mu.Lock()
cfg.certCache.cache[cert.hash] = cert
cfg.certCache.mu.Unlock()
}

// our copy of cert has the new OCSP staple, so replace it in the cache
cfg.certCache.mu.Lock()
cfg.certCache.cache[cert.hash] = cert
cfg.certCache.mu.Unlock()

// We attempt to replace any certificates that were revoked.
// Crucially, this happens OUTSIDE a lock on the certCache.
cfg.forceRenewIfCertRevoked(ctx, log, cert, ocspResp)
}

return cert, nil
Expand All @@ -503,7 +508,7 @@ func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certifi
// certificate has been renewed, and returns the renewed certificate.
//
// This function is safe for use by multiple concurrent goroutines.
func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) {
func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) {
log := loggerNamed(cfg.Logger, "on_demand")

name := cfg.getNameFromClientHello(hello)
Expand Down Expand Up @@ -587,6 +592,7 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe
// Renew and reload the certificate
renewAndReload := func(ctx context.Context, cancel context.CancelFunc) (Certificate, error) {
defer cancel()

err = cfg.RenewCertAsync(ctx, name, false)
if err == nil {
// even though the recursive nature of the dynamic cert loading
Expand Down Expand Up @@ -617,14 +623,13 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe

// if the certificate hasn't expired, we can serve what we have and renew in the background
if timeLeft > 0 {
// TODO: get a proper context; we use one with timeout because retries are enabled because interactive is false
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Minute)
ctx, cancel := context.WithTimeout(ctx, 5*time.Minute)
go renewAndReload(ctx, cancel)
return currentCert, nil
}

// otherwise, we have to block while we renew an expired certificate
ctx, cancel := context.WithTimeout(context.TODO(), 90*time.Second)
ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
return renewAndReload(ctx, cancel)
}

Expand Down
127 changes: 69 additions & 58 deletions maintain.go
Expand Up @@ -394,64 +394,8 @@ func (certCache *Cache) updateOCSPStaples(ctx context.Context) {
// We attempt to replace any certificates that were revoked.
// Crucially, this happens OUTSIDE a lock on the certCache.
for _, renew := range renewQueue {
if logger != nil {
logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
zap.Strings("identifiers", renew.oldCert.Names),
zap.Time("expiration", renew.oldCert.Leaf.NotAfter))
}

renewName := renew.oldCert.Names[0]
cfg := configs[renewName]

// if revoked for key compromise, we can't be sure whether the storage of
// the key is still safe; however, we KNOW the old key is not safe, and we
// can only hope by the time of revocation that storage has been secured;
// key management is not something we want to get into, but in this case
// it seems prudent to replace the key - and since renewal requires reuse
// of a prior key, we can't do a "renew" to replace the cert if we need a
// new key, so we'll have to do an obtain instead
var obtainInsteadOfRenew bool
if renew.ocspResp.RevocationReason == acme.ReasonKeyCompromise {
err := cfg.moveCompromisedPrivateKey(renew.oldCert, logger)
if err != nil && logger != nil {
logger.Error("could not remove compromised private key from use",
zap.Strings("identifiers", renew.oldCert.Names),
zap.String("issuer", renew.oldCert.issuerKey),
zap.Error(err))
}
obtainInsteadOfRenew = true
}

var err error
if obtainInsteadOfRenew {
err = cfg.ObtainCertAsync(ctx, renewName)
} else {
// notice that we force renewal; otherwise, it might see that the
// certificate isn't close to expiring and return, but we really
// need a replacement certificate! see issue #4191
err = cfg.RenewCertAsync(ctx, renewName, true)
}
if err != nil {
// probably better to not serve a revoked certificate at all
if logger != nil {
logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
zap.Strings("identifiers", renew.oldCert.Names),
zap.Error(err))
}
certCache.mu.Lock()
certCache.removeCertificate(renew.oldCert)
certCache.mu.Unlock()
continue
}
err = cfg.reloadManagedCertificate(renew.oldCert)
if err != nil {
if logger != nil {
logger.Error("after obtaining new certificate due to OCSP status of REVOKED",
zap.Strings("identifiers", renew.oldCert.Names),
zap.Error(err))
}
continue
}
cfg := configs[renew.oldCert.Names[0]]
cfg.forceRenewIfCertRevoked(ctx, logger, renew.oldCert, renew.ocspResp)
}
}

Expand Down Expand Up @@ -599,6 +543,73 @@ func deleteExpiredCerts(ctx context.Context, storage Storage, gracePeriod time.D
return nil
}

// forceRenewIfCertRevoked forcefully renews cert if ocspResp has a status of Revoked. (It is a no-op otherwise.)
// Due to nuances with error handling, any errors are logged with the provided logger rather than returned.
// This MUST NOT be called within a lock on cfg.certCacheMu.
func (cfg *Config) forceRenewIfCertRevoked(ctx context.Context, logger *zap.Logger, cert Certificate, ocspResp *ocsp.Response) {
if !cert.managed || ocspResp.Status != ocsp.Revoked || len(cert.Names) == 0 {
return
}

if logger != nil {
logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
zap.Strings("identifiers", cert.Names),
zap.Time("expiration", cert.Leaf.NotAfter))
}

renewName := cert.Names[0]

// if revoked for key compromise, we can't be sure whether the storage of
// the key is still safe; however, we KNOW the old key is not safe, and we
// can only hope by the time of revocation that storage has been secured;
// key management is not something we want to get into, but in this case
// it seems prudent to replace the key - and since renewal requires reuse
// of a prior key, we can't do a "renew" to replace the cert if we need a
// new key, so we'll have to do an obtain instead
var obtainInsteadOfRenew bool
if ocspResp.RevocationReason == acme.ReasonKeyCompromise {
err := cfg.moveCompromisedPrivateKey(cert, logger)
if err != nil && logger != nil {
logger.Error("could not remove compromised private key from use",
zap.Strings("identifiers", cert.Names),
zap.String("issuer", cert.issuerKey),
zap.Error(err))
}
obtainInsteadOfRenew = true
}

var err error
if obtainInsteadOfRenew {
err = cfg.ObtainCertAsync(ctx, renewName)
} else {
// notice that we force renewal; otherwise, it might see that the
// certificate isn't close to expiring and return, but we really
// need a replacement certificate! see issue #4191
err = cfg.RenewCertAsync(ctx, renewName, true)
}
if err != nil {
// probably better to not serve a revoked certificate at all
if logger != nil {
logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
}
cfg.certCache.mu.Lock()
cfg.certCache.removeCertificate(cert)
cfg.certCache.mu.Unlock()
return
}
err = cfg.reloadManagedCertificate(cert)
if err != nil {
if logger != nil {
logger.Error("after obtaining new certificate due to OCSP status of REVOKED",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
}
return
}
}

// moveCompromisedPrivateKey moves the private key for cert to a ".compromised" file
// by copying the data to the new file, then deleting the old one.
func (cfg *Config) moveCompromisedPrivateKey(cert Certificate, logger *zap.Logger) error {
Expand Down

0 comments on commit 9245be5

Please sign in to comment.