Automatically replace revoked certs managed on-demand

When I initially wrote the auto-replace feature, it was for the standard mode of operation, which I presumed the vast majority of CertMagic deployments use. At the time, On-Demand mode of operation was fairly niche. And at the time, it looked tricky to properly enable this feature for on-demand certificates, so I shelved it considering it would be low-impact anyway. So on-demand certificates didn't benefit from auto-replace in the case of revocation (oh well, no other servers / ACME clients do that at all anyway). I guess since that time, the use of CertMagic's exclusive on-demand feature has risen in popularity. But there is no way to tell, and I had no real way of knowing whether any significant use of the feature is being had since Caddy has no telemetry. (We used to have telemetry -- benign, anonymous technical stats to help us understand usage -- but unfortunately public backlash forced us to end the program.) Based on public feedback forced by external events, it seems that on-demand TLS deployments are probably rare, but each of those few deployments actually serve thousands of sites/domains. (The true importance of this feature would have been clear months ago if Caddy had telemetry, as Caddy is the primary importer of CertMagic.) This commit should enable auto-replace for on-demand certificates. It required some refactoring and some decisions that aren't *entirely* clear are right, but that's how it goes. I haven't tested this. (Last time I worked on this feature it took me about 2 days to test properly.)
crccw · Jan 31, 2022 · 9245be5 · 9245be5
1 parent b6b3db3
commit 9245be5
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 92 deletions.
diff --git a/handshake.go b/handshake.go
@@ -228,6 +228,9 @@ func DefaultCertificateSelector(hello *tls.ClientHelloInfo, choices []Certificat
 func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNecessary, obtainIfNecessary bool) (Certificate, error) {
 	log := loggerNamed(cfg.Logger, "handshake")
 
+	// TODO: get a proper context... somehow...?
+	ctx := context.Background()
+
 	// First check our in-memory cache to see if we've already loaded it
 	cert, matched, defaulted := cfg.getCertificate(hello)
 	if matched {
@@ -239,12 +242,10 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
 				zap.String("hash", cert.hash))
 		}
 		if cert.managed && cfg.OnDemand != nil && obtainIfNecessary {
-			// It's been reported before that if the machine goes to sleep (or
-			// suspends the process) that certs which are already loaded into
-			// memory won't get renewed in the background, so we need to check
-			// expiry on each handshake too, sigh:
-			// https://caddy.community/t/local-certificates-not-renewing-on-demand/9482
-			return cfg.optionalMaintenance(loggerNamed(cfg.Logger, "on_demand"), cert, hello)
+			// On-demand certificates are maintained in the background, but
+			// maintenance is triggered by handshakes instead of by a timer
+			// as in maintain.go.
+			return cfg.optionalMaintenance(ctx, loggerNamed(cfg.Logger, "on_demand"), cert, hello)
 		}
 		return cert, nil
 	}
@@ -290,7 +291,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
 					zap.Time("expiration", loadedCert.Leaf.NotAfter),
 					zap.String("hash", loadedCert.hash))
 			}
-			loadedCert, err = cfg.handshakeMaintenance(hello, loadedCert)
+			loadedCert, err = cfg.handshakeMaintenance(ctx, hello, loadedCert)
 			if err != nil {
 				if log != nil {
 					log.Error("maintining newly-loaded certificate",
@@ -302,7 +303,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
 		}
 		if cfg.OnDemand != nil && obtainIfNecessary {
 			// By this point, we need to ask the CA for a certificate
-			return cfg.obtainOnDemandCertificate(hello)
+			return cfg.obtainOnDemandCertificate(ctx, hello)
 		}
 	}
 
@@ -336,8 +337,8 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece
 // optionalMaintenance will perform maintenance on the certificate (if necessary) and
 // will return the resulting certificate. This should only be done if the certificate
 // is managed, OnDemand is enabled, and the scope is allowed to obtain certificates.
-func (cfg *Config) optionalMaintenance(log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) {
-	newCert, err := cfg.handshakeMaintenance(hello, cert)
+func (cfg *Config) optionalMaintenance(ctx context.Context, log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) {
+	newCert, err := cfg.handshakeMaintenance(ctx, hello, cert)
 	if err == nil {
 		return newCert, nil
 	}
@@ -382,7 +383,7 @@ func (cfg *Config) checkIfCertShouldBeObtained(name string) error {
 // hello, it will wait and use what the other goroutine obtained.
 //
 // This function is safe for use by multiple concurrent goroutines.
-func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certificate, error) {
+func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.ClientHelloInfo) (Certificate, error) {
 	log := loggerNamed(cfg.Logger, "on_demand")
 
 	name := cfg.getNameFromClientHello(hello)
@@ -436,9 +437,10 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif
 		log.Info("obtaining new certificate", zap.String("server_name", name))
 	}
 
-	// TODO: use a proper context; we use one with timeout because retries are enabled because interactive is false
+	// TODO: we are only adding a timeout because we don't know if the context passed in is actually cancelable...
 	// (timeout duration is based on https://caddy.community/t/zerossl-dns-challenge-failing-often-route53-plugin/13822/24?u=matt)
-	ctx, cancel := context.WithTimeout(context.TODO(), 180*time.Second)
+	var cancel context.CancelFunc
+	ctx, cancel = context.WithTimeout(ctx, 180*time.Second)
 	defer cancel()
 
 	// Obtain the certificate
@@ -464,32 +466,35 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif
 // OCSP stapling errors are not returned, only logged.
 //
 // This function is safe for use by multiple concurrent goroutines.
-func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) {
+func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) {
 	log := loggerNamed(cfg.Logger, "on_demand")
 
 	// Check cert expiration
 	if currentlyInRenewalWindow(cert.Leaf.NotBefore, cert.Leaf.NotAfter, cfg.RenewalWindowRatio) {
-		return cfg.renewDynamicCertificate(hello, cert)
+		return cfg.renewDynamicCertificate(ctx, hello, cert)
 	}
 
 	// Check OCSP staple validity
-	if cert.ocsp != nil {
-		refreshTime := cert.ocsp.ThisUpdate.Add(cert.ocsp.NextUpdate.Sub(cert.ocsp.ThisUpdate) / 2)
-		if time.Now().After(refreshTime) {
-			_, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil)
-			if err != nil {
-				// An error with OCSP stapling is not the end of the world, and in fact, is
-				// quite common considering not all certs have issuer URLs that support it.
-				if log != nil {
-					log.Warn("stapling OCSP",
-						zap.String("server_name", hello.ServerName),
-						zap.Error(err))
-				}
+	if cert.ocsp != nil && !freshOCSP(cert.ocsp) {
+		ocspResp, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil)
+		if err != nil {
+			// An error with OCSP stapling is not the end of the world, and in fact, is
+			// quite common considering not all certs have issuer URLs that support it.
+			if log != nil {
+				log.Warn("stapling OCSP",
+					zap.String("server_name", hello.ServerName),
+					zap.Error(err))
 			}
-			cfg.certCache.mu.Lock()
-			cfg.certCache.cache[cert.hash] = cert
-			cfg.certCache.mu.Unlock()
 		}
+
+		// our copy of cert has the new OCSP staple, so replace it in the cache
+		cfg.certCache.mu.Lock()
+		cfg.certCache.cache[cert.hash] = cert
+		cfg.certCache.mu.Unlock()
+
+		// We attempt to replace any certificates that were revoked.
+		// Crucially, this happens OUTSIDE a lock on the certCache.
+		cfg.forceRenewIfCertRevoked(ctx, log, cert, ocspResp)
 	}
 
 	return cert, nil
@@ -503,7 +508,7 @@ func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certifi
 // certificate has been renewed, and returns the renewed certificate.
 //
 // This function is safe for use by multiple concurrent goroutines.
-func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) {
+func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) {
 	log := loggerNamed(cfg.Logger, "on_demand")
 
 	name := cfg.getNameFromClientHello(hello)
@@ -587,6 +592,7 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe
 	// Renew and reload the certificate
 	renewAndReload := func(ctx context.Context, cancel context.CancelFunc) (Certificate, error) {
 		defer cancel()
+
 		err = cfg.RenewCertAsync(ctx, name, false)
 		if err == nil {
 			// even though the recursive nature of the dynamic cert loading
@@ -617,14 +623,13 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe
 
 	// if the certificate hasn't expired, we can serve what we have and renew in the background
 	if timeLeft > 0 {
-		// TODO: get a proper context; we use one with timeout because retries are enabled because interactive is false
-		ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Minute)
+		ctx, cancel := context.WithTimeout(ctx, 5*time.Minute)
 		go renewAndReload(ctx, cancel)
 		return currentCert, nil
 	}
 
 	// otherwise, we have to block while we renew an expired certificate
-	ctx, cancel := context.WithTimeout(context.TODO(), 90*time.Second)
+	ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
 	return renewAndReload(ctx, cancel)
 }
 

diff --git a/maintain.go b/maintain.go
@@ -394,64 +394,8 @@ func (certCache *Cache) updateOCSPStaples(ctx context.Context) {
 	// We attempt to replace any certificates that were revoked.
 	// Crucially, this happens OUTSIDE a lock on the certCache.
 	for _, renew := range renewQueue {
-		if logger != nil {
-			logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
-				zap.Strings("identifiers", renew.oldCert.Names),
-				zap.Time("expiration", renew.oldCert.Leaf.NotAfter))
-		}
-
-		renewName := renew.oldCert.Names[0]
-		cfg := configs[renewName]
-
-		// if revoked for key compromise, we can't be sure whether the storage of
-		// the key is still safe; however, we KNOW the old key is not safe, and we
-		// can only hope by the time of revocation that storage has been secured;
-		// key management is not something we want to get into, but in this case
-		// it seems prudent to replace the key - and since renewal requires reuse
-		// of a prior key, we can't do a "renew" to replace the cert if we need a
-		// new key, so we'll have to do an obtain instead
-		var obtainInsteadOfRenew bool
-		if renew.ocspResp.RevocationReason == acme.ReasonKeyCompromise {
-			err := cfg.moveCompromisedPrivateKey(renew.oldCert, logger)
-			if err != nil && logger != nil {
-				logger.Error("could not remove compromised private key from use",
-					zap.Strings("identifiers", renew.oldCert.Names),
-					zap.String("issuer", renew.oldCert.issuerKey),
-					zap.Error(err))
-			}
-			obtainInsteadOfRenew = true
-		}
-
-		var err error
-		if obtainInsteadOfRenew {
-			err = cfg.ObtainCertAsync(ctx, renewName)
-		} else {
-			// notice that we force renewal; otherwise, it might see that the
-			// certificate isn't close to expiring and return, but we really
-			// need a replacement certificate! see issue #4191
-			err = cfg.RenewCertAsync(ctx, renewName, true)
-		}
-		if err != nil {
-			// probably better to not serve a revoked certificate at all
-			if logger != nil {
-				logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
-					zap.Strings("identifiers", renew.oldCert.Names),
-					zap.Error(err))
-			}
-			certCache.mu.Lock()
-			certCache.removeCertificate(renew.oldCert)
-			certCache.mu.Unlock()
-			continue
-		}
-		err = cfg.reloadManagedCertificate(renew.oldCert)
-		if err != nil {
-			if logger != nil {
-				logger.Error("after obtaining new certificate due to OCSP status of REVOKED",
-					zap.Strings("identifiers", renew.oldCert.Names),
-					zap.Error(err))
-			}
-			continue
-		}
+		cfg := configs[renew.oldCert.Names[0]]
+		cfg.forceRenewIfCertRevoked(ctx, logger, renew.oldCert, renew.ocspResp)
 	}
 }
 
@@ -599,6 +543,73 @@ func deleteExpiredCerts(ctx context.Context, storage Storage, gracePeriod time.D
 	return nil
 }
 
+// forceRenewIfCertRevoked forcefully renews cert if ocspResp has a status of Revoked. (It is a no-op otherwise.)
+// Due to nuances with error handling, any errors are logged with the provided logger rather than returned.
+// This MUST NOT be called within a lock on cfg.certCacheMu.
+func (cfg *Config) forceRenewIfCertRevoked(ctx context.Context, logger *zap.Logger, cert Certificate, ocspResp *ocsp.Response) {
+	if !cert.managed || ocspResp.Status != ocsp.Revoked || len(cert.Names) == 0 {
+		return
+	}
+
+	if logger != nil {
+		logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
+			zap.Strings("identifiers", cert.Names),
+			zap.Time("expiration", cert.Leaf.NotAfter))
+	}
+
+	renewName := cert.Names[0]
+
+	// if revoked for key compromise, we can't be sure whether the storage of
+	// the key is still safe; however, we KNOW the old key is not safe, and we
+	// can only hope by the time of revocation that storage has been secured;
+	// key management is not something we want to get into, but in this case
+	// it seems prudent to replace the key - and since renewal requires reuse
+	// of a prior key, we can't do a "renew" to replace the cert if we need a
+	// new key, so we'll have to do an obtain instead
+	var obtainInsteadOfRenew bool
+	if ocspResp.RevocationReason == acme.ReasonKeyCompromise {
+		err := cfg.moveCompromisedPrivateKey(cert, logger)
+		if err != nil && logger != nil {
+			logger.Error("could not remove compromised private key from use",
+				zap.Strings("identifiers", cert.Names),
+				zap.String("issuer", cert.issuerKey),
+				zap.Error(err))
+		}
+		obtainInsteadOfRenew = true
+	}
+
+	var err error
+	if obtainInsteadOfRenew {
+		err = cfg.ObtainCertAsync(ctx, renewName)
+	} else {
+		// notice that we force renewal; otherwise, it might see that the
+		// certificate isn't close to expiring and return, but we really
+		// need a replacement certificate! see issue #4191
+		err = cfg.RenewCertAsync(ctx, renewName, true)
+	}
+	if err != nil {
+		// probably better to not serve a revoked certificate at all
+		if logger != nil {
+			logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
+				zap.Strings("identifiers", cert.Names),
+				zap.Error(err))
+		}
+		cfg.certCache.mu.Lock()
+		cfg.certCache.removeCertificate(cert)
+		cfg.certCache.mu.Unlock()
+		return
+	}
+	err = cfg.reloadManagedCertificate(cert)
+	if err != nil {
+		if logger != nil {
+			logger.Error("after obtaining new certificate due to OCSP status of REVOKED",
+				zap.Strings("identifiers", cert.Names),
+				zap.Error(err))
+		}
+		return
+	}
+}
+
 // moveCompromisedPrivateKey moves the private key for cert to a ".compromised" file
 // by copying the data to the new file, then deleting the old one.
 func (cfg *Config) moveCompromisedPrivateKey(cert Certificate, logger *zap.Logger) error {