From d7fcd635f2830b430efde867fd81da7c4adedbbf Mon Sep 17 00:00:00 2001 From: Pires Date: Wed, 12 Jun 2019 20:28:34 +0100 Subject: [PATCH 1/4] consul: fix nits Signed-off-by: Pires --- registry/consul/backend.go | 3 +-- registry/consul/register.go | 2 +- registry/consul/service.go | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/registry/consul/backend.go b/registry/consul/backend.go index 26dc8852b..cfff930ef 100644 --- a/registry/consul/backend.go +++ b/registry/consul/backend.go @@ -99,11 +99,10 @@ func (b *be) Deregister(service string) error { func (b *be) DeregisterAll() error { log.Printf("[DEBUG]: consul: Deregistering all registered aliases.") - for name, dereg := range b.dereg { + for _, dereg := range b.dereg { if dereg == nil { continue } - log.Printf("[INFO] consul: Deregistering %q", name) dereg <- true // trigger deregistration <-dereg // wait for completion } diff --git a/registry/consul/register.go b/registry/consul/register.go index e8aca4834..243b9dfb9 100644 --- a/registry/consul/register.go +++ b/registry/consul/register.go @@ -52,7 +52,7 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { } deregister := func(serviceID string) { - log.Printf("[INFO] consul: Deregistering %s", service.Name) + log.Printf("[INFO] consul: Deregistering %q", service.Name) c.Agent().ServiceDeregister(serviceID) } diff --git a/registry/consul/service.go b/registry/consul/service.go index f9ad1067e..083d6d0ee 100644 --- a/registry/consul/service.go +++ b/registry/consul/service.go @@ -29,7 +29,7 @@ func NewServiceMonitor(client *api.Client, config *config.Consul, dc string) *Se } // Watch monitors the consul health checks and sends a new -// configuration to the updates channnel on every change. +// configuration to the updates channel on every change. func (w *ServiceMonitor) Watch(updates chan string) { var lastIndex uint64 for { From b2727f9ead5e340e116c6a5ee1d9cd781bc92a35 Mon Sep 17 00:00:00 2001 From: Pires Date: Thu, 13 Jun 2019 12:41:27 +0100 Subject: [PATCH 2/4] docs: add registry.consul.register.deregistercriticalserviceafter option Signed-off-by: Pires --- ...ul.register.deregisterCriticalServiceAfter.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md diff --git a/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md b/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md new file mode 100644 index 000000000..f8e7c1017 --- /dev/null +++ b/docs/content/ref/registry.consul.register.deregisterCriticalServiceAfter.md @@ -0,0 +1,16 @@ +--- +title: "registry.consul.register.deregisterCriticalServiceAfter" +--- + +`registry.consul.register.deregisterCriticalServiceAfter` configures the time the +service health check is allowed to be in state `critical` until Consul automatically +deregisters it. +If fabio is still running, the service will be re-registered almost immediately after +being deleted by Consul. + +At the time of this writing, Consul enforces a minimum value of one minute and runs +its reaper process every thirty seconds. + +The default is + + registry.consul.register.deregisterCriticalServiceAfter = 90m \ No newline at end of file From a35efcea6f7fc1307324ed6b13027638647553d5 Mon Sep 17 00:00:00 2001 From: Pires Date: Thu, 13 Jun 2019 13:37:48 +0100 Subject: [PATCH 3/4] register: clean-up fabio service entries in Consul on dirty exit This is achieved by: - Adding a TTL check to any services fabio may register into Consul, including its own; - Repurpose the goroutine (that exists per service that fabio registers) that periodically guarantees the service is registered in Consul to also reset the respective TTL check clock; - Set a short deadline in Consul to remove the service in case the TTL check is `critical` after said deadline is expired (time is non-deterministic as fabio doesn't control when the Consul reaper runs); Signed-off-by: Pires --- registry/consul/register.go | 61 +++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/registry/consul/register.go b/registry/consul/register.go index 243b9dfb9..91251eee8 100644 --- a/registry/consul/register.go +++ b/registry/consul/register.go @@ -14,6 +14,12 @@ import ( "github.com/hashicorp/consul/api" ) +const ( + TTLInterval = time.Second * 15 + TTLRefreshInterval = time.Second * 10 + TTLDeregisterCriticalServiceAfter = time.Minute +) + // register keeps a service registered in consul. // // When a value is sent in the dereg channel the service is deregistered from @@ -46,7 +52,9 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { log.Printf("[INFO] consul: Registered fabio with id %q", service.ID) log.Printf("[INFO] consul: Registered fabio with address %q", service.Address) log.Printf("[INFO] consul: Registered fabio with tags %q", strings.Join(service.Tags, ",")) - log.Printf("[INFO] consul: Registered fabio with health check to %q", service.Check.HTTP) + for _, check := range service.Checks { + log.Printf("[INFO] consul: Registered fabio with check %+v", check) + } return service.ID } @@ -56,12 +64,21 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { c.Agent().ServiceDeregister(serviceID) } + passTTL := func(serviceTTLID string) { + c.Agent().UpdateTTL(serviceTTLID, "", api.HealthPassing) + } + dereg := make(chan bool) go func() { var serviceID string + var serviceTTLCheckId string + for { if !registered(serviceID) { serviceID = register() + serviceTTLCheckId = computeServiceTTLCheckId(serviceID) + // Pass the TTL check right now so traffic can be served immediately. + passTTL(serviceTTLCheckId) } select { @@ -69,8 +86,9 @@ func register(c *api.Client, service *api.AgentServiceRegistration) chan bool { deregister(serviceID) dereg <- true return - case <-time.After(time.Second): - // continue + case <-time.After(TTLRefreshInterval): + // Reset the TTL check clock. + passTTL(serviceTTLCheckId) } } }() @@ -115,14 +133,39 @@ func serviceRegistration(cfg *config.Consul, serviceName string) (*api.AgentServ Address: ip.String(), Port: port, Tags: cfg.ServiceTags, - Check: &api.AgentServiceCheck{ - HTTP: checkURL, - Interval: cfg.CheckInterval.String(), - Timeout: cfg.CheckTimeout.String(), - TLSSkipVerify: cfg.CheckTLSSkipVerify, - DeregisterCriticalServiceAfter: cfg.CheckDeregisterCriticalServiceAfter, + // Set the checks for the service. + // + // Both checks must pass for Consul to consider the service healthy and therefore serve the fabio instance to clients. + Checks: []*api.AgentServiceCheck{ + // If fabio doesn't exit cleanly, it doesn't auto-deregister itself from Consul. + // In order to address this, we introduce a TTL check to prove the fabio instance is alive and able to route this service. + // The TTL check must be refreshed before its timeout is crossed. + // If the timeout is crossed, the check fails. + // If the check fails, Consul considers this service to have become unhealthy. + // If the check is failing (critical) after DeregisterCriticalServiceAfter is elapsed, the Consul reaper will remove it from Consul. + // For more info, read https://www.consul.io/api/agent/check.html#deregistercriticalserviceafter. + { + CheckID: computeServiceTTLCheckId(serviceID), + TTL: TTLInterval.String(), + DeregisterCriticalServiceAfter: TTLDeregisterCriticalServiceAfter.String(), + }, + // HTTP check is meant to confirm fabio health endpoint is reachable from the Consul agent. + // If the check fails, Consul considers this service to have become unhealthy. + // If the check fails and registry.consul.register.deregisterCriticalServiceAfter is set, the service will be deregistered from Consul. + // For more info, read https://www.consul.io/api/agent/check.html#deregistercriticalserviceafter. + { + HTTP: checkURL, + Interval: cfg.CheckInterval.String(), + Timeout: cfg.CheckTimeout.String(), + TLSSkipVerify: cfg.CheckTLSSkipVerify, + DeregisterCriticalServiceAfter: cfg.CheckDeregisterCriticalServiceAfter, + }, }, } return service, nil } + +func computeServiceTTLCheckId(serviceID string) string { + return strings.Join([]string{serviceID, "ttl"}, "-") +} From 2f907fa3109affcb8ac9b9731f105e768a7b45f0 Mon Sep 17 00:00:00 2001 From: Pires Date: Fri, 14 Jun 2019 09:30:47 +0100 Subject: [PATCH 4/4] changelog: mention #663 and #664 Signed-off-by: Pires --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68bef50b8..704fef605 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,15 @@ Thanks to [@murphymj25](https://github.com/murphymj25) for the patch. +* [PR #664](https://github.com/fabiolb/fabio/issues/664): Clean-up fabio service entries in Consul on dirty exit + + In the case fabio dies abruptly, the steps to deregister any fabio-related services in Consul will not take place. + In certain cases this could result in duplicate service and check entries after fabio restarts, especially if fabio runs in a Docker container. + + This PR addresses this issue by registering an additional TTL check that acts as a deadman switch and removes abandoned service registrations eventually. + + Thanks to [@pires](https://github.com/pires) for the [report](https://github.com/fabiolb/fabio/issues/663) and the fix. + ### [v1.5.11](https://github.com/fabiolb/fabio/releases/tag/v1.5.11) - 25 Feb 2019 #### Breaking Changes