From 77307998ea7acfb472c11587d8cfaa87a843ee96 Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Mon, 6 Feb 2023 13:34:50 +0200 Subject: [PATCH 1/2] Bail if we fail to cleanup failed instance if we fail to cleanup failed instance, we return before retrying to recreate it. Signed-off-by: Gabriel Adrian Samfira --- runner/pool/pool.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/runner/pool/pool.go b/runner/pool/pool.go index 26553ffa..e396e739 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -888,6 +888,14 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(pool params.Pool) { // TODO(gabriel-samfira): implement request throttling. if err := r.deleteInstanceFromProvider(inst); err != nil { log.Printf("failed to delete instance %s from provider: %s", inst.Name, err) + // Bail here, otherwise we risk creating multiple failing instances, and losing track + // of them. If Create instance failed to return a proper provider ID, we rely on the + // name to delete the instance. If we don't bail here, and end up with multiple + // instances with the same name, using the name to clean up failed instances will fail + // on any subsequent call, unless the external or native provider takes into account + // non unique names and loops over all of them. Something which is extremely hacky and + // which we would rather avoid. + return } // TODO(gabriel-samfira): Incrementing CreateAttempt should be done within a transaction. @@ -1044,6 +1052,9 @@ func (r *basePoolManager) addPendingInstances() { // won't attempt to create the runner a second time. if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceCreating, nil); err != nil { log.Printf("failed to update runner %s status", instance.Name) + // We failed to transition the instance to Creating. This means that garm will retry to create this instance + // when the loop runs again and we end up with multiple instances. + continue } go func(instance params.Instance) { log.Printf("creating instance %s in pool %s", instance.Name, instance.PoolID) From 439eeee479e4750dfc87cd6b9fc1b5c3f402bac4 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Tue, 7 Feb 2023 13:02:47 +0200 Subject: [PATCH 2/2] Update runner/pool/pool.go Co-authored-by: Michael Kuhnt --- runner/pool/pool.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/pool/pool.go b/runner/pool/pool.go index e396e739..5fe8ec0d 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -1051,7 +1051,7 @@ func (r *basePoolManager) addPendingInstances() { // Set the instance to "creating" before launching the goroutine. This will ensure that addPendingInstances() // won't attempt to create the runner a second time. if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceCreating, nil); err != nil { - log.Printf("failed to update runner %s status", instance.Name) + log.Printf("failed to update runner %s status: %s", instance.Name, err) // We failed to transition the instance to Creating. This means that garm will retry to create this instance // when the loop runs again and we end up with multiple instances. continue