From cfdf2984e8bf5124709389c36952d6b685f8688b Mon Sep 17 00:00:00 2001 From: Prashant Tak Date: Fri, 10 Oct 2025 10:46:11 +0530 Subject: [PATCH 1/3] pendingCreateMachine safety map to prevent early termination --- .../provider/machinecontroller/controller.go | 4 ++ .../provider/machinecontroller/machine.go | 46 ++++++++++++++----- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index 2727ecb4c..4dc7aeb5c 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -240,6 +240,10 @@ type controller struct { // - lastAcquire time // it is used to limit removal of `health timed out` machines permitGiver permits.PermitGiver + // pendingMachineCreationMap keeps track of machines that are currently in + // creation flow, this is used to determine whether or not a machine should + //be processed by the termination queue + pendingMachineCreationMap sync.Map // control listers secretLister corelisters.SecretLister diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 4b60170c3..530313f5a 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -46,7 +46,7 @@ func (c *controller) addMachine(obj interface{}) { // On restart of the controller process, a machine that was marked for // deletion would be processed as part of an `add` event. This check // ensures that its enqueued in the correct queue. - if machine.DeletionTimestamp != nil { + if c.shouldMachineBeMovedToTerminatingQueue(machine) { c.enqueueMachineTermination(machine, "handling terminating machine object ADD event") } else { c.enqueueMachine(obj, "handling machine obj ADD event") @@ -67,7 +67,7 @@ func (c *controller) updateMachine(oldObj, newObj interface{}) { return } - if newMachine.DeletionTimestamp != nil { + if c.shouldMachineBeMovedToTerminatingQueue(newMachine) { c.enqueueMachineTermination(newMachine, "handling terminating machine object UPDATE event") } else { c.enqueueMachine(newObj, "handling machine object UPDATE event") @@ -83,7 +83,7 @@ func (c *controller) deleteMachine(obj interface{}) { } machine, ok = tombstone.Obj.(*v1alpha1.Machine) if !ok { - utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a Machine Deployment %#v", obj)) + utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a Machine %#v", obj)) return } } @@ -153,8 +153,15 @@ func (c *controller) reconcileClusterMachineKey(key string) error { return err } - if machine.DeletionTimestamp != nil { + // Add finalizers if not present on machine object + _, err = c.addMachineFinalizers(ctx, machine) + if err != nil { + return err + } + + if c.shouldMachineBeMovedToTerminatingQueue(machine) { klog.Errorf("Machine %q should be in machine termination queue", machine.Name) + c.enqueueMachineTermination(machine, "handling terminating machine object") return nil } @@ -202,12 +209,6 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp return retry, err } - // Add finalizers if not present on machine object - retry, err = c.addMachineFinalizers(ctx, machine) - if err != nil { - return retry, err - } - if machine.Labels[v1alpha1.NodeLabelKey] != "" && machine.Status.CurrentStatus.Phase != "" { // If reference to node object exists execute the below retry, err := c.reconcileMachineHealth(ctx, machine) @@ -237,6 +238,8 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp } if machine.Spec.ProviderID == "" || machine.Status.CurrentStatus.Phase == "" || machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff { + c.pendingMachineCreationMap.Store(machine.Name, "") + return c.triggerCreationFlow( ctx, &driver.CreateMachineRequest{ @@ -484,6 +487,10 @@ func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []strin */ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineRequest *driver.CreateMachineRequest) (machineutils.RetryPeriod, error) { + defer func() { + c.pendingMachineCreationMap.Delete(createMachineRequest.Machine.Name) + }() + var ( // Declarations nodeName, providerID string @@ -780,11 +787,16 @@ func (c *controller) initializeMachine(ctx context.Context, machine *v1alpha1.Ma func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( - machine = deleteMachineRequest.Machine - finalizers = sets.NewString(machine.Finalizers...) + machine = deleteMachineRequest.Machine + finalizers = sets.NewString(machine.Finalizers...) + _, isMachineInCreationFlow = c.pendingMachineCreationMap.Load(machine.Name) ) switch { + case isMachineInCreationFlow: + err := fmt.Errorf("Machine %q is in creation flow. Deletion cannot proceed", machine.Name) + return machineutils.MediumRetry, err + case !finalizers.Has(MCMFinalizerName): // If Finalizers are not present on machine err := fmt.Errorf("Machine %q is missing finalizers. Deletion cannot proceed", machine.Name) @@ -857,3 +869,13 @@ func buildAddressStatus(addresses sets.Set[corev1.NodeAddress], nodeName string) }) return res } + +func (c *controller) shouldMachineBeMovedToTerminatingQueue(machine *v1alpha1.Machine) bool { + _, isMachineInCreationFlow := c.pendingMachineCreationMap.Load(machine.Name) + + if machine.DeletionTimestamp != nil && isMachineInCreationFlow { + klog.Infof("Cannot delete machine %q, its deletionTimestamp is set but it is currently being processed by the creation flow\n", machine.Name) + } + + return !isMachineInCreationFlow && machine.DeletionTimestamp != nil +} From 021bc22c77928df283da535ebcb2ba12b006efe5 Mon Sep 17 00:00:00 2001 From: Prashant Tak Date: Mon, 13 Oct 2025 12:23:53 +0530 Subject: [PATCH 2/3] feat(pendingCreateMachineMap): add unit tests --- .../provider/machinecontroller/controller.go | 2 +- .../machinecontroller/machine_test.go | 356 ++++++++++++++++++ 2 files changed, 357 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index 4dc7aeb5c..2cbc44019 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -242,7 +242,7 @@ type controller struct { permitGiver permits.PermitGiver // pendingMachineCreationMap keeps track of machines that are currently in // creation flow, this is used to determine whether or not a machine should - //be processed by the termination queue + // be processed by the termination queue. pendingMachineCreationMap sync.Map // control listers diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index d293988df..f7a74949a 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -3614,6 +3614,362 @@ var _ = Describe("machine", func() { ) }) + Describe("#pendingMachineCreationMap tests", func() { + type setup struct { + secrets []*corev1.Secret + machineClasses []*v1alpha1.MachineClass + machines []*v1alpha1.Machine + nodes []*corev1.Node + fakeResourceActions *customfake.ResourceActions + controller *controller + noTargetCluster bool + } + type machineActionRequest struct { + machine *v1alpha1.Machine + machineClass *v1alpha1.MachineClass + secret *corev1.Secret + } + type action struct { + machine string + forceDeleteLabelPresent bool + fakeMachineStatus *v1alpha1.MachineStatus + fakeDriver *driver.FakeDriver + // These fields are used to change the test scenario (add to pending map, call creation/deletion flow) + isCreation bool + testFunc func(setup, machineActionRequest) (machineutils.RetryPeriod, error) + } + type expect struct { + machine *v1alpha1.Machine + err error + nodeTerminationConditionIsSet bool + nodeDeleted bool + retry machineutils.RetryPeriod + } + type data struct { + setup setup + action action + expect expect + } + objMeta := &metav1.ObjectMeta{ + GenerateName: "machine", + Namespace: "test", + CreationTimestamp: metav1.Now(), + } + commonSetup := setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + } + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + machineObjects := []runtime.Object{} + for _, o := range data.setup.machineClasses { + machineObjects = append(machineObjects, o) + } + for _, o := range data.setup.machines { + machineObjects = append(machineObjects, o) + } + + controlCoreObjects := []runtime.Object{} + targetCoreObjects := []runtime.Object{} + + for _, o := range data.setup.secrets { + controlCoreObjects = append(controlCoreObjects, o) + } + for _, o := range data.setup.nodes { + targetCoreObjects = append(targetCoreObjects, o) + } + + fakeDriver := driver.NewFakeDriver( + data.action.fakeDriver.VMExists, + data.action.fakeDriver.ProviderID, + data.action.fakeDriver.NodeName, + data.action.fakeDriver.LastKnownState, + data.action.fakeDriver.Addresses, + data.action.fakeDriver.Err, + nil, + ) + + var trackers *customfake.FakeObjectTrackers + data.setup.controller, trackers = createController(stop, objMeta.Namespace, machineObjects, controlCoreObjects, targetCoreObjects, fakeDriver, data.setup.noTargetCluster) + + defer trackers.Stop() + waitForCacheSync(stop, data.setup.controller) + + action := data.action + machine, err := data.setup.controller.controlMachineClient.Machines(objMeta.Namespace).Get(context.TODO(), action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + machineClass, err := data.setup.controller.controlMachineClient.MachineClasses(objMeta.Namespace).Get(context.TODO(), machine.Spec.Class.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + secret, err := data.setup.controller.controlCoreClient.CoreV1().Secrets(objMeta.Namespace).Get(context.TODO(), machineClass.SecretRef.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + if data.setup.fakeResourceActions != nil { + _ = trackers.TargetCore.SetFakeResourceActions(data.setup.fakeResourceActions, math.MaxInt32) + } + + // ****************************************************** + // This is changing the setup in accordance with the test + retry, err := data.action.testFunc(data.setup, machineActionRequest{ + machine: machine, + machineClass: machineClass, + secret: secret, + }) + // ****************************************************** + + if err != nil || data.expect.err != nil { + Expect(err).To(Equal(data.expect.err)) + } + Expect(retry).To(Equal(data.expect.retry)) + + if data.action.isCreation { + _, found := data.setup.controller.pendingMachineCreationMap.Load(data.expect.machine.Name) + Expect(found).To(Equal(false)) + } + + machine, err = data.setup.controller.controlMachineClient.Machines(objMeta.Namespace).Get(context.TODO(), action.machine, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(machine.Spec).To(Equal(data.expect.machine.Spec)) + Expect(machine.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase)) + Expect(machine.Status.LastOperation.State).To(Equal(data.expect.machine.Status.LastOperation.State)) + Expect(machine.Status.LastOperation.Type).To(Equal(data.expect.machine.Status.LastOperation.Type)) + Expect(machine.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description)) + Expect(machine.Finalizers).To(Equal(data.expect.machine.Finalizers)) + + if data.expect.nodeDeleted { + _, nodeErr := data.setup.controller.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), machine.Labels[v1alpha1.NodeLabelKey], metav1.GetOptions{}) + Expect(nodeErr).To(HaveOccurred()) + } + if data.expect.nodeTerminationConditionIsSet { + node, nodeErr := data.setup.controller.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), machine.Labels[v1alpha1.NodeLabelKey], metav1.GetOptions{}) + Expect(nodeErr).To(Not(HaveOccurred())) + Expect(len(node.Status.Conditions)).To(Equal(1)) + Expect(node.Status.Conditions[0].Type).To(Equal(machineutils.NodeTerminationCondition)) + Expect(node.Status.Conditions[0].Status).To(Equal(corev1.ConditionTrue)) + } + }, + Entry("Remove machine from pendingMachineCreationMap after it exits creation flow", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Data: map[string][]byte{"userData": []byte("test")}, + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil, true, metav1.Now()), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + isCreation: true, + testFunc: func(setUp setup, req machineActionRequest) (machineutils.RetryPeriod, error) { + setUp.controller.pendingMachineCreationMap.Store("machine-0", "") + return setUp.controller.triggerCreationFlow(context.TODO(), &driver.CreateMachineRequest{ + Machine: req.machine, + MachineClass: req.machineClass, + Secret: req.secret, + }) + }, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, nil, nil, nil, map[string]string{v1alpha1.NodeLabelKey: "fakeNode-0"}, true, metav1.Now()), + err: fmt.Errorf("machine creation in process. Machine labels/annotations update is successful"), + retry: machineutils.ShortRetry, + }, + }), + Entry("Do not process machine deletion for machine present in pendingMachineCreationMap", &data{ + setup: commonSetup, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + isCreation: false, + testFunc: func(setUp setup, req machineActionRequest) (machineutils.RetryPeriod, error) { + setUp.controller.pendingMachineCreationMap.Store("machine-0", "") + return setUp.controller.triggerDeletionFlow(context.TODO(), &driver.DeleteMachineRequest{ + Machine: req.machine, + MachineClass: req.machineClass, + Secret: req.secret, + }) + }, + }, + expect: expect{ + err: fmt.Errorf("Machine \"machine-0\" is in creation flow. Deletion cannot proceed"), + retry: machineutils.MediumRetry, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineRunning, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: "Machine machine-0 successfully joined the cluster", + State: v1alpha1.MachineStateSuccessful, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + }, + }), + Entry("Proceed with machine deletion for machine not present in pendingMachineCreationMap", &data{ + setup: commonSetup, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + isCreation: false, + testFunc: func(setUp setup, req machineActionRequest) (machineutils.RetryPeriod, error) { + setUp.controller.pendingMachineCreationMap.Store("machine-xyz", "") + return setUp.controller.triggerDeletionFlow(context.TODO(), &driver.DeleteMachineRequest{ + Machine: req.machine, + MachineClass: req.machineClass, + Secret: req.secret, + }) + }, + }, + expect: expect{ + err: fmt.Errorf("Machine deletion in process. Phase set to termination"), + retry: machineutils.ShortRetry, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: machineutils.GetVMStatus, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + }, + }), + ) + }) /* Describe("#checkMachineTimeout", func() { type setup struct { From e5d644a9bc7c4a6eedd683172181584a8da41c98 Mon Sep 17 00:00:00 2001 From: Prashant Tak Date: Mon, 13 Oct 2025 15:36:18 +0530 Subject: [PATCH 3/3] Address review comments --- pkg/util/provider/machinecontroller/machine.go | 6 +++--- pkg/util/provider/machinecontroller/machine_test.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 530313f5a..5f1e36c7a 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -83,7 +83,7 @@ func (c *controller) deleteMachine(obj interface{}) { } machine, ok = tombstone.Obj.(*v1alpha1.Machine) if !ok { - utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a Machine %#v", obj)) + utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a Machine %#v", obj)) return } } @@ -794,7 +794,7 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque switch { case isMachineInCreationFlow: - err := fmt.Errorf("Machine %q is in creation flow. Deletion cannot proceed", machine.Name) + err := fmt.Errorf("machine %q is in creation flow. Deletion cannot proceed", machine.Name) return machineutils.MediumRetry, err case !finalizers.Has(MCMFinalizerName): @@ -874,7 +874,7 @@ func (c *controller) shouldMachineBeMovedToTerminatingQueue(machine *v1alpha1.Ma _, isMachineInCreationFlow := c.pendingMachineCreationMap.Load(machine.Name) if machine.DeletionTimestamp != nil && isMachineInCreationFlow { - klog.Infof("Cannot delete machine %q, its deletionTimestamp is set but it is currently being processed by the creation flow\n", machine.Name) + klog.Warningf("Cannot delete machine %q, its deletionTimestamp is set but it is currently being processed by the creation flow\n", machine.Name) } return !isMachineInCreationFlow && machine.DeletionTimestamp != nil diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index f7a74949a..0defc0a99 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -3873,7 +3873,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine \"machine-0\" is in creation flow. Deletion cannot proceed"), + err: fmt.Errorf("machine \"machine-0\" is in creation flow. Deletion cannot proceed"), retry: machineutils.MediumRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{