Skip to content

Commit

Permalink
Bugfix: Fixes race between machine & machineSet creation/deletion ope…
Browse files Browse the repository at this point in the history
…rations (#391)

* Bugfix: Race between machine/machineSet creation/deletion

* Added tests

* Fixed go vet error on cli
  • Loading branch information
prashanth26 committed Jan 29, 2020
1 parent 535de4f commit 0c52454
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 48 deletions.
2 changes: 1 addition & 1 deletion cmd/machine-controller-manager-cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func main() {
fmt.Printf("Machine id: %s\n", id)
fmt.Printf("Name: %s\n", name)
} else {
err = driver.Delete()
err = driver.Delete(machineID)
if err != nil {
log.Fatalf("Could not delete %s : %s", machineID, err)
}
Expand Down
19 changes: 18 additions & 1 deletion pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,23 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
// Get the latest version of the machine so that we can avoid conflicts
machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
if err != nil {

if apierrors.IsNotFound(err) {
glog.Infof("Machine %q not found on APIServer anymore. Deleting created (orphan) VM", machineName)

if err = driver.Delete(actualProviderID); err != nil {
glog.Errorf(
"Deletion failed for orphan machine %q with provider-ID %q: %s",
machine.Name,
actualProviderID,
err,
)
}

// Return with error
return fmt.Errorf("Couldn't find machine object, hence deleted orphan VM")
}

glog.Warningf("Machine GET failed for %q. Retrying, error: %s", machineName, err)
continue
}
Expand Down Expand Up @@ -591,7 +608,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
return err
}

err = driver.Delete()
err = driver.Delete(machineID)
}

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/machine_safety.go
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ func (c *controller) deleteOrphanVM(vm driver.VMs, secretRef *corev1.Secret, kin
machineName,
)

err := dvr.Delete()
err := dvr.Delete(machineID)
if err != nil {
glog.Errorf("SafetyController: Error while trying to DELETE VM on CP - %s. Shall retry in next safety controller sync.", err)
} else {
Expand Down
71 changes: 64 additions & 7 deletions pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"math"
"net/http"
"time"

machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine"
Expand All @@ -33,8 +34,10 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/validation/field"
k8stesting "k8s.io/client-go/testing"
)
Expand Down Expand Up @@ -514,14 +517,12 @@ var _ = Describe("machine", func() {
func() (string, string, error) {
return action.fakeProviderID, action.fakeNodeName, action.fakeError
},
nil, nil))
func(string) error {
return action.fakeError
}, nil))

if data.expect.err {
Expect(err).To(HaveOccurred())
actual, err := controller.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
Expect(err).To(BeNil())
Expect(actual.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description))
Expect(actual.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase))
return
}

Expand Down Expand Up @@ -657,7 +658,15 @@ var _ = Describe("machine", func() {
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Machine: customfake.Actions{
Get: "Failed to GET machine",
Get: apierrors.NewGenericServerResponse(
http.StatusBadRequest,
"dummy method",
schema.GroupResource{},
"dummy name",
"Failed to GET machine",
30,
true,
),
},
},
},
Expand All @@ -684,6 +693,54 @@ var _ = Describe("machine", func() {
err: false,
},
}),
Entry("Orphan VM deletion on faling to find referred machine object", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Machine: customfake.Actions{
Get: apierrors.NewGenericServerResponse(
http.StatusNotFound,
"dummy method",
schema.GroupResource{},
"dummy name",
"Machine not found",
30,
true,
),
},
},
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
},
expect: expect{
err: true,
},
}),
)
})

Expand Down Expand Up @@ -754,7 +811,7 @@ var _ = Describe("machine", func() {
}
return action.fakeProviderID, action.fakeNodeName, action.fakeError
},
func() error {
func(string) error {
return nil
},
func() (string, error) {
Expand Down
4 changes: 2 additions & 2 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
// Driver is the common interface for creation/deletion of the VMs over different cloud-providers.
type Driver interface {
Create() (string, string, error)
Delete() error
Delete(string) error
GetExisting() (string, error)
GetVMs(string) (VMs, error)
GetVolNames([]corev1.PersistentVolumeSpec) ([]string, error)
Expand Down Expand Up @@ -100,7 +100,7 @@ func NewDriver(machineID string, secretRef *corev1.Secret, classKind string, mac
func() (string, string, error) {
return "fake", "fake_ip", nil
},
func() error {
func(string) error {
return nil
},
func() (string, error) {
Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_alicloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,28 +107,28 @@ func (c *AlicloudDriver) Create() (string, string, error) {
}

// Delete method is used to delete an alicloud machine
func (c *AlicloudDriver) Delete() error {
result, err := c.getVMDetails(c.MachineID)
func (c *AlicloudDriver) Delete(machineID string) error {
result, err := c.getVMDetails(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machineID
glog.V(2).Infof("No VM matching the machineID found on the provider %q", c.MachineID)
glog.V(2).Infof("No VM matching the machineID found on the provider %q", machineID)
return nil
}

if result[0].Status != "Running" && result[0].Status != "Stopped" {
return errors.New("ec2 instance not in running/stopped state")
}

machineID := c.decodeMachineID(c.MachineID)
instanceID := c.decodeMachineID(machineID)

client, err := c.getEcsClient()
if err != nil {
return err
}

err = c.deleteInstance(client, machineID)
err = c.deleteInstance(client, instanceID)
return err
}

Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,23 +188,23 @@ func (d *AWSDriver) generateBlockDevices(blockDevices []v1alpha1.AWSBlockDeviceM
}

// Delete method is used to delete a AWS machine
func (d *AWSDriver) Delete() error {
func (d *AWSDriver) Delete(machineID string) error {

result, err := d.GetVMs(d.MachineID)
result, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

machineID := d.decodeMachineID(d.MachineID)
instanceID := d.decodeMachineID(machineID)

svc := d.createSVC()
input := &ec2.TerminateInstancesInput{
InstanceIds: []*string{
aws.String(machineID),
aws.String(instanceID),
},
DryRun: aws.Bool(true),
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/driver/driver_azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,15 @@ func (d *AzureDriver) Create() (string, string, error) {
}

// Delete method is used to delete an azure machine
func (d *AzureDriver) Delete() error {
func (d *AzureDriver) Delete(machineID string) error {
clients, err := d.setup()
if err != nil {
return err
}

var (
ctx = context.Background()
vmName = decodeMachineID(d.MachineID)
vmName = decodeMachineID(machineID)
nicName = dependencyNameFromVMName(vmName, nicSuffix)
diskName = dependencyNameFromVMName(vmName, diskSuffix)
resourceGroupName = d.AzureMachineClass.Spec.ResourceGroup
Expand Down
8 changes: 4 additions & 4 deletions pkg/driver/driver_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ import corev1 "k8s.io/api/core/v1"
// FakeDriver is a fake driver returned when none of the actual drivers match
type FakeDriver struct {
create func() (string, string, error)
delete func() error
delete func(string) error
//existing func() (string, v1alpha1.MachinePhase, error)
existing func() (string, error)
getVolNames func([]corev1.PersistentVolumeSpec) ([]string, error)
}

// NewFakeDriver returns a new fakedriver object
func NewFakeDriver(create func() (string, string, error), delete func() error, existing func() (string, error)) Driver {
func NewFakeDriver(create func() (string, string, error), delete func(string) error, existing func() (string, error)) Driver {
return &FakeDriver{
create: create,
delete: delete,
Expand All @@ -43,8 +43,8 @@ func (d *FakeDriver) Create() (string, string, error) {
}

// Delete deletes a fake driver
func (d *FakeDriver) Delete() error {
return d.delete()
func (d *FakeDriver) Delete(machineID string) error {
return d.delete(machineID)
}

// GetExisting returns the existing fake driver
Expand Down
8 changes: 4 additions & 4 deletions pkg/driver/driver_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,14 @@ func (d *GCPDriver) Create() (string, string, error) {
}

// Delete method is used to delete a GCP machine
func (d *GCPDriver) Delete() error {
func (d *GCPDriver) Delete(machineID string) error {

result, err := d.GetVMs(d.MachineID)
result, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

Expand All @@ -169,7 +169,7 @@ func (d *GCPDriver) Delete() error {
return err
}

project, zone, name, err := d.decodeMachineID(d.MachineID)
project, zone, name, err := d.decodeMachineID(machineID)
if err != nil {
glog.Error(err)
return err
Expand Down
16 changes: 8 additions & 8 deletions pkg/driver/driver_openstack.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ type OpenStackDriver struct {
// deleteOnFail method is used to delete the VM, which was created with an error
func (d *OpenStackDriver) deleteOnFail(err error) error {
// this method is called after the d.MachineID has been set
if e := d.Delete(); e != nil {
if e := d.Delete(d.MachineID); e != nil {
return fmt.Errorf("Error deleting machine %s (%s) after unsuccessful create attempt: %s", d.MachineID, e.Error(), err.Error())
}
return err
Expand Down Expand Up @@ -218,34 +218,34 @@ func (d *OpenStackDriver) Create() (string, string, error) {
}

// Delete method is used to delete an OS machine
func (d *OpenStackDriver) Delete() error {
res, err := d.GetVMs(d.MachineID)
func (d *OpenStackDriver) Delete(machineID string) error {
res, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(res) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

machineID := d.decodeMachineID(d.MachineID)
instanceID := d.decodeMachineID(machineID)
client, err := d.createNovaClient()
if err != nil {
return err
}

result := servers.Delete(client, machineID)
result := servers.Delete(client, instanceID)
if result.Err == nil {
// waiting for the machine to be deleted to release consumed quota resources, 5 minutes should be enough
err = waitForStatus(client, machineID, nil, []string{"DELETED", "SOFT_DELETED"}, 300)
if err != nil {
return fmt.Errorf("error waiting for the %q server to be deleted: %s", machineID, err)
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "openstack", "service": "nova"}).Inc()
glog.V(3).Infof("Deleted machine with ID: %s", d.MachineID)
glog.V(3).Infof("Deleted machine with ID: %s", machineID)
} else {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "openstack", "service": "nova"}).Inc()
glog.Errorf("Failed to delete machine with ID: %s", d.MachineID)
glog.Errorf("Failed to delete machine with ID: %s", machineID)
}

return result.Err
Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_packet.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,20 @@ func (d *PacketDriver) Create() (string, string, error) {
}

// Delete method is used to delete a Packet machine
func (d *PacketDriver) Delete() error {
func (d *PacketDriver) Delete(machineID string) error {

svc := d.createSVC()
if svc == nil {
return fmt.Errorf("nil Packet service returned")
}
machineID := d.decodeMachineID(d.MachineID)
resp, err := svc.Devices.Delete(machineID)
instanceID := d.decodeMachineID(machineID)
resp, err := svc.Devices.Delete(instanceID)
if err != nil {
if resp.StatusCode == 404 {
glog.V(2).Infof("No machine matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No machine matching the machineID found on the provider %q", machineID)
return nil
}
glog.Errorf("Could not terminate machine %s: %v", d.MachineID, err)
glog.Errorf("Could not terminate machine %s: %v", machineID, err)
return err
}
return nil
Expand Down

0 comments on commit 0c52454

Please sign in to comment.