Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Fixes race between machine & machineSet creation/deletion operations #391

Merged
merged 3 commits into from
Jan 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/machine-controller-manager-cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func main() {
fmt.Printf("Machine id: %s\n", id)
fmt.Printf("Name: %s\n", name)
} else {
err = driver.Delete()
err = driver.Delete(machineID)
if err != nil {
log.Fatalf("Could not delete %s : %s", machineID, err)
}
Expand Down
19 changes: 18 additions & 1 deletion pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,23 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
// Get the latest version of the machine so that we can avoid conflicts
machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
if err != nil {

if apierrors.IsNotFound(err) {
glog.Infof("Machine %q not found on APIServer anymore. Deleting created (orphan) VM", machineName)

if err = driver.Delete(actualProviderID); err != nil {
glog.Errorf(
"Deletion failed for orphan machine %q with provider-ID %q: %s",
machine.Name,
actualProviderID,
err,
)
}

// Return with error
return fmt.Errorf("Couldn't find machine object, hence deleted orphan VM")
}

glog.Warningf("Machine GET failed for %q. Retrying, error: %s", machineName, err)
continue
}
Expand Down Expand Up @@ -591,7 +608,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
return err
}

err = driver.Delete()
err = driver.Delete(machineID)
}

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/machine_safety.go
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ func (c *controller) deleteOrphanVM(vm driver.VMs, secretRef *corev1.Secret, kin
machineName,
)

err := dvr.Delete()
err := dvr.Delete(machineID)
if err != nil {
glog.Errorf("SafetyController: Error while trying to DELETE VM on CP - %s. Shall retry in next safety controller sync.", err)
} else {
Expand Down
71 changes: 64 additions & 7 deletions pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"math"
"net/http"
"time"

machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine"
Expand All @@ -33,8 +34,10 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/validation/field"
k8stesting "k8s.io/client-go/testing"
)
Expand Down Expand Up @@ -514,14 +517,12 @@ var _ = Describe("machine", func() {
func() (string, string, error) {
return action.fakeProviderID, action.fakeNodeName, action.fakeError
},
nil, nil))
func(string) error {
return action.fakeError
}, nil))

if data.expect.err {
Expect(err).To(HaveOccurred())
actual, err := controller.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
Expect(err).To(BeNil())
Expect(actual.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description))
Expect(actual.Status.CurrentStatus.Phase).To(Equal(data.expect.machine.Status.CurrentStatus.Phase))
return
}

Expand Down Expand Up @@ -657,7 +658,15 @@ var _ = Describe("machine", func() {
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Machine: customfake.Actions{
Get: "Failed to GET machine",
Get: apierrors.NewGenericServerResponse(
http.StatusBadRequest,
"dummy method",
schema.GroupResource{},
"dummy name",
"Failed to GET machine",
30,
true,
),
},
},
},
Expand All @@ -684,6 +693,54 @@ var _ = Describe("machine", func() {
err: false,
},
}),
Entry("Orphan VM deletion on faling to find referred machine object", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Machine: customfake.Actions{
Get: apierrors.NewGenericServerResponse(
http.StatusNotFound,
"dummy method",
schema.GroupResource{},
"dummy name",
"Machine not found",
30,
true,
),
},
},
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
},
expect: expect{
err: true,
},
}),
)
})

Expand Down Expand Up @@ -754,7 +811,7 @@ var _ = Describe("machine", func() {
}
return action.fakeProviderID, action.fakeNodeName, action.fakeError
},
func() error {
func(string) error {
return nil
},
func() (string, error) {
Expand Down
4 changes: 2 additions & 2 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
// Driver is the common interface for creation/deletion of the VMs over different cloud-providers.
type Driver interface {
Create() (string, string, error)
Delete() error
Delete(string) error
GetExisting() (string, error)
GetVMs(string) (VMs, error)
GetVolNames([]corev1.PersistentVolumeSpec) ([]string, error)
Expand Down Expand Up @@ -100,7 +100,7 @@ func NewDriver(machineID string, secretRef *corev1.Secret, classKind string, mac
func() (string, string, error) {
return "fake", "fake_ip", nil
},
func() error {
func(string) error {
return nil
},
func() (string, error) {
Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_alicloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,28 +107,28 @@ func (c *AlicloudDriver) Create() (string, string, error) {
}

// Delete method is used to delete an alicloud machine
func (c *AlicloudDriver) Delete() error {
result, err := c.getVMDetails(c.MachineID)
func (c *AlicloudDriver) Delete(machineID string) error {
result, err := c.getVMDetails(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machineID
glog.V(2).Infof("No VM matching the machineID found on the provider %q", c.MachineID)
glog.V(2).Infof("No VM matching the machineID found on the provider %q", machineID)
return nil
}

if result[0].Status != "Running" && result[0].Status != "Stopped" {
return errors.New("ec2 instance not in running/stopped state")
}

machineID := c.decodeMachineID(c.MachineID)
instanceID := c.decodeMachineID(machineID)

client, err := c.getEcsClient()
if err != nil {
return err
}

err = c.deleteInstance(client, machineID)
err = c.deleteInstance(client, instanceID)
return err
}

Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,23 +188,23 @@ func (d *AWSDriver) generateBlockDevices(blockDevices []v1alpha1.AWSBlockDeviceM
}

// Delete method is used to delete a AWS machine
func (d *AWSDriver) Delete() error {
func (d *AWSDriver) Delete(machineID string) error {

result, err := d.GetVMs(d.MachineID)
result, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

machineID := d.decodeMachineID(d.MachineID)
instanceID := d.decodeMachineID(machineID)

svc := d.createSVC()
input := &ec2.TerminateInstancesInput{
InstanceIds: []*string{
aws.String(machineID),
aws.String(instanceID),
},
DryRun: aws.Bool(true),
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/driver/driver_azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,15 @@ func (d *AzureDriver) Create() (string, string, error) {
}

// Delete method is used to delete an azure machine
func (d *AzureDriver) Delete() error {
func (d *AzureDriver) Delete(machineID string) error {
clients, err := d.setup()
if err != nil {
return err
}

var (
ctx = context.Background()
vmName = decodeMachineID(d.MachineID)
vmName = decodeMachineID(machineID)
nicName = dependencyNameFromVMName(vmName, nicSuffix)
diskName = dependencyNameFromVMName(vmName, diskSuffix)
resourceGroupName = d.AzureMachineClass.Spec.ResourceGroup
Expand Down
8 changes: 4 additions & 4 deletions pkg/driver/driver_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ import corev1 "k8s.io/api/core/v1"
// FakeDriver is a fake driver returned when none of the actual drivers match
type FakeDriver struct {
create func() (string, string, error)
delete func() error
delete func(string) error
//existing func() (string, v1alpha1.MachinePhase, error)
existing func() (string, error)
getVolNames func([]corev1.PersistentVolumeSpec) ([]string, error)
}

// NewFakeDriver returns a new fakedriver object
func NewFakeDriver(create func() (string, string, error), delete func() error, existing func() (string, error)) Driver {
func NewFakeDriver(create func() (string, string, error), delete func(string) error, existing func() (string, error)) Driver {
return &FakeDriver{
create: create,
delete: delete,
Expand All @@ -43,8 +43,8 @@ func (d *FakeDriver) Create() (string, string, error) {
}

// Delete deletes a fake driver
func (d *FakeDriver) Delete() error {
return d.delete()
func (d *FakeDriver) Delete(machineID string) error {
return d.delete(machineID)
}

// GetExisting returns the existing fake driver
Expand Down
8 changes: 4 additions & 4 deletions pkg/driver/driver_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,14 @@ func (d *GCPDriver) Create() (string, string, error) {
}

// Delete method is used to delete a GCP machine
func (d *GCPDriver) Delete() error {
func (d *GCPDriver) Delete(machineID string) error {

result, err := d.GetVMs(d.MachineID)
result, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(result) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

Expand All @@ -169,7 +169,7 @@ func (d *GCPDriver) Delete() error {
return err
}

project, zone, name, err := d.decodeMachineID(d.MachineID)
project, zone, name, err := d.decodeMachineID(machineID)
if err != nil {
glog.Error(err)
return err
Expand Down
16 changes: 8 additions & 8 deletions pkg/driver/driver_openstack.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ type OpenStackDriver struct {
// deleteOnFail method is used to delete the VM, which was created with an error
func (d *OpenStackDriver) deleteOnFail(err error) error {
// this method is called after the d.MachineID has been set
if e := d.Delete(); e != nil {
if e := d.Delete(d.MachineID); e != nil {
return fmt.Errorf("Error deleting machine %s (%s) after unsuccessful create attempt: %s", d.MachineID, e.Error(), err.Error())
}
return err
Expand Down Expand Up @@ -196,34 +196,34 @@ func (d *OpenStackDriver) Create() (string, string, error) {
}

// Delete method is used to delete an OS machine
func (d *OpenStackDriver) Delete() error {
res, err := d.GetVMs(d.MachineID)
func (d *OpenStackDriver) Delete(machineID string) error {
res, err := d.GetVMs(machineID)
if err != nil {
return err
} else if len(res) == 0 {
// No running instance exists with the given machine-ID
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No VM matching the machine-ID found on the provider %q", machineID)
return nil
}

machineID := d.decodeMachineID(d.MachineID)
instanceID := d.decodeMachineID(machineID)
client, err := d.createNovaClient()
if err != nil {
return err
}

result := servers.Delete(client, machineID)
result := servers.Delete(client, instanceID)
if result.Err == nil {
// waiting for the machine to be deleted to release consumed quota resources, 5 minutes should be enough
err = waitForStatus(client, machineID, nil, []string{"DELETED", "SOFT_DELETED"}, 300)
if err != nil {
return fmt.Errorf("error waiting for the %q server to be deleted: %s", machineID, err)
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "openstack", "service": "nova"}).Inc()
glog.V(3).Infof("Deleted machine with ID: %s", d.MachineID)
glog.V(3).Infof("Deleted machine with ID: %s", machineID)
} else {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "openstack", "service": "nova"}).Inc()
glog.Errorf("Failed to delete machine with ID: %s", d.MachineID)
glog.Errorf("Failed to delete machine with ID: %s", machineID)
}

return result.Err
Expand Down
10 changes: 5 additions & 5 deletions pkg/driver/driver_packet.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,20 @@ func (d *PacketDriver) Create() (string, string, error) {
}

// Delete method is used to delete a Packet machine
func (d *PacketDriver) Delete() error {
func (d *PacketDriver) Delete(machineID string) error {

svc := d.createSVC()
if svc == nil {
return fmt.Errorf("nil Packet service returned")
}
machineID := d.decodeMachineID(d.MachineID)
resp, err := svc.Devices.Delete(machineID)
instanceID := d.decodeMachineID(machineID)
resp, err := svc.Devices.Delete(instanceID)
if err != nil {
if resp.StatusCode == 404 {
glog.V(2).Infof("No machine matching the machine-ID found on the provider %q", d.MachineID)
glog.V(2).Infof("No machine matching the machineID found on the provider %q", machineID)
return nil
}
glog.Errorf("Could not terminate machine %s: %v", d.MachineID, err)
glog.Errorf("Could not terminate machine %s: %v", machineID, err)
return err
}
return nil
Expand Down