Skip to content
This repository has been archived by the owner on Sep 26, 2021. It is now read-only.

Commit

Permalink
Merge pull request #4317 from nolith/aws-cleanup-on-failure
Browse files Browse the repository at this point in the history
Ensure proper cleanup of dangling resources in case of failure
  • Loading branch information
dgageot committed Jan 28, 2018
2 parents ae6078d + 88010c2 commit 83d186e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 4 deletions.
44 changes: 40 additions & 4 deletions drivers/amazonec2/amazonec2.go
Expand Up @@ -109,6 +109,8 @@ type Driver struct {
Endpoint string
DisableSSL bool
UserDataFile string

spotInstanceRequestId string
}

type clientFactory interface {
Expand Down Expand Up @@ -573,6 +575,16 @@ func (d *Driver) Create() error {
return err
}

if err := d.innerCreate(); err != nil {
// cleanup partially created resources
d.Remove()
return err
}

return nil
}

func (d *Driver) innerCreate() error {
log.Infof("Launching instance...")

if err := d.createKeyPair(); err != nil {
Expand Down Expand Up @@ -639,12 +651,13 @@ func (d *Driver) Create() error {
if err != nil {
return fmt.Errorf("Error request spot instance: %s", err)
}
d.spotInstanceRequestId = *spotInstanceRequest.SpotInstanceRequests[0].SpotInstanceRequestId

log.Info("Waiting for spot instance...")
for i := 0; i < 3; i++ {
// AWS eventual consistency means we could not have SpotInstanceRequest ready yet
err = d.getClient().WaitUntilSpotInstanceRequestFulfilled(&ec2.DescribeSpotInstanceRequestsInput{
SpotInstanceRequestIds: []*string{spotInstanceRequest.SpotInstanceRequests[0].SpotInstanceRequestId},
SpotInstanceRequestIds: []*string{&d.spotInstanceRequestId},
})
if err != nil {
if awsErr, ok := err.(awserr.Error); ok {
Expand All @@ -657,15 +670,15 @@ func (d *Driver) Create() error {
}
break
}
log.Info("Created spot instance request %v", *spotInstanceRequest.SpotInstanceRequests[0].SpotInstanceRequestId)
log.Infof("Created spot instance request %v", d.spotInstanceRequestId)
// resolve instance id
for i := 0; i < 3; i++ {
// Even though the waiter succeeded, eventual consistency means we could
// get a describe output that does not include this information. Try a
// few times just in case
var resolvedSpotInstance *ec2.DescribeSpotInstanceRequestsOutput
resolvedSpotInstance, err = d.getClient().DescribeSpotInstanceRequests(&ec2.DescribeSpotInstanceRequestsInput{
SpotInstanceRequestIds: []*string{spotInstanceRequest.SpotInstanceRequests[0].SpotInstanceRequestId},
SpotInstanceRequestIds: []*string{&d.spotInstanceRequestId},
})
if err != nil {
// Unexpected; no need to retry
Expand Down Expand Up @@ -868,6 +881,14 @@ func (d *Driver) Remove() error {
multierr.Errs = append(multierr.Errs, err)
}

// In case of failure waiting for a SpotInstance, we must cancel the unfulfilled request, otherwise an instance may be created later.
// If the instance was created, terminating it will be enough for canceling the SpotInstanceRequest
if d.RequestSpotInstance && d.spotInstanceRequestId != "" {
if err := d.cancelSpotInstanceRequest(); err != nil {
multierr.Errs = append(multierr.Errs, err)
}
}

if !d.ExistingKey {
if err := d.deleteKeyPair(); err != nil {
multierr.Errs = append(multierr.Errs, err)
Expand All @@ -881,6 +902,15 @@ func (d *Driver) Remove() error {
return multierr
}

func (d *Driver) cancelSpotInstanceRequest() error {
// NB: Canceling a Spot instance request does not terminate running Spot instances associated with the request
_, err := d.getClient().CancelSpotInstanceRequests(&ec2.CancelSpotInstanceRequestsInput{
SpotInstanceRequestIds: []*string{&d.spotInstanceRequestId},
})

return err
}

func (d *Driver) getInstance() (*ec2.Instance, error) {
instances, err := d.getClient().DescribeInstances(&ec2.DescribeInstancesInput{
InstanceIds: []*string{&d.InstanceId},
Expand Down Expand Up @@ -955,7 +985,8 @@ func (d *Driver) createKeyPair() error {

func (d *Driver) terminate() error {
if d.InstanceId == "" {
return fmt.Errorf("unknown instance")
log.Warn("Missing instance ID, this is likely due to a failure during machine creation")
return nil
}

log.Debugf("terminating instance: %s", d.InstanceId)
Expand Down Expand Up @@ -1168,6 +1199,11 @@ func (d *Driver) configureSecurityGroupPermissions(group *ec2.SecurityGroup) ([]
}

func (d *Driver) deleteKeyPair() error {
if d.KeyName == "" {
log.Warn("Missing key pair name, this is likely due to a failure during machine creation")
return nil
}

log.Debugf("deleting key pair: %s", d.KeyName)

_, err := d.getClient().DeleteKeyPair(&ec2.DeleteKeyPairInput{
Expand Down
1 change: 1 addition & 0 deletions drivers/amazonec2/ec2client.go
Expand Up @@ -48,4 +48,5 @@ type Ec2Client interface {
DescribeSpotInstanceRequests(input *ec2.DescribeSpotInstanceRequestsInput) (*ec2.DescribeSpotInstanceRequestsOutput, error)

WaitUntilSpotInstanceRequestFulfilled(input *ec2.DescribeSpotInstanceRequestsInput) error
CancelSpotInstanceRequests(input *ec2.CancelSpotInstanceRequestsInput) (*ec2.CancelSpotInstanceRequestsOutput, error)
}

0 comments on commit 83d186e

Please sign in to comment.