Skip to content

Commit

Permalink
Fix bug where AWS CPI fails to delete volume in use, and doesn't retry
Browse files Browse the repository at this point in the history
Also DRY sleep_callback (bounded exponential backoff) for resource waits
[fixes #46679995]
  • Loading branch information
Amit Kumar Gupta committed Mar 22, 2013
1 parent 960fc44 commit e2fce50
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 51 deletions.
26 changes: 4 additions & 22 deletions bosh_aws_cpi/lib/cloud/aws/cloud.rb
Original file line number Diff line number Diff line change
Expand Up @@ -202,35 +202,17 @@ def create_disk(size, instance_id = nil)
def delete_disk(disk_id)
with_thread_name("delete_disk(#{disk_id})") do
volume = @ec2.volumes[disk_id]
state = volume.state

if state != :available
cloud_error("Cannot delete volume `#{volume.id}', state is #{state}")
end

@logger.info("Deleting volume `#{volume.id}'")

# even though the contract is that we don't get here until AWS
# reports that the volume is detached, the "eventual consistency"
# might throw a spanner in the machinery and report that it still
# is in use

tries = 10
sleep_cb = lambda do |num_tries, error|
sleep_time = 2**[num_tries,8].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
@logger.debug("Waiting for volume `#{volume.id}' to be deleted")
@logger.debug("#{error.class}: `#{error.message}'") if error
@logger.debug("Retrying in #{sleep_time} seconds - #{num_tries}/#{tries}")
sleep_time
end

sleep_cb = ResourceWait.sleep_callback("Waiting for volume `#{volume.id}' to be deleted", tries)
ensure_cb = Proc.new do |retries|
cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries == tries
cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries+1 == tries
end
error = AWS::EC2::Errors::Client::VolumeInUse

errors = [AWS::EC2::Errors::Client::VolumeInUse]

Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: errors, ensure: ensure_cb) do
Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: error, ensure: ensure_cb) do
volume.delete
true # return true to only retry on Exceptions
end
Expand Down
20 changes: 12 additions & 8 deletions bosh_aws_cpi/lib/cloud/aws/resource_wait.rb
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,8 @@ def for_resource(args)
tries = args.fetch(:tries, DEFAULT_TRIES).to_i
target_state = args.fetch(:target_state)

sleep_cb = self.class.sleep_callback("Waiting for #{desc} to be #{target_state}", tries)
errors << AWS::EC2::Errors::RequestLimitExceeded

sleep_cb = lambda do |num_tries, error|
sleep_time = 2**[num_tries,MAX_SLEEP_EXPONENT].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
Bosh::AwsCloud::ResourceWait.logger.debug("#{error.class}: `#{error.message}'") if error
Bosh::AwsCloud::ResourceWait.logger.debug("Waiting for #{desc} to be #{target_state}, retrying in #{sleep_time} seconds (#{num_tries}/#{tries})")
sleep_time
end

ensure_cb = Proc.new do |retries|
cloud_error("Timed out waiting for #{desc} to be #{target_state}, took #{time_passed}s") if retries == tries
end
Expand Down Expand Up @@ -166,6 +159,17 @@ def for_resource(args)
def time_passed
Time.now - @started_at
end

private

def self.sleep_callback(description, tries)
lambda do |num_tries, error|
sleep_time = 2**[num_tries, MAX_SLEEP_EXPONENT].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
Bosh::AwsCloud::ResourceWait.logger.debug("#{error.class}: `#{error.message}'") if error
Bosh::AwsCloud::ResourceWait.logger.debug("#{description}, retrying in #{sleep_time} seconds (#{num_tries}/#{tries})")
sleep_time
end
end
end
end

45 changes: 24 additions & 21 deletions bosh_aws_cpi/spec/unit/delete_disk_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,53 +3,56 @@
require "spec_helper"

describe Bosh::AwsCloud::Cloud do

it "deletes an EC2 volume" do
volume = double("volume", :id => "v-foo")

cloud = mock_cloud do |ec2|
let(:volume) { double(AWS::EC2::Volume, :id => "v-foo") }
let(:cloud) do
mock_cloud do |ec2|
ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
end
cloud.stub(:task_checkpoint)
end

before do
Bosh::AwsCloud::ResourceWait.stub(sleep_callback: 0)
end

it "deletes an EC2 volume" do
Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)

volume.should_receive(:state).and_return(:available)
volume.should_receive(:delete)

cloud.delete_disk("v-foo")
end

it "retries deleting the volume if it's in use" do
Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)
Bosh::Clouds::Config.stub(:task_checkpoint)

volume.should_receive(:delete).once.ordered.and_raise(AWS::EC2::Errors::Client::VolumeInUse)
volume.should_receive(:delete).ordered

cloud.delete_disk("v-foo")
end

it "doesn't delete volume unless it's state is `available'" do
volume = double("volume", :id => "v-foo", :state => :busy)
it "raises an error if the volume remains in use after every deletion retry" do
Bosh::Clouds::Config.stub(:task_checkpoint)

cloud = mock_cloud do |ec2|
ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
end
volume.should_receive(:delete).exactly(10).times.and_raise(AWS::EC2::Errors::Client::VolumeInUse)

expect {
cloud.delete_disk("v-foo")
}.to raise_error(Bosh::Clouds::CloudError,
"Cannot delete volume `v-foo', state is busy")
}.to raise_error("Timed out waiting to delete volume `v-foo'")
end

it "does a fast path delete when asked to" do
volume = double("volume", :id => "v-foo")

options = mock_cloud_options
options["aws"]["fast_path_delete"] = "yes"

cloud = mock_cloud(options) do |ec2|
ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
end

volume.stub(:state => :available)
volume.should_receive(:delete)

volume.should_receive(:add_tag).with("Name", {:value => "to be deleted"})
Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)
Bosh::AwsCloud::ResourceWait.should_not_receive(:for_volume)

cloud.delete_disk("v-foo")
end

end

0 comments on commit e2fce50

Please sign in to comment.