Permalink
Browse files

Fix bug where AWS CPI fails to delete volume in use, and doesn't retry

Also DRY sleep_callback (bounded exponential backoff) for resource waits
[fixes #46679995]
  • Loading branch information...
1 parent 960fc44 commit e2fce50adf91975e06b64312d12f4dba22e29ca8 Amit Kumar Gupta committed Mar 22, 2013
@@ -202,35 +202,17 @@ def create_disk(size, instance_id = nil)
def delete_disk(disk_id)
with_thread_name("delete_disk(#{disk_id})") do
volume = @ec2.volumes[disk_id]
- state = volume.state
-
- if state != :available
- cloud_error("Cannot delete volume `#{volume.id}', state is #{state}")
- end
@logger.info("Deleting volume `#{volume.id}'")
- # even though the contract is that we don't get here until AWS
- # reports that the volume is detached, the "eventual consistency"
- # might throw a spanner in the machinery and report that it still
- # is in use
-
tries = 10
- sleep_cb = lambda do |num_tries, error|
- sleep_time = 2**[num_tries,8].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
- @logger.debug("Waiting for volume `#{volume.id}' to be deleted")
- @logger.debug("#{error.class}: `#{error.message}'") if error
- @logger.debug("Retrying in #{sleep_time} seconds - #{num_tries}/#{tries}")
- sleep_time
- end
-
+ sleep_cb = ResourceWait.sleep_callback("Waiting for volume `#{volume.id}' to be deleted", tries)
ensure_cb = Proc.new do |retries|
- cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries == tries
+ cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries+1 == tries
end
+ error = AWS::EC2::Errors::Client::VolumeInUse
- errors = [AWS::EC2::Errors::Client::VolumeInUse]
-
- Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: errors, ensure: ensure_cb) do
+ Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: error, ensure: ensure_cb) do
volume.delete
true # return true to only retry on Exceptions
end
@@ -130,15 +130,8 @@ def for_resource(args)
tries = args.fetch(:tries, DEFAULT_TRIES).to_i
target_state = args.fetch(:target_state)
+ sleep_cb = self.class.sleep_callback("Waiting for #{desc} to be #{target_state}", tries)
errors << AWS::EC2::Errors::RequestLimitExceeded
-
- sleep_cb = lambda do |num_tries, error|
- sleep_time = 2**[num_tries,MAX_SLEEP_EXPONENT].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
- Bosh::AwsCloud::ResourceWait.logger.debug("#{error.class}: `#{error.message}'") if error
- Bosh::AwsCloud::ResourceWait.logger.debug("Waiting for #{desc} to be #{target_state}, retrying in #{sleep_time} seconds (#{num_tries}/#{tries})")
- sleep_time
- end
-
ensure_cb = Proc.new do |retries|
cloud_error("Timed out waiting for #{desc} to be #{target_state}, took #{time_passed}s") if retries == tries
end
@@ -166,6 +159,17 @@ def for_resource(args)
def time_passed
Time.now - @started_at
end
+
+ private
+
+ def self.sleep_callback(description, tries)
+ lambda do |num_tries, error|
+ sleep_time = 2**[num_tries, MAX_SLEEP_EXPONENT].min # Exp backoff: 1, 2, 4, 8 ... up to max 256
+ Bosh::AwsCloud::ResourceWait.logger.debug("#{error.class}: `#{error.message}'") if error
+ Bosh::AwsCloud::ResourceWait.logger.debug("#{description}, retrying in #{sleep_time} seconds (#{num_tries}/#{tries})")
+ sleep_time
+ end
+ end
end
end
@@ -3,53 +3,56 @@
require "spec_helper"
describe Bosh::AwsCloud::Cloud do
-
- it "deletes an EC2 volume" do
- volume = double("volume", :id => "v-foo")
-
- cloud = mock_cloud do |ec2|
+ let(:volume) { double(AWS::EC2::Volume, :id => "v-foo") }
+ let(:cloud) do
+ mock_cloud do |ec2|
ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
end
- cloud.stub(:task_checkpoint)
+ end
+
+ before do
+ Bosh::AwsCloud::ResourceWait.stub(sleep_callback: 0)
+ end
+
+ it "deletes an EC2 volume" do
+ Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)
- volume.should_receive(:state).and_return(:available)
volume.should_receive(:delete)
+ cloud.delete_disk("v-foo")
+ end
+
+ it "retries deleting the volume if it's in use" do
Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)
+ Bosh::Clouds::Config.stub(:task_checkpoint)
+
+ volume.should_receive(:delete).once.ordered.and_raise(AWS::EC2::Errors::Client::VolumeInUse)
+ volume.should_receive(:delete).ordered
cloud.delete_disk("v-foo")
end
- it "doesn't delete volume unless it's state is `available'" do
- volume = double("volume", :id => "v-foo", :state => :busy)
+ it "raises an error if the volume remains in use after every deletion retry" do
+ Bosh::Clouds::Config.stub(:task_checkpoint)
- cloud = mock_cloud do |ec2|
- ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
- end
+ volume.should_receive(:delete).exactly(10).times.and_raise(AWS::EC2::Errors::Client::VolumeInUse)
expect {
cloud.delete_disk("v-foo")
- }.to raise_error(Bosh::Clouds::CloudError,
- "Cannot delete volume `v-foo', state is busy")
+ }.to raise_error("Timed out waiting to delete volume `v-foo'")
end
it "does a fast path delete when asked to" do
- volume = double("volume", :id => "v-foo")
-
options = mock_cloud_options
options["aws"]["fast_path_delete"] = "yes"
-
cloud = mock_cloud(options) do |ec2|
ec2.volumes.stub(:[]).with("v-foo").and_return(volume)
end
- volume.stub(:state => :available)
volume.should_receive(:delete)
-
volume.should_receive(:add_tag).with("Name", {:value => "to be deleted"})
- Bosh::AwsCloud::ResourceWait.stub(:for_volume).with(volume: volume, state: :deleted)
+ Bosh::AwsCloud::ResourceWait.should_not_receive(:for_volume)
cloud.delete_disk("v-foo")
end
-
end

0 comments on commit e2fce50

Please sign in to comment.