Skip to content

Commit

Permalink
Prevent race conditions when cancelling service binding creations
Browse files Browse the repository at this point in the history
The CreateBindingAsyncJob processes service credential bindings, service
keys and service route bindings. There can be race conditions when a
delete has been successful just before the polling times out or the
entire job expires.

Add tests to the 'create binding job' shared examples to simulate the
scenarios outlined above. The resource being gone results in a
Sequel::NoExistingObject exception that is now caught and logged.

Furthermore the database resource is only loaded once per job execution
(resulting in less database queries) and reloaded when needed.
  • Loading branch information
philippthun committed Feb 25, 2022
1 parent 5598462 commit e5be915
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 14 deletions.
41 changes: 27 additions & 14 deletions app/jobs/v3/create_binding_async_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
module VCAP::CloudController
module V3
class CreateBindingAsyncJob < Jobs::ReoccurringJob
class OperationCancelled < StandardError; end
class OperationCancelled < CloudController::Errors::ApiError; end

class BindingNotFound < CloudController::Errors::ApiError; end

Expand Down Expand Up @@ -54,9 +54,9 @@ def resource_type
end

def perform
not_found! unless resource
not_found! unless get_resource

cancelled! if delete_in_progress?
cancelled! if other_operation_in_progress?

compute_maximum_duration

Expand All @@ -70,15 +70,13 @@ def perform
polling_status = action.poll(resource)

if polling_status[:finished]
finish
return finish
end

if polling_status[:retry_after].present?
self.polling_interval_seconds = polling_status[:retry_after]
end
rescue OperationCancelled => e
raise CloudController::Errors::ApiError.new_from_details('UnableToPerform', operation_type, e.message)
rescue BindingNotFound => e
rescue BindingNotFound, OperationCancelled => e
raise e
rescue ServiceBindingCreate::BindingNotRetrievable
raise CloudController::Errors::ApiError.new_from_details('ServiceBindingInvalid', 'The broker responded asynchronously but does not support fetching binding data')
Expand All @@ -88,20 +86,27 @@ def perform
end

def handle_timeout
resource.save_with_attributes_and_new_operation(
error_message = "Service Broker failed to #{operation} within the required time."
resource.reload.save_with_attributes_and_new_operation(
{},
{
type: operation_type,
state: 'failed',
description: "Service Broker failed to #{operation} within the required time.",
description: error_message,
}
)
rescue Sequel::NoExistingObject
log_failed_operation_for_non_existing_resource(error_message)
end

private

def get_resource # rubocop:disable Naming/AccessorMethodName
@resource = actor.get_resource(resource_guid)
end

def resource
actor.get_resource(resource_guid)
@resource ||= get_resource
end

def compute_maximum_duration
Expand All @@ -110,15 +115,15 @@ def compute_maximum_duration
end

def not_found!
raise BindingNotFound.new_from_details('ResourceNotFound', "The binding could not be found: #{@resource_guid}")
raise BindingNotFound.new_from_details('ResourceNotFound', "The binding could not be found: #{resource_guid}")
end

def delete_in_progress?
resource.operation_in_progress? && resource.last_operation&.type == 'delete'
def other_operation_in_progress?
resource.operation_in_progress? && resource.last_operation.type != operation_type
end

def cancelled!
raise OperationCancelled.new("#{resource.last_operation.type} in progress")
raise OperationCancelled.new_from_details('UnableToPerform', operation_type, "#{resource.last_operation.type} in progress")
end

def save_failure(error_message)
Expand All @@ -132,6 +137,14 @@ def save_failure(error_message)
}
)
end
rescue Sequel::NoExistingObject
log_failed_operation_for_non_existing_resource(error_message)
end

def log_failed_operation_for_non_existing_resource(error_message)
@logger ||= Steno.logger('cc.background')

@logger.info("Saving failed operation with error message '#{error_message}' for #{resource_type} '#{resource_guid}' did not succeed. The resource does not exist anymore.")
end
end
end
Expand Down
39 changes: 39 additions & 0 deletions spec/support/shared_examples/jobs/create_binding_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -244,4 +244,43 @@ def test_retry_after(value, expected)
expect(binding.last_operation.description).to eq('Service Broker failed to bind within the required time.')
end
end

describe 'cancelling a binding creation' do
let(:action) { double('BindingAction', { bind: nil }) }

before do
allow(VCAP::CloudController::V3::CreateServiceBindingFactory).to receive(:action).and_return(action)
end

context 'when binding gets deleted while service broker polling times out' do
before do
allow(action).to receive(:poll) do
binding.destroy # Simulate a successful deletion happening in parallel...
raise 'Service Broker Timeout' # ... and raise an error (would be: ServiceBrokers::V2::Errors::HttpClientTimeout).
end
end

it 'reports the error and does not try to update the (non-existing) binding / last_operation' do
expect { subject.perform }.to raise_error(
CloudController::Errors::ApiError, 'bind could not be completed: Service Broker Timeout'
)
expect { binding.reload }.to raise_error(Sequel::NoExistingObject)
end
end

context 'when binding gets deleted immediately before the job expires' do
before do
allow(action).to receive(:poll) do
binding.destroy # Simulate a successful deletion happening in parallel.
{ finished: false }
end
end

it 'does not try to update the (non-existing) binding / last_operation' do
subject.perform
subject.handle_timeout
expect { binding.reload }.to raise_error(Sequel::NoExistingObject)
end
end
end
end

0 comments on commit e5be915

Please sign in to comment.