Skip to content

Commit

Permalink
Improve retrying of failed sharded background migrations
Browse files Browse the repository at this point in the history
  • Loading branch information
fatkodima committed May 7, 2024
1 parent edf5de1 commit bc46c60
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## master (unreleased)

- Improve retrying of failed sharded background migrations
- Fix a bug when retried background data migration can not start
- Do not run multiple background schema migrations on the same table at the same time

Expand Down
11 changes: 11 additions & 0 deletions docs/background_data_migrations.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,17 @@ migration.progress # value from 0 to 100.0

**Note**: It will be easier to work with background migrations through some kind of Web UI, but until it is implemented, we can work with them only manually.

## Retrying a failed migration

To retry a failed migration, run:

```ruby
migration = OnlineMigrations::BackgroundMigrations::Migration.find(id)
migration.retry # => `true` if scheduled to be retried, `false` - if not
```

The migration will be retried on the next Scheduler run.

## Configuring

There are a few configurable options for the Background Migrations. Custom configurations should be placed in a `online_migrations.rb` initializer.
Expand Down
11 changes: 11 additions & 0 deletions docs/background_schema_migrations.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ end

You shouldn't depend on the schema until the background schema migration is finished. If having the schema migrated is a requirement, then the `ensure_background_schema_migration_succeeded` helper can be used to guarantee that the migration succeeded and the schema change applied.

## Retrying a failed migration

To retry a failed migration, run:

```ruby
migration = OnlineMigrations::BackgroundSchemaMigrations::Migration.find(id)
migration.retry # => `true` if scheduled to be retried, `false` - if not
```

The migration will be retried on the next Scheduler run.

## Instrumentation

Background schema migrations use the [ActiveSupport::Notifications](http://api.rubyonrails.org/classes/ActiveSupport/Notifications.html) API.
Expand Down
26 changes: 17 additions & 9 deletions lib/online_migrations/background_migrations/migration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ class Migration < ApplicationRecord
enum status: STATUSES.index_with(&:to_s)
end

belongs_to :parent, class_name: name, optional: true
has_many :children, class_name: name, foreign_key: :parent_id, dependent: :delete_all
has_many :migration_jobs, dependent: :delete_all
belongs_to :parent, class_name: name, optional: true, inverse_of: :children
has_many :children, class_name: name, foreign_key: :parent_id, dependent: :delete_all, inverse_of: :parent
has_many :migration_jobs, dependent: :delete_all, inverse_of: :migration

validates :migration_name, :batch_column_name, presence: true

Expand Down Expand Up @@ -156,20 +156,28 @@ def interval_elapsed?
last_job.enqueued? || (last_job.updated_at + batch_pause <= Time.current)
end

# Manually retry failed jobs.
# Mark this migration as ready to be processed again.
#
# This method marks failed jobs as ready to be processed again, and
# they will be picked up on the next Scheduler run.
#
def retry_failed_jobs
iterator = BatchIterator.new(migration_jobs.failed)
iterator.each_batch(of: 100) do |batch|
transaction do
def retry
if composite? && failed?
children.failed.each(&:retry)
enqueued!
true
elsif failed?
iterator = BatchIterator.new(migration_jobs.failed)
iterator.each_batch(of: 100) do |batch|
batch.each(&:retry)
enqueued!
end
enqueued!
true
else
false
end
end
alias retry_failed_jobs retry

# Returns the time this migration started running.
def started_at
Expand Down
28 changes: 18 additions & 10 deletions lib/online_migrations/background_migrations/migration_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class MigrationJob < ApplicationRecord
delegate :migration_name, :migration_class, :migration_object, :migration_relation, :batch_column_name,
:arguments, :batch_pause, to: :migration

belongs_to :migration
belongs_to :migration, inverse_of: :migration_jobs

validates :min_value, :max_value, presence: true, numericality: { greater_than: 0 }
validate :values_in_migration_range, if: :min_value?
Expand All @@ -69,15 +69,23 @@ def stuck?
# This is used when retrying failed jobs.
#
def retry
update!(
status: self.class.statuses[:enqueued],
attempts: 0,
started_at: nil,
finished_at: nil,
error_class: nil,
error_message: nil,
backtrace: nil
)
if failed?
transaction do
update!(
status: self.class.statuses[:enqueued],
attempts: 0,
started_at: nil,
finished_at: nil,
error_class: nil,
error_message: nil,
backtrace: nil
)
migration.enqueued! if migration.failed?
end
true
else
false
end
end

private
Expand Down
11 changes: 8 additions & 3 deletions lib/online_migrations/background_schema_migrations/migration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ class Migration < ApplicationRecord
enum status: STATUSES.index_with(&:to_s)
end

belongs_to :parent, class_name: name, optional: true
has_many :children, class_name: name, foreign_key: :parent_id
belongs_to :parent, class_name: name, optional: true, inverse_of: :children
has_many :children, class_name: name, foreign_key: :parent_id, inverse_of: :parent

validates :table_name, presence: true, length: { maximum: MAX_IDENTIFIER_LENGTH }
validates :definition, presence: true
Expand Down Expand Up @@ -109,8 +109,10 @@ def progress
# This is used to manually retrying failed migrations.
#
def retry
if composite?
if composite? && failed?
children.failed.each(&:retry)
enqueued!
true
elsif failed?
update!(
status: self.class.statuses[:enqueued],
Expand All @@ -121,6 +123,9 @@ def retry
error_message: nil,
backtrace: nil
)
true
else
false
end
end

Expand Down
4 changes: 3 additions & 1 deletion test/background_migrations/migration_job_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def test_retry
assert_equal "Boom!", error.message
j = m.last_job

j.retry
assert j.failed?
assert j.retry
assert m.running? # not updated to `enqueued`, because it was not `failed`
assert j.enqueued?
assert_equal 0, j.attempts
assert_nil j.started_at
Expand Down
27 changes: 25 additions & 2 deletions test/background_migrations/migration_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -221,20 +221,43 @@ def test_interval_elapsed_p
end
end

def test_retry_failed_jobs
def test_retry
2.times { User.create! }
m = create_migration(batch_size: 1, sub_batch_size: 1)
3.times { run_migration_job(m) }
assert m.succeeded?
assert_equal false, m.retry

m.update_column(:status, "failed")
m.migration_jobs.update_all(status: "failed")

m.retry_failed_jobs
assert m.retry
assert m.migration_jobs.all?(&:enqueued?)
assert m.enqueued?
end

def test_retry_composite
m = create_migration(migration_name: "MakeAllDogsNice")
child1, child2, child3 = m.children.to_a
run_all_migration_jobs(m)

migrations = [m, child1, child2, child3]
assert migrations.all?(&:succeeded?)
assert_equal false, m.retry

m.update_column(:status, "failed")
child1.update_column(:status, "failed")
child1.migration_jobs.update_all(status: "failed")

assert m.retry

migrations.each(&:reload)
assert m.enqueued?
assert child1.enqueued?
assert child3.succeeded?
assert child1.migration_jobs.all?(&:enqueued?)
end

def test_started_at
m = create_migration
assert_equal m.created_at, m.started_at
Expand Down
23 changes: 22 additions & 1 deletion test/background_schema_migrations/migration_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test_retry
end
assert m.failed?

m.retry
assert m.retry

m.reload
assert m.enqueued?
Expand All @@ -171,6 +171,27 @@ def test_retry
assert_nil m.backtrace
end

def test_retry_composite
m = create_sharded_migration
run_migration(m)
assert m.succeeded?

child1, child2, child3 = m.children.to_a
migrations = [m, child1, child2, child3]
assert migrations.all?(&:succeeded?)
assert_equal false, m.retry

m.update_column(:status, "failed")
child1.update_column(:status, "failed")

assert m.retry

migrations.each(&:reload)
assert m.enqueued?
assert child1.enqueued?
assert child3.succeeded?
end

private
def create_migration(
name: "index_users_on_email",
Expand Down

0 comments on commit bc46c60

Please sign in to comment.