Skip to content
This repository has been archived by the owner on Dec 15, 2018. It is now read-only.

Commit

Permalink
Allow any jobstep that isn't finished to timeout
Browse files Browse the repository at this point in the history
Summary:
We have a number of stuck jobsteps (and we will continue to get them) from Mesos
where the state is not `in_progress` (`allocated' is a primary culprit) and
the jobstep hasn't started.
We want all unfinished jobsteps to be able to timeout; this change makes that
happen.
Since timing out a jobstep that isn't `in_progress` is almost certainly an
infrastructural issue, a warning is sent to Sentry in those cases.

Test Plan: Light unit; suggestions welcome.

Reviewers: josiah

Reviewed By: josiah

Subscribers: changesbot, wwu

Differential Revision: https://tails.corp.dropbox.com/D93802
  • Loading branch information
kylec1 committed Mar 9, 2015
1 parent 7fbf2ed commit 46df8af
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 4 deletions.
20 changes: 16 additions & 4 deletions changes/jobs/sync_job_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,16 @@ def has_timed_out(step, jobplan, default_timeout):
nothing is expected to run forever.
"""
if step.status != Status.in_progress:
return False
# HACK: We don't really want to timeout jobsteps that are
# stuck for infrastructural reasons here, but we don't yet
# have code to handle other cases well.
# To be sure that jobsteps can die, we cast a wide net here.
if step.status == Status.finished:
return False

if not step.date_started:
return False
# date_started is preferred if available, but we fall back to
# date_created (always set) so jobsteps that never start don't get to run forever.
start_time = step.date_started or step.date_created

# TODO(dcramer): we make an assumption that there is a single step
options = jobplan.get_steps()[0].options
Expand All @@ -95,7 +101,7 @@ def has_timed_out(step, jobplan, default_timeout):
# timeout is in minutes
timeout = timeout * 60

delta = datetime.utcnow() - step.date_started
delta = datetime.utcnow() - start_time
if delta.total_seconds() > timeout:
return True

Expand Down Expand Up @@ -155,6 +161,7 @@ def sync_job_step(step_id):
if not is_finished:
default_timeout = current_app.config['DEFAULT_JOB_TIMEOUT_MIN']
if has_timed_out(step, jobplan, default_timeout=default_timeout):
old_status = step.status
implementation.cancel_step(step=step)

# Not all implementations can actually cancel, but it's dead to us as of now
Expand All @@ -179,6 +186,11 @@ def sync_job_step(step_id):

db.session.flush()
statsreporter.stats().incr('job_step_timed_out')
# If we timeout something that isn't in progress, that's our fault, and we should know.
if old_status != Status.in_progress:
current_app.logger.warning(
"Timed out jobstep that wasn't in progress: %s (was %s)", step.id, old_status)

if step.status != Status.in_progress:
retry_after = QUEUED_RETRY_DELAY
else:
Expand Down
17 changes: 17 additions & 0 deletions tests/changes/jobs/test_sync_job_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,21 @@ def test_simple(self):

assert not has_timed_out(jobstep, jobplan, default_always)

jobstep.status = Status.allocated
jobstep.date_created = datetime.utcnow() - timedelta(minutes=6)
db.session.add(jobstep)
db.session.commit()

# No date_started, but based on config value of 5 and date_created from
# 6 minutes ago, should time out.
assert has_timed_out(jobstep, jobplan, default_never)

jobstep.status = Status.in_progress
jobstep.date_started = datetime.utcnow()
db.session.add(jobstep)
db.session.commit()

# Now we have a recent date_started, shouldn't time out.
assert not has_timed_out(jobstep, jobplan, default_always)

# make it so the job started 6 minutes ago.
Expand All @@ -60,6 +70,13 @@ def test_simple(self):
# Based on config value of 5, should time out.
assert has_timed_out(jobstep, jobplan, default_never)

jobstep.status = Status.allocated
db.session.add(jobstep)
db.session.commit()

# Doesn't require 'in_progress' to time out.
assert has_timed_out(jobstep, jobplan, default_never)

jobplan.data['snapshot']['steps'][0]['options'][option.name] = '0'
db.session.add(jobplan)
db.session.commit()
Expand Down

0 comments on commit 46df8af

Please sign in to comment.