Skip to content
This repository has been archived by the owner on Dec 15, 2018. It is now read-only.

Commit

Permalink
Use a default timeout for job steps without one specified
Browse files Browse the repository at this point in the history
Summary:
Without a timeout, jobs that don't finish successfully can continue running indefinitely.
This isn't useful, and can't be easily solved by adding timeouts later, as builds run based on
the config when they were created.
This change provides a default timeout of 1 hour when one wasn't specified.
Jobs that run for longer than that legitimately should specify a longer timeout, as otherwise
we have difficulty distinguishing them from stuck jobs.

Test Plan: Unit

Reviewers: vishal

Reviewed By: vishal

Subscribers: changesbot, wwu

Differential Revision: https://tails.corp.dropbox.com/D92037
  • Loading branch information
kylec1 committed Feb 27, 2015
1 parent 417aec3 commit e4da9dd
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 14 deletions.
5 changes: 5 additions & 0 deletions changes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ def create_app(_read_config=True, **config):
app.config['MAIL_DEFAULT_SENDER'] = 'changes@localhost'
app.config['BASE_URI'] = 'http://localhost:5000'

# In minutes, the timeout applied to jobs without a timeout specified at build time.
# A timeout should nearly always be specified; this is just a safeguard so that
# unspecified timeout doesn't mean "is allowed to run indefinitely".
app.config['DEFAULT_JOB_TIMEOUT_MIN'] = 60

app.config.update(config)
if _read_config:
if os.environ.get('CHANGES_CONF'):
Expand Down
21 changes: 16 additions & 5 deletions changes/jobs/sync_job_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,13 @@ def has_test_failures(step):
).exists()).scalar()


def has_timed_out(step, jobplan):
def has_timed_out(step, jobplan, default_timeout):
"""
Args:
default_timeout (int): Timeout in minutes to be used when
no timeout is specified for this build. Required because
nothing is expected to run forever.
"""
if step.status != Status.in_progress:
return False

Expand All @@ -84,9 +90,7 @@ def has_timed_out(step, jobplan):
# TODO(dcramer): we make an assumption that there is a single step
options = jobplan.get_steps()[0].options

timeout = int(options.get('build.timeout') or 0)
if not timeout:
return False
timeout = int(options.get('build.timeout', '0')) or default_timeout

# timeout is in minutes
timeout = timeout * 60
Expand Down Expand Up @@ -123,6 +127,12 @@ def record_coverage_stats(step):
})


# In minutes, the timeout applied to jobs without a timeout specified at build time.
# If the job legitimately takes more than an hour, the build
# should specify an appropriate timeout.
DEFAULT_TIMEOUT_MIN = 60


@tracked_task(on_abort=abort_step, max_retries=100)
def sync_job_step(step_id):
step = JobStep.query.get(step_id)
Expand All @@ -143,7 +153,8 @@ def sync_job_step(step_id):
is_finished = sync_job_step.verify_all_children() == Status.finished

if not is_finished:
if has_timed_out(step, jobplan):
default_timeout = current_app.config['DEFAULT_JOB_TIMEOUT_MIN']
if has_timed_out(step, jobplan, default_timeout=default_timeout):
implementation.cancel_step(step=step)

# Not all implementations can actually cancel, but it's dead to us as of now
Expand Down
2 changes: 1 addition & 1 deletion partials/admin/project-plan-details.html
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ <h4>{{step.order}}. {{step.name}}</h4>
<div class="form-group">
<label>Timeout</label>
<input type="text" class="form-control" ng-model="step.options['build.timeout']">
<div class="help-block">Abort step if it runs longer than given duration (in minutes)</div>
<div class="help-block">Abort step if it runs longer than given duration (in minutes). If '0', an arbitrary default will be used.</div>
</div>

<div style="text-align:right">
Expand Down
29 changes: 21 additions & 8 deletions tests/changes/jobs/test_sync_job_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import mock

from datetime import datetime, timedelta
from flask import current_app

from changes.config import db
from changes.constants import Result, Status
Expand Down Expand Up @@ -36,35 +37,45 @@ def test_simple(self):

db.session.commit()

# for use as defaults, an instant timeout and one 1k+ years in the future.
default_always, default_never = 0, 1e9

jobphase = self.create_jobphase(job)
jobstep = self.create_jobstep(jobphase)

assert not has_timed_out(jobstep, jobplan)
assert not has_timed_out(jobstep, jobplan, default_always)

jobstep.status = Status.in_progress
jobstep.date_started = datetime.utcnow()
db.session.add(jobstep)
db.session.commit()

assert not has_timed_out(jobstep, jobplan)
assert not has_timed_out(jobstep, jobplan, default_always)

jobstep.date_started = datetime.utcnow() - timedelta(seconds=400)
# make it so the job started 6 minutes ago.
jobstep.date_started = datetime.utcnow() - timedelta(minutes=6)
db.session.add(jobstep)
db.session.commit()

assert has_timed_out(jobstep, jobplan)
# Based on config value of 5, should time out.
assert has_timed_out(jobstep, jobplan, default_never)

jobplan.data['snapshot']['steps'][0]['options'][option.name] = '0'
db.session.add(jobplan)
db.session.commit()

assert not has_timed_out(jobstep, jobplan)
# The timeout option is unset, so default is used.
assert has_timed_out(jobstep, jobplan, 4)
assert not has_timed_out(jobstep, jobplan, 7)

# Make sure we don't ignore 0 as default like we do with the option.
assert has_timed_out(jobstep, jobplan, 0)

jobplan.data['snapshot']['steps'][0]['options'][option.name] = '500'
jobplan.data['snapshot']['steps'][0]['options'][option.name] = '7'
db.session.add(jobplan)
db.session.commit()

assert not has_timed_out(jobstep, jobplan)
assert not has_timed_out(jobstep, jobplan, default_always)


class IsMissingTestsTest(BaseTestCase):
Expand Down Expand Up @@ -424,13 +435,15 @@ def test_timed_out(self, get_implementation, mock_has_timed_out):

mock_has_timed_out.return_value = True

current_app.config['DEFAULT_JOB_TIMEOUT_MIN'] = 99

sync_job_step(
step_id=step.id.hex,
task_id=step.id.hex,
parent_task_id=job.id.hex
)

mock_has_timed_out.assert_called_once_with(step, jobplan)
mock_has_timed_out.assert_called_once_with(step, jobplan, default_timeout=99)

get_implementation.assert_called_once_with()
implementation.cancel_step.assert_called_once_with(
Expand Down

0 comments on commit e4da9dd

Please sign in to comment.