Skip to content

Commit

Permalink
[blinkpy] 3/3: Detect infrastructure failures
Browse files Browse the repository at this point in the history
Prior to the test-results turndown, rebaselining with interrupted builds
prompted the user with a warning that continuing may yield incorrect
results. This change reimplements this detection on the ResultDB-enabled
path by fetching and interpreting each shard's exit code. The tool
previously read the `interrupted` field of the merged results JSON,
which is no longer fetched.

Also, builds with `INFRA_FAILURE` now also trigger the warning. This
addresses silent incorrect rebaselining when a bot fails on an unrelated
step like `bot_update`, such as:
  https://ci.chromium.org/ui/p/chromium/builders/try/win11-blink-rel/1995/overview

Bug: 1213998, 1123077
Test: git cl issue 4143102
Test: ./blink_tool.py rebaseline-cl --patchset=1 --dry-run
Test: git cl issue 3894814
Test: ./blink_tool.py rebaseline-cl --patchset=7 --dry-run
Test: Both rebaseline runs should show the warning
Change-Id: I31bde4706efae82f7d6a564949fc3b70321afd4e
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4006206
Reviewed-by: Weizhong Xia <weizhong@google.com>
Commit-Queue: Jonathan Lee <jonathanjlee@google.com>
Cr-Commit-Position: refs/heads/main@{#1096359}
  • Loading branch information
jonathan-j-lee authored and Chromium LUCI CQ committed Jan 24, 2023
1 parent 03e7da2 commit dad1b2d
Show file tree
Hide file tree
Showing 10 changed files with 237 additions and 80 deletions.
9 changes: 3 additions & 6 deletions third_party/blink/tools/blinkpy/common/net/git_cl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class TryJobStatus(NamedTuple):
"""
status: Literal['MISSING', 'TRIGGERED', 'SCHEDULED', 'STARTED',
'COMPLETED']
result: Literal[None, 'FAILURE', 'SUCCESS', 'CANCELED'] = None
result: Literal[None, 'FAILURE', 'INFRA_FAILURE', 'SUCCESS',
'CANCELED'] = None

@staticmethod
def from_bb_status(bb_status: str) -> 'TryJobStatus':
Expand All @@ -42,11 +43,7 @@ def from_bb_status(bb_status: str) -> 'TryJobStatus':
if bb_status in ('SCHEDULED', 'STARTED'):
return TryJobStatus(bb_status, None)
else:
# Map result INFRA_FAILURE to FAILURE to avoid introducing a new
# result, and it amounts to the same thing anyway.
return TryJobStatus(
'COMPLETED',
'FAILURE' if bb_status == 'INFRA_FAILURE' else bb_status)
return TryJobStatus('COMPLETED', bb_status)


BuildStatuses = Mapping[Build, TryJobStatus]
Expand Down
12 changes: 6 additions & 6 deletions third_party/blink/tools/blinkpy/common/net/git_cl_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,10 @@ def test_latest_try_jobs_failures(self):
git_cl = GitCL(MockHost(web=web))
self.assertEqual(
git_cl.latest_try_jobs(builder_names=['builder-a', 'builder-b']), {
Build('builder-a', 100): TryJobStatus('COMPLETED', 'FAILURE'),
Build('builder-b', 200): TryJobStatus('COMPLETED', 'FAILURE'),
Build('builder-a', 100):
TryJobStatus('COMPLETED', 'FAILURE'),
Build('builder-b', 200):
TryJobStatus('COMPLETED', 'INFRA_FAILURE'),
})

def test_filter_latest(self):
Expand Down Expand Up @@ -555,15 +557,13 @@ def test_try_job_results(self):
}])
git_cl = GitCL(MockHost(web=web))
self.assertEqual(
git_cl.try_job_results(issue_number=None),
{
git_cl.try_job_results(issue_number=None), {
Build('builder-a', 111):
TryJobStatus('COMPLETED', 'SUCCESS'),
Build('builder-b', 222):
TryJobStatus('SCHEDULED', None),
# INFRA_FAILURE is mapped to FAILURE for this build.
Build('builder-c', 333):
TryJobStatus('COMPLETED', 'FAILURE'),
TryJobStatus('COMPLETED', 'INFRA_FAILURE'),
})

def test_try_job_results_skip_experimental_cq(self):
Expand Down
6 changes: 4 additions & 2 deletions third_party/blink/tools/blinkpy/common/net/rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,16 @@ def _make_get_build_body(self, build=None, build_fields=None):
request = {}
if build.build_id:
request['id'] = str(build.build_id)
if build.builder_name:
elif build.builder_name and build.build_number:
request['builder'] = {
'project': 'chromium',
'bucket': build.bucket,
'builder': build.builder_name
}
if build.build_number:
request['buildNumber'] = build.build_number
else:
raise ValueError('bad GetBuild request: must provide either '
'build ID or (builder and build number)')
if build_fields:
# The `builds.*` prefix is not needed for retrieving an individual
# build.
Expand Down
4 changes: 3 additions & 1 deletion third_party/blink/tools/blinkpy/common/net/rpc_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def test_execute_batch(self):
}
}],
})
self.client.add_get_build_req(Build('linux-rel'), build_fields=['id'])
self.client.add_get_build_req(Build('linux-rel', 123),
build_fields=['id'])
self.client.add_search_builds_req({}, ['id'])
build1, build2, build3 = self.client.execute_batch()
(url, request), = self.host.web.requests
Expand All @@ -153,6 +154,7 @@ def test_execute_batch(self):
'bucket': 'try',
'builder': 'linux-rel',
},
'buildNumber': 123,
'fields': 'id',
},
},
Expand Down
76 changes: 65 additions & 11 deletions third_party/blink/tools/blinkpy/tool/commands/build_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import contextlib
import logging
import re
from typing import Collection, Dict, Optional, Tuple

from requests.exceptions import RequestException

from blinkpy.common import exit_codes
from blinkpy.common.net.rpc import Build
from blinkpy.common.net.web import Web
from blinkpy.common.net.git_cl import BuildStatuses, GitCL, TryJobStatus

_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -34,12 +40,19 @@ class BuildResolver:

# Build fields required by `_build_statuses_from_responses`.
_build_fields = [
'id', 'number', 'builder.builder', 'builder.bucket', 'status'
'id',
'number',
'builder.builder',
'builder.bucket',
'status',
'steps.*.name',
'steps.*.logs.*.name',
'steps.*.logs.*.view_url',
]

def __init__(self,
git_cl: GitCL,
def __init__(self, web: Web, git_cl: GitCL,
can_trigger_jobs: bool = False):
self._web = web
self._git_cl = git_cl
self._can_trigger_jobs = can_trigger_jobs

Expand All @@ -54,7 +67,7 @@ def _build_statuses_from_responses(self, raw_builds) -> BuildStatuses:
return {
Build(build['builder']['builder'], build['number'], build['id'],
build['builder']['bucket']):
TryJobStatus.from_bb_status(build['status'])
self._status_if_interrupted(build)
for build in raw_builds
}

Expand Down Expand Up @@ -92,9 +105,15 @@ def resolve_builds(self,
build_statuses = {}
# Handle implied tryjobs first, since there are more failure modes.
if try_builders_to_infer:
build_statuses.update(
self.fetch_or_trigger_try_jobs(try_builders_to_infer,
patchset))
try_build_statuses = self.fetch_or_trigger_try_jobs(
try_builders_to_infer, patchset)
build_statuses.update(try_build_statuses)
# Re-request completed try builds so that the resolver can check
# for interrupted steps.
for build, (status, _) in try_build_statuses.items():
if build.build_number and status == 'COMPLETED':
self._git_cl.bb_client.add_get_build_req(
build, build_fields=self._build_fields)
# Find explicit or CI builds.
build_statuses.update(
self._build_statuses_from_responses(
Expand All @@ -107,10 +126,41 @@ def resolve_builds(self,
'please re-run the tool to fetch new results.')
return build_statuses

def fetch_or_trigger_try_jobs(self,
builders: Collection[str],
patchset: Optional[int] = None
) -> BuildStatuses:
def _status_if_interrupted(self, raw_build) -> TryJobStatus:
"""Map non-browser-related failures to an infrastructue failure status.
Such failures include shard-level timeouts and early exits caused by
exceeding the failure threshold. These failures are opaque to LUCI, but
can be discovered from `run_web_tests.py` exit code conventions.
"""
# TODO(crbug.com/1123077): After the switch to wptrunner, stop checking
# the `blink_wpt_tests` step.
run_web_tests_pattern = re.compile(
r'[\w_-]*blink_(web|wpt)_tests.*\(with patch\)[^|]*')
for step in raw_build.get('steps', []):
if run_web_tests_pattern.fullmatch(step['name']):
summary = self._fetch_swarming_summary(step)
shards = (summary or {}).get('shards', [])
if any(map(_shard_interrupted, shards)):
return TryJobStatus.from_bb_status('INFRA_FAILURE')
return TryJobStatus.from_bb_status(raw_build['status'])

def _fetch_swarming_summary(self,
step,
log_name: str = 'chromium_swarming.summary'):
for log in step.get('logs', []):
if log['name'] == log_name:
with contextlib.suppress(RequestException):
params = {'format': 'raw'}
return self._web.session.get(log['viewUrl'],
params=params).json()
return None

def fetch_or_trigger_try_jobs(
self,
builders: Collection[str],
patchset: Optional[int] = None,
) -> BuildStatuses:
"""Fetch or trigger try jobs for the current CL.
Arguments:
Expand Down Expand Up @@ -178,3 +228,7 @@ def _log_build_statuses(self, build_statuses: BuildStatuses):
_log.info(template, build.builder_name,
str(build.build_number or '--'), build_statuses[build],
build.bucket)


def _shard_interrupted(shard) -> bool:
return int(shard.get('exit_code', 0)) in exit_codes.ERROR_CODES
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
# found in the LICENSE file.

import json
from unittest.mock import Mock, call

from blinkpy.common import exit_codes
from blinkpy.common.host_mock import MockHost
from blinkpy.common.net.git_cl import TryJobStatus
from blinkpy.common.net.git_cl_mock import MockGitCL
Expand All @@ -21,10 +23,11 @@ class BuildResolverTest(LoggingTestCase):
def setUp(self):
super().setUp()
self.host = MockHost()
self.host.web.session = Mock()
# A CL should only be required for try builders without explicit build
# numbers.
self.git_cl = MockGitCL(self.host, issue_number='None')
self.resolver = BuildResolver(self.git_cl)
self.resolver = BuildResolver(self.host.web, self.git_cl)

def test_resolve_last_failing_ci_build(self):
self.host.web.append_prpc_response({
Expand Down Expand Up @@ -62,9 +65,14 @@ def test_resolve_last_failing_ci_build(self):
},
'status': 'FAILURE',
},
'fields': ('builds.*.id,builds.*.number,'
'fields': ('builds.*.id,'
'builds.*.number,'
'builds.*.builder.builder,'
'builds.*.builder.bucket,builds.*.status'),
'builds.*.builder.bucket,'
'builds.*.status,'
'builds.*.steps.*.name,'
'builds.*.steps.*.logs.*.name,'
'builds.*.steps.*.logs.*.view_url'),
},
}],
})
Expand Down Expand Up @@ -116,8 +124,14 @@ def test_resolve_builds_with_explicit_numbers(self):
},
'buildNumber':
123,
'fields': ('id,number,builder.builder,builder.bucket,'
'status'),
'fields': ('id,'
'number,'
'builder.builder,'
'builder.bucket,'
'status,'
'steps.*.name,'
'steps.*.logs.*.name,'
'steps.*.logs.*.view_url'),
},
}, {
'getBuild': {
Expand All @@ -128,8 +142,81 @@ def test_resolve_builds_with_explicit_numbers(self):
},
'buildNumber':
456,
'fields': ('id,number,builder.builder,builder.bucket,'
'status'),
'fields': ('id,'
'number,'
'builder.builder,'
'builder.bucket,'
'status,'
'steps.*.name,'
'steps.*.logs.*.name,'
'steps.*.logs.*.view_url'),
},
}],
})

def test_detect_interruption_from_shard_exit_codes(self):
self.host.web.append_prpc_response({
'responses': [{
'getBuild': {
'id':
str(build_num),
'builder': {
'builder': 'linux-rel',
'bucket': 'try',
},
'number':
build_num,
'status':
'FAILURE',
'steps': [{
'name': ('highdpi_blink_web_tests '
'(with patch) on Ubuntu-18.04 (2)'),
'logs': [{
'name':
'chromium_swarming.summary',
'viewUrl':
'https://logs.chromium.org/swarming',
}],
}],
},
} for build_num in [1, 2, 3]],
})

self.host.web.session.get.return_value.json.side_effect = [{
'shards': [{
'exit_code': '0',
}, {
'exit_code': str(exit_codes.INTERRUPTED_EXIT_STATUS),
}],
}, {
'shards': [{
'exit_code': '0',
}, {
'exit_code': str(exit_codes.EARLY_EXIT_STATUS),
}],
}, {
'shards': [{
'exit_code': '0',
}, {
'exit_code': '5',
}],
}]

build_statuses = self.resolver.resolve_builds([
Build('linux-rel', 1),
Build('linux-rel', 2),
Build('linux-rel', 3),
])
self.assertEqual([
call('https://logs.chromium.org/swarming',
params={'format': 'raw'}),
] * 3, self.host.web.session.get.call_args_list)
self.assertEqual(
build_statuses, {
Build('linux-rel', 1, '1'):
TryJobStatus('COMPLETED', 'INFRA_FAILURE'),
Build('linux-rel', 2, '2'):
TryJobStatus('COMPLETED', 'INFRA_FAILURE'),
Build('linux-rel', 3, '3'):
TryJobStatus('COMPLETED', 'FAILURE'),
})

0 comments on commit dad1b2d

Please sign in to comment.