[blinkpy] Report failure reasons for blink web tests.

Report failure reasons for two categories of failures: - Testharness.js failures. Here the new failure(s) (if any) will be reported as the reason. - Other text diff-based failures. A snippet of diff will be supplied as the reason. Failure reasons will be used in Weetbix to perform failure clustering and to assist users link bugs to known failures using failure association rules. They will also be surfaced in MILO. BUG=1243174 TEST=third_party/blink/tools/run_blinkpy_tests.py Change-Id: Id4a1f61527cfea52c9a78e0179ac3070e40beb85 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3539721 Reviewed-by: Rakib Hasan <rmhasan@google.com> Commit-Queue: Patrick Meiring <meiring@google.com> Cr-Commit-Position: refs/heads/main@{#984756}
chromium · Mar 24, 2022 · 989707b · 989707b
1 parent dd5ef8a
commit 989707b
Show file tree

Hide file tree

Showing 8 changed files with 264 additions and 21 deletions.
diff --git a/testing/scripts/run_android_wpt.pydeps b/testing/scripts/run_android_wpt.pydeps
@@ -67,6 +67,7 @@
 //third_party/blink/tools/blinkpy/web_tests/layout_package/bot_test_expectations.py
 //third_party/blink/tools/blinkpy/web_tests/layout_package/json_results_generator.py
 //third_party/blink/tools/blinkpy/web_tests/models/__init__.py
+//third_party/blink/tools/blinkpy/web_tests/models/failure_reason.py
 //third_party/blink/tools/blinkpy/web_tests/models/test_configuration.py
 //third_party/blink/tools/blinkpy/web_tests/models/test_expectations.py
 //third_party/blink/tools/blinkpy/web_tests/models/test_failures.py

diff --git a/third_party/blink/tools/blinkpy/web_tests/controllers/test_result_sink.py b/third_party/blink/tools/blinkpy/web_tests/controllers/test_result_sink.py
@@ -247,6 +247,13 @@ def sink(self, expected, result, expectations):
         if summaries:
             r['summaryHtml'] = '\n'.join(summaries)
 
+        if result.failure_reason:
+            primary_error_message = _truncate_to_utf8_bytes(
+                result.failure_reason.primary_error_message, 1024)
+            r['failureReason'] = {
+                'primaryErrorMessage': primary_error_message,
+            }
+
         self._send({'testResults': [r]})
 
     def close(self):
@@ -257,3 +264,31 @@ def close(self):
         if not self.is_closed:
             self.is_closed = True
             self._session.close()
+
+
+def _truncate_to_utf8_bytes(s, length):
+    """ Truncates a string to a given number of bytes when encoded as UTF-8.
+
+    Ensures the given string does not take more than length bytes when encoded
+    as UTF-8. Adds trailing ellipsis (...) if truncation occurred. A truncated
+    string may end up encoding to a length slightly shorter than length
+    because only whole Unicode codepoints are dropped.
+
+    Args:
+        s: The string to truncate.
+        length: the length (in bytes) to truncate to.
+    """
+    try:
+        encoded = s.encode('utf-8')
+    # When encode throws UnicodeDecodeError in py2, it usually means the str is
+    # already encoded and has non-ascii chars. So skip re-encoding it.
+    except UnicodeDecodeError:
+        encoded = s
+    if len(encoded) > length:
+        # Truncate, leaving space for trailing ellipsis (...).
+        encoded = encoded[:length - 3]
+        # Truncating the string encoded as UTF-8 may have left the final
+        # codepoint only partially present. Pass 'ignore' to acknowledge
+        # and ensure this is dropped.
+        return encoded.decode('utf-8', 'ignore') + "..."
+    return s
diff --git a/third_party/blink/tools/blinkpy/web_tests/controllers/test_result_sink_unittest.py b/third_party/blink/tools/blinkpy/web_tests/controllers/test_result_sink_unittest.py
@@ -15,7 +15,7 @@
 from blinkpy.common.path_finder import RELATIVE_WEB_TESTS
 from blinkpy.web_tests.controllers.test_result_sink import CreateTestResultSink
 from blinkpy.web_tests.controllers.test_result_sink import TestResultSink
-from blinkpy.web_tests.models import test_results
+from blinkpy.web_tests.models import test_results, failure_reason
 from blinkpy.web_tests.models.typ_types import ResultType
 from blinkpy.web_tests.port.test import add_manifest_to_mock_filesystem
 from blinkpy.web_tests.port.test import TestPort
@@ -347,3 +347,32 @@ def test_wpt_location_filename(self):
             'virtual/virtual_wpt/external/wpt/dom/ranges/Range-attributes.html',
             'external/wpt/dom/ranges/Range-attributes.html',
         )
+
+    def test_failure_reason(self):
+        tr = test_results.TestResult(test_name='test-name')
+        tr.failure_reason = failure_reason.FailureReason(
+            'primary error message')
+        sent_data = self.sink(True, tr)
+        self.assertDictEqual(sent_data['failureReason'], {
+            'primaryErrorMessage': 'primary error message',
+        })
+
+    def test_failure_reason_truncated(self):
+        # Swedish "Place of interest symbol", which encodes as 3 bytes in
+        # UTF-8. This is one Unicode code point.
+        poi = b'\xE2\x8C\x98'.decode('utf-8')
+        primary_error_message = poi * 350
+
+        # Test that the primary error message is truncated to 1K bytes in
+        # UTF-8 encoding.
+        tr = test_results.TestResult(test_name='test-name')
+        tr.failure_reason = failure_reason.FailureReason(primary_error_message)
+        sent_data = self.sink(True, tr)
+
+        # Ensure truncation has left only whole unicode code points.
+        # In this case, the output ends up being 1023 bytes, which is one
+        # byte less than the allowed size of 1024 bytes, as we do not want
+        # part of a unicode code point to be included in the output.
+        self.assertDictEqual(sent_data['failureReason'], {
+            'primaryErrorMessage': (poi * 340) + '...',
+        })
diff --git a/third_party/blink/tools/blinkpy/web_tests/models/failure_reason.py b/third_party/blink/tools/blinkpy/web_tests/models/failure_reason.py
@@ -0,0 +1,20 @@
+# Copyright 2022 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+
+# Information about why a test failed.
+# Modelled after https://source.chromium.org/chromium/infra/infra/+/master:go/src/go.chromium.org/luci/resultdb/proto/v1/failure_reason.proto
+class FailureReason(object):
+    def __init__(self, primary_error_message):
+        """Initialises a new failure reason.
+
+        Args:
+            primary_error_message: The error message that ultimately caused
+                    the test to fail. This should/ only be the error message
+                    and should not include any stack traces. In the case that
+                    a test failed due to multiple expectation failures, any
+                    immediately fatal failure should be chosen, or otherwise
+                    the first expectation failure.
+        """
+        self.primary_error_message = primary_error_message
diff --git a/third_party/blink/tools/blinkpy/web_tests/models/test_failures.py b/third_party/blink/tools/blinkpy/web_tests/models/test_failures.py
@@ -25,11 +25,13 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import re
 import six
 from six.moves import cPickle
 
 from blinkpy.web_tests.controllers import repaint_overlay
 from blinkpy.web_tests.models.typ_types import ResultType
+from blinkpy.web_tests.models.failure_reason import FailureReason
 from blinkpy.common.html_diff import html_diff
 from blinkpy.common.unified_diff import unified_diff
 
@@ -101,6 +103,9 @@
 
 _ext_to_file_type = {'.txt': 'text', '.png': 'image', '.wav': 'audio'}
 
+# Matches new failures in TestHarness.js tests.
+FAILURE_RE = re.compile(r'\+(?:FAIL|Harness Error\.) (.*)$')
+
 
 def has_failure_type(failure_type, failure_list):
     return any(isinstance(failure, failure_type) for failure in failure_list)
@@ -174,6 +179,13 @@ def message(self):
         """Returns a string describing the failure in more detail."""
         raise NotImplementedError
 
+    def failure_reason(self):
+        """Returns a FailureReason object describing why the test failed, if
+        the test failed and suitable reasons are available.
+        The information is used in LUCI result viewing UIs and will be
+        used to cluster similar failures together."""
+        return None
+
     def __eq__(self, other):
         return self.__class__.__name__ == other.__class__.__name__
 
@@ -309,38 +321,48 @@ def __init__(self, actual_driver_output, expected_driver_output):
                                           expected_driver_output)
         self.has_repaint_overlay = (
             repaint_overlay.result_contains_repaint_rects(
-                actual_driver_output.text)
+                self._actual_text())
             or repaint_overlay.result_contains_repaint_rects(
-                expected_driver_output.text))
+                self._expected_text()))
         self.file_ext = '.txt'
 
-    def create_artifacts(self, typ_artifacts, force_overwrite=False):
-        # TODO (rmhasan): See if you can can only output diff files for
-        # non empty text.
-        super(FailureText, self).create_artifacts(typ_artifacts,
-                                                  force_overwrite)
-
-        actual_text = ''
-        expected_text = ''
-        if self.expected_driver_output.text is not None:
+    def _actual_text(self):
+        if self.actual_driver_output and\
+                self.actual_driver_output.text is not None:
             if six.PY3:
                 # TODO(crbug/1197331): We should not decode here looks like.
                 # html_diff expects it to be bytes for comparing to account
                 # various types of encodings.
                 # html_diff.py and unified_diff.py use str types during
                 # diff fixup. Will handle it later.
-                expected_text = self.expected_driver_output.text.decode(
-                    'utf8', 'replace')
+                return self.actual_driver_output.text.decode('utf8', 'replace')
             else:
-                expected_text = self.expected_driver_output.text
+                return self.actual_driver_output.text
+        return ''
 
-        if self.actual_driver_output.text is not None:
+    def _expected_text(self):
+        if self.expected_driver_output and\
+                self.expected_driver_output.text is not None:
             if six.PY3:
-                # TODO(crbug/1197331): ditto as in the case of expected_text above.
-                actual_text = self.actual_driver_output.text.decode(
+                # TODO(crbug/1197331): We should not decode here looks like.
+                # html_diff expects it to be bytes for comparing to account
+                # various types of encodings.
+                # html_diff.py and unified_diff.py use str types during
+                # diff fixup. Will handle it later.
+                return self.expected_driver_output.text.decode(
                     'utf8', 'replace')
             else:
-                actual_text = self.actual_driver_output.text
+                return self.expected_driver_output.text
+        return ''
+
+    def create_artifacts(self, typ_artifacts, force_overwrite=False):
+        # TODO (rmhasan): See if you can can only output diff files for
+        # non empty text.
+        super(FailureText, self).create_artifacts(typ_artifacts,
+                                                  force_overwrite)
+
+        actual_text = self._actual_text()
+        expected_text = self._expected_text()
 
         artifacts_abs_path = self.filesystem.join(
             self.result_directory, typ_artifacts.ArtifactsSubDirectory())
@@ -374,6 +396,72 @@ def message(self):
     def text_mismatch_category(self):
         raise NotImplementedError
 
+    def failure_reason(self):
+        actual_text = self._actual_text()
+        expected_text = self._expected_text()
+        diff_content = unified_diff(expected_text, actual_text, "expected",
+                                    "actual")
+        diff_lines = diff_content.splitlines()
+
+        # Drop the standard diff header with the following lines:
+        # --- expected
+        # +++ actual
+        diff_lines = diff_lines[2:]
+
+        # Find the first block of additions and/or deletions in the diff.
+        # E.g.
+        #  Unchanged line
+        # -Old line 1  <-- Start of block
+        # -Old line 2
+        # +New line 1
+        # +New line 2  <-- End of block
+        #  Unchanged line
+        deleted_lines = []
+        added_lines = []
+        match_state = ''
+        for i, line in enumerate(diff_lines):
+            # A block of diffs starts with removals (-)
+            # and then additions (+). Any variation from this
+            # pattern indicates an end of this block of diffs.
+            if ((line.startswith(' ') and match_state != '')
+                    or (line.startswith('-') and match_state == '+')):
+                # End of block of additions and deletions.
+                break
+            if line.startswith('-'):
+                match_state = '-'
+                deleted_lines.append(line)
+            if line.startswith('+'):
+                match_state = '+'
+                added_lines.append(line)
+
+        primary_error = None
+
+        if 'This is a testharness.js-based test.' in actual_text:
+            # Testharness.js test. Find the first new failure (if any) and
+            # report it as the failure reason.
+            for i, line in enumerate(added_lines):
+                match = FAILURE_RE.match(line)
+                if match:
+                    primary_error = match.group(1)
+                    break
+        else:
+            # Reconstitute the first diff block, but put added lines before
+            # the deleted lines as they are usually more interesting.
+            # (New actual output more interesting than missing expected
+            # output, as it is likely to contain errors.)
+            first_diff_block = '\n'.join(added_lines + deleted_lines)
+
+            if len(first_diff_block) >= 30:
+                # Only use the diff if it is not tiny. If it is only the
+                # addition of an empty line at the end of the file or
+                # similar, it is unlikely to be useful.
+                primary_error = ('Unexpected Diff (+got, -want):\n' +
+                                 first_diff_block)
+
+        if primary_error:
+            return FailureReason(primary_error)
+        return None
+
 
 class FailureMissingResult(FailureText):
     def message(self):

diff --git a/third_party/blink/tools/blinkpy/web_tests/models/test_failures_unittest.py b/third_party/blink/tools/blinkpy/web_tests/models/test_failures_unittest.py
@@ -36,7 +36,7 @@
                                                     PassWithStderr,
                                                     FailureCrash,
                                                     FailureTimeout,
-                                                    TestFailure)
+                                                    TestFailure, FailureText)
 
 
 class TestFailuresTest(unittest.TestCase):
@@ -131,3 +131,61 @@ def init_test_failure(test_failure):
         pass_with_stderr.create_artifacts(artifacts)
         self.assertEqual('timeout with stderr',
                          host.filesystem.read_text_file('/dir/foo-stderr.txt'))
+
+    def test_failure_text_failure_reason_testharness_js(self):
+        expected_text = ''
+        actual_text = """Content-Type: text/plain
+This is a testharness.js-based test.
+FAIL Tests that the document gets overscroll event with right deltaX/Y attributes. promise_test: Unhandled rejection with value: "Document did not receive scrollend event."
+Harness: the test ran to completion."""
+
+        self._actual_output.text = actual_text.encode('utf8')
+        self._expected_output.text = expected_text.encode('utf8')
+
+        failure_text = FailureText(self._actual_output, self._expected_output)
+        failure_reason = failure_text.failure_reason()
+        self.assertIsNotNone(failure_reason)
+        self.assertEqual(
+            failure_reason.primary_error_message,
+            'Tests that the document gets overscroll event with right'
+            ' deltaX/Y attributes. promise_test: Unhandled rejection with'
+            ' value: "Document did not receive scrollend event."')
+
+    def test_failure_text_failure_reason_other(self):
+        expected_text = """retained line 1
+deleted line 1
+deleted line 2
+retained line 2
+"""
+
+        actual_text = """retained line 1
+new line 1
+retained line 2
+new line 2
+"""
+
+        self._actual_output.text = actual_text.encode('utf8')
+        self._expected_output.text = expected_text.encode('utf8')
+
+        failure_text = FailureText(self._actual_output, self._expected_output)
+        failure_reason = failure_text.failure_reason()
+        self.assertIsNotNone(failure_reason)
+        self.assertEqual(
+            failure_reason.primary_error_message,
+            'Unexpected Diff (+got, -want):\n'
+            '+new line 1\n'
+            '-deleted line 1\n'
+            '-deleted line 2')
+
+    def test_failure_text_failure_reason_empty(self):
+        # Construct a scenario in which the difference between the actual
+        # and expected text does not provide a useful failure reason.
+        expected_text = ''
+        actual_text = '\n'
+
+        self._actual_output.text = actual_text.encode('utf8')
+        self._expected_output.text = expected_text.encode('utf8')
+
+        failure_text = FailureText(self._actual_output, self._expected_output)
+        failure_reason = failure_text.failure_reason()
+        self.assertIsNone(failure_reason)
diff --git a/third_party/blink/tools/blinkpy/web_tests/models/test_results.py b/third_party/blink/tools/blinkpy/web_tests/models/test_results.py
@@ -113,6 +113,17 @@ def __init__(self,
             # FIXME: Setting this in the constructor makes this class hard to mutate.
             self.type = results.pop()
 
+        self.failure_reason = None
+        for failure in self.failures:
+            # Take the failure reason from any failure that has one.
+            # At time of writing, only one type of failure defines failure
+            # reasons, if this changes, we may want to change this to be
+            # more deterministic.
+            failure_reason = failure.failure_reason()
+            if failure_reason:
+                self.failure_reason = failure_reason
+                break
+
         # These are set by the worker, not by the driver, so they are not passed to the constructor.
         self.worker_name = ''
         self.shard_name = ''

diff --git a/third_party/blink/tools/blinkpy/web_tests/models/test_results_unittest.py b/third_party/blink/tools/blinkpy/web_tests/models/test_results_unittest.py
@@ -58,7 +58,8 @@ def test_results_has_stderr(self):
         self.assertTrue(result.has_stderr)
 
     def test_results_has_repaint_overlay(self):
-        driver_output = DriverOutput('"invalidations": [', None, None, None)
+        text = '"invalidations": ['.encode('utf8')
+        driver_output = DriverOutput(text, None, None, None)
         failures = [test_failures.FailureTextMismatch(driver_output, None)]
         result = TestResult('foo', failures=failures)
         self.assertTrue(result.has_repaint_overlay)