Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix finding of failed tests in output of PyTorch test step #2859

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
165 changes: 100 additions & 65 deletions easybuild/easyblocks/p/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import re
import tempfile
import easybuild.tools.environment as env
from collections import namedtuple
from distutils.version import LooseVersion
from easybuild.easyblocks.generic.pythonpackage import PythonPackage
from easybuild.framework.easyconfig import CUSTOM
Expand All @@ -42,6 +43,94 @@
from easybuild.tools.systemtools import POWER, get_cpu_architecture


def get_count_for_pattern(regex, text):
"""Match the regexp containing a single group and return the integer value of the matched group.
Return zero if no or more than 1 match was found and warn for the latter case
"""
match = re.findall(regex, text)
if len(match) == 1:
return int(match[0])
elif len(match) > 1:
# Shouldn't happen, but means something went wrong with the regular expressions.
# Throw warning, as the build might be fine, no need to error on this.
warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n"
warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable."
print_warning(warn_msg)
return 0


def extract_failed_tests_info(tests_out):
"""
Extract information about failed tests from output produced by PyTorch tests
"""

# Create clear summary report
failure_report = ""
failure_cnt = 0
error_cnt = 0
failed_test_suites = []

# Look for patterns like:
# Ran 219 tests in 67.325s
#
# FAILED (errors=10, skipped=190, expected failures=6)
# test_fx failed!
regex = (r"^Ran (?P<test_cnt>[0-9]+) tests.*$\n\n"
r"FAILED \((?P<failure_summary>.*)\)$\n"
r"(?:^(?:(?!failed!).)*$\n)*"
r"(?P<failed_test_suite_name>.*) failed!(?: Received signal: \w+)?\s*$")

for match in re.finditer(regex, tests_out, re.M):
# E.g. 'failures=3, errors=10, skipped=190, expected failures=6'
failure_summary, test_cnt, test_suite = match.group('failure_summary', 'test_cnt', 'failed_test_suite_name')
failure_report += test_suite + ' (' + test_cnt + " total tests, " + failure_summary + ')\n'
failure_cnt += get_count_for_pattern(r"(?<!expected )failures=([0-9]+)", failure_summary)
error_cnt += get_count_for_pattern(r"errors=([0-9]+)", failure_summary)
failed_test_suites.append(test_suite)

# Look for patterns like:
# ===================== 2 failed, 128 passed, 2 skipped, 2 warnings in 3.43s =====================
regex = r"^=+ (?P<failure_summary>.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P<failed_test_suite_name>.*) failed!$"

for match in re.finditer(regex, tests_out, re.M):
# E.g. '2 failed, 128 passed, 2 skipped, 2 warnings'
failure_summary, test_suite = match.group('failure_summary', 'failed_test_suite_name')
failure_report += test_suite + ' ' + failure_summary + '\n'
failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary)
error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary)
failed_test_suites.append(test_suite)

# Look for patterns like:
# AssertionError: 4 unit test(s) failed:
# ...
# distributed/test_c10d_gloo failed!
regex = (r"^AssertionError: (?P<failed_test_cnt>[0-9]+) unit test\(s\) failed:$\n"
r"(?:^(?:(?!failed!).)*$\n)*"
r"(?P<failed_test_group_name>.*) failed!$")
for match in re.finditer(regex, tests_out, re.M):
test_group, failed_test_cnt = match.group('failed_test_group_name', 'failed_test_cnt')
failure_report += test_group + ' (' + failed_test_cnt + " failed tests)\n"
failure_cnt += int(failed_test_cnt)
failed_test_suites.append(test_group)

# Look for patterns like:
# Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222]
# ...
# AttributeError: 'NoneType' object has no attribute 'split'
# test_jit_cuda_fuser failed!
regex = (r"^Running (?P<test_name>test_[^\s]+) .*\n"
r"(?:^(?:(?!(Ran [0-9]+|failed!)).)*$\n)*"
r"\1 failed!$")
boegel marked this conversation as resolved.
Show resolved Hide resolved
for match in re.finditer(regex, tests_out, re.M):
test_name = match.group('test_name')
failure_report += test_name + " (unknown failed test count)\n"
failure_cnt += 1
failed_test_suites.append(test_name)

TestsInfo = namedtuple('TestsInfo', ('failure_report', 'failure_cnt', 'error_cnt', 'failed_test_suites'))
return TestsInfo(failure_report, failure_cnt, error_cnt, failed_test_suites)


class EB_PyTorch(PythonPackage):
"""Support for building/installing PyTorch."""

Expand Down Expand Up @@ -268,65 +357,11 @@ def test_step(self):

tests_out, tests_ec = super(EB_PyTorch, self).test_step(return_output_ec=True)

def get_count_for_pattern(regex, text):
"""Match the regexp containing a single group and return the integer value of the matched group.
Return zero if no or more than 1 match was found and warn for the latter case
"""
match = re.findall(regex, text)
if len(match) == 1:
return int(match[0])
elif len(match) > 1:
# Shouldn't happen, but means something went wrong with the regular expressions.
# Throw warning, as the build might be fine, no need to error on this.
warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n"
warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable."
print_warning(warn_msg)
return 0

# Create clear summary report
failure_report = ""
failure_cnt = 0
error_cnt = 0
failed_test_suites = []

# Grep for patterns like:
# Ran 219 tests in 67.325s
#
# FAILED (errors=10, skipped=190, expected failures=6)
# test_fx failed!
regex = (r"^Ran (?P<test_cnt>[0-9]+) tests.*$\n\n"
r"FAILED \((?P<failure_summary>.*)\)$\n"
r"(?:^(?:(?!failed!).)*$\n)*"
r"(?P<failed_test_suite_name>.*) failed!(?: Received signal: \w+)?\s*$")

for m in re.finditer(regex, tests_out, re.M):
# E.g. 'failures=3, errors=10, skipped=190, expected failures=6'
failure_summary = m.group('failure_summary')
total, test_suite = m.group('test_cnt', 'failed_test_suite_name')
failure_report += "{test_suite} ({total} total tests, {failure_summary})\n".format(
test_suite=test_suite, total=total, failure_summary=failure_summary
)
failure_cnt += get_count_for_pattern(r"(?<!expected )failures=([0-9]+)", failure_summary)
error_cnt += get_count_for_pattern(r"errors=([0-9]+)", failure_summary)
failed_test_suites.append(test_suite)

# Grep for patterns like:
# ===================== 2 failed, 128 passed, 2 skipped, 2 warnings in 3.43s =====================
regex = r"^=+ (?P<failure_summary>.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P<failed_test_suite_name>.*) failed!$"

for m in re.finditer(regex, tests_out, re.M):
# E.g. '2 failed, 128 passed, 2 skipped, 2 warnings'
failure_summary = m.group('failure_summary')
test_suite = m.group('failed_test_suite_name')
failure_report += "{test_suite} ({failure_summary})\n".format(
test_suite=test_suite, failure_summary=failure_summary
)
failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary)
error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary)
failed_test_suites.append(test_suite)
failed_tests_info = extract_failed_tests_info(tests_out)
failure_report = failed_tests_info.failure_report
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: To avoid long distracting lines like sorted(set(failed_tests_info.failed_test_suites)) (2 times "failed") maybe either unwrap the named tuple here: failure_report, failed_test_suites = failed_tests_info.failure_report, failed_tests_info.failed_test_suites etc. or rename to test_info as test_info.failed_test_suites is easier to read.
And maybe the test_cnt extraction can be done in there too which would fit the name even better (then drop the "failed" from the function name).


# Make the names unique and sorted
failed_test_suites = sorted(set(failed_test_suites))
failed_test_suites = sorted(set(failed_tests_info.failed_test_suites))
# Gather all failed tests suites in case we missed any (e.g. when it exited due to syntax errors)
# Also unique and sorted to be able to compare the lists below
all_failed_test_suites = sorted(set(
Expand All @@ -341,17 +376,17 @@ def get_count_for_pattern(regex, text):
failure_report += '\n' + failure_report_save

# Calculate total number of unsuccesful and total tests
failed_test_cnt = failure_cnt + error_cnt
failed_test_cnt = failed_tests_info.failure_cnt + failed_tests_info.error_cnt
test_cnt = sum(int(hit) for hit in re.findall(r"^Ran (?P<test_cnt>[0-9]+) tests in", tests_out, re.M))
max_failed_tests = self.cfg['max_failed_tests']
self.log.info("%d unsuccessful tests (out of %d), - max. failed tests set to %d",
failed_test_cnt, test_cnt, max_failed_tests)

if failed_test_cnt > 0:
max_failed_tests = self.cfg['max_failed_tests']

failure_or_failures = 'failure' if failure_cnt == 1 else 'failures'
error_or_errors = 'error' if error_cnt == 1 else 'errors'
msg = "%d test %s, %d test %s (out of %d):\n" % (
failure_cnt, failure_or_failures, error_cnt, error_or_errors, test_cnt
)
failure_or_failures = 'failure' if failed_tests_info.failure_cnt == 1 else 'failures'
msg = "%d test %s, " % (failed_tests_info.failure_cnt, failure_or_failures)
error_or_errors = 'error' if failed_tests_info.error_cnt == 1 else 'errors'
msg += "%d test %s (out of %d):\n" % (failed_tests_info.error_cnt, error_or_errors, test_cnt)
msg += failure_report

# If no tests are supposed to fail or some failed for which we were not able to count errors fail now
Expand Down
125 changes: 125 additions & 0 deletions test/easyblocks/easyblock_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from easybuild.base.testing import TestCase
from easybuild.easyblocks.generic.cmakemake import det_cmake_version
from easybuild.easyblocks.generic.toolchain import Toolchain
from easybuild.easyblocks import pytorch
from easybuild.framework.easyblock import EasyBlock, get_easyblock_instance
from easybuild.framework.easyconfig.easyconfig import process_easyconfig
from easybuild.tools import config
Expand All @@ -51,6 +52,97 @@
from easybuild.tools.options import set_tmpdir
from easybuild.tools.py2vs3 import StringIO

PYTORCH_TESTS_OUTPUT = """
...
AssertionError: Expected zero exit code but got -6 for pid: 2006681

----------------------------------------------------------------------
Ran 2 tests in 6.576s

FAILED (failures=2)
distributed/fsdp/test_fsdp_input failed!
Running distributed/fsdp/test_fsdp_multiple_forward ... [2023-01-12 05:46:45.746098]

RuntimeError: Process 0 terminated or timed out after 610.0615825653076 seconds

----------------------------------------------------------------------
Ran 1 test in 610.744s

FAILED (errors=1)
Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_comm_hook_register_just_once

RuntimeError: Process 0 terminated or timed out after 610.0726096630096 seconds

----------------------------------------------------------------------
Ran 1 test in 610.729s

FAILED (errors=1)
Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_invalid_comm_hook_init
test_ddp_invalid_comm_hook_return_type (__main__.DistributedDataParallelTest)

AssertionError: 4 unit test(s) failed:
DistributedDataParallelTest.test_ddp_comm_hook_register_just_once
DistributedDataParallelTest.test_ddp_invalid_comm_hook_init
ProcessGroupGlooTest.test_round_robin
ProcessGroupGlooTest.test_round_robin_create_destroy
distributed/test_c10d_gloo failed!
Running distributed/test_c10d_nccl ... [2023-01-12 07:43:41.085197]

ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
4.535600093557479e-05

----------------------------------------------------------------------
Ran 216 tests in 22.396s

FAILED (errors=4)
distributions/test_distributions failed!

Running test_autograd ... [2023-01-13 04:19:25.587981]
Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_autograd.py', '-v'] ... [2023-01-13 04:19:25.588074]
...
test_autograd_views_codegen (__main__.TestAutograd) ... ok
...
======================================================================
FAIL: test_thread_shutdown (__main__.TestAutograd)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/tmp/vsc40023/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_autograd.py", line 4220, in test_thread_shutdown
self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
AssertionError: Regex didn't match: 'PYTORCH_API_USAGE torch.autograd.thread_shutdown' not found in 'PYTORCH_API_USAGE torch.python.import\nPYTORCH_API_USAGE c10d.python.import\nPYTORCH_API_USAGE tensor.create\n'
----------------------------------------------------------------------
Ran 464 tests in 18.443s

FAILED (failures=1, skipped=52, expected failures=1)
test_autograd failed!
Running test_binary_ufuncs ... [2023-01-12 09:02:45.049490]
...

Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222]
Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_jit_cuda_fuser.py', '-v'] ... [2023-01-12 04:04:08.949319]
CUDA not available, skipping tests
monkeytype is not installed. Skipping tests for Profile-Directed Typing
Traceback (most recent call last):
File "/tmp/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_jit_cuda_fuser.py", line 25, in <module>
CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.'))
AttributeError: 'NoneType' object has no attribute 'split'
test_jit_cuda_fuser failed!
...
Running distributions/test_constraints ... [2023-01-12 09:05:15.013470]
SKIPPED [2] distributions/test_constraints.py:83: `biject_to` not implemented.
FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn5-False-value5]
FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn7-True-value7]
============= 2 failed, 128 passed, 2 skipped, 2 warnings in 8.66s =============
distributions/test_constraints failed!

Running distributions/rpc/test_tensorpipe_agent ... [2023-01-12 09:06:37.093571]
...
Ran 123 tests in 7.549s

FAILED (errors=2, skipped=2)
...
test_fx failed! Received signal: SIGSEGV
""" # noqa
boegel marked this conversation as resolved.
Show resolved Hide resolved


class EasyBlockSpecificTest(TestCase):
""" Baseclass for easyblock testcases """
Expand Down Expand Up @@ -265,6 +357,39 @@ def test_det_cmake_version(self):
"""))
self.assertEqual(det_cmake_version(), '1.2.3-rc4')

def test_pytorch_extract_failed_tests_info(self):
"""
Test extract_failed_tests_info function from PyTorch easyblock.
"""
res = pytorch.extract_failed_tests_info(PYTORCH_TESTS_OUTPUT)
self.assertEqual(len(res), 4)

expected_failure_report = '\n'.join([
"distributed/fsdp/test_fsdp_input (2 total tests, failures=2)",
"distributions/test_distributions (216 total tests, errors=4)",
"test_autograd (464 total tests, failures=1, skipped=52, expected failures=1)",
"test_fx (123 total tests, errors=2, skipped=2)",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This or the SIGSEGV test needs to be another one or failed_test_suites could pass when the signal was not detected as in the case of a forced termination due to a signla there won't be a test summary, will there?

Copy link
Contributor

@Flamefire Flamefire Jan 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we have a bug here. For the output of

Running distributions/rpc/test_tensorpipe_agent ... [2023-01-12 09:06:37.093571]
...
Ran 123 tests in 7.549s
FAILED (errors=2, skipped=2)
...
test_fx failed! Received signal: SIGSEGV

it must NOT match the RegExp. The original, correct output is:

FAILED (errors=2, skipped=2)
distributed/rpc/test_tensorpipe_agent failed!

So the RegExp must be made to only add the counting of "FAILED" if the next line is the failed test, see https://gist.github.com/Flamefire/dc1403ccefdebfc3412c6fbb2d5cbabd#file-pytorch-1-9-0-foss-2020b_partial-log-L480

I would actually add this wrong output as a test against regressions. With that snipped it should only find test_fx as failed (with an unknown number of tests)

"distributions/test_constraints 2 failed, 128 passed, 2 skipped, 2 warnings",
"distributed/test_c10d_gloo (4 failed tests)",
"test_jit_cuda_fuser (unknown failed test count)",
])
self.assertEqual(res.failure_report.strip(), expected_failure_report)
# test failures
boegel marked this conversation as resolved.
Show resolved Hide resolved
self.assertEqual(res.failure_cnt, 10)
# test errors
self.assertEqual(res.error_cnt, 6)

expected_failed_test_suites = [
'distributed/fsdp/test_fsdp_input',
'distributions/test_distributions',
'test_autograd',
'test_fx',
'distributions/test_constraints',
'distributed/test_c10d_gloo',
'test_jit_cuda_fuser',
]
self.assertEqual(res.failed_test_suites, expected_failed_test_suites)


def suite():
"""Return all easyblock-specific tests."""
Expand Down