Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CI slack bot less noisy #48409

Merged
merged 1 commit into from
Apr 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
108 changes: 96 additions & 12 deletions utils/ci-slack-bot/ci-slack-bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os
import json
import base64
import random

if os.environ.get("AWS_LAMBDA_ENV", "0") == "1":
# For AWS labmda (python 3.7)
Expand All @@ -28,25 +29,34 @@
MAX_FAILURES_DEFAULT = 50
SLACK_URL_DEFAULT = DRY_RUN_MARK

# Find tests that failed in master during the last check_period hours,
EXTENDED_CHECK_PERIOD_MUL = 3
FLAKY_ALERT_PROBABILITY = 0.20

# Find tests that failed in master during the last check_period * 12 hours,
# but did not fail during the last 2 weeks. Assuming these tests were broken recently.
# NOTE: It may report flaky tests that fail too rarely.
# Counts number of failures in check_period and check_period * 12 time windows
# to distinguish rare flaky tests from completely broken tests
NEW_BROKEN_TESTS_QUERY = """
WITH
1 AS check_period,
check_period * 12 AS extended_check_period,
now() as now
SELECT test_name, any(report_url)
SELECT
test_name,
any(report_url),
countIf((check_start_time + check_duration_ms / 1000) < now - INTERVAL check_period HOUR) AS count_prev_periods,
countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL check_period HOUR) AS count
FROM checks
WHERE 1
AND check_start_time >= now - INTERVAL 1 WEEK
AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL check_period HOUR
AND check_start_time BETWEEN now - INTERVAL 1 WEEK AND now
AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL extended_check_period HOUR
AND pull_request_number = 0
AND test_status LIKE 'F%'
AND check_status != 'success'
AND test_name NOT IN (
SELECT test_name FROM checks WHERE 1
AND check_start_time >= now - INTERVAL 1 MONTH
AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL check_period HOUR
AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR
AND pull_request_number = 0
AND check_status != 'success'
AND test_status LIKE 'F%')
Expand Down Expand Up @@ -74,6 +84,27 @@
AND check_name ILIKE check_name_pattern
"""

# It shows all recent failures of the specified test (helps to find when it started)
ALL_RECENT_FAILURES_QUERY = """
WITH
'{}' AS name_substr,
90 AS interval_days,
('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)') AS backport_and_release_specific_checks
SELECT
toStartOfDay(check_start_time) AS d,
count(),
groupUniqArray(pull_request_number) AS prs,
any(report_url)
FROM checks
WHERE ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (pull_request_number NOT IN (
SELECT pull_request_number AS prn
FROM checks
WHERE (prn != 0) AND ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (check_name IN (backport_and_release_specific_checks))
)) AND (position(test_name, name_substr) > 0) AND (test_status IN ('FAIL', 'ERROR', 'FLAKY'))
GROUP BY d
ORDER BY d DESC
"""

SLACK_MESSAGE_JSON = {"type": "mrkdwn", "text": None}


Expand All @@ -97,16 +128,62 @@ def run_clickhouse_query(query):
return [x.split("\t") for x in lines]


def get_new_broken_tests_message(broken_tests):
if not broken_tests:
def split_broken_and_flaky_tests(failed_tests):
if not failed_tests:
return None

broken_tests = []
flaky_tests = []
for name, report, count_prev_str, count_str in failed_tests:
count_prev, count = int(count_prev_str), int(count_str)
if (2 <= count and count_prev < 2) or (count_prev == 1 and count == 1):
# It failed 2 times or more within extended time window, it's definitely broken.
# 2 <= count_prev means that it was not reported as broken on previous runs
broken_tests.append([name, report])
elif 0 < count and count_prev == 0:
# It failed only once, can be a rare flaky test
flaky_tests.append([name, report])

return broken_tests, flaky_tests


def format_failed_tests_list(failed_tests, failure_type):
if len(failed_tests) == 1:
res = "There is a new {} test:\n".format(failure_type)
else:
res = "There are {} new {} tests:\n".format(len(failed_tests), failure_type)

for name, report in failed_tests:
cidb_url = get_play_url(ALL_RECENT_FAILURES_QUERY.format(name))
res += " - *{}* - <{}|Report> - <{}|CI DB> \n".format(
name, report, cidb_url
)
return res


def get_new_broken_tests_message(failed_tests):
if not failed_tests:
return None

broken_tests, flaky_tests = split_broken_and_flaky_tests(failed_tests)
if len(broken_tests) == 0 and len(flaky_tests) == 0:
return None

msg = ""
if len(broken_tests) > 0:
msg += format_failed_tests_list(broken_tests, "*BROKEN*")
elif random.random() > FLAKY_ALERT_PROBABILITY:
# Should we report fuzzers unconditionally?
print("Will not report flaky tests to avoid noise: ", flaky_tests)
return None
msg = "There are {} new broken tests in master:\n".format(len(broken_tests))
for name, report in broken_tests:
msg += " - *{}* - <{}|Report>\n".format(name, report)

if len(flaky_tests) > 0:
msg += format_failed_tests_list(flaky_tests, "flaky")

return msg


def get_too_many_failures_message(failures_count):
def get_too_many_failures_message_impl(failures_count):
MAX_FAILURES = int(os.environ.get("MAX_FAILURES", MAX_FAILURES_DEFAULT))
curr_failures = int(failures_count[0][0])
prev_failures = int(failures_count[0][1])
Expand All @@ -129,6 +206,13 @@ def get_too_many_failures_message(failures_count):
)


def get_too_many_failures_message(failures_count):
msg = get_too_many_failures_message_impl(failures_count)
if msg:
msg += "\nSee https://aretestsgreenyet.com/"
return msg


def send_to_slack(message):
SLACK_URL = os.environ.get("SLACK_URL", SLACK_URL_DEFAULT)
if SLACK_URL == DRY_RUN_MARK:
Expand Down