ClickHouse · tavplubix · Apr 12, 2023 · Apr 12, 2023
diff --git a/utils/ci-slack-bot/ci-slack-bot.py b/utils/ci-slack-bot/ci-slack-bot.py
@@ -26,20 +26,22 @@
 
 DRY_RUN_MARK = "<no url, dry run>"
 
-MAX_FAILURES_DEFAULT = 50
+MAX_FAILURES_DEFAULT = 40
 SLACK_URL_DEFAULT = DRY_RUN_MARK
 
-EXTENDED_CHECK_PERIOD_MUL = 3
 FLAKY_ALERT_PROBABILITY = 0.20
 
-# Find tests that failed in master during the last check_period * 12 hours,
+# Slack has a stupid limitation on message size, it splits long messages into multiple ones breaking formatting
+MESSAGE_LENGTH_LIMIT = 4000
+
+# Find tests that failed in master during the last check_period * 24 hours,
 # but did not fail during the last 2 weeks. Assuming these tests were broken recently.
-# Counts number of failures in check_period and check_period * 12 time windows
+# Counts number of failures in check_period and check_period * 24 time windows
 # to distinguish rare flaky tests from completely broken tests
 NEW_BROKEN_TESTS_QUERY = """
 WITH
     1 AS check_period,
-    check_period * 12 AS extended_check_period,
+    check_period * 24 AS extended_check_period,
     now() as now
 SELECT
     test_name,
@@ -155,7 +157,7 @@ def format_failed_tests_list(failed_tests, failure_type):
 
     for name, report in failed_tests:
         cidb_url = get_play_url(ALL_RECENT_FAILURES_QUERY.format(name))
-        res += " - *{}*  -  <{}|Report>  -  <{}|CI DB> \n".format(
+        res += "-   *{}*  -  <{}|Report>  -  <{}|CI DB> \n".format(
             name, report, cidb_url
         )
     return res
@@ -173,11 +175,14 @@ def get_new_broken_tests_message(failed_tests):
     if len(broken_tests) > 0:
         msg += format_failed_tests_list(broken_tests, "*BROKEN*")
     elif random.random() > FLAKY_ALERT_PROBABILITY:
-        # Should we report fuzzers unconditionally?
-        print("Will not report flaky tests to avoid noise: ", flaky_tests)
-        return None
+        looks_like_fuzzer = [x[0].count(" ") > 2 for x in flaky_tests]
+        if not any(looks_like_fuzzer):
+            print("Will not report flaky tests to avoid noise: ", flaky_tests)
+            return None
 
     if len(flaky_tests) > 0:
+        if len(msg) > 0:
+            msg += "\n"
         msg += format_failed_tests_list(flaky_tests, "flaky")
 
     return msg
@@ -187,7 +192,7 @@ def get_too_many_failures_message_impl(failures_count):
     MAX_FAILURES = int(os.environ.get("MAX_FAILURES", MAX_FAILURES_DEFAULT))
     curr_failures = int(failures_count[0][0])
     prev_failures = int(failures_count[0][1])
-    if curr_failures == 0:
+    if curr_failures == 0 and prev_failures != 0:
         return (
             "Looks like CI is completely broken: there are *no failures* at all... 0_o"
         )
@@ -213,7 +218,22 @@ def get_too_many_failures_message(failures_count):
     return msg
 
 
-def send_to_slack(message):
+def split_slack_message(long_message):
+    lines = long_message.split("\n")
+    messages = []
+    curr_msg = ""
+    for line in lines:
+        if len(curr_msg) + len(line) < MESSAGE_LENGTH_LIMIT:
+            curr_msg += "\n"
+            curr_msg += line
+        else:
+            messages.append(curr_msg)
+            curr_msg = line
+    messages.append(curr_msg)
+    return messages
+
+
+def send_to_slack_impl(message):
     SLACK_URL = os.environ.get("SLACK_URL", SLACK_URL_DEFAULT)
     if SLACK_URL == DRY_RUN_MARK:
         return
@@ -230,6 +250,12 @@ def send_to_slack(message):
         )
 
 
+def send_to_slack(message):
+    messages = split_slack_message(message)
+    for msg in messages:
+        send_to_slack_impl(msg)
+
+
 def query_and_alert_if_needed(query, get_message_func):
     query_res = run_clickhouse_query(query)
     print("Got result {} for query {}", query_res, query)