From 774ab783761916a2e6d8db9491f7ff12fd094045 Mon Sep 17 00:00:00 2001 From: Julia Date: Tue, 30 Apr 2024 13:55:45 +0200 Subject: [PATCH 1/5] Add `service_name` label to metrics collector --- .../metrics_exporter/metrics_collectors.py | 52 ++++++------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py index 302f5756a4..282e57180d 100644 --- a/engine/apps/metrics_exporter/metrics_collectors.py +++ b/engine/apps/metrics_exporter/metrics_collectors.py @@ -9,6 +9,7 @@ from apps.metrics_exporter.constants import ( ALERT_GROUPS_RESPONSE_TIME, ALERT_GROUPS_TOTAL, + SERVICE_LABEL, USER_WAS_NOTIFIED_OF_ALERT_GROUPS, AlertGroupsResponseTimeMetricsDict, AlertGroupsTotalMetricsDict, @@ -52,7 +53,7 @@ def __init__(self): "team", ] + self._stack_labels - # + [SERVICE_LABEL] # todo:metrics: uncomment when all metric cache is updated (~2 after release) + + [SERVICE_LABEL] ) self._integration_labels_with_state = self._integration_labels + ["state"] self._user_labels = ["username"] + self._stack_labels @@ -100,24 +101,12 @@ def _get_alert_groups_total_metric(self, org_ids): integration_data["id"], # grafana instance id ] labels_values = list(map(str, labels_values)) - # clause below is needed for compatibility with old metric cache during rollout metrics with services - if "services" in integration_data: - count_per_state = {state.value: 0 for state in AlertGroupState} - for service_name in integration_data["services"]: - for state in AlertGroupState: - count_per_state[state.value] += integration_data["services"][service_name][state.value] - # todo:metrics: with enabling service_name label move "add_metric" under - # "for service_name..." iteration - for state_name, counter in count_per_state.items(): + for service_name in integration_data["services"]: + for state in AlertGroupState: alert_groups_total.add_metric( - labels_values + [state_name], - # todo:metrics: replace [state.value] when all metric cache is updated - # + [service_name, state.value], - counter, + labels_values + [service_name, state.value], + integration_data["services"][service_name][state.value], ) - else: - for state in AlertGroupState: - alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value]) org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] processed_org_ids.add(int(org_id_from_key)) missing_org_ids = org_ids - processed_org_ids @@ -146,27 +135,16 @@ def _get_response_time_metric(self, org_ids): ] labels_values = list(map(str, labels_values)) - # clause below is needed for compatibility with old metric cache during rollout metrics with services - if "services" in integration_data: - response_time_values = [] - # todo:metrics: for service_name, response_time - for _, response_time in integration_data["services"].items(): - if not response_time: - continue - response_time_values.extend(response_time) - else: - response_time_values = integration_data["response_time"] - if not response_time_values: + for service_name, response_time in integration_data["services"].items(): + if not response_time: continue - # todo:metrics: with enabling service_name label move "add_metric" under - # "for service_name, response_time..." iteration - buckets, sum_value = self.get_buckets_with_sum(response_time_values) - buckets = sorted(list(buckets.items()), key=lambda x: float(x[0])) - alert_groups_response_time_seconds.add_metric( - labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated - buckets=buckets, - sum_value=sum_value, - ) + buckets, sum_value = self.get_buckets_with_sum(response_time) + buckets = sorted(list(buckets.items()), key=lambda x: float(x[0])) + alert_groups_response_time_seconds.add_metric( + labels_values + [service_name], + buckets=buckets, + sum_value=sum_value, + ) org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0] processed_org_ids.add(int(org_id_from_key)) missing_org_ids = org_ids - processed_org_ids From 1ad2f217e569c413f761f60b914a701ac5c7d68f Mon Sep 17 00:00:00 2001 From: Julia Date: Tue, 30 Apr 2024 13:57:56 +0200 Subject: [PATCH 2/5] Update tests --- .../apps/metrics_exporter/tests/conftest.py | 141 +------------- .../tests/test_metrics_collectors.py | 69 +++---- .../tests/test_update_metrics_cache.py | 183 +----------------- 3 files changed, 48 insertions(+), 345 deletions(-) diff --git a/engine/apps/metrics_exporter/tests/conftest.py b/engine/apps/metrics_exporter/tests/conftest.py index 2290e720f3..99ee837258 100644 --- a/engine/apps/metrics_exporter/tests/conftest.py +++ b/engine/apps/metrics_exporter/tests/conftest.py @@ -18,6 +18,7 @@ METRICS_TEST_INSTANCE_SLUG = "test_instance" METRICS_TEST_INSTANCE_ID = 292 # random number METRICS_TEST_USER_USERNAME = "Alex" +METRICS_TEST_SERVICE_NAME = "test_service" @pytest.fixture() @@ -45,84 +46,11 @@ def _mock_cache_get(key, *args, **kwargs): "acknowledged": 3, "resolved": 5, }, - }, - }, - }, - ALERT_GROUPS_RESPONSE_TIME: { - 1: { - "integration_name": "Test metrics integration", - "team_name": "Test team", - "team_id": 1, - "org_id": 1, - "slug": "Test stack", - "id": 1, - "services": { - NO_SERVICE_VALUE: [2, 10, 200, 650], - }, - } - }, - USER_WAS_NOTIFIED_OF_ALERT_GROUPS: { - 1: { - "org_id": 1, - "slug": "Test stack", - "id": 1, - "user_username": "Alex", - "counter": 4, - } - }, - } - return test_metrics.get(key) - - def _mock_cache_get_many(keys, *args, **kwargs): - return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)} - - monkeypatch.setattr(cache, "get", _mock_cache_get) - monkeypatch.setattr(cache, "get_many", _mock_cache_get_many) - - -# todo:metrics: remove later when all cache is updated -@pytest.fixture() # used for test backwards compatibility with old version of metrics -def mock_cache_get_metrics_for_collector_mixed_versions(monkeypatch): - def _mock_cache_get(key, *args, **kwargs): - if ALERT_GROUPS_TOTAL in key: - key = ALERT_GROUPS_TOTAL - elif ALERT_GROUPS_RESPONSE_TIME in key: - key = ALERT_GROUPS_RESPONSE_TIME - elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key: - key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS - test_metrics = { - ALERT_GROUPS_TOTAL: { - 1: { - "integration_name": "Test metrics integration", - "team_name": "Test team", - "team_id": 1, - "org_id": 1, - "slug": "Test stack", - "id": 1, - "firing": 2, - "acknowledged": 3, - "silenced": 4, - "resolved": 5, - }, - 2: { - "integration_name": "Test metrics integration 2", - "team_name": "Test team", - "team_id": 1, - "org_id": 1, - "slug": "Test stack", - "id": 1, - "services": { - NO_SERVICE_VALUE: { - "firing": 2, - "silenced": 4, - "acknowledged": 3, - "resolved": 5, - }, - "test_service": { - "firing": 10, - "silenced": 10, - "acknowledged": 10, - "resolved": 10, + METRICS_TEST_SERVICE_NAME: { + "firing": 12, + "silenced": 14, + "acknowledged": 13, + "resolved": 15, }, }, }, @@ -135,17 +63,8 @@ def _mock_cache_get(key, *args, **kwargs): "org_id": 1, "slug": "Test stack", "id": 1, - "response_time": [2, 10, 200, 650], - }, - 2: { - "integration_name": "Test metrics integration 2", - "team_name": "Test team", - "team_id": 1, - "org_id": 1, - "slug": "Test stack", - "id": 1, - "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]}, - }, + "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], METRICS_TEST_SERVICE_NAME: [4, 12, 20]}, + } }, USER_WAS_NOTIFIED_OF_ALERT_GROUPS: { 1: { @@ -227,50 +146,6 @@ def cache_get(key, *args, **kwargs): return _make_cache_params -# todo:metrics: remove later when all cache is updated -@pytest.fixture -def make_metrics_cache_params_old_version(monkeypatch): - def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None): - team_name = team_name or "No team" - team_id = team_id or "no_team" - metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id) - metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id) - - def cache_get(key, *args, **kwargs): - metrics_data = { - metric_alert_groups_response_time_key: { - integration_id: { - "integration_name": METRICS_TEST_INTEGRATION_NAME, - "team_name": team_name, - "team_id": team_id, - "org_id": METRICS_TEST_ORG_ID, - "slug": METRICS_TEST_INSTANCE_SLUG, - "id": METRICS_TEST_INSTANCE_ID, - "response_time": [], - } - }, - metric_alert_groups_total_key: { - integration_id: { - "integration_name": METRICS_TEST_INTEGRATION_NAME, - "team_name": team_name, - "team_id": team_id, - "org_id": METRICS_TEST_ORG_ID, - "slug": METRICS_TEST_INSTANCE_SLUG, - "id": METRICS_TEST_INSTANCE_ID, - "firing": 0, - "acknowledged": 0, - "silenced": 0, - "resolved": 0, - } - }, - } - return metrics_data.get(key, {}) - - return cache_get - - return _make_cache_params - - @pytest.fixture def make_user_was_notified_metrics_cache_params(monkeypatch): def _make_cache_params(user_id, organization_id): diff --git a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py index 61921157c8..257e6c4efa 100644 --- a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py +++ b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py @@ -8,9 +8,11 @@ from apps.metrics_exporter.constants import ( ALERT_GROUPS_RESPONSE_TIME, ALERT_GROUPS_TOTAL, + NO_SERVICE_VALUE, USER_WAS_NOTIFIED_OF_ALERT_GROUPS, ) from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector +from apps.metrics_exporter.tests.conftest import METRICS_TEST_SERVICE_NAME # redis cluster usage modifies the cache keys for some operations, so we need to test both cases @@ -24,17 +26,44 @@ def test_application_metrics_collector( ): """Test that ApplicationMetricsCollector generates expected metrics from cache""" + def get_expected_labels(service_name=NO_SERVICE_VALUE, **kwargs): + labels = { + "integration": "Test metrics integration", + "team": "Test team", + "org_id": "1", + "slug": "Test stack", + "id": "1", + "service_name": service_name, + } + labels.update(kwargs) + return labels + with override_settings(USE_REDIS_CLUSTER=use_redis_cluster): collector = ApplicationMetricsCollector() test_metrics_registry = CollectorRegistry() test_metrics_registry.register(collector) for metric in test_metrics_registry.collect(): if metric.name == ALERT_GROUPS_TOTAL: - # integration with labels for each alert group state - assert len(metric.samples) == len(AlertGroupState) + # integration with labels for each alert group state per service + assert len(metric.samples) == len(AlertGroupState) * 2 + assert {2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples) + # check that labels were set correctly + expected_labels_no_service = get_expected_labels(state="firing") + expected_labels_test_service = get_expected_labels(METRICS_TEST_SERVICE_NAME, state="firing") + metric_labels = [sample.labels for sample in metric.samples] + for expected_labels in [expected_labels_no_service, expected_labels_test_service]: + assert expected_labels in metric_labels elif metric.name == ALERT_GROUPS_RESPONSE_TIME: # integration with labels for each value in collector's bucket + _count and _sum histogram values - assert len(metric.samples) == len(collector._buckets) + 2 + assert len(metric.samples) == (len(collector._buckets) + 2) * 2 + # check that `_sum` values for both services are presented + assert {36, 862}.issubset(set(sample.value for sample in metric.samples)) + # check that labels were set correctly + expected_labels_no_service = get_expected_labels() + expected_labels_test_service = get_expected_labels(METRICS_TEST_SERVICE_NAME) + metric_labels = [sample.labels for sample in metric.samples] + for expected_labels in [expected_labels_no_service, expected_labels_test_service]: + assert expected_labels in metric_labels elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS: # metric with labels for each notified user assert len(metric.samples) == 1 @@ -44,37 +73,3 @@ def test_application_metrics_collector( # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called assert mocked_start_calculate_and_cache_metrics.called test_metrics_registry.unregister(collector) - - -# todo:metrics: remove later when all cache is updated -@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1]) -@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async") -@pytest.mark.django_db -def test_application_metrics_collector_mixed_cache( - mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector_mixed_versions -): - """Test that ApplicationMetricsCollector generates expected metrics from previous and new versions of cache""" - - collector = ApplicationMetricsCollector() - test_metrics_registry = CollectorRegistry() - test_metrics_registry.register(collector) - for metric in test_metrics_registry.collect(): - if metric.name == ALERT_GROUPS_TOTAL: - # integration with labels for each alert group state - assert len(metric.samples) == len(AlertGroupState) * 2 - # check that values from different services were combined to one sample - assert {2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples) - elif metric.name == ALERT_GROUPS_RESPONSE_TIME: - # integration with labels for each value in collector's bucket + _count and _sum histogram values - assert len(metric.samples) == (len(collector._buckets) + 2) * 2 - # check that values from different services were combined to one sample - assert 7.0 in set(sample.value for sample in metric.samples) - elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS: - # metric with labels for each notified user - assert len(metric.samples) == 1 - result = generate_latest(test_metrics_registry).decode("utf-8") - assert result is not None - assert mocked_org_ids.called - # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called - assert mocked_start_calculate_and_cache_metrics.called - test_metrics_registry.unregister(collector) diff --git a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py index 2876c0aebd..7292f953ea 100644 --- a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py +++ b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py @@ -21,11 +21,10 @@ METRICS_TEST_INSTANCE_SLUG, METRICS_TEST_INTEGRATION_NAME, METRICS_TEST_ORG_ID, + METRICS_TEST_SERVICE_NAME, METRICS_TEST_USER_USERNAME, ) -TEST_SERVICE_VALUE = "Test_service" - @pytest.fixture def mock_apply_async(monkeypatch): @@ -159,23 +158,23 @@ def get_called_arg_index_and_compare_results(update_expected_result, service_nam alert_group.un_silence_by_user_or_backsync(user) arg_idx = get_called_arg_index_and_compare_results(expected_result_firing) - # create alert group with service label and check metric cache is updated properly + # set state values to default expected_result_metric_alert_groups_total[alert_receive_channel.id]["services"][NO_SERVICE_VALUE].update( default_state ) - + # create alert group with service label and check metric cache is updated properly alert_group_with_service = make_alert_group(alert_receive_channel) make_alert(alert_group=alert_group_with_service, raw_request_data={}) make_alert_group_label_association( - organization, alert_group_with_service, key_name=SERVICE_LABEL, value_name=TEST_SERVICE_VALUE + organization, alert_group_with_service, key_name=SERVICE_LABEL, value_name=METRICS_TEST_SERVICE_NAME ) alert_group_created_signal.send(sender=alert_group_with_service.__class__, alert_group=alert_group_with_service) # check alert_groups_total metric cache, get called args - arg_idx = get_called_arg_index_and_compare_results(expected_result_firing, TEST_SERVICE_VALUE) + arg_idx = get_called_arg_index_and_compare_results(expected_result_firing, METRICS_TEST_SERVICE_NAME) alert_group_with_service.resolve_by_user_or_backsync(user) - get_called_arg_index_and_compare_results(expected_result_resolved, TEST_SERVICE_VALUE) + get_called_arg_index_and_compare_results(expected_result_resolved, METRICS_TEST_SERVICE_NAME) @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") @@ -282,11 +281,11 @@ def assert_cache_was_not_changed_by_response_time_metric(): alert_group_with_service = make_alert_group(alert_receive_channel) make_alert(alert_group=alert_group_with_service, raw_request_data={}) make_alert_group_label_association( - organization, alert_group_with_service, key_name=SERVICE_LABEL, value_name=TEST_SERVICE_VALUE + organization, alert_group_with_service, key_name=SERVICE_LABEL, value_name=METRICS_TEST_SERVICE_NAME ) assert_cache_was_not_changed_by_response_time_metric() alert_group_with_service.acknowledge_by_user_or_backsync(user) - get_called_arg_index_and_compare_results(TEST_SERVICE_VALUE) + get_called_arg_index_and_compare_results(METRICS_TEST_SERVICE_NAME) @pytest.mark.django_db @@ -676,169 +675,3 @@ def _expected_alert_groups_response_time(alert_receive_channel, response_time=No alert_receive_channel1.id: _expected_alert_groups_response_time(alert_receive_channel1), alert_receive_channel2.id: _expected_alert_groups_response_time(alert_receive_channel2, response_time=[12]), } - - -# todo:metrics: remove later when all cache is updated -@patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") -@pytest.mark.django_db -@override_settings(CELERY_TASK_ALWAYS_EAGER=True) -def test_update_metric_alert_groups_total_cache_on_action_backward_compatability( - mocked_send_log_signal, - mocked_action_signal_send, - mock_apply_async, - make_organization, - make_user_for_organization, - make_alert_receive_channel, - make_alert_group, - make_alert, - make_metrics_cache_params_old_version, - monkeypatch, -): - """Test update metric cache works properly with previous version of cache""" - organization = make_organization( - org_id=METRICS_TEST_ORG_ID, - stack_slug=METRICS_TEST_INSTANCE_SLUG, - stack_id=METRICS_TEST_INSTANCE_ID, - ) - user = make_user_for_organization(organization) - alert_receive_channel = make_alert_receive_channel(organization, verbal_name=METRICS_TEST_INTEGRATION_NAME) - - metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id) - - expected_result_metric_alert_groups_total = { - alert_receive_channel.id: { - "integration_name": alert_receive_channel.verbal_name, - "team_name": "No team", - "team_id": "no_team", - "org_id": organization.org_id, - "slug": organization.stack_slug, - "id": organization.stack_id, - "firing": 0, - "silenced": 0, - "acknowledged": 0, - "resolved": 0, - } - } - - expected_result_firing = { - "firing": 1, - "silenced": 0, - "acknowledged": 0, - "resolved": 0, - } - - expected_result_acked = { - "firing": 0, - "silenced": 0, - "acknowledged": 1, - "resolved": 0, - } - - expected_result_resolved = { - "firing": 0, - "silenced": 0, - "acknowledged": 0, - "resolved": 1, - } - - metrics_cache = make_metrics_cache_params_old_version(alert_receive_channel.id, organization.id) - monkeypatch.setattr(cache, "get", metrics_cache) - - def get_called_arg_index_and_compare_results(update_expected_result): - """find index for the metric argument, that was set in cache""" - for idx, called_arg in enumerate(mock_cache_set_called_args): - if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_total_key: - expected_result_metric_alert_groups_total[alert_receive_channel.id].update(update_expected_result) - assert called_arg.args[1] == expected_result_metric_alert_groups_total - return idx + 1 - raise AssertionError - - with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set: - arg_idx = 0 - alert_group = make_alert_group(alert_receive_channel) - make_alert(alert_group=alert_group, raw_request_data={}) - # this signal is normally called in get_or_create_grouping on create alert - alert_group_created_signal.send(sender=alert_group.__class__, alert_group=alert_group) - - # check alert_groups_total metric cache, get called args - mock_cache_set_called_args = mock_cache_set.call_args_list - arg_idx = get_called_arg_index_and_compare_results(expected_result_firing) - - alert_group.acknowledge_by_user_or_backsync(user) - arg_idx = get_called_arg_index_and_compare_results(expected_result_acked) - - alert_group.resolve_by_user_or_backsync(user) - arg_idx = get_called_arg_index_and_compare_results(expected_result_resolved) - - alert_group.un_resolve_by_user_or_backsync(user) - arg_idx = get_called_arg_index_and_compare_results(expected_result_firing) - - -# todo:metrics: remove later when all cache is updated -@patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") -@pytest.mark.django_db -@override_settings(CELERY_TASK_ALWAYS_EAGER=True) -def test_update_metric_alert_groups_response_time_cache_on_action_backward_compatability( - mocked_send_log_signal, - mocked_action_signal_send, - mock_apply_async, - make_organization, - make_user_for_organization, - make_alert_receive_channel, - make_alert_group, - make_alert, - monkeypatch, - make_metrics_cache_params_old_version, -): - """Test update metric cache works properly with previous version of cache""" - organization = make_organization( - org_id=METRICS_TEST_ORG_ID, - stack_slug=METRICS_TEST_INSTANCE_SLUG, - stack_id=METRICS_TEST_INSTANCE_ID, - ) - user = make_user_for_organization(organization) - alert_receive_channel = make_alert_receive_channel(organization, verbal_name=METRICS_TEST_INTEGRATION_NAME) - - metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id) - - expected_result_metric_alert_groups_response_time = { - alert_receive_channel.id: { - "integration_name": alert_receive_channel.verbal_name, - "team_name": "No team", - "team_id": "no_team", - "org_id": organization.org_id, - "slug": organization.stack_slug, - "id": organization.stack_id, - "response_time": [], - } - } - - metrics_cache = make_metrics_cache_params_old_version(alert_receive_channel.id, organization.id) - monkeypatch.setattr(cache, "get", metrics_cache) - - def get_called_arg_index_and_compare_results(): - """find index for related to the metric argument, that was set in cache""" - for idx, called_arg in enumerate(mock_cache_set_called_args): - if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_response_time_key: - response_time_values = called_arg.args[1][alert_receive_channel.id]["response_time"] - expected_result_metric_alert_groups_response_time[alert_receive_channel.id].update( - {"response_time": response_time_values} - ) - # response time values len always will be 1 here since cache is mocked and refreshed on every call - assert len(response_time_values) == 1 - assert called_arg.args[1] == expected_result_metric_alert_groups_response_time - return idx + 1 - raise AssertionError - - with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set: - arg_idx = 0 - alert_group = make_alert_group(alert_receive_channel) - make_alert(alert_group=alert_group, raw_request_data={}) - - # check alert_groups_response_time metric cache, get called args - mock_cache_set_called_args = mock_cache_set.call_args_list - - alert_group.acknowledge_by_user_or_backsync(user) - arg_idx = get_called_arg_index_and_compare_results() From 69365c32aafdd6e04000002c7415cb813d18404f Mon Sep 17 00:00:00 2001 From: Julia Date: Tue, 30 Apr 2024 14:07:28 +0200 Subject: [PATCH 3/5] Update docs --- docs/sources/manage/insights-and-metrics/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/sources/manage/insights-and-metrics/index.md b/docs/sources/manage/insights-and-metrics/index.md index ebb50393a2..33658fa1fb 100644 --- a/docs/sources/manage/insights-and-metrics/index.md +++ b/docs/sources/manage/insights-and-metrics/index.md @@ -65,6 +65,7 @@ This metric has the following labels: | `org_id` | ID of Grafana organization | | `team` | Team name | | `integration` | OnCall Integration name | +| `service_name`| Value of Alert group `service_name` label | | `state` | Alert groups state. May be `firing`, `acknowledged`, `resolved` and `silenced`| **Query example:** @@ -86,6 +87,7 @@ This metric has the following labels: | `org_id` | ID of Grafana organization | | `team` | Team name | | `integration` | OnCall Integration name | +| `service_name`| Value of Alert group `service_name` label | | `le` | Histogram bucket value in seconds. May be `60`, `300`, `600`, `3600` and `+Inf`| **Query example:** From 553d40e34de4143427fdadd11af2d3a3a66db5c3 Mon Sep 17 00:00:00 2001 From: Julia Date: Thu, 2 May 2024 11:08:14 +0200 Subject: [PATCH 4/5] Remove support of metrics cache without service name --- engine/apps/metrics_exporter/helpers.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/engine/apps/metrics_exporter/helpers.py b/engine/apps/metrics_exporter/helpers.py index f1e457905f..7baf85a607 100644 --- a/engine/apps/metrics_exporter/helpers.py +++ b/engine/apps/metrics_exporter/helpers.py @@ -287,14 +287,7 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id: if not integration_alert_groups: continue for service_name, service_state_diff in service_data.items(): - if "services" in integration_alert_groups: - states_to_update = integration_alert_groups["services"].setdefault( - service_name, get_default_states_dict() - ) - else: - # support version of metrics cache without service name. This clause can be removed when all metrics - # cache is updated on prod (~2 days after release) - states_to_update = integration_alert_groups + states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict()) for previous_state, counter in service_state_diff["previous_states"].items(): if states_to_update[previous_state] - counter > 0: states_to_update[previous_state] -= counter @@ -329,13 +322,8 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time: if not integration_response_time_metrics: continue for service_name, response_time_values in service_data.items(): - if "services" in integration_response_time_metrics: - integration_response_time_metrics["services"].setdefault(service_name, []) - integration_response_time_metrics["services"][service_name].extend(response_time_values) - else: - # support version of metrics cache without service name. This clause can be removed when all metrics - # cache is updated on prod (~2 days after release) - integration_response_time_metrics["response_time"].extend(response_time_values) + integration_response_time_metrics["services"].setdefault(service_name, []) + integration_response_time_metrics["services"][service_name].extend(response_time_values) cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout) From 3af798c169a5a1a1c05eeb1b5909c6f7d8b6ec5b Mon Sep 17 00:00:00 2001 From: Julia Date: Wed, 22 May 2024 15:13:17 +0200 Subject: [PATCH 5/5] Update metrics collector test --- .../tests/test_metrics_collectors.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py index 257e6c4efa..24b54a5c04 100644 --- a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py +++ b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py @@ -44,9 +44,9 @@ def get_expected_labels(service_name=NO_SERVICE_VALUE, **kwargs): test_metrics_registry.register(collector) for metric in test_metrics_registry.collect(): if metric.name == ALERT_GROUPS_TOTAL: - # integration with labels for each alert group state per service - assert len(metric.samples) == len(AlertGroupState) * 2 - assert {2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples) + # 2 integrations with labels for each alert group state per service + assert len(metric.samples) == len(AlertGroupState) * 3 # 2 from 1st integration and 1 from 2nd + assert {0, 2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples) # check that labels were set correctly expected_labels_no_service = get_expected_labels(state="firing") expected_labels_test_service = get_expected_labels(METRICS_TEST_SERVICE_NAME, state="firing") @@ -54,8 +54,10 @@ def get_expected_labels(service_name=NO_SERVICE_VALUE, **kwargs): for expected_labels in [expected_labels_no_service, expected_labels_test_service]: assert expected_labels in metric_labels elif metric.name == ALERT_GROUPS_RESPONSE_TIME: - # integration with labels for each value in collector's bucket + _count and _sum histogram values - assert len(metric.samples) == (len(collector._buckets) + 2) * 2 + # integration with labels for each of 2 service_name values in collector's bucket + _count and _sum + # histogram values + # ignore integration without response_time data + assert len(metric.samples) == (len(collector._buckets) + 2) * 2 # 2 from 1st integration, ignore 2nd # check that `_sum` values for both services are presented assert {36, 862}.issubset(set(sample.value for sample in metric.samples)) # check that labels were set correctly