Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New feature that user assumpt an arbitrary value of sli when metrics hasn't a increase #242

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 63 additions & 16 deletions slo_generator/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from slo_generator import utils
from slo_generator.constants import COLORED_OUTPUT, MIN_VALID_EVENTS, NO_DATA, Colors

#patch 0.3 importing random lib
import random,time

LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -88,6 +91,18 @@ class SLOReport:
# Data validation
errors: List[str] = field(default_factory=list)

def get_assumption_empty_sli(self, config):
if "assumption_empty_sli" in config["spec"] and config["spec"]["method"] == "good_bad_ratio":
if 0 <= float(config["spec"]["assumption_empty_sli"]) <= 100:
assumption_empty_sli = float(config["spec"]["assumption_empty_sli"])
LOGGER.debug(f"{self.info} | Found assumption_empty_sli config, It will to be used the value {str(assumption_empty_sli)}% when not exists increase in metrics")
return assumption_empty_sli
else:
LOGGER.error("Value to assumption_empty_sli is not between 0 and 100. Received: " + str(config['spec']['assumption_empty_sli']) + '.')
else:
LOGGER.debug("assumption_empty_sli is not in labels, skipping")
return -1

# pylint: disable=too-many-arguments
def __init__(self, config, backend, step, timestamp, client=None, delete=False):

Expand All @@ -113,20 +128,34 @@ def __init__(self, config, backend, step, timestamp, client=None, delete=False):
self.valid = True
self.errors = []

# Get Assumption of Empty SLI value in slo-config
#assumption_empty_sli = self.get_assumption_empty_sli(config)
assumption_empty_sli = 100

# Get backend results
data = self.run_backend(config, backend, client=client, delete=delete)
if not self._validate(data):
# Patch globo-0.3 for three retries
RETRIES=3
Copy link
Collaborator

@ocervell ocervell Oct 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Retries do not belong here, this code should be left as clean as possible.
Retries should go in the providers (backends / exporters) code as they can be different for various backends. Please see https://github.com/google/slo-generator/blob/master/slo_generator/backends/dynatrace.py#L270 for an example.
On top of that, what is globo-0.3 ?

while RETRIES > 0:
data = self.run_backend(config, backend, client=client, delete=delete)
LOGGER.debug(f'patch globo-0.3 | After get data in backend({str(data)}).')
if not self._validate(data, assumption_empty_sli):
RETRIES -= 1
rWAIT = random.randint(2, 5)
LOGGER.debug(f'patch globo-0.3 | Waiting {rWAIT} before next try. ({RETRIES} retries remaining)')
time.sleep(rWAIT)
else:
RETRIES = 0
if not self._validate(data, assumption_empty_sli):
LOGGER.error('patch globo-0.3 | an error occurs in last validation of retrieved data from backend...')
self.valid = False
return

# Build SLO report
self.build(step, data)

self.build(step, data, assumption_empty_sli)
# Post validation
if not self._post_validate():
self.valid = False

def build(self, step, data):
def build(self, step, data, assumption_empty_sli=-1):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, I don't think we should pass assumption_empty_sli to the build method.
imho the way should be a user configurable global variable to specify the SLI value when no events are found.
Please keep in mind that SLOs with no events cannot be relied on, read https://sre.google/workbook/alerting-on-slos/#low-traffic-services-and-error-budget-alerting for more info on why.

"""Compute all data necessary to build the SLO report.

Args:
Expand All @@ -139,7 +168,12 @@ def build(self, step, data):
LOGGER.debug(f"{self.info} | SLO report starting ...")

# SLI, Good count, Bad count, Gap from backend results
sli, good_count, bad_count = self.get_sli(data)
sli, good_count, bad_count = self.get_sli(data, assumption_empty_sli)
#patch globo-0.3 additional condition to good events empty or NODATA
LOGGER.debug(f"SLI={sli}, GOOD={good_count}, BAD={bad_count}")
if good_count == 'EMPTY':
LOGGER.error("Good events found is 0, terminating...")
exit(-1)
gap = sli - self.goal

# Error Budget calculations
Expand Down Expand Up @@ -229,7 +263,7 @@ def run_backend(self, config, backend, client=None, delete=False):
return None
return data

def get_sli(self, data):
def get_sli(self, data, assumption_empty_sli=-1):
"""Compute SLI value and good / bad counts from the backend result.

Some backends (e.g: Prometheus) are computing and returning the SLI
Expand All @@ -251,11 +285,18 @@ def get_sli(self, data):
if isinstance(data, tuple): # good, bad count
good_count, bad_count = data
if good_count == NO_DATA:
good_count = 0
#patch globo-0.3, terminate if not exists good events
good_count = 'EMPTY'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good_count should be an int o a float, and NO_DATA already tells us that no events were found.

sli_measurement = None
return sli_measurement, good_count, bad_count
if bad_count == NO_DATA:
bad_count = 0
LOGGER.debug(f"{self.info} | Good: {good_count} | Bad: {bad_count}")
sli_measurement = round(good_count / (good_count + bad_count), 6)
if good_count == 0 and bad_count == 0 and assumption_empty_sli != -1:
sli_measurement = round(assumption_empty_sli / 100, 5)
LOGGER.warning(f"{self.info} | Setting sli_measurement to {float(assumption_empty_sli)}% because sli increase in time is 0.")
else:
sli_measurement = round(good_count / (good_count + bad_count), 6)
else: # sli value
sli_measurement = round(data, 6)
good_count, bad_count = NO_DATA, NO_DATA
Expand All @@ -274,7 +315,7 @@ def to_json(self) -> dict:
return asdict(self)

# pylint: disable=too-many-return-statements
def _validate(self, data) -> bool:
def _validate(self, data, assumption_empty_sli=-1) -> bool:
"""Validate backend results. Invalid data will result in SLO report not
being built.

Expand Down Expand Up @@ -341,11 +382,17 @@ def _validate(self, data) -> bool:
# Tuple should not have elements where the sum is inferior to our
# minimum valid events threshold
if (good + bad) < MIN_VALID_EVENTS:
error = (
f"Not enough valid events ({good + bad}) found. Minimum "
f"valid events: {MIN_VALID_EVENTS}."
)
self.errors.append(error)
if good == 0 and bad == 0 and assumption_empty_sli > -1:
LOGGER.debug(f"good={good}, bad={bad}, assumption_empty_sli={assumption_empty_sli}, setting SLI to {assumption_empty_sli}")
else:
error = (
f"Not enough valid events ({good + bad}) found. Minimum "
f"valid events: {MIN_VALID_EVENTS}.")
self.errors.append(error)
return False
# Patch globo-0.3 for good events less than 0
if good <= 0 and bad > 0:
LOGGER.debug("Good less than 0, skipping SLO calculation by patch globo-0.3")
return False

# Check backend float / int value
Expand Down