From 68962210469f4d77b519aabe9eca322ebe845702 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 3 Jul 2025 19:46:32 -0500 Subject: [PATCH 01/48] Start of lint for checking whether Prometheus metrics have the `server_name` label --- scripts-dev/mypy_synapse_plugin.py | 83 +++- synapse/util/ratelimitutils.py | 668 ++++++++++++++--------------- 2 files changed, 416 insertions(+), 335 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index a15c3c005cf..817272900ae 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -29,7 +29,14 @@ from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode from mypy.nodes import ARG_NAMED_OPT, TempNode, Var -from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin +from mypy.plugin import ( + FunctionSigContext, + MethodSigContext, + Plugin, + FunctionContext, + ClassDefContext, + FunctionLike, +) from mypy.typeops import bind_self from mypy.types import ( AnyType, @@ -42,12 +49,22 @@ UninhabitedType, UnionType, ) +from mypy.nodes import StrExpr, TupleExpr, ListExpr class SynapsePlugin(Plugin): + def get_function_signature_hook( + self, fullname: str + ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]: + if fullname in ("prometheus_client.metrics.Counter",): + return check_asdf + + return None + def get_method_signature_hook( self, fullname: str ) -> Optional[Callable[[MethodSigContext], CallableType]]: + # print(f"m fullname={fullname}") if fullname.startswith( ( "synapse.util.caches.descriptors.CachedFunction.__call__", @@ -65,6 +82,70 @@ def get_method_signature_hook( return None +def check_asdf(ctx: FunctionSigContext) -> CallableType: + """ + TODO + """ + # The true signature, this isn't being modified so this is what will be returned. + signature: CallableType = ctx.default_signature + + # Sanity check the arguments are still as expected in this version of + # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)` + # + # signature.arg_names should be: ["name", "documentation", "labelnames", ...] + if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames": + ctx.api.fail( + f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got " + f"{signature.arg_names[2]}", + ctx.context, + ) + return signature + + # Ensure mypy is passing the correct number of arguments because we are doing some + # dirty indexing into `ctx.args` later on. + assert len(ctx.args) == len(signature.arg_names), ( + f"Expected {len(signature.arg_names)} arguments for {signature.name}, " + f"but got {len(ctx.args)}" + ) + + # `ctx.args` should look like this: + # ``` + # [ + # [StrExpr("name")], + # [StrExpr("documentation")], + # [ListExpr([StrExpr("label1"), StrExpr("label2")])] + # ... + # ] + # ``` + labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None + if isinstance(labelnames_arg_expression, ListExpr): + labels = [] + for labelname_expression in labelnames_arg_expression.items: + if isinstance(labelname_expression, StrExpr): + labels.append(labelname_expression.value) + else: + ctx.api.fail( + f"Expected all items in the 3rd argument of {signature.name} to be strings, but got " + f"{labelname_expression.__class__.__name__}", + ctx.context, + ) + + if "server_name" not in labels: + ctx.api.fail( + f"Expected {signature.name} to include 'server_name' in the list of labels", + ctx.context, + ) + else: + ctx.api.fail( + f"Expected the 3rd argument of {signature.name} to be a list of label names, but got " + f"{labelnames_arg_expression}", + ctx.context, + ) + return signature + + return signature + + def _get_true_return_type(signature: CallableType) -> mypy.types.Type: """ Get the "final" return type of a callable which might return an Awaitable/Deferred. diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 3f067b792c4..d8c3504754a 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -42,23 +42,23 @@ from prometheus_client.core import Counter -from twisted.internet import defer - -from synapse.api.errors import LimitExceededError -from synapse.config.ratelimiting import FederationRatelimitSettings -from synapse.logging.context import ( - PreserveLoggingContext, - make_deferred_yieldable, - run_in_background, -) -from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram, LaterGauge -from synapse.util import Clock +# from twisted.internet import defer + +# from synapse.api.errors import LimitExceededError +# from synapse.config.ratelimiting import FederationRatelimitSettings +# from synapse.logging.context import ( +# PreserveLoggingContext, +# make_deferred_yieldable, +# run_in_background, +# ) +# from synapse.logging.opentracing import start_active_span +# from synapse.metrics import Histogram, LaterGauge +# from synapse.util import Clock -if typing.TYPE_CHECKING: - from contextlib import _GeneratorContextManager +# if typing.TYPE_CHECKING: +# from contextlib import _GeneratorContextManager -logger = logging.getLogger(__name__) +# logger = logging.getLogger(__name__) # Track how much the ratelimiter is affecting requests @@ -72,326 +72,326 @@ "Number of requests rejected by the rate limiter", ["rate_limiter_name"], ) -queue_wait_timer = Histogram( - "synapse_rate_limit_queue_wait_time_seconds", - "Amount of time spent waiting for the rate limiter to let our request through.", - ["rate_limiter_name"], - buckets=( - 0.005, - 0.01, - 0.025, - 0.05, - 0.1, - 0.25, - 0.5, - 0.75, - 1.0, - 2.5, - 5.0, - 10.0, - 20.0, - "+Inf", - ), -) +# queue_wait_timer = Histogram( +# "synapse_rate_limit_queue_wait_time_seconds", +# "Amount of time spent waiting for the rate limiter to let our request through.", +# ["rate_limiter_name"], +# buckets=( +# 0.005, +# 0.01, +# 0.025, +# 0.05, +# 0.1, +# 0.25, +# 0.5, +# 0.75, +# 1.0, +# 2.5, +# 5.0, +# 10.0, +# 20.0, +# "+Inf", +# ), +# ) # This must be a `WeakSet`, otherwise we indirectly hold on to entire `HomeServer`s # during trial test runs and leak a lot of memory. -_rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet() -# Protects the _rate_limiter_instances set from concurrent access -_rate_limiter_instances_lock = threading.Lock() - - -def _get_counts_from_rate_limiter_instance( - count_func: Callable[["FederationRateLimiter"], int], -) -> Mapping[Tuple[str, ...], int]: - """Returns a count of something (slept/rejected hosts) by (metrics_name)""" - # Cast to a list to prevent it changing while the Prometheus - # thread is collecting metrics - with _rate_limiter_instances_lock: - rate_limiter_instances = list(_rate_limiter_instances) - - # Map from (metrics_name,) -> int, the number of something like slept hosts - # or rejected hosts. The key type is Tuple[str], but we leave the length - # unspecified for compatability with LaterGauge's annotations. - counts: Dict[Tuple[str, ...], int] = {} - for rate_limiter_instance in rate_limiter_instances: - # Only track metrics if they provided a `metrics_name` to - # differentiate this instance of the rate limiter. - if rate_limiter_instance.metrics_name: - key = (rate_limiter_instance.metrics_name,) - counts[key] = count_func(rate_limiter_instance) - - return counts - - -# We track the number of affected hosts per time-period so we can -# differentiate one really noisy homeserver from a general -# ratelimit tuning problem across the federation. -LaterGauge( - "synapse_rate_limit_sleep_affected_hosts", - "Number of hosts that had requests put to sleep", - ["rate_limiter_name"], - lambda: _get_counts_from_rate_limiter_instance( - lambda rate_limiter_instance: sum( - ratelimiter.should_sleep() - for ratelimiter in rate_limiter_instance.ratelimiters.values() - ) - ), -) -LaterGauge( - "synapse_rate_limit_reject_affected_hosts", - "Number of hosts that had requests rejected", - ["rate_limiter_name"], - lambda: _get_counts_from_rate_limiter_instance( - lambda rate_limiter_instance: sum( - ratelimiter.should_reject() - for ratelimiter in rate_limiter_instance.ratelimiters.values() - ) - ), -) - - -class FederationRateLimiter: - """Used to rate limit request per-host.""" - - def __init__( - self, - clock: Clock, - config: FederationRatelimitSettings, - metrics_name: Optional[str] = None, - ): - """ - Args: - clock - config - metrics_name: The name of the rate limiter so we can differentiate it - from the rest in the metrics. If `None`, we don't track metrics - for this rate limiter. - - """ - self.metrics_name = metrics_name - - def new_limiter() -> "_PerHostRatelimiter": - return _PerHostRatelimiter( - clock=clock, config=config, metrics_name=metrics_name - ) - - self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = ( - collections.defaultdict(new_limiter) - ) - - with _rate_limiter_instances_lock: - _rate_limiter_instances.add(self) - - def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": - """Used to ratelimit an incoming request from a given host - - Example usage: - - with rate_limiter.ratelimit(origin) as wait_deferred: - yield wait_deferred - # Handle request ... - - Args: - host: Origin of incoming request. - - Returns: - context manager which returns a deferred. - """ - return self.ratelimiters[host].ratelimit(host) - - -class _PerHostRatelimiter: - def __init__( - self, - clock: Clock, - config: FederationRatelimitSettings, - metrics_name: Optional[str] = None, - ): - """ - Args: - clock - config - metrics_name: The name of the rate limiter so we can differentiate it - from the rest in the metrics. If `None`, we don't track metrics - for this rate limiter. - from the rest in the metrics - """ - self.clock = clock - self.metrics_name = metrics_name - - self.window_size = config.window_size - self.sleep_limit = config.sleep_limit - self.sleep_sec = config.sleep_delay / 1000.0 - self.reject_limit = config.reject_limit - self.concurrent_requests = config.concurrent - - # request_id objects for requests which have been slept - self.sleeping_requests: Set[object] = set() - - # map from request_id object to Deferred for requests which are ready - # for processing but have been queued - self.ready_request_queue: collections.OrderedDict[ - object, defer.Deferred[None] - ] = collections.OrderedDict() - - # request id objects for requests which are in progress - self.current_processing: Set[object] = set() - - # times at which we have recently (within the last window_size ms) - # received requests. - self.request_times: List[int] = [] - - @contextlib.contextmanager - def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": - # `contextlib.contextmanager` takes a generator and turns it into a - # context manager. The generator should only yield once with a value - # to be returned by manager. - # Exceptions will be reraised at the yield. - - self.host = host - - request_id = object() - # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant - # type-checking, but we'd need Twisted >= 21.2. - ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id)) - try: - yield ret - finally: - self._on_exit(request_id) - - def should_reject(self) -> bool: - """ - Whether to reject the request if we already have too many queued up - (either sleeping or in the ready queue). - """ - queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) - return queue_size > self.reject_limit - - def should_sleep(self) -> bool: - """ - Whether to sleep the request if we already have too many requests coming - through within the window. - """ - return len(self.request_times) > self.sleep_limit - - async def _on_enter_with_tracing(self, request_id: object) -> None: - maybe_metrics_cm: ContextManager = contextlib.nullcontext() - if self.metrics_name: - maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() - with start_active_span("ratelimit wait"), maybe_metrics_cm: - await self._on_enter(request_id) - - def _on_enter(self, request_id: object) -> "defer.Deferred[None]": - time_now = self.clock.time_msec() - - # remove any entries from request_times which aren't within the window - self.request_times[:] = [ - r for r in self.request_times if time_now - r < self.window_size - ] - - # reject the request if we already have too many queued up (either - # sleeping or in the ready queue). - if self.should_reject(): - logger.debug("Ratelimiter(%s): rejecting request", self.host) - if self.metrics_name: - rate_limit_reject_counter.labels(self.metrics_name).inc() - raise LimitExceededError( - limiter_name="rc_federation", - retry_after_ms=int(self.window_size / self.sleep_limit), - ) - - self.request_times.append(time_now) - - def queue_request() -> "defer.Deferred[None]": - if len(self.current_processing) >= self.concurrent_requests: - queue_defer: defer.Deferred[None] = defer.Deferred() - self.ready_request_queue[request_id] = queue_defer - logger.info( - "Ratelimiter(%s): queueing request (queue now %i items)", - self.host, - len(self.ready_request_queue), - ) - - return queue_defer - else: - return defer.succeed(None) - - logger.debug( - "Ratelimit(%s) [%s]: len(self.request_times)=%d", - self.host, - id(request_id), - len(self.request_times), - ) - - if self.should_sleep(): - logger.debug( - "Ratelimiter(%s) [%s]: sleeping request for %f sec", - self.host, - id(request_id), - self.sleep_sec, - ) - if self.metrics_name: - rate_limit_sleep_counter.labels(self.metrics_name).inc() - ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) - - self.sleeping_requests.add(request_id) - - def on_wait_finished(_: Any) -> "defer.Deferred[None]": - logger.debug( - "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) - ) - self.sleeping_requests.discard(request_id) - queue_defer = queue_request() - return queue_defer - - ret_defer.addBoth(on_wait_finished) - else: - ret_defer = queue_request() - - def on_start(r: object) -> object: - logger.debug( - "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) - ) - self.current_processing.add(request_id) - return r - - def on_err(r: object) -> object: - # XXX: why is this necessary? this is called before we start - # processing the request so why would the request be in - # current_processing? - self.current_processing.discard(request_id) - return r - - def on_both(r: object) -> object: - # Ensure that we've properly cleaned up. - self.sleeping_requests.discard(request_id) - self.ready_request_queue.pop(request_id, None) - return r - - ret_defer.addCallbacks(on_start, on_err) - ret_defer.addBoth(on_both) - return make_deferred_yieldable(ret_defer) - - def _on_exit(self, request_id: object) -> None: - logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) - - # When requests complete synchronously, we will recursively start the next - # request in the queue. To avoid stack exhaustion, we defer starting the next - # request until the next reactor tick. - - def start_next_request() -> None: - # We only remove the completed request from the list when we're about to - # start the next one, otherwise we can allow extra requests through. - self.current_processing.discard(request_id) - try: - # start processing the next item on the queue. - _, deferred = self.ready_request_queue.popitem(last=False) - - with PreserveLoggingContext(): - deferred.callback(None) - except KeyError: - pass - - self.clock.call_later(0.0, start_next_request) +# _rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet() +# # Protects the _rate_limiter_instances set from concurrent access +# _rate_limiter_instances_lock = threading.Lock() + + +# def _get_counts_from_rate_limiter_instance( +# count_func: Callable[["FederationRateLimiter"], int], +# ) -> Mapping[Tuple[str, ...], int]: +# """Returns a count of something (slept/rejected hosts) by (metrics_name)""" +# # Cast to a list to prevent it changing while the Prometheus +# # thread is collecting metrics +# with _rate_limiter_instances_lock: +# rate_limiter_instances = list(_rate_limiter_instances) + +# # Map from (metrics_name,) -> int, the number of something like slept hosts +# # or rejected hosts. The key type is Tuple[str], but we leave the length +# # unspecified for compatability with LaterGauge's annotations. +# counts: Dict[Tuple[str, ...], int] = {} +# for rate_limiter_instance in rate_limiter_instances: +# # Only track metrics if they provided a `metrics_name` to +# # differentiate this instance of the rate limiter. +# if rate_limiter_instance.metrics_name: +# key = (rate_limiter_instance.metrics_name,) +# counts[key] = count_func(rate_limiter_instance) + +# return counts + + +# # We track the number of affected hosts per time-period so we can +# # differentiate one really noisy homeserver from a general +# # ratelimit tuning problem across the federation. +# LaterGauge( +# "synapse_rate_limit_sleep_affected_hosts", +# "Number of hosts that had requests put to sleep", +# ["rate_limiter_name"], +# lambda: _get_counts_from_rate_limiter_instance( +# lambda rate_limiter_instance: sum( +# ratelimiter.should_sleep() +# for ratelimiter in rate_limiter_instance.ratelimiters.values() +# ) +# ), +# ) +# LaterGauge( +# "synapse_rate_limit_reject_affected_hosts", +# "Number of hosts that had requests rejected", +# ["rate_limiter_name"], +# lambda: _get_counts_from_rate_limiter_instance( +# lambda rate_limiter_instance: sum( +# ratelimiter.should_reject() +# for ratelimiter in rate_limiter_instance.ratelimiters.values() +# ) +# ), +# ) + + +# class FederationRateLimiter: +# """Used to rate limit request per-host.""" + +# def __init__( +# self, +# clock: Clock, +# config: FederationRatelimitSettings, +# metrics_name: Optional[str] = None, +# ): +# """ +# Args: +# clock +# config +# metrics_name: The name of the rate limiter so we can differentiate it +# from the rest in the metrics. If `None`, we don't track metrics +# for this rate limiter. + +# """ +# self.metrics_name = metrics_name + +# def new_limiter() -> "_PerHostRatelimiter": +# return _PerHostRatelimiter( +# clock=clock, config=config, metrics_name=metrics_name +# ) + +# self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = ( +# collections.defaultdict(new_limiter) +# ) + +# with _rate_limiter_instances_lock: +# _rate_limiter_instances.add(self) + +# def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": +# """Used to ratelimit an incoming request from a given host + +# Example usage: + +# with rate_limiter.ratelimit(origin) as wait_deferred: +# yield wait_deferred +# # Handle request ... + +# Args: +# host: Origin of incoming request. + +# Returns: +# context manager which returns a deferred. +# """ +# return self.ratelimiters[host].ratelimit(host) + + +# class _PerHostRatelimiter: +# def __init__( +# self, +# clock: Clock, +# config: FederationRatelimitSettings, +# metrics_name: Optional[str] = None, +# ): +# """ +# Args: +# clock +# config +# metrics_name: The name of the rate limiter so we can differentiate it +# from the rest in the metrics. If `None`, we don't track metrics +# for this rate limiter. +# from the rest in the metrics +# """ +# self.clock = clock +# self.metrics_name = metrics_name + +# self.window_size = config.window_size +# self.sleep_limit = config.sleep_limit +# self.sleep_sec = config.sleep_delay / 1000.0 +# self.reject_limit = config.reject_limit +# self.concurrent_requests = config.concurrent + +# # request_id objects for requests which have been slept +# self.sleeping_requests: Set[object] = set() + +# # map from request_id object to Deferred for requests which are ready +# # for processing but have been queued +# self.ready_request_queue: collections.OrderedDict[ +# object, defer.Deferred[None] +# ] = collections.OrderedDict() + +# # request id objects for requests which are in progress +# self.current_processing: Set[object] = set() + +# # times at which we have recently (within the last window_size ms) +# # received requests. +# self.request_times: List[int] = [] + +# @contextlib.contextmanager +# def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": +# # `contextlib.contextmanager` takes a generator and turns it into a +# # context manager. The generator should only yield once with a value +# # to be returned by manager. +# # Exceptions will be reraised at the yield. + +# self.host = host + +# request_id = object() +# # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant +# # type-checking, but we'd need Twisted >= 21.2. +# ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id)) +# try: +# yield ret +# finally: +# self._on_exit(request_id) + +# def should_reject(self) -> bool: +# """ +# Whether to reject the request if we already have too many queued up +# (either sleeping or in the ready queue). +# """ +# queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) +# return queue_size > self.reject_limit + +# def should_sleep(self) -> bool: +# """ +# Whether to sleep the request if we already have too many requests coming +# through within the window. +# """ +# return len(self.request_times) > self.sleep_limit + +# async def _on_enter_with_tracing(self, request_id: object) -> None: +# maybe_metrics_cm: ContextManager = contextlib.nullcontext() +# if self.metrics_name: +# maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() +# with start_active_span("ratelimit wait"), maybe_metrics_cm: +# await self._on_enter(request_id) + +# def _on_enter(self, request_id: object) -> "defer.Deferred[None]": +# time_now = self.clock.time_msec() + +# # remove any entries from request_times which aren't within the window +# self.request_times[:] = [ +# r for r in self.request_times if time_now - r < self.window_size +# ] + +# # reject the request if we already have too many queued up (either +# # sleeping or in the ready queue). +# if self.should_reject(): +# logger.debug("Ratelimiter(%s): rejecting request", self.host) +# if self.metrics_name: +# rate_limit_reject_counter.labels(self.metrics_name).inc() +# raise LimitExceededError( +# limiter_name="rc_federation", +# retry_after_ms=int(self.window_size / self.sleep_limit), +# ) + +# self.request_times.append(time_now) + +# def queue_request() -> "defer.Deferred[None]": +# if len(self.current_processing) >= self.concurrent_requests: +# queue_defer: defer.Deferred[None] = defer.Deferred() +# self.ready_request_queue[request_id] = queue_defer +# logger.info( +# "Ratelimiter(%s): queueing request (queue now %i items)", +# self.host, +# len(self.ready_request_queue), +# ) + +# return queue_defer +# else: +# return defer.succeed(None) + +# logger.debug( +# "Ratelimit(%s) [%s]: len(self.request_times)=%d", +# self.host, +# id(request_id), +# len(self.request_times), +# ) + +# if self.should_sleep(): +# logger.debug( +# "Ratelimiter(%s) [%s]: sleeping request for %f sec", +# self.host, +# id(request_id), +# self.sleep_sec, +# ) +# if self.metrics_name: +# rate_limit_sleep_counter.labels(self.metrics_name).inc() +# ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) + +# self.sleeping_requests.add(request_id) + +# def on_wait_finished(_: Any) -> "defer.Deferred[None]": +# logger.debug( +# "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) +# ) +# self.sleeping_requests.discard(request_id) +# queue_defer = queue_request() +# return queue_defer + +# ret_defer.addBoth(on_wait_finished) +# else: +# ret_defer = queue_request() + +# def on_start(r: object) -> object: +# logger.debug( +# "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) +# ) +# self.current_processing.add(request_id) +# return r + +# def on_err(r: object) -> object: +# # XXX: why is this necessary? this is called before we start +# # processing the request so why would the request be in +# # current_processing? +# self.current_processing.discard(request_id) +# return r + +# def on_both(r: object) -> object: +# # Ensure that we've properly cleaned up. +# self.sleeping_requests.discard(request_id) +# self.ready_request_queue.pop(request_id, None) +# return r + +# ret_defer.addCallbacks(on_start, on_err) +# ret_defer.addBoth(on_both) +# return make_deferred_yieldable(ret_defer) + +# def _on_exit(self, request_id: object) -> None: +# logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) + +# # When requests complete synchronously, we will recursively start the next +# # request in the queue. To avoid stack exhaustion, we defer starting the next +# # request until the next reactor tick. + +# def start_next_request() -> None: +# # We only remove the completed request from the list when we're about to +# # start the next one, otherwise we can allow extra requests through. +# self.current_processing.discard(request_id) +# try: +# # start processing the next item on the queue. +# _, deferred = self.ready_request_queue.popitem(last=False) + +# with PreserveLoggingContext(): +# deferred.callback(None) +# except KeyError: +# pass + +# self.clock.call_later(0.0, start_next_request) From ada39e37210af7664e9e66ce56898b13ee69d61e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:18:52 -0500 Subject: [PATCH 02/48] Add more comments --- scripts-dev/mypy_synapse_plugin.py | 33 ++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 817272900ae..590c7d41ae5 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -57,7 +57,7 @@ def get_function_signature_hook( self, fullname: str ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]: if fullname in ("prometheus_client.metrics.Counter",): - return check_asdf + return check_prometheus_metric_instantiation return None @@ -82,9 +82,19 @@ def get_method_signature_hook( return None -def check_asdf(ctx: FunctionSigContext) -> CallableType: +def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType: """ - TODO + Ensure that the `prometheus_client` metrics are instantiated with `server_name` in + the list of labels. + + This is important because we support multiple Synapse instances running in the same + process, where all metrics share a single global `REGISTRY`. The `server_name` label + ensures metrics are correctly separated by homeserver. + + There are also some metrics that apply at the process level, such as + CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't + have a `server_name` label. In those cases, use use a type ignore comment to + disable the check, e.g. `# type: ignore[misc]`. """ # The true signature, this isn't being modified so this is what will be returned. signature: CallableType = ctx.default_signature @@ -92,7 +102,7 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType: # Sanity check the arguments are still as expected in this version of # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)` # - # signature.arg_names should be: ["name", "documentation", "labelnames", ...] + # `signature.arg_names` should be: ["name", "documentation", "labelnames", ...] if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames": ctx.api.fail( f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got " @@ -104,10 +114,12 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType: # Ensure mypy is passing the correct number of arguments because we are doing some # dirty indexing into `ctx.args` later on. assert len(ctx.args) == len(signature.arg_names), ( - f"Expected {len(signature.arg_names)} arguments for {signature.name}, " - f"but got {len(ctx.args)}" + f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})" + f"to match the number of arguments from the function signature context ({len(ctx.args)})" ) + # Check if the `labelnames` argument includes the `server_name` label. + # # `ctx.args` should look like this: # ``` # [ @@ -119,17 +131,22 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType: # ``` labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None if isinstance(labelnames_arg_expression, ListExpr): + # Collect the label names as a list of strings. labels = [] for labelname_expression in labelnames_arg_expression.items: if isinstance(labelname_expression, StrExpr): labels.append(labelname_expression.value) else: ctx.api.fail( - f"Expected all items in the 3rd argument of {signature.name} to be strings, but got " + f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got " f"{labelname_expression.__class__.__name__}", ctx.context, + code=CUSTOM_ERROR, ) + # Check if the `labelnames` argument includes the `server_name` label. + # + # TODO: Use `SERVER_NAME_LABEL` here if "server_name" not in labels: ctx.api.fail( f"Expected {signature.name} to include 'server_name' in the list of labels", @@ -137,7 +154,7 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType: ) else: ctx.api.fail( - f"Expected the 3rd argument of {signature.name} to be a list of label names, but got " + f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got " f"{labelnames_arg_expression}", ctx.context, ) From 0c45aab8d184c4c803e9d4477390c3a876d13f8e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:30:34 -0500 Subject: [PATCH 03/48] Add specific error code --- scripts-dev/mypy_synapse_plugin.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 590c7d41ae5..b55ec610c73 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -51,6 +51,17 @@ ) from mypy.nodes import StrExpr, TupleExpr, ListExpr +PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( + "missing-server-name-label", + "'server_name' label required in metric", + category="per-homeserver-tenant-metrics", +) +PROMETHEUS_METRIC_MANGLED_LABELS = ErrorCode( + "mangled-labels", + "`labelnames` argument should be a list of strings", + category="per-homeserver-tenant-metrics", +) + class SynapsePlugin(Plugin): def get_function_signature_hook( @@ -94,7 +105,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy There are also some metrics that apply at the process level, such as CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't have a `server_name` label. In those cases, use use a type ignore comment to - disable the check, e.g. `# type: ignore[misc]`. + disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ # The true signature, this isn't being modified so this is what will be returned. signature: CallableType = ctx.default_signature @@ -141,7 +152,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got " f"{labelname_expression.__class__.__name__}", ctx.context, - code=CUSTOM_ERROR, + code=PROMETHEUS_METRIC_MANGLED_LABELS, ) # Check if the `labelnames` argument includes the `server_name` label. @@ -151,12 +162,14 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy ctx.api.fail( f"Expected {signature.name} to include 'server_name' in the list of labels", ctx.context, + code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) else: ctx.api.fail( f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got " f"{labelnames_arg_expression}", ctx.context, + code=PROMETHEUS_METRIC_MANGLED_LABELS, ) return signature From 6e74eca2688edb1f9af697c22383e6d032eb2c5d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:32:08 -0500 Subject: [PATCH 04/48] Revert temp changes to `synapse/util/ratelimitutils.py` --- synapse/util/ratelimitutils.py | 668 ++++++++++++++++----------------- 1 file changed, 334 insertions(+), 334 deletions(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index d8c3504754a..3f067b792c4 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -42,23 +42,23 @@ from prometheus_client.core import Counter -# from twisted.internet import defer - -# from synapse.api.errors import LimitExceededError -# from synapse.config.ratelimiting import FederationRatelimitSettings -# from synapse.logging.context import ( -# PreserveLoggingContext, -# make_deferred_yieldable, -# run_in_background, -# ) -# from synapse.logging.opentracing import start_active_span -# from synapse.metrics import Histogram, LaterGauge -# from synapse.util import Clock +from twisted.internet import defer + +from synapse.api.errors import LimitExceededError +from synapse.config.ratelimiting import FederationRatelimitSettings +from synapse.logging.context import ( + PreserveLoggingContext, + make_deferred_yieldable, + run_in_background, +) +from synapse.logging.opentracing import start_active_span +from synapse.metrics import Histogram, LaterGauge +from synapse.util import Clock -# if typing.TYPE_CHECKING: -# from contextlib import _GeneratorContextManager +if typing.TYPE_CHECKING: + from contextlib import _GeneratorContextManager -# logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) # Track how much the ratelimiter is affecting requests @@ -72,326 +72,326 @@ "Number of requests rejected by the rate limiter", ["rate_limiter_name"], ) -# queue_wait_timer = Histogram( -# "synapse_rate_limit_queue_wait_time_seconds", -# "Amount of time spent waiting for the rate limiter to let our request through.", -# ["rate_limiter_name"], -# buckets=( -# 0.005, -# 0.01, -# 0.025, -# 0.05, -# 0.1, -# 0.25, -# 0.5, -# 0.75, -# 1.0, -# 2.5, -# 5.0, -# 10.0, -# 20.0, -# "+Inf", -# ), -# ) +queue_wait_timer = Histogram( + "synapse_rate_limit_queue_wait_time_seconds", + "Amount of time spent waiting for the rate limiter to let our request through.", + ["rate_limiter_name"], + buckets=( + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 10.0, + 20.0, + "+Inf", + ), +) # This must be a `WeakSet`, otherwise we indirectly hold on to entire `HomeServer`s # during trial test runs and leak a lot of memory. -# _rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet() -# # Protects the _rate_limiter_instances set from concurrent access -# _rate_limiter_instances_lock = threading.Lock() - - -# def _get_counts_from_rate_limiter_instance( -# count_func: Callable[["FederationRateLimiter"], int], -# ) -> Mapping[Tuple[str, ...], int]: -# """Returns a count of something (slept/rejected hosts) by (metrics_name)""" -# # Cast to a list to prevent it changing while the Prometheus -# # thread is collecting metrics -# with _rate_limiter_instances_lock: -# rate_limiter_instances = list(_rate_limiter_instances) - -# # Map from (metrics_name,) -> int, the number of something like slept hosts -# # or rejected hosts. The key type is Tuple[str], but we leave the length -# # unspecified for compatability with LaterGauge's annotations. -# counts: Dict[Tuple[str, ...], int] = {} -# for rate_limiter_instance in rate_limiter_instances: -# # Only track metrics if they provided a `metrics_name` to -# # differentiate this instance of the rate limiter. -# if rate_limiter_instance.metrics_name: -# key = (rate_limiter_instance.metrics_name,) -# counts[key] = count_func(rate_limiter_instance) - -# return counts - - -# # We track the number of affected hosts per time-period so we can -# # differentiate one really noisy homeserver from a general -# # ratelimit tuning problem across the federation. -# LaterGauge( -# "synapse_rate_limit_sleep_affected_hosts", -# "Number of hosts that had requests put to sleep", -# ["rate_limiter_name"], -# lambda: _get_counts_from_rate_limiter_instance( -# lambda rate_limiter_instance: sum( -# ratelimiter.should_sleep() -# for ratelimiter in rate_limiter_instance.ratelimiters.values() -# ) -# ), -# ) -# LaterGauge( -# "synapse_rate_limit_reject_affected_hosts", -# "Number of hosts that had requests rejected", -# ["rate_limiter_name"], -# lambda: _get_counts_from_rate_limiter_instance( -# lambda rate_limiter_instance: sum( -# ratelimiter.should_reject() -# for ratelimiter in rate_limiter_instance.ratelimiters.values() -# ) -# ), -# ) - - -# class FederationRateLimiter: -# """Used to rate limit request per-host.""" - -# def __init__( -# self, -# clock: Clock, -# config: FederationRatelimitSettings, -# metrics_name: Optional[str] = None, -# ): -# """ -# Args: -# clock -# config -# metrics_name: The name of the rate limiter so we can differentiate it -# from the rest in the metrics. If `None`, we don't track metrics -# for this rate limiter. - -# """ -# self.metrics_name = metrics_name - -# def new_limiter() -> "_PerHostRatelimiter": -# return _PerHostRatelimiter( -# clock=clock, config=config, metrics_name=metrics_name -# ) - -# self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = ( -# collections.defaultdict(new_limiter) -# ) - -# with _rate_limiter_instances_lock: -# _rate_limiter_instances.add(self) - -# def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": -# """Used to ratelimit an incoming request from a given host - -# Example usage: - -# with rate_limiter.ratelimit(origin) as wait_deferred: -# yield wait_deferred -# # Handle request ... - -# Args: -# host: Origin of incoming request. - -# Returns: -# context manager which returns a deferred. -# """ -# return self.ratelimiters[host].ratelimit(host) - - -# class _PerHostRatelimiter: -# def __init__( -# self, -# clock: Clock, -# config: FederationRatelimitSettings, -# metrics_name: Optional[str] = None, -# ): -# """ -# Args: -# clock -# config -# metrics_name: The name of the rate limiter so we can differentiate it -# from the rest in the metrics. If `None`, we don't track metrics -# for this rate limiter. -# from the rest in the metrics -# """ -# self.clock = clock -# self.metrics_name = metrics_name - -# self.window_size = config.window_size -# self.sleep_limit = config.sleep_limit -# self.sleep_sec = config.sleep_delay / 1000.0 -# self.reject_limit = config.reject_limit -# self.concurrent_requests = config.concurrent - -# # request_id objects for requests which have been slept -# self.sleeping_requests: Set[object] = set() - -# # map from request_id object to Deferred for requests which are ready -# # for processing but have been queued -# self.ready_request_queue: collections.OrderedDict[ -# object, defer.Deferred[None] -# ] = collections.OrderedDict() - -# # request id objects for requests which are in progress -# self.current_processing: Set[object] = set() - -# # times at which we have recently (within the last window_size ms) -# # received requests. -# self.request_times: List[int] = [] - -# @contextlib.contextmanager -# def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": -# # `contextlib.contextmanager` takes a generator and turns it into a -# # context manager. The generator should only yield once with a value -# # to be returned by manager. -# # Exceptions will be reraised at the yield. - -# self.host = host - -# request_id = object() -# # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant -# # type-checking, but we'd need Twisted >= 21.2. -# ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id)) -# try: -# yield ret -# finally: -# self._on_exit(request_id) - -# def should_reject(self) -> bool: -# """ -# Whether to reject the request if we already have too many queued up -# (either sleeping or in the ready queue). -# """ -# queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) -# return queue_size > self.reject_limit - -# def should_sleep(self) -> bool: -# """ -# Whether to sleep the request if we already have too many requests coming -# through within the window. -# """ -# return len(self.request_times) > self.sleep_limit - -# async def _on_enter_with_tracing(self, request_id: object) -> None: -# maybe_metrics_cm: ContextManager = contextlib.nullcontext() -# if self.metrics_name: -# maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() -# with start_active_span("ratelimit wait"), maybe_metrics_cm: -# await self._on_enter(request_id) - -# def _on_enter(self, request_id: object) -> "defer.Deferred[None]": -# time_now = self.clock.time_msec() - -# # remove any entries from request_times which aren't within the window -# self.request_times[:] = [ -# r for r in self.request_times if time_now - r < self.window_size -# ] - -# # reject the request if we already have too many queued up (either -# # sleeping or in the ready queue). -# if self.should_reject(): -# logger.debug("Ratelimiter(%s): rejecting request", self.host) -# if self.metrics_name: -# rate_limit_reject_counter.labels(self.metrics_name).inc() -# raise LimitExceededError( -# limiter_name="rc_federation", -# retry_after_ms=int(self.window_size / self.sleep_limit), -# ) - -# self.request_times.append(time_now) - -# def queue_request() -> "defer.Deferred[None]": -# if len(self.current_processing) >= self.concurrent_requests: -# queue_defer: defer.Deferred[None] = defer.Deferred() -# self.ready_request_queue[request_id] = queue_defer -# logger.info( -# "Ratelimiter(%s): queueing request (queue now %i items)", -# self.host, -# len(self.ready_request_queue), -# ) - -# return queue_defer -# else: -# return defer.succeed(None) - -# logger.debug( -# "Ratelimit(%s) [%s]: len(self.request_times)=%d", -# self.host, -# id(request_id), -# len(self.request_times), -# ) - -# if self.should_sleep(): -# logger.debug( -# "Ratelimiter(%s) [%s]: sleeping request for %f sec", -# self.host, -# id(request_id), -# self.sleep_sec, -# ) -# if self.metrics_name: -# rate_limit_sleep_counter.labels(self.metrics_name).inc() -# ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) - -# self.sleeping_requests.add(request_id) - -# def on_wait_finished(_: Any) -> "defer.Deferred[None]": -# logger.debug( -# "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) -# ) -# self.sleeping_requests.discard(request_id) -# queue_defer = queue_request() -# return queue_defer - -# ret_defer.addBoth(on_wait_finished) -# else: -# ret_defer = queue_request() - -# def on_start(r: object) -> object: -# logger.debug( -# "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) -# ) -# self.current_processing.add(request_id) -# return r - -# def on_err(r: object) -> object: -# # XXX: why is this necessary? this is called before we start -# # processing the request so why would the request be in -# # current_processing? -# self.current_processing.discard(request_id) -# return r - -# def on_both(r: object) -> object: -# # Ensure that we've properly cleaned up. -# self.sleeping_requests.discard(request_id) -# self.ready_request_queue.pop(request_id, None) -# return r - -# ret_defer.addCallbacks(on_start, on_err) -# ret_defer.addBoth(on_both) -# return make_deferred_yieldable(ret_defer) - -# def _on_exit(self, request_id: object) -> None: -# logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) - -# # When requests complete synchronously, we will recursively start the next -# # request in the queue. To avoid stack exhaustion, we defer starting the next -# # request until the next reactor tick. - -# def start_next_request() -> None: -# # We only remove the completed request from the list when we're about to -# # start the next one, otherwise we can allow extra requests through. -# self.current_processing.discard(request_id) -# try: -# # start processing the next item on the queue. -# _, deferred = self.ready_request_queue.popitem(last=False) - -# with PreserveLoggingContext(): -# deferred.callback(None) -# except KeyError: -# pass - -# self.clock.call_later(0.0, start_next_request) +_rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet() +# Protects the _rate_limiter_instances set from concurrent access +_rate_limiter_instances_lock = threading.Lock() + + +def _get_counts_from_rate_limiter_instance( + count_func: Callable[["FederationRateLimiter"], int], +) -> Mapping[Tuple[str, ...], int]: + """Returns a count of something (slept/rejected hosts) by (metrics_name)""" + # Cast to a list to prevent it changing while the Prometheus + # thread is collecting metrics + with _rate_limiter_instances_lock: + rate_limiter_instances = list(_rate_limiter_instances) + + # Map from (metrics_name,) -> int, the number of something like slept hosts + # or rejected hosts. The key type is Tuple[str], but we leave the length + # unspecified for compatability with LaterGauge's annotations. + counts: Dict[Tuple[str, ...], int] = {} + for rate_limiter_instance in rate_limiter_instances: + # Only track metrics if they provided a `metrics_name` to + # differentiate this instance of the rate limiter. + if rate_limiter_instance.metrics_name: + key = (rate_limiter_instance.metrics_name,) + counts[key] = count_func(rate_limiter_instance) + + return counts + + +# We track the number of affected hosts per time-period so we can +# differentiate one really noisy homeserver from a general +# ratelimit tuning problem across the federation. +LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_sleep() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) +LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_reject() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) + + +class FederationRateLimiter: + """Used to rate limit request per-host.""" + + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str] = None, + ): + """ + Args: + clock + config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. + + """ + self.metrics_name = metrics_name + + def new_limiter() -> "_PerHostRatelimiter": + return _PerHostRatelimiter( + clock=clock, config=config, metrics_name=metrics_name + ) + + self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = ( + collections.defaultdict(new_limiter) + ) + + with _rate_limiter_instances_lock: + _rate_limiter_instances.add(self) + + def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": + """Used to ratelimit an incoming request from a given host + + Example usage: + + with rate_limiter.ratelimit(origin) as wait_deferred: + yield wait_deferred + # Handle request ... + + Args: + host: Origin of incoming request. + + Returns: + context manager which returns a deferred. + """ + return self.ratelimiters[host].ratelimit(host) + + +class _PerHostRatelimiter: + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str] = None, + ): + """ + Args: + clock + config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. + from the rest in the metrics + """ + self.clock = clock + self.metrics_name = metrics_name + + self.window_size = config.window_size + self.sleep_limit = config.sleep_limit + self.sleep_sec = config.sleep_delay / 1000.0 + self.reject_limit = config.reject_limit + self.concurrent_requests = config.concurrent + + # request_id objects for requests which have been slept + self.sleeping_requests: Set[object] = set() + + # map from request_id object to Deferred for requests which are ready + # for processing but have been queued + self.ready_request_queue: collections.OrderedDict[ + object, defer.Deferred[None] + ] = collections.OrderedDict() + + # request id objects for requests which are in progress + self.current_processing: Set[object] = set() + + # times at which we have recently (within the last window_size ms) + # received requests. + self.request_times: List[int] = [] + + @contextlib.contextmanager + def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": + # `contextlib.contextmanager` takes a generator and turns it into a + # context manager. The generator should only yield once with a value + # to be returned by manager. + # Exceptions will be reraised at the yield. + + self.host = host + + request_id = object() + # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant + # type-checking, but we'd need Twisted >= 21.2. + ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id)) + try: + yield ret + finally: + self._on_exit(request_id) + + def should_reject(self) -> bool: + """ + Whether to reject the request if we already have too many queued up + (either sleeping or in the ready queue). + """ + queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) + return queue_size > self.reject_limit + + def should_sleep(self) -> bool: + """ + Whether to sleep the request if we already have too many requests coming + through within the window. + """ + return len(self.request_times) > self.sleep_limit + + async def _on_enter_with_tracing(self, request_id: object) -> None: + maybe_metrics_cm: ContextManager = contextlib.nullcontext() + if self.metrics_name: + maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() + with start_active_span("ratelimit wait"), maybe_metrics_cm: + await self._on_enter(request_id) + + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": + time_now = self.clock.time_msec() + + # remove any entries from request_times which aren't within the window + self.request_times[:] = [ + r for r in self.request_times if time_now - r < self.window_size + ] + + # reject the request if we already have too many queued up (either + # sleeping or in the ready queue). + if self.should_reject(): + logger.debug("Ratelimiter(%s): rejecting request", self.host) + if self.metrics_name: + rate_limit_reject_counter.labels(self.metrics_name).inc() + raise LimitExceededError( + limiter_name="rc_federation", + retry_after_ms=int(self.window_size / self.sleep_limit), + ) + + self.request_times.append(time_now) + + def queue_request() -> "defer.Deferred[None]": + if len(self.current_processing) >= self.concurrent_requests: + queue_defer: defer.Deferred[None] = defer.Deferred() + self.ready_request_queue[request_id] = queue_defer + logger.info( + "Ratelimiter(%s): queueing request (queue now %i items)", + self.host, + len(self.ready_request_queue), + ) + + return queue_defer + else: + return defer.succeed(None) + + logger.debug( + "Ratelimit(%s) [%s]: len(self.request_times)=%d", + self.host, + id(request_id), + len(self.request_times), + ) + + if self.should_sleep(): + logger.debug( + "Ratelimiter(%s) [%s]: sleeping request for %f sec", + self.host, + id(request_id), + self.sleep_sec, + ) + if self.metrics_name: + rate_limit_sleep_counter.labels(self.metrics_name).inc() + ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) + + self.sleeping_requests.add(request_id) + + def on_wait_finished(_: Any) -> "defer.Deferred[None]": + logger.debug( + "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) + ) + self.sleeping_requests.discard(request_id) + queue_defer = queue_request() + return queue_defer + + ret_defer.addBoth(on_wait_finished) + else: + ret_defer = queue_request() + + def on_start(r: object) -> object: + logger.debug( + "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) + ) + self.current_processing.add(request_id) + return r + + def on_err(r: object) -> object: + # XXX: why is this necessary? this is called before we start + # processing the request so why would the request be in + # current_processing? + self.current_processing.discard(request_id) + return r + + def on_both(r: object) -> object: + # Ensure that we've properly cleaned up. + self.sleeping_requests.discard(request_id) + self.ready_request_queue.pop(request_id, None) + return r + + ret_defer.addCallbacks(on_start, on_err) + ret_defer.addBoth(on_both) + return make_deferred_yieldable(ret_defer) + + def _on_exit(self, request_id: object) -> None: + logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) + + # When requests complete synchronously, we will recursively start the next + # request in the queue. To avoid stack exhaustion, we defer starting the next + # request until the next reactor tick. + + def start_next_request() -> None: + # We only remove the completed request from the list when we're about to + # start the next one, otherwise we can allow extra requests through. + self.current_processing.discard(request_id) + try: + # start processing the next item on the queue. + _, deferred = self.ready_request_queue.popitem(last=False) + + with PreserveLoggingContext(): + deferred.callback(None) + except KeyError: + pass + + self.clock.call_later(0.0, start_next_request) From 7651a44635cc58e22c89cb4f0ad4541298b54410 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:33:20 -0500 Subject: [PATCH 05/48] Indicate future plan --- scripts-dev/mypy_synapse_plugin.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index b55ec610c73..e0574d01738 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -67,7 +67,11 @@ class SynapsePlugin(Plugin): def get_function_signature_hook( self, fullname: str ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]: - if fullname in ("prometheus_client.metrics.Counter",): + if fullname in ( + "prometheus_client.metrics.Counter", + # TODO: Add other prometheus_client metrics that need checking as we + # refactor, see https://github.com/element-hq/synapse/issues/18592 + ): return check_prometheus_metric_instantiation return None From 944abdd09df04bc03281e32dfb6e729fe297a54f Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:48:16 -0500 Subject: [PATCH 06/48] Adjust errors --- scripts-dev/mypy_synapse_plugin.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index e0574d01738..e19e514ef90 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -28,14 +28,12 @@ import mypy.types from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode -from mypy.nodes import ARG_NAMED_OPT, TempNode, Var +from mypy.nodes import ARG_NAMED_OPT, ListExpr, StrExpr, TempNode, Var from mypy.plugin import ( + FunctionLike, FunctionSigContext, MethodSigContext, Plugin, - FunctionContext, - ClassDefContext, - FunctionLike, ) from mypy.typeops import bind_self from mypy.types import ( @@ -49,7 +47,6 @@ UninhabitedType, UnionType, ) -from mypy.nodes import StrExpr, TupleExpr, ListExpr PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( "missing-server-name-label", @@ -170,10 +167,10 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy ) else: ctx.api.fail( - f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got " - f"{labelnames_arg_expression}", + f"Expected the `labelnames` argument of {signature.name} to be a list of label names " + f"(including 'server_name'), but got {labelnames_arg_expression}", ctx.context, - code=PROMETHEUS_METRIC_MANGLED_LABELS, + code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) return signature From 143fa3f61138020aae205a10a111e16f98afa141 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 11:52:08 -0500 Subject: [PATCH 07/48] Explain when it's okay to ignore lint --- scripts-dev/mypy_synapse_plugin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index e19e514ef90..f7dbbc634d6 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -161,14 +161,18 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy # TODO: Use `SERVER_NAME_LABEL` here if "server_name" not in labels: ctx.api.fail( - f"Expected {signature.name} to include 'server_name' in the list of labels", + f"Expected {signature.name} to include 'server_name' in the list of labels." + " If this is a process-level metric (vs homeserver-level), use a type ignore " + "comment to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) else: ctx.api.fail( f"Expected the `labelnames` argument of {signature.name} to be a list of label names " - f"(including 'server_name'), but got {labelnames_arg_expression}", + f"(including 'server_name'), but got {labelnames_arg_expression}.", + " If this is a process-level metric (vs homeserver-level), use a type ignore " + "comment to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) From f66758fc1cf3a3bd94c149bae89df671729e076a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 12:54:43 -0500 Subject: [PATCH 08/48] Use `SERVER_NAME_LABEL` --- scripts-dev/mypy_synapse_plugin.py | 30 +++++++++++++++--------------- synapse/metrics/__init__.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index f7dbbc634d6..8821e3a8770 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -48,6 +48,8 @@ UnionType, ) +from synapse.metrics import SERVER_NAME_LABEL + PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( "missing-server-name-label", "'server_name' label required in metric", @@ -97,16 +99,16 @@ def get_method_signature_hook( def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType: """ Ensure that the `prometheus_client` metrics are instantiated with `server_name` in - the list of labels. + the list of labels (use constant `SERVER_NAME_LABEL`). This is important because we support multiple Synapse instances running in the same process, where all metrics share a single global `REGISTRY`. The `server_name` label ensures metrics are correctly separated by homeserver. - There are also some metrics that apply at the process level, such as - CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't - have a `server_name` label. In those cases, use use a type ignore comment to - disable the check, e.g. `# type: ignore[missing-server-name-label]`. + There are also some metrics that apply at the process level, such as CPU usage, + Python garbage collection, Twisted reactor tick time which shouldn't have a + `server_name` label. In those cases, use use a type ignore + comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ # The true signature, this isn't being modified so this is what will be returned. signature: CallableType = ctx.default_signature @@ -130,7 +132,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy f"to match the number of arguments from the function signature context ({len(ctx.args)})" ) - # Check if the `labelnames` argument includes the `server_name` label. + # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). # # `ctx.args` should look like this: # ``` @@ -156,22 +158,20 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy code=PROMETHEUS_METRIC_MANGLED_LABELS, ) - # Check if the `labelnames` argument includes the `server_name` label. - # - # TODO: Use `SERVER_NAME_LABEL` here - if "server_name" not in labels: + # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). + if SERVER_NAME_LABEL not in labels: ctx.api.fail( - f"Expected {signature.name} to include 'server_name' in the list of labels." - " If this is a process-level metric (vs homeserver-level), use a type ignore " - "comment to disable this check.", + f"Expected {signature.name} to include 'server_name' (use constant `SERVER_NAME_LABEL`) " + "in the list of labels. If this is a process-level metric (vs homeserver-level), " + "use a type ignore comment to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) else: ctx.api.fail( f"Expected the `labelnames` argument of {signature.name} to be a list of label names " - f"(including 'server_name'), but got {labelnames_arg_expression}.", - " If this is a process-level metric (vs homeserver-level), use a type ignore " + f"(including 'server_name', use constant `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. " + "If this is a process-level metric (vs homeserver-level), use a type ignore " "comment to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 86ac2c23959..da58df6b968 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -66,6 +66,21 @@ HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat") +SERVER_NAME_LABEL = "server_name" +""" +The `server_name` label is used to identify the homeserver that the metrics correspond +to. Because we support multiple instances of Synapse running in the same process and all +metrics are in a single global `REGISTRY`, we need to manually label any metrics. + +In the case of a Synapse homeserver, this should be set to the homeserver name +(`hs.hostname`). + +We're purposely not using the `instance` label for this purpose as that should be "The +: part of the target's URL that was scraped.". Also: "In Prometheus +terms, an endpoint you can scrape is called an *instance*, usually corresponding to a +single process." (source: https://prometheus.io/docs/concepts/jobs_instances/) +""" + class _RegistryProxy: @staticmethod From 3705aecdb3b0216a15211f84bbb0137014b42361 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:09:52 -0500 Subject: [PATCH 09/48] Support `NameExpr` `SERVER_NAME_LABEL` in lint --- scripts-dev/mypy_synapse_plugin.py | 57 ++++++++++++------------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 8821e3a8770..224b4562c44 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -28,7 +28,7 @@ import mypy.types from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode -from mypy.nodes import ARG_NAMED_OPT, ListExpr, StrExpr, TempNode, Var +from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, Var from mypy.plugin import ( FunctionLike, FunctionSigContext, @@ -48,16 +48,9 @@ UnionType, ) -from synapse.metrics import SERVER_NAME_LABEL - PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( "missing-server-name-label", - "'server_name' label required in metric", - category="per-homeserver-tenant-metrics", -) -PROMETHEUS_METRIC_MANGLED_LABELS = ErrorCode( - "mangled-labels", - "`labelnames` argument should be a list of strings", + "`SERVER_NAME_LABEL` required in metric", category="per-homeserver-tenant-metrics", ) @@ -98,17 +91,17 @@ def get_method_signature_hook( def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType: """ - Ensure that the `prometheus_client` metrics are instantiated with `server_name` in - the list of labels (use constant `SERVER_NAME_LABEL`). + Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label + when instantiated. This is important because we support multiple Synapse instances running in the same process, where all metrics share a single global `REGISTRY`. The `server_name` label ensures metrics are correctly separated by homeserver. There are also some metrics that apply at the process level, such as CPU usage, - Python garbage collection, Twisted reactor tick time which shouldn't have a - `server_name` label. In those cases, use use a type ignore - comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. + Python garbage collection, Twisted reactor tick time which shouldn't have the + `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the + check, e.g. `# type: ignore[missing-server-name-label]`. """ # The true signature, this isn't being modified so this is what will be returned. signature: CallableType = ctx.default_signature @@ -132,7 +125,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy f"to match the number of arguments from the function signature context ({len(ctx.args)})" ) - # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). + # Check if the `labelnames` argument includes `SERVER_NAME_LABEL` # # `ctx.args` should look like this: # ``` @@ -145,34 +138,28 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy # ``` labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None if isinstance(labelnames_arg_expression, ListExpr): - # Collect the label names as a list of strings. - labels = [] - for labelname_expression in labelnames_arg_expression.items: - if isinstance(labelname_expression, StrExpr): - labels.append(labelname_expression.value) - else: - ctx.api.fail( - f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got " - f"{labelname_expression.__class__.__name__}", - ctx.context, - code=PROMETHEUS_METRIC_MANGLED_LABELS, - ) - # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). - if SERVER_NAME_LABEL not in labels: + for labelname_expression in labelnames_arg_expression.items: + if ( + isinstance(labelname_expression, NameExpr) + and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL" + ): + # Found the `SERVER_NAME_LABEL`, all good! + break + else: ctx.api.fail( - f"Expected {signature.name} to include 'server_name' (use constant `SERVER_NAME_LABEL`) " - "in the list of labels. If this is a process-level metric (vs homeserver-level), " - "use a type ignore comment to disable this check.", + f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. " + "If this is a process-level metric (vs homeserver-level), use a type ignore comment " + "to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) else: ctx.api.fail( f"Expected the `labelnames` argument of {signature.name} to be a list of label names " - f"(including 'server_name', use constant `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. " - "If this is a process-level metric (vs homeserver-level), use a type ignore " - "comment to disable this check.", + f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. " + "If this is a process-level metric (vs homeserver-level), use a type ignore comment " + "to disable this check.", ctx.context, code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, ) From 2ba8a31c7bfa00f62bc5d7a50829d9ad7fbbc22e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:25:38 -0500 Subject: [PATCH 10/48] Fill in `synapse/notifier.py` --- synapse/notifier.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/synapse/notifier.py b/synapse/notifier.py index 6190432b877..887957b151b 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -50,7 +50,7 @@ from synapse.logging import issue9533_logger from synapse.logging.context import PreserveLoggingContext from synapse.logging.opentracing import log_kv, start_active_span -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge from synapse.streams.config import PaginationConfig from synapse.types import ( ISynapseReactor, @@ -74,10 +74,10 @@ logger = logging.getLogger(__name__) -notified_events_counter = Counter("synapse_notifier_notified_events", "") - users_woken_by_stream_counter = Counter( - "synapse_notifier_users_woken_by_stream", "", ["stream"] + "synapse_notifier_users_woken_by_stream", + "", + labelnames=["stream", SERVER_NAME_LABEL], ) T = TypeVar("T") @@ -224,6 +224,7 @@ def __init__(self, hs: "HomeServer"): self.room_to_user_streams: Dict[str, Set[_NotifierUserStream]] = {} self.hs = hs + self.server_name = hs.hostname self._storage_controllers = hs.get_storage_controllers() self.event_sources = hs.get_event_sources() self.store = hs.get_datastores().main @@ -350,9 +351,10 @@ async def on_un_partial_stated_room( for listener in listeners: listener.callback(current_token) - users_woken_by_stream_counter.labels(StreamKeyType.UN_PARTIAL_STATED_ROOMS).inc( - len(user_streams) - ) + users_woken_by_stream_counter.labels( + stream=StreamKeyType.UN_PARTIAL_STATED_ROOMS, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(len(user_streams)) # Poke the replication so that other workers also see the write to # the un-partial-stated rooms stream. @@ -575,7 +577,10 @@ def on_new_event( listener.callback(current_token) if user_streams: - users_woken_by_stream_counter.labels(stream_key).inc(len(user_streams)) + users_woken_by_stream_counter.labels( + stream=stream_key, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(len(user_streams)) self.notify_replication() From a28be1acc650ae8909a02becec3b7d129b4003d9 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:35:44 -0500 Subject: [PATCH 11/48] Fill in `synapse/appservice/api.py` --- synapse/appservice/api.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py index 19322471dc5..88a22185df3 100644 --- a/synapse/appservice/api.py +++ b/synapse/appservice/api.py @@ -48,6 +48,7 @@ from synapse.events.utils import SerializeEventConfig, serialize_event from synapse.http.client import SimpleHttpClient, is_unknown_endpoint from synapse.logging import opentracing +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import DeviceListUpdates, JsonDict, JsonMapping, ThirdPartyInstanceID from synapse.util.caches.response_cache import ResponseCache @@ -59,29 +60,31 @@ sent_transactions_counter = Counter( "synapse_appservice_api_sent_transactions", "Number of /transactions/ requests sent", - ["service"], + labelnames=["service", SERVER_NAME_LABEL], ) failed_transactions_counter = Counter( "synapse_appservice_api_failed_transactions", "Number of /transactions/ requests that failed to send", - ["service"], + labelnames=["service", SERVER_NAME_LABEL], ) sent_events_counter = Counter( - "synapse_appservice_api_sent_events", "Number of events sent to the AS", ["service"] + "synapse_appservice_api_sent_events", + "Number of events sent to the AS", + labelnames=["service", SERVER_NAME_LABEL], ) sent_ephemeral_counter = Counter( "synapse_appservice_api_sent_ephemeral", "Number of ephemeral events sent to the AS", - ["service"], + labelnames=["service", SERVER_NAME_LABEL], ) sent_todevice_counter = Counter( "synapse_appservice_api_sent_todevice", "Number of todevice messages sent to the AS", - ["service"], + labelnames=["service", SERVER_NAME_LABEL], ) HOUR_IN_MS = 60 * 60 * 1000 @@ -126,6 +129,7 @@ class ApplicationServiceApi(SimpleHttpClient): def __init__(self, hs: "HomeServer"): super().__init__(hs) + self.server_name = hs.hostname self.clock = hs.get_clock() self.config = hs.config.appservice @@ -378,6 +382,7 @@ async def push_bulk( "left": list(device_list_summary.left), } + labels = {"service": service.id, SERVER_NAME_LABEL: self.server_name} try: args = None if self.config.use_appservice_legacy_authorization: @@ -395,10 +400,10 @@ async def push_bulk( service.url, [event.get("event_id") for event in events], ) - sent_transactions_counter.labels(service.id).inc() - sent_events_counter.labels(service.id).inc(len(serialized_events)) - sent_ephemeral_counter.labels(service.id).inc(len(ephemeral)) - sent_todevice_counter.labels(service.id).inc(len(to_device_messages)) + sent_transactions_counter.labels(**labels).inc() + sent_events_counter.labels(**labels).inc(len(serialized_events)) + sent_ephemeral_counter.labels(**labels).inc(len(ephemeral)) + sent_todevice_counter.labels(**labels).inc(len(to_device_messages)) return True except CodeMessageException as e: logger.warning( @@ -417,7 +422,7 @@ async def push_bulk( ex.args, exc_info=logger.isEnabledFor(logging.DEBUG), ) - failed_transactions_counter.labels(service.id).inc() + failed_transactions_counter.labels(**labels).inc() return False async def claim_client_keys( From 48782ac975988952ad4f1531762d4ab8f96862e0 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:44:02 -0500 Subject: [PATCH 12/48] Fill in `synapse/federation/federation_client.py` --- synapse/federation/federation_client.py | 29 +++++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 2d1da70793c..4566ace7091 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -74,6 +74,7 @@ from synapse.http.client import is_unknown_endpoint from synapse.http.types import QueryParams from synapse.logging.opentracing import SynapseTags, log_kv, set_tag, tag_args, trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import JsonDict, StrCollection, UserID, get_domain_from_id from synapse.types.handlers.policy_server import RECOMMENDATION_OK, RECOMMENDATION_SPAM from synapse.util.async_helpers import concurrently_execute @@ -85,7 +86,9 @@ logger = logging.getLogger(__name__) -sent_queries_counter = Counter("synapse_federation_client_sent_queries", "", ["type"]) +sent_queries_counter = Counter( + "synapse_federation_client_sent_queries", "", labelnames=["type", SERVER_NAME_LABEL] +) PDU_RETRY_TIME_MS = 1 * 60 * 1000 @@ -137,7 +140,7 @@ def __init__(self, hs: "HomeServer"): self.state = hs.get_state_handler() self.transport_layer = hs.get_federation_transport_client() - self.hostname = hs.hostname + self.server_name = hs.hostname self.signing_key = hs.signing_key # Cache mapping `event_id` to a tuple of the event itself and the `pull_origin` @@ -207,7 +210,10 @@ async def make_query( Returns: The JSON object from the response """ - sent_queries_counter.labels(query_type).inc() + sent_queries_counter.labels( + type=query_type, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() return await self.transport_layer.make_query( destination, @@ -229,7 +235,10 @@ async def query_client_keys( Returns: The JSON object from the response """ - sent_queries_counter.labels("client_device_keys").inc() + sent_queries_counter.labels( + type="client_device_keys", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() return await self.transport_layer.query_client_keys( destination, content, timeout ) @@ -240,7 +249,10 @@ async def query_user_devices( """Query the device keys for a list of user ids hosted on a remote server. """ - sent_queries_counter.labels("user_devices").inc() + sent_queries_counter.labels( + type="user_devices", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() return await self.transport_layer.query_user_devices( destination, user_id, timeout ) @@ -262,7 +274,10 @@ async def claim_client_keys( Returns: The JSON object from the response """ - sent_queries_counter.labels("client_one_time_keys").inc() + sent_queries_counter.labels( + type="client_one_time_keys", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() # Convert the query with counts into a stable and unstable query and check # if attempting to claim more than 1 OTK. @@ -1068,7 +1083,7 @@ async def send_request(destination: str) -> Tuple[str, EventBase, RoomVersion]: # there's some we never care about ev = builder.create_local_event_from_event_dict( self._clock, - self.hostname, + self.server_name, self.signing_key, room_version=room_version, event_dict=pdu_dict, From cdbeb4fd723f6f5e56d05f1bbad57bae1c2277cf Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:46:14 -0500 Subject: [PATCH 13/48] Fill in `synapse/federation/federation_server.py` --- synapse/federation/federation_server.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 8718b704017..90ccc904917 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -82,6 +82,7 @@ tag_args, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.replication.http.federation import ( ReplicationFederationSendEduRestServlet, @@ -105,12 +106,18 @@ logger = logging.getLogger(__name__) -received_pdus_counter = Counter("synapse_federation_server_received_pdus", "") +received_pdus_counter = Counter( + "synapse_federation_server_received_pdus", "", labelnames=[SERVER_NAME_LABEL] +) -received_edus_counter = Counter("synapse_federation_server_received_edus", "") +received_edus_counter = Counter( + "synapse_federation_server_received_edus", "", labelnames=[SERVER_NAME_LABEL] +) received_queries_counter = Counter( - "synapse_federation_server_received_queries", "", ["type"] + "synapse_federation_server_received_queries", + "", + labelnames=["type", SERVER_NAME_LABEL], ) pdu_process_time = Histogram( @@ -424,7 +431,9 @@ async def _handle_pdus_in_txn( report back to the sending server. """ - received_pdus_counter.inc(len(transaction.pdus)) + received_pdus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( + len(transaction.pdus) + ) origin_host, _ = parse_server_name(origin) @@ -543,7 +552,7 @@ async def _handle_edus_in_txn(self, origin: str, transaction: Transaction) -> No """Process the EDUs in a received transaction.""" async def _process_edu(edu_dict: JsonDict) -> None: - received_edus_counter.inc() + received_edus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() edu = Edu( origin=origin, @@ -658,7 +667,10 @@ async def on_pdu_request( async def on_query_request( self, query_type: str, args: Dict[str, str] ) -> Tuple[int, Dict[str, Any]]: - received_queries_counter.labels(query_type).inc() + received_queries_counter.labels( + type=query_type, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() resp = await self.registry.on_query(query_type, args) return 200, resp From 5cd7f3e9b0b419fc72903c92edd9b459f466ef69 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:48:21 -0500 Subject: [PATCH 14/48] Fill in `synapse/federation/sender/__init__.py` --- synapse/federation/sender/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 2eef7b707d7..c695f5d494c 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -160,6 +160,7 @@ from synapse.federation.units import Edu from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics import ( + SERVER_NAME_LABEL, LaterGauge, event_processing_loop_counter, event_processing_loop_room_count, @@ -189,11 +190,13 @@ sent_pdus_destination_dist_count = Counter( "synapse_federation_client_sent_pdu_destinations_count", "Number of PDUs queued for sending to one or more destinations", + labelnames=[SERVER_NAME_LABEL], ) sent_pdus_destination_dist_total = Counter( "synapse_federation_client_sent_pdu_destinations", "Total number of PDUs queued for sending across all destinations", + labelnames=[SERVER_NAME_LABEL], ) # Time (in s) to wait before trying to wake up destinations that have @@ -723,8 +726,12 @@ async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None: if not destinations: return - sent_pdus_destination_dist_total.inc(len(destinations)) - sent_pdus_destination_dist_count.inc() + sent_pdus_destination_dist_total.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(destinations)) + sent_pdus_destination_dist_count.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() assert pdu.internal_metadata.stream_ordering From 6847cc3800fc5e9d65fe6eab367830c92fe41880 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:50:18 -0500 Subject: [PATCH 15/48] Fill in `synapse/federation/sender/per_destination_queue.py` --- .../federation/sender/per_destination_queue.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index b3f65e8237e..9ab95a988ea 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -40,7 +40,7 @@ from synapse.handlers.presence import format_user_presence_state from synapse.logging import issue9533_logger from synapse.logging.opentracing import SynapseTags, set_tag -from synapse.metrics import sent_transactions_counter +from synapse.metrics import SERVER_NAME_LABEL, sent_transactions_counter from synapse.metrics.background_process_metrics import run_as_background_process from synapse.types import JsonDict, ReadReceipt from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter @@ -56,13 +56,15 @@ sent_edus_counter = Counter( - "synapse_federation_client_sent_edus", "Total number of EDUs successfully sent" + "synapse_federation_client_sent_edus", + "Total number of EDUs successfully sent", + labelnames=[SERVER_NAME_LABEL], ) sent_edus_by_type = Counter( "synapse_federation_client_sent_edus_by_type", "Number of sent EDUs successfully sent, by event type", - ["type"], + labelnames=["type", SERVER_NAME_LABEL], ) @@ -361,9 +363,14 @@ async def _transaction_transmission_loop(self) -> None: ) sent_transactions_counter.inc() - sent_edus_counter.inc(len(pending_edus)) + sent_edus_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(pending_edus)) for edu in pending_edus: - sent_edus_by_type.labels(edu.edu_type).inc() + sent_edus_by_type.labels( + type=edu.edu_type, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() except NotRetryingDestination as e: logger.debug( From 70547f39d45acfcd9d55fbeb787d571cd3e55ae6 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:52:47 -0500 Subject: [PATCH 16/48] Fill in `synapse/handlers/appservice.py` --- synapse/handlers/appservice.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index f3bbdb5a05b..ec624e405d7 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -42,6 +42,7 @@ from synapse.handlers.presence import format_user_presence_state from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics import ( + SERVER_NAME_LABEL, event_processing_loop_counter, event_processing_loop_room_count, ) @@ -68,11 +69,14 @@ logger = logging.getLogger(__name__) -events_processed_counter = Counter("synapse_handlers_appservice_events_processed", "") +events_processed_counter = Counter( + "synapse_handlers_appservice_events_processed", "", labelnames=[SERVER_NAME_LABEL] +) class ApplicationServicesHandler: def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.store = hs.get_datastores().main self.is_mine_id = hs.is_mine_id self.appservice_api = hs.get_application_service_api() @@ -200,7 +204,9 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: "appservice_sender" ).set(upper_bound) - events_processed_counter.inc(len(events)) + events_processed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(events)) event_processing_loop_room_count.labels("appservice_sender").inc( len(events_by_room) From ecbcdeaaa2186c5a015c1a29b6f05943eacf7a4d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 13:57:41 -0500 Subject: [PATCH 17/48] Fill in `synapse/handlers/auth.py` --- synapse/handlers/auth.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index 9e3e70ec1c0..a802941340b 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -70,6 +70,7 @@ from synapse.http.server import finish_request, respond_with_html from synapse.http.site import SynapseRequest from synapse.logging.context import defer_to_thread +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.databases.main.registration import ( LoginTokenExpired, @@ -95,7 +96,7 @@ invalid_login_token_counter = Counter( "synapse_user_login_invalid_login_tokens", "Counts the number of rejected m.login.token on /login", - ["reason"], + labelnames=["reason", SERVER_NAME_LABEL], ) @@ -198,6 +199,7 @@ class AuthHandler: SESSION_EXPIRE_MS = 48 * 60 * 60 * 1000 def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.store = hs.get_datastores().main self.auth = hs.get_auth() self.auth_blocking = hs.get_auth_blocking() @@ -270,8 +272,6 @@ def __init__(self, hs: "HomeServer"): hs.config.sso.sso_account_deactivated_template ) - self._server_name = hs.config.server.server_name - # cast to tuple for use with str.startswith self._whitelisted_sso_clients = tuple(hs.config.sso.sso_client_whitelist) @@ -1477,11 +1477,20 @@ async def consume_login_token(self, login_token: str) -> LoginTokenLookupResult: try: return await self.store.consume_login_token(login_token) except LoginTokenExpired: - invalid_login_token_counter.labels("expired").inc() + invalid_login_token_counter.labels( + reason="expired", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() except LoginTokenReused: - invalid_login_token_counter.labels("reused").inc() + invalid_login_token_counter.labels( + reason="reused", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() except NotFoundError: - invalid_login_token_counter.labels("not found").inc() + invalid_login_token_counter.labels( + reason="not found", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() raise AuthError(403, "Invalid login token", errcode=Codes.FORBIDDEN) @@ -1856,7 +1865,7 @@ async def complete_sso_login( html = self._sso_redirect_confirm_template.render( display_url=display_url, redirect_url=redirect_url, - server_name=self._server_name, + server_name=self.server_name, new_user=new_user, user_id=registered_user_id, user_profile=user_profile_data, From 6e014d327fa698865f0fa1a9575c639035767fbe Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:00:31 -0500 Subject: [PATCH 18/48] Fill in `synapse/handlers/federation_event.py` --- synapse/handlers/federation_event.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 1e738f484f9..cbdbd10cae4 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -76,6 +76,7 @@ tag_args, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.http.devices import ( ReplicationMultiUserDevicesResyncRestServlet, @@ -108,6 +109,7 @@ soft_failed_event_counter = Counter( "synapse_federation_soft_failed_events_total", "Events received over federation that we marked as soft_failed", + labelnames=[SERVER_NAME_LABEL], ) # Added to debug performance and track progress on optimizations @@ -149,6 +151,7 @@ class FederationEventHandler: """ def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self._clock = hs.get_clock() self._store = hs.get_datastores().main self._state_store = hs.get_datastores().state @@ -173,7 +176,6 @@ def __init__(self, hs: "HomeServer"): self._is_mine_id = hs.is_mine_id self._is_mine_server_name = hs.is_mine_server_name - self._server_name = hs.hostname self._instance_name = hs.get_instance_name() self._config = hs.config @@ -257,7 +259,7 @@ async def on_receive_pdu(self, origin: str, pdu: EventBase) -> None: # Note that if we were never in the room then we would have already # dropped the event, since we wouldn't know the room version. is_in_room = await self._event_auth_handler.is_host_in_room( - room_id, self._server_name + room_id, self.server_name ) if not is_in_room: logger.info( @@ -2062,7 +2064,9 @@ async def _check_for_soft_fail( "hs": origin, }, ) - soft_failed_event_counter.inc() + soft_failed_event_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() event.internal_metadata.soft_failed = True async def _load_or_fetch_auth_events_for_event( From 1fadfedbf2853637a187a04f9b2b87bbc246e133 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:11:44 -0500 Subject: [PATCH 19/48] Fill in `synapse/handlers/presence.py` --- synapse/handlers/presence.py | 120 +++++++++++++++++++++++++------- tests/handlers/test_presence.py | 9 +++ 2 files changed, 102 insertions(+), 27 deletions(-) diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index 390cafa8f63..bbe81dbf4f3 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -105,7 +105,7 @@ from synapse.appservice import ApplicationService from synapse.events.presence_router import PresenceRouter from synapse.logging.context import run_in_background -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, @@ -137,24 +137,40 @@ logger = logging.getLogger(__name__) -notified_presence_counter = Counter("synapse_handler_presence_notified_presence", "") +notified_presence_counter = Counter( + "synapse_handler_presence_notified_presence", "", labelnames=[SERVER_NAME_LABEL] +) federation_presence_out_counter = Counter( - "synapse_handler_presence_federation_presence_out", "" + "synapse_handler_presence_federation_presence_out", + "", + labelnames=[SERVER_NAME_LABEL], +) +presence_updates_counter = Counter( + "synapse_handler_presence_presence_updates", "", labelnames=[SERVER_NAME_LABEL] +) +timers_fired_counter = Counter( + "synapse_handler_presence_timers_fired", "", labelnames=[SERVER_NAME_LABEL] ) -presence_updates_counter = Counter("synapse_handler_presence_presence_updates", "") -timers_fired_counter = Counter("synapse_handler_presence_timers_fired", "") federation_presence_counter = Counter( - "synapse_handler_presence_federation_presence", "" + "synapse_handler_presence_federation_presence", "", labelnames=[SERVER_NAME_LABEL] +) +bump_active_time_counter = Counter( + "synapse_handler_presence_bump_active_time", "", labelnames=[SERVER_NAME_LABEL] ) -bump_active_time_counter = Counter("synapse_handler_presence_bump_active_time", "") -get_updates_counter = Counter("synapse_handler_presence_get_updates", "", ["type"]) +get_updates_counter = Counter( + "synapse_handler_presence_get_updates", "", labelnames=["type", SERVER_NAME_LABEL] +) notify_reason_counter = Counter( - "synapse_handler_presence_notify_reason", "", ["locality", "reason"] + "synapse_handler_presence_notify_reason", + "", + labelnames=["locality", "reason", SERVER_NAME_LABEL], ) state_transition_counter = Counter( - "synapse_handler_presence_state_transition", "", ["locality", "from", "to"] + "synapse_handler_presence_state_transition", + "", + labelnames=["locality", "from", "to", SERVER_NAME_LABEL], ) # If a user was last active in the last LAST_ACTIVE_GRANULARITY, consider them @@ -484,6 +500,7 @@ def __exit__( class WorkerPresenceHandler(BasePresenceHandler): def __init__(self, hs: "HomeServer"): super().__init__(hs) + self.server_name = hs.hostname self._presence_writer_instance = hs.config.worker.writers.presence[0] # Route presence EDUs to the right worker @@ -666,7 +683,9 @@ async def process_replication_rows( old_state = self.user_to_current_state.get(new_state.user_id) self.user_to_current_state[new_state.user_id] = new_state is_mine = self.is_mine_id(new_state.user_id) - if not old_state or should_notify(old_state, new_state, is_mine): + if not old_state or should_notify( + old_state, new_state, is_mine, self.server_name + ): state_to_notify.append(new_state) stream_id = token @@ -747,6 +766,7 @@ async def bump_presence_active_time( class PresenceHandler(BasePresenceHandler): def __init__(self, hs: "HomeServer"): super().__init__(hs) + self.server_name = hs.hostname self.wheel_timer: WheelTimer[str] = WheelTimer() self.notifier = hs.get_notifier() @@ -969,6 +989,7 @@ async def _update_states( prev_state, new_state, is_mine=self.is_mine_id(user_id), + our_server_name=self.server_name, wheel_timer=self.wheel_timer, now=now, # When overriding disabled presence, don't kick off all the @@ -988,10 +1009,14 @@ async def _update_states( # TODO: We should probably ensure there are no races hereafter - presence_updates_counter.inc(len(new_states)) + presence_updates_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(new_states)) if to_notify: - notified_presence_counter.inc(len(to_notify)) + notified_presence_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(to_notify)) await self._persist_and_notify(list(to_notify.values())) self.unpersisted_users_changes |= {s.user_id for s in new_states} @@ -1010,7 +1035,9 @@ async def _update_states( if user_id not in to_notify } if to_federation_ping: - federation_presence_out_counter.inc(len(to_federation_ping)) + federation_presence_out_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(to_federation_ping)) hosts_to_states = await get_interested_remotes( self.store, @@ -1060,7 +1087,9 @@ async def _handle_timeouts(self) -> None: for user_id in users_to_check ] - timers_fired_counter.inc(len(states)) + timers_fired_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( + len(states) + ) # Set of user ID & device IDs which are currently syncing. syncing_user_devices = { @@ -1094,7 +1123,7 @@ async def bump_presence_active_time( user_id = user.to_string() - bump_active_time_counter.inc() + bump_active_time_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() now = self.clock.time_msec() @@ -1346,7 +1375,9 @@ async def incoming_presence(self, origin: str, content: JsonDict) -> None: updates.append(prev_state.copy_and_replace(**new_fields)) if updates: - federation_presence_counter.inc(len(updates)) + federation_presence_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(updates)) await self._update_states(updates) async def set_state( @@ -1655,7 +1686,10 @@ async def _handle_state_delta(self, room_id: str, deltas: List[StateDelta]) -> N def should_notify( - old_state: UserPresenceState, new_state: UserPresenceState, is_mine: bool + old_state: UserPresenceState, + new_state: UserPresenceState, + is_mine: bool, + our_server_name: str, ) -> bool: """Decides if a presence state change should be sent to interested parties.""" user_location = "remote" @@ -1666,19 +1700,38 @@ def should_notify( return False if old_state.status_msg != new_state.status_msg: - notify_reason_counter.labels(user_location, "status_msg_change").inc() + notify_reason_counter.labels( + locality=user_location, + reason="status_msg_change", + **{SERVER_NAME_LABEL: our_server_name}, + ).inc() return True if old_state.state != new_state.state: - notify_reason_counter.labels(user_location, "state_change").inc() + notify_reason_counter.labels( + locality=user_location, + reason="state_change", + **{SERVER_NAME_LABEL: our_server_name}, + ).inc() state_transition_counter.labels( - user_location, old_state.state, new_state.state + **{ + "locality": user_location, + # `from` is a reserved word in Python so we have to label it this way if + # we want to use keyword args. + "from": old_state.state, + "to": new_state.state, + SERVER_NAME_LABEL: our_server_name, + }, ).inc() return True if old_state.state == PresenceState.ONLINE: if new_state.currently_active != old_state.currently_active: - notify_reason_counter.labels(user_location, "current_active_change").inc() + notify_reason_counter.labels( + locality=user_location, + reason="current_active_change", + **{SERVER_NAME_LABEL: our_server_name}, + ).inc() return True if ( @@ -1688,14 +1741,18 @@ def should_notify( # Only notify about last active bumps if we're not currently active if not new_state.currently_active: notify_reason_counter.labels( - user_location, "last_active_change_online" + locality=user_location, + reason="last_active_change_online", + **{SERVER_NAME_LABEL: our_server_name}, ).inc() return True elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY: # Always notify for a transition where last active gets bumped. notify_reason_counter.labels( - user_location, "last_active_change_not_online" + locality=user_location, + reason="last_active_change_not_online", + **{SERVER_NAME_LABEL: our_server_name}, ).inc() return True @@ -1761,6 +1818,7 @@ def __init__(self, hs: "HomeServer"): # AuthHandler -> Notifier -> PresenceEventSource -> ModuleApi -> AuthHandler self.get_presence_handler = hs.get_presence_handler self.get_presence_router = hs.get_presence_router + self.server_name = hs.hostname self.clock = hs.get_clock() self.store = hs.get_datastores().main @@ -1870,7 +1928,10 @@ async def get_new_events( # If we have the full list of changes for presence we can # simply check which ones share a room with the user. - get_updates_counter.labels("stream").inc() + get_updates_counter.labels( + type="stream", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() sharing_users = await self.store.do_users_share_a_room( user_id, updated_users @@ -1883,7 +1944,10 @@ async def get_new_events( else: # Too many possible updates. Find all users we can see and check # if any of them have changed. - get_updates_counter.labels("full").inc() + get_updates_counter.labels( + type="full", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() users_interested_in = ( await self.store.get_users_who_share_room_with_user(user_id) @@ -2133,6 +2197,7 @@ def handle_update( prev_state: UserPresenceState, new_state: UserPresenceState, is_mine: bool, + our_server_name: str, wheel_timer: WheelTimer, now: int, persist: bool, @@ -2145,6 +2210,7 @@ def handle_update( prev_state new_state is_mine: Whether the user is ours + our_server_name: The homeserver name of the our server (`hs.hostname`) wheel_timer now: Time now in ms persist: True if this state should persist until another update occurs. @@ -2213,7 +2279,7 @@ def handle_update( ) # Check whether the change was something worth notifying about - if should_notify(prev_state, new_state, is_mine): + if should_notify(prev_state, new_state, is_mine, our_server_name): new_state = new_state.copy_and_replace(last_federation_update_ts=now) persist_and_notify = True diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py index 6b7bf112c21..7ee411cfd94 100644 --- a/tests/handlers/test_presence.py +++ b/tests/handlers/test_presence.py @@ -90,6 +90,7 @@ def test_offline_to_online(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -137,6 +138,7 @@ def test_online_to_online(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -187,6 +189,7 @@ def test_online_to_online_last_active_noop(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -235,6 +238,7 @@ def test_online_to_online_last_active(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -275,6 +279,7 @@ def test_remote_ping_timer(self) -> None: prev_state, new_state, is_mine=False, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -314,6 +319,7 @@ def test_online_to_offline(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -341,6 +347,7 @@ def test_online_to_idle(self) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, @@ -431,6 +438,7 @@ def test_override(self, initial_state: str, final_state: str) -> None: prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=True, @@ -494,6 +502,7 @@ def _test_ratelimit_offline_to_online_to_unavailable( prev_state, new_state, is_mine=True, + our_server_name=self.hs.hostname, wheel_timer=wheel_timer, now=now, persist=False, From a6a686322230b146a20255ced65091cb992810f1 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:18:57 -0500 Subject: [PATCH 20/48] Fill in `synapse/handlers/register.py` --- synapse/handlers/register.py | 40 ++++++++++++++++++++++++------------ synapse/handlers/sso.py | 12 ++++++----- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py index 970013ef20c..342740d3cf8 100644 --- a/synapse/handlers/register.py +++ b/synapse/handlers/register.py @@ -46,6 +46,7 @@ from synapse.config.server import is_threepid_reserved from synapse.handlers.device import DeviceHandler from synapse.http.servlet import assert_params_in_dict +from synapse.metrics import SERVER_NAME_LABEL from synapse.replication.http.login import RegisterDeviceReplicationServlet from synapse.replication.http.register import ( ReplicationPostRegisterActionsServlet, @@ -64,29 +65,38 @@ registration_counter = Counter( "synapse_user_registrations_total", "Number of new users registered (since restart)", - ["guest", "shadow_banned", "auth_provider"], + labelnames=["guest", "shadow_banned", "auth_provider", SERVER_NAME_LABEL], ) login_counter = Counter( "synapse_user_logins_total", "Number of user logins (since restart)", - ["guest", "auth_provider"], + labelnames=["guest", "auth_provider", SERVER_NAME_LABEL], ) -def init_counters_for_auth_provider(auth_provider_id: str) -> None: +def init_counters_for_auth_provider(auth_provider_id: str, server_name: str) -> None: """Ensure the prometheus counters for the given auth provider are initialised This fixes a problem where the counters are not reported for a given auth provider until the user first logs in/registers. + + Args: + auth_provider_id: The ID of the auth provider to initialise counters for. + server_name: Our server name (used to label metrics) (this should be `hs.hostname`). """ for is_guest in (True, False): - login_counter.labels(guest=is_guest, auth_provider=auth_provider_id) + login_counter.labels( + guest=is_guest, + auth_provider=auth_provider_id, + **{SERVER_NAME_LABEL: server_name}, + ) for shadow_banned in (True, False): registration_counter.labels( guest=is_guest, shadow_banned=shadow_banned, auth_provider=auth_provider_id, + **{SERVER_NAME_LABEL: server_name}, ) @@ -99,6 +109,7 @@ class LoginDict(TypedDict): class RegistrationHandler: def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self.clock = hs.get_clock() @@ -114,7 +125,6 @@ def __init__(self, hs: "HomeServer"): self._account_validity_handler = hs.get_account_validity_handler() self._user_consent_version = self.hs.config.consent.user_consent_version self._server_notices_mxid = hs.config.servernotices.server_notices_mxid - self._server_name = hs.hostname self._user_types_config = hs.config.user_types self._spam_checker_module_callbacks = hs.get_module_api_callbacks().spam_checker @@ -141,7 +151,9 @@ def __init__(self, hs: "HomeServer"): ) self.refresh_token_lifetime = hs.config.registration.refresh_token_lifetime - init_counters_for_auth_provider("") + init_counters_for_auth_provider( + auth_provider_id="", server_name=self.server_name + ) async def check_username( self, @@ -365,6 +377,7 @@ async def register_user( guest=make_guest, shadow_banned=shadow_banned, auth_provider=(auth_provider_id or ""), + **{SERVER_NAME_LABEL: self.server_name}, ).inc() # If the user does not need to consent at registration, auto-join any @@ -425,7 +438,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None: if self.hs.config.registration.auto_join_user_id: fake_requester = create_requester( self.hs.config.registration.auto_join_user_id, - authenticated_entity=self._server_name, + authenticated_entity=self.server_name, ) # If the room requires an invite, add the user to the list of invites. @@ -438,7 +451,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None: requires_join = True else: fake_requester = create_requester( - user_id, authenticated_entity=self._server_name + user_id, authenticated_entity=self.server_name ) # Choose whether to federate the new room. @@ -470,7 +483,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None: await room_member_handler.update_membership( requester=create_requester( - user_id, authenticated_entity=self._server_name + user_id, authenticated_entity=self.server_name ), target=UserID.from_string(user_id), room_id=room_id, @@ -496,7 +509,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None: if requires_join: await room_member_handler.update_membership( requester=create_requester( - user_id, authenticated_entity=self._server_name + user_id, authenticated_entity=self.server_name ), target=UserID.from_string(user_id), room_id=room_id, @@ -542,7 +555,7 @@ async def _join_rooms(self, user_id: str) -> None: # we don't have a local user in the room to craft up an invite with. requires_invite = await self.store.is_host_joined( room_id, - self._server_name, + self.server_name, ) if requires_invite: @@ -570,7 +583,7 @@ async def _join_rooms(self, user_id: str) -> None: await room_member_handler.update_membership( requester=create_requester( self.hs.config.registration.auto_join_user_id, - authenticated_entity=self._server_name, + authenticated_entity=self.server_name, ), target=UserID.from_string(user_id), room_id=room_id, @@ -582,7 +595,7 @@ async def _join_rooms(self, user_id: str) -> None: # Send the join. await room_member_handler.update_membership( requester=create_requester( - user_id, authenticated_entity=self._server_name + user_id, authenticated_entity=self.server_name ), target=UserID.from_string(user_id), room_id=room_id, @@ -810,6 +823,7 @@ async def register_device( login_counter.labels( guest=is_guest, auth_provider=(auth_provider_id or ""), + **{SERVER_NAME_LABEL: self.server_name}, ).inc() return ( diff --git a/synapse/handlers/sso.py b/synapse/handlers/sso.py index 7c5cf91aba5..53225b95f77 100644 --- a/synapse/handlers/sso.py +++ b/synapse/handlers/sso.py @@ -203,7 +203,7 @@ class SsoHandler: def __init__(self, hs: "HomeServer"): self._clock = hs.get_clock() self._store = hs.get_datastores().main - self._server_name = hs.hostname + self.server_name = hs.hostname self._is_mine_server_name = hs.is_mine_server_name self._registration_handler = hs.get_registration_handler() self._auth_handler = hs.get_auth_handler() @@ -239,7 +239,9 @@ def register_identity_provider(self, p: SsoIdentityProvider) -> None: p_id = p.idp_id assert p_id not in self._identity_providers self._identity_providers[p_id] = p - init_counters_for_auth_provider(p_id) + init_counters_for_auth_provider( + auth_provider_id=p_id, server_name=self.server_name + ) def get_identity_providers(self) -> Mapping[str, SsoIdentityProvider]: """Get the configured identity providers""" @@ -570,7 +572,7 @@ async def _call_attribute_mapper( return attributes # Check if this mxid already exists - user_id = UserID(attributes.localpart, self._server_name).to_string() + user_id = UserID(attributes.localpart, self.server_name).to_string() if not await self._store.get_users_by_id_case_insensitive(user_id): # This mxid is free break @@ -908,7 +910,7 @@ async def complete_sso_ui_auth_request( # render an error page. html = self._bad_user_template.render( - server_name=self._server_name, + server_name=self.server_name, user_id_to_verify=user_id_to_verify, ) respond_with_html(request, 200, html) @@ -960,7 +962,7 @@ async def check_username_availability( if contains_invalid_mxid_characters(localpart): raise SynapseError(400, "localpart is invalid: %s" % (localpart,)) - user_id = UserID(localpart, self._server_name).to_string() + user_id = UserID(localpart, self.server_name).to_string() user_infos = await self._store.get_users_by_id_case_insensitive(user_id) logger.info("[session %s] users: %s", session_id, user_infos) From e81be25436861b8e6233dbbe8b9453eb3f8a778d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:20:31 -0500 Subject: [PATCH 21/48] Fill in `synapse/handlers/sync.py` --- synapse/handlers/sync.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index a400e63fd56..4d1b32b562c 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -63,6 +63,7 @@ start_active_span, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.storage.databases.main.event_push_actions import RoomNotifCounts from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.stream import PaginateFunction @@ -104,7 +105,7 @@ "Count of non empty sync responses. type is initial_sync/full_state_sync" "/incremental_sync. lazy_loaded indicates if lazy loaded members were " "enabled for that request.", - ["type", "lazy_loaded"], + labelnames=["type", "lazy_loaded", SERVER_NAME_LABEL], ) # Store the cache that tracks which lazy-loaded members have been sent to a given @@ -329,6 +330,7 @@ class E2eeSyncResult: class SyncHandler: def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.hs_config = hs.config self.store = hs.get_datastores().main self.notifier = hs.get_notifier() @@ -611,7 +613,11 @@ async def current_sync_callback( lazy_loaded = "true" else: lazy_loaded = "false" - non_empty_sync_counter.labels(sync_label, lazy_loaded).inc() + non_empty_sync_counter.labels( + type=sync_label, + lazy_loaded=lazy_loaded, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() return result From d42e742d37b4cf608c14cbdd79c68ce2c5a3e141 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:24:51 -0500 Subject: [PATCH 22/48] Fill in `synapse/http/client.py` --- synapse/http/client.py | 43 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 928bfb228a7..e8078b0a257 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -85,6 +85,7 @@ from synapse.http.types import QueryParams from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, start_active_span, tags +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import ISynapseReactor, StrSequence from synapse.util import json_decoder from synapse.util.async_helpers import timeout_deferred @@ -108,9 +109,13 @@ logger = logging.getLogger(__name__) -outgoing_requests_counter = Counter("synapse_http_client_requests", "", ["method"]) +outgoing_requests_counter = Counter( + "synapse_http_client_requests", "", labelnames=["method", SERVER_NAME_LABEL] +) incoming_responses_counter = Counter( - "synapse_http_client_responses", "", ["method", "code"] + "synapse_http_client_responses", + "", + labelnames=["method", "code", SERVER_NAME_LABEL], ) # the type of the headers map, to be passed to the t.w.h.Headers. @@ -346,6 +351,7 @@ def __init__( treq_args: Optional[Dict[str, Any]] = None, ): self.hs = hs + self.server_name = hs.hostname self.reactor = hs.get_reactor() self._extra_treq_args = treq_args or {} @@ -384,7 +390,9 @@ async def request( RequestTimedOutError if the request times out before the headers are read """ - outgoing_requests_counter.labels(method).inc() + outgoing_requests_counter.labels( + method=method, **{SERVER_NAME_LABEL: self.server_name} + ).inc() # log request but strip `access_token` (AS requests for example include this) logger.debug("Sending request %s %s", method, redact_uri(uri)) @@ -438,7 +446,11 @@ async def request( response = await make_deferred_yieldable(request_deferred) - incoming_responses_counter.labels(method, response.code).inc() + incoming_responses_counter.labels( + method=method, + code=response.code, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() logger.info( "Received response to %s %s: %s", method, @@ -447,7 +459,11 @@ async def request( ) return response except Exception as e: - incoming_responses_counter.labels(method, "ERR").inc() + incoming_responses_counter.labels( + method=method, + code="ERR", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() logger.info( "Error sending request to %s %s: %s %s", method, @@ -855,6 +871,7 @@ def __init__( hs: The HomeServer instance to pass in """ super().__init__(hs) + self.server_name = hs.hostname # Use a pool, but a very small one. pool = HTTPConnectionPool(self.reactor) @@ -891,7 +908,9 @@ async def request( RequestTimedOutError if the request times out before the headers are read """ - outgoing_requests_counter.labels(method).inc() + outgoing_requests_counter.labels( + method=method, **{SERVER_NAME_LABEL: self.server_name} + ).inc() logger.debug("Sending request %s %s", method, uri) @@ -948,7 +967,11 @@ async def request( response = await make_deferred_yieldable(request_deferred) - incoming_responses_counter.labels(method, response.code).inc() + incoming_responses_counter.labels( + method=method, + code=response.code, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() logger.info( "Received response to %s %s: %s", method, @@ -957,7 +980,11 @@ async def request( ) return response except Exception as e: - incoming_responses_counter.labels(method, "ERR").inc() + incoming_responses_counter.labels( + method=method, + code="ERR", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() logger.info( "Error sending request to %s %s: %s %s", method, From f197c92c90f8add397848ea1b28e4c07c8e411c5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 14:28:05 -0500 Subject: [PATCH 23/48] Fill in `synapse/http/matrixfederationclient.py` --- synapse/http/matrixfederationclient.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 97a863a1189..2ab1e2ae3ec 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -87,6 +87,7 @@ from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, start_active_span, tags +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import JsonDict from synapse.util import json_decoder from synapse.util.async_helpers import AwakenableSleeper, Linearizer, timeout_deferred @@ -99,10 +100,14 @@ logger = logging.getLogger(__name__) outgoing_requests_counter = Counter( - "synapse_http_matrixfederationclient_requests", "", ["method"] + "synapse_http_matrixfederationclient_requests", + "", + labelnames=["method", SERVER_NAME_LABEL], ) incoming_responses_counter = Counter( - "synapse_http_matrixfederationclient_responses", "", ["method", "code"] + "synapse_http_matrixfederationclient_responses", + "", + labelnames=["method", "code", SERVER_NAME_LABEL], ) @@ -694,7 +699,9 @@ async def _send_request( _sec_timeout, ) - outgoing_requests_counter.labels(request.method).inc() + outgoing_requests_counter.labels( + method=request.method, **{SERVER_NAME_LABEL: self.server_name} + ).inc() try: with Measure(self.clock, "outbound_request"): @@ -729,7 +736,9 @@ async def _send_request( raise RequestSendFailed(e, can_retry=True) from e incoming_responses_counter.labels( - request.method, response.code + method=request.method, + code=response.code, + **{SERVER_NAME_LABEL: self.server_name}, ).inc() set_tag(tags.HTTP_STATUS_CODE, response.code) From c28d4bcf96650a221622ffd0cdcb5aad9bd4ec26 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 15:05:42 -0500 Subject: [PATCH 24/48] Fill in `synapse/http/request_metrics.py` --- synapse/http/request_metrics.py | 109 ++++++++++++------ synapse/http/site.py | 12 +- tests/handlers/test_oauth_delegation.py | 4 +- tests/logging/test_terse_json.py | 6 +- tests/rest/key/v2/test_remote_key_resource.py | 4 +- 5 files changed, 93 insertions(+), 42 deletions(-) diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 366f06eb809..799b0cb4c77 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -27,40 +27,52 @@ from prometheus_client.core import Counter, Histogram from synapse.logging.context import current_context -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge logger = logging.getLogger(__name__) # total number of responses served, split by method/servlet/tag response_count = Counter( - "synapse_http_server_response_count", "", ["method", "servlet", "tag"] + "synapse_http_server_response_count", + "", + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) requests_counter = Counter( - "synapse_http_server_requests_received", "", ["method", "servlet"] + "synapse_http_server_requests_received", + "", + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) outgoing_responses_counter = Counter( - "synapse_http_server_responses", "", ["method", "code"] + "synapse_http_server_responses", + "", + labelnames=["method", "code", SERVER_NAME_LABEL], ) response_timer = Histogram( "synapse_http_server_response_time_seconds", "sec", - ["method", "servlet", "tag", "code"], + labelnames=["method", "servlet", "tag", "code", SERVER_NAME_LABEL], ) response_ru_utime = Counter( - "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"] + "synapse_http_server_response_ru_utime_seconds", + "sec", + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) response_ru_stime = Counter( - "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"] + "synapse_http_server_response_ru_stime_seconds", + "sec", + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) response_db_txn_count = Counter( - "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"] + "synapse_http_server_response_db_txn_count", + "", + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # seconds spent waiting for db txns, excluding scheduling time, when processing @@ -68,34 +80,42 @@ response_db_txn_duration = Counter( "synapse_http_server_response_db_txn_duration_seconds", "", - ["method", "servlet", "tag"], + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # seconds spent waiting for a db connection, when processing this request response_db_sched_duration = Counter( "synapse_http_server_response_db_sched_duration_seconds", "", - ["method", "servlet", "tag"], + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # size in bytes of the response written response_size = Counter( - "synapse_http_server_response_size", "", ["method", "servlet", "tag"] + "synapse_http_server_response_size", + "", + labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # In flight metrics are incremented while the requests are in flight, rather # than when the response was written. in_flight_requests_ru_utime = Counter( - "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"] + "synapse_http_server_in_flight_requests_ru_utime_seconds", + "", + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) in_flight_requests_ru_stime = Counter( - "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"] + "synapse_http_server_in_flight_requests_ru_stime_seconds", + "", + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) in_flight_requests_db_txn_count = Counter( - "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"] + "synapse_http_server_in_flight_requests_db_txn_count", + "", + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) # seconds spent waiting for db txns, excluding scheduling time, when processing @@ -103,14 +123,14 @@ in_flight_requests_db_txn_duration = Counter( "synapse_http_server_in_flight_requests_db_txn_duration_seconds", "", - ["method", "servlet"], + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) # seconds spent waiting for a db connection, when processing this request in_flight_requests_db_sched_duration = Counter( "synapse_http_server_in_flight_requests_db_sched_duration_seconds", "", - ["method", "servlet"], + labelnames=["method", "servlet", SERVER_NAME_LABEL], ) _in_flight_requests: Set["RequestMetrics"] = set() @@ -149,6 +169,13 @@ def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]: class RequestMetrics: + def __init__(self, our_server_name: str) -> None: + """ + Args: + our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`) + """ + self.our_server_name = our_server_name + def start(self, time_sec: float, name: str, method: str) -> None: self.start_ts = time_sec self.start_context = current_context() @@ -194,33 +221,39 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None: response_code_str = str(response_code) - outgoing_responses_counter.labels(self.method, response_code_str).inc() + outgoing_responses_counter.labels( + method=self.method, + code=response_code_str, + **{SERVER_NAME_LABEL: self.our_server_name}, + ).inc() + + response_base_labels = { + "method": self.method, + "servlet": self.name, + "tag": tag, + SERVER_NAME_LABEL: self.our_server_name, + } - response_count.labels(self.method, self.name, tag).inc() + response_count.labels(**response_base_labels).inc() - response_timer.labels(self.method, self.name, tag, response_code_str).observe( + response_timer.labels(code=response_code_str, **response_base_labels).observe( time_sec - self.start_ts ) resource_usage = context.get_resource_usage() - response_ru_utime.labels(self.method, self.name, tag).inc( - resource_usage.ru_utime - ) - response_ru_stime.labels(self.method, self.name, tag).inc( - resource_usage.ru_stime - ) - response_db_txn_count.labels(self.method, self.name, tag).inc( + response_ru_utime.labels(**response_base_labels).inc(resource_usage.ru_utime) + response_ru_stime.labels(**response_base_labels).inc(resource_usage.ru_stime) + response_db_txn_count.labels(**response_base_labels).inc( resource_usage.db_txn_count ) - response_db_txn_duration.labels(self.method, self.name, tag).inc( + response_db_txn_duration.labels(**response_base_labels).inc( resource_usage.db_txn_duration_sec ) - response_db_sched_duration.labels(self.method, self.name, tag).inc( + response_db_sched_duration.labels(**response_base_labels).inc( resource_usage.db_sched_duration_sec ) - - response_size.labels(self.method, self.name, tag).inc(sent_bytes) + response_size.labels(**response_base_labels).inc(sent_bytes) # We always call this at the end to ensure that we update the metrics # regardless of whether a call to /metrics while the request was in @@ -240,24 +273,30 @@ def update_metrics(self) -> None: diff = new_stats - self._request_stats self._request_stats = new_stats + in_flight_labels = { + "method": self.method, + "servlet": self.name, + SERVER_NAME_LABEL: self.our_server_name, + } + # max() is used since rapid use of ru_stime/ru_utime can end up with the # count going backwards due to NTP, time smearing, fine-grained # correction, or floating points. Who knows, really? - in_flight_requests_ru_utime.labels(self.method, self.name).inc( + in_flight_requests_ru_utime.labels(**in_flight_labels).inc( max(diff.ru_utime, 0) ) - in_flight_requests_ru_stime.labels(self.method, self.name).inc( + in_flight_requests_ru_stime.labels(**in_flight_labels).inc( max(diff.ru_stime, 0) ) - in_flight_requests_db_txn_count.labels(self.method, self.name).inc( + in_flight_requests_db_txn_count.labels(**in_flight_labels).inc( diff.db_txn_count ) - in_flight_requests_db_txn_duration.labels(self.method, self.name).inc( + in_flight_requests_db_txn_duration.labels(**in_flight_labels).inc( diff.db_txn_duration_sec ) - in_flight_requests_db_sched_duration.labels(self.method, self.name).inc( + in_flight_requests_db_sched_duration.labels(**in_flight_labels).inc( diff.db_sched_duration_sec ) diff --git a/synapse/http/site.py b/synapse/http/site.py index e83a4447b2f..1ceccc197e6 100644 --- a/synapse/http/site.py +++ b/synapse/http/site.py @@ -44,6 +44,7 @@ LoggingContext, PreserveLoggingContext, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import ISynapseReactor, Requester if TYPE_CHECKING: @@ -83,12 +84,14 @@ def __init__( self, channel: HTTPChannel, site: "SynapseSite", + hs: "HomeServer", *args: Any, max_request_body_size: int = 1024, request_id_header: Optional[str] = None, **kw: Any, ): super().__init__(channel, *args, **kw) + self.our_server_name = hs.hostname self._max_request_body_size = max_request_body_size self.request_id_header = request_id_header self.synapse_site = site @@ -334,7 +337,11 @@ def render(self, resrc: Resource) -> None: # dispatching to the handler, so that the handler # can update the servlet name in the request # metrics - requests_counter.labels(self.get_method(), self.request_metrics.name).inc() + requests_counter.labels( + method=self.get_method(), + servlet=self.request_metrics.name, + **{SERVER_NAME_LABEL: self.our_server_name}, + ).inc() @contextlib.contextmanager def processing(self) -> Generator[None, None, None]: @@ -455,7 +462,7 @@ def _started_processing(self, servlet_name: str) -> None: self.request_metrics.name. """ self.start_time = time.time() - self.request_metrics = RequestMetrics() + self.request_metrics = RequestMetrics(our_server_name=self.our_server_name) self.request_metrics.start( self.start_time, name=servlet_name, method=self.get_method() ) @@ -705,6 +712,7 @@ def request_factory(channel: HTTPChannel, queued: bool) -> Request: return request_class( channel, self, + hs, max_request_body_size=max_request_body_size, queued=queued, request_id_header=request_id_header, diff --git a/tests/handlers/test_oauth_delegation.py b/tests/handlers/test_oauth_delegation.py index fefa2f1135c..2ef25f73a15 100644 --- a/tests/handlers/test_oauth_delegation.py +++ b/tests/handlers/test_oauth_delegation.py @@ -867,7 +867,7 @@ async def mock_http_client_request( # First test a known access token channel = FakeChannel(self.site, self.reactor) # type-ignore: FakeChannel is a mock of an HTTPChannel, not a proper HTTPChannel - req = SynapseRequest(channel, self.site) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] req.client.host = EXAMPLE_IPV4_ADDR req.requestHeaders.addRawHeader("Authorization", f"Bearer {known_token}") req.requestHeaders.addRawHeader("User-Agent", EXAMPLE_USER_AGENT) @@ -899,7 +899,7 @@ async def mock_http_client_request( MAS_USER_AGENT = "masmasmas" channel = FakeChannel(self.site, self.reactor) - req = SynapseRequest(channel, self.site) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] req.client.host = MAS_IPV4_ADDR req.requestHeaders.addRawHeader( "Authorization", f"Bearer {self.auth._admin_token()}" diff --git a/tests/logging/test_terse_json.py b/tests/logging/test_terse_json.py index d9cbbbd51ef..ddc2837ff9a 100644 --- a/tests/logging/test_terse_json.py +++ b/tests/logging/test_terse_json.py @@ -171,8 +171,12 @@ def test_with_request_context(self) -> None: site.site_tag = "test-site" site.server_version_string = "Server v1" site.reactor = Mock() + + hs = Mock(spec=["hostname"]) + hs.hostname = "test_server" + request = SynapseRequest( - cast(HTTPChannel, FakeChannel(site, self.reactor)), site + cast(HTTPChannel, FakeChannel(site, self.reactor)), site, hs ) # Call requestReceived to finish instantiating the object. request.content = BytesIO() diff --git a/tests/rest/key/v2/test_remote_key_resource.py b/tests/rest/key/v2/test_remote_key_resource.py index 21e12b2a2f2..62538bbdc37 100644 --- a/tests/rest/key/v2/test_remote_key_resource.py +++ b/tests/rest/key/v2/test_remote_key_resource.py @@ -99,7 +99,7 @@ def make_notary_request(self, server_name: str, key_id: str) -> dict: """ channel = FakeChannel(self.site, self.reactor) # channel is a `FakeChannel` but `HTTPChannel` is expected - req = SynapseRequest(channel, self.site) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] req.content = BytesIO(b"") req.requestReceived( b"GET", @@ -201,7 +201,7 @@ async def post_json( channel = FakeChannel(self.site, self.reactor) # channel is a `FakeChannel` but `HTTPChannel` is expected - req = SynapseRequest(channel, self.site) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] req.content = BytesIO(encode_canonical_json(data)) req.requestReceived( From fb5e1f26d7520ee97483c67c16fd0da6ef97e808 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 15:11:55 -0500 Subject: [PATCH 25/48] Fix `synapse/federation/sender/per_destination_queue.py` --- synapse/federation/sender/per_destination_queue.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 9ab95a988ea..b759644a740 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -93,7 +93,7 @@ def __init__( transaction_manager: "synapse.federation.sender.TransactionManager", destination: str, ): - self._server_name = hs.hostname + self.server_name = hs.hostname self._clock = hs.get_clock() self._storage_controllers = hs.get_storage_controllers() self._store = hs.get_datastores().main @@ -561,7 +561,7 @@ async def _catch_up_transmission_loop(self) -> None: new_pdus = await filter_events_for_server( self._storage_controllers, self._destination, - self._server_name, + self.server_name, new_pdus, redact=False, filter_out_erased_senders=True, @@ -608,7 +608,7 @@ def _get_receipt_edus(self, limit: int) -> Iterable[Edu]: # Send at most limit EDUs for receipts. for content in self._pending_receipt_edus[:limit]: yield Edu( - origin=self._server_name, + origin=self.server_name, destination=self._destination, edu_type=EduTypes.RECEIPT, content=content, @@ -634,7 +634,7 @@ async def _get_device_update_edus(self, limit: int) -> Tuple[List[Edu], int]: ) edus = [ Edu( - origin=self._server_name, + origin=self.server_name, destination=self._destination, edu_type=edu_type, content=content, @@ -661,7 +661,7 @@ async def _get_to_device_message_edus(self, limit: int) -> Tuple[List[Edu], int] edus = [ Edu( - origin=self._server_name, + origin=self.server_name, destination=self._destination, edu_type=EduTypes.DIRECT_TO_DEVICE, content=content, @@ -734,7 +734,7 @@ async def __aenter__(self) -> Tuple[List[EventBase], List[Edu]]: pending_edus.append( Edu( - origin=self.queue._server_name, + origin=self.queue.server_name, destination=self.queue._destination, edu_type=EduTypes.PRESENCE, content={"push": presence_to_add}, From 49f209838c463ffdb4472f0edd4328e335e6a995 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 15:44:42 -0500 Subject: [PATCH 26/48] Fix mypy complaining about `request_metrics` `has-type` lints (part of `synapse/http/request_metrics.py`) ``` synapse/logging/opentracing.py:1084: error: Cannot determine type of "request_metrics" [has-type] synapse/logging/opentracing.py:1100: error: Cannot determine type of "request_metrics" [has-type] synapse/logging/opentracing.py:1106: error: Unused "type: ignore" comment [unused-ignore] synapse/logging/opentracing.py:1106: error: Cannot determine type of "request_metrics" [has-type] synapse/logging/opentracing.py:1106: note: Error code "has-type" not covered by "type: ignore" comment synapse/http/server.py:141: error: Cannot determine type of "request_metrics" [has-type] synapse/http/server.py:151: error: Cannot determine type of "request_metrics" [has-type] synapse/http/server.py:333: error: Cannot determine type of "request_metrics" [has-type] synapse/http/server.py:546: error: Cannot determine type of "request_metrics" [has-type] synapse/http/site.py:132: error: Cannot determine type of "server_name" [has-type] synapse/http/site.py:341: error: Cannot determine type of "request_metrics" [has-type] ``` --- synapse/http/site.py | 7 ++++--- tests/handlers/test_oauth_delegation.py | 4 ++-- tests/logging/test_terse_json.py | 5 +---- tests/rest/key/v2/test_remote_key_resource.py | 4 ++-- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/synapse/http/site.py b/synapse/http/site.py index 1ceccc197e6..55088fc190e 100644 --- a/synapse/http/site.py +++ b/synapse/http/site.py @@ -84,14 +84,14 @@ def __init__( self, channel: HTTPChannel, site: "SynapseSite", - hs: "HomeServer", + our_server_name: str, *args: Any, max_request_body_size: int = 1024, request_id_header: Optional[str] = None, **kw: Any, ): super().__init__(channel, *args, **kw) - self.our_server_name = hs.hostname + self.our_server_name = our_server_name self._max_request_body_size = max_request_body_size self.request_id_header = request_id_header self.synapse_site = site @@ -701,6 +701,7 @@ def __init__( self.site_tag = site_tag self.reactor: ISynapseReactor = reactor + self.server_name = hs.hostname assert config.http_options is not None proxied = config.http_options.x_forwarded @@ -712,7 +713,7 @@ def request_factory(channel: HTTPChannel, queued: bool) -> Request: return request_class( channel, self, - hs, + our_server_name=self.server_name, max_request_body_size=max_request_body_size, queued=queued, request_id_header=request_id_header, diff --git a/tests/handlers/test_oauth_delegation.py b/tests/handlers/test_oauth_delegation.py index 2ef25f73a15..539fa909a86 100644 --- a/tests/handlers/test_oauth_delegation.py +++ b/tests/handlers/test_oauth_delegation.py @@ -867,7 +867,7 @@ async def mock_http_client_request( # First test a known access token channel = FakeChannel(self.site, self.reactor) # type-ignore: FakeChannel is a mock of an HTTPChannel, not a proper HTTPChannel - req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs.hostname) # type: ignore[arg-type] req.client.host = EXAMPLE_IPV4_ADDR req.requestHeaders.addRawHeader("Authorization", f"Bearer {known_token}") req.requestHeaders.addRawHeader("User-Agent", EXAMPLE_USER_AGENT) @@ -899,7 +899,7 @@ async def mock_http_client_request( MAS_USER_AGENT = "masmasmas" channel = FakeChannel(self.site, self.reactor) - req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs.hostname) # type: ignore[arg-type] req.client.host = MAS_IPV4_ADDR req.requestHeaders.addRawHeader( "Authorization", f"Bearer {self.auth._admin_token()}" diff --git a/tests/logging/test_terse_json.py b/tests/logging/test_terse_json.py index ddc2837ff9a..60de8d786f9 100644 --- a/tests/logging/test_terse_json.py +++ b/tests/logging/test_terse_json.py @@ -172,11 +172,8 @@ def test_with_request_context(self) -> None: site.server_version_string = "Server v1" site.reactor = Mock() - hs = Mock(spec=["hostname"]) - hs.hostname = "test_server" - request = SynapseRequest( - cast(HTTPChannel, FakeChannel(site, self.reactor)), site, hs + cast(HTTPChannel, FakeChannel(site, self.reactor)), site, "test_server" ) # Call requestReceived to finish instantiating the object. request.content = BytesIO() diff --git a/tests/rest/key/v2/test_remote_key_resource.py b/tests/rest/key/v2/test_remote_key_resource.py index 62538bbdc37..31f3cb0f122 100644 --- a/tests/rest/key/v2/test_remote_key_resource.py +++ b/tests/rest/key/v2/test_remote_key_resource.py @@ -99,7 +99,7 @@ def make_notary_request(self, server_name: str, key_id: str) -> dict: """ channel = FakeChannel(self.site, self.reactor) # channel is a `FakeChannel` but `HTTPChannel` is expected - req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs.hostname) # type: ignore[arg-type] req.content = BytesIO(b"") req.requestReceived( b"GET", @@ -201,7 +201,7 @@ async def post_json( channel = FakeChannel(self.site, self.reactor) # channel is a `FakeChannel` but `HTTPChannel` is expected - req = SynapseRequest(channel, self.site, self.hs) # type: ignore[arg-type] + req = SynapseRequest(channel, self.site, self.hs.hostname) # type: ignore[arg-type] req.content = BytesIO(encode_canonical_json(data)) req.requestReceived( From 347e66d03325c29d37e459bf605583244321b006 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 15:57:00 -0500 Subject: [PATCH 27/48] Fill in `synapse/metrics/__init__.py` --- synapse/federation/sender/__init__.py | 20 ++++++++++++------- .../sender/per_destination_queue.py | 8 ++++++-- synapse/handlers/appservice.py | 14 ++++++++----- synapse/metrics/__init__.py | 14 +++++++++---- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index c695f5d494c..0121fac175b 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -699,13 +699,19 @@ async def handle_room_events(events: List[EventBase]) -> None: "federation_sender" ).set(ts) - events_processed_counter.inc(len(event_entries)) - - event_processing_loop_room_count.labels("federation_sender").inc( - len(events_by_room) - ) - - event_processing_loop_counter.labels("federation_sender").inc() + events_processed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(event_entries)) + + event_processing_loop_room_count.labels( + name="federation_sender", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(len(events_by_room)) + + event_processing_loop_counter.labels( + name="federation_sender", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() synapse.metrics.event_processing_positions.labels( "federation_sender" diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index b759644a740..34ba44f14f5 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -362,7 +362,9 @@ async def _transaction_transmission_loop(self) -> None: self._destination, pending_pdus, pending_edus ) - sent_transactions_counter.inc() + sent_transactions_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() sent_edus_counter.labels( **{SERVER_NAME_LABEL: self.server_name} ).inc(len(pending_edus)) @@ -585,7 +587,9 @@ async def _catch_up_transmission_loop(self) -> None: self._destination, room_catchup_pdus, [] ) - sent_transactions_counter.inc() + sent_transactions_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() # We pulled this from the DB, so it'll be non-null assert pdu.internal_metadata.stream_ordering diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index ec624e405d7..acc1ae63d7a 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -208,11 +208,15 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: **{SERVER_NAME_LABEL: self.server_name} ).inc(len(events)) - event_processing_loop_room_count.labels("appservice_sender").inc( - len(events_by_room) - ) - - event_processing_loop_counter.labels("appservice_sender").inc() + event_processing_loop_room_count.labels( + name="appservice_sender", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(len(events_by_room)) + + event_processing_loop_counter.labels( + name="appservice_sender", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() if events: now = self.clock.time_msec() diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index da58df6b968..238e0a043cf 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -395,18 +395,24 @@ def collect(self) -> Iterable[Metric]: # Federation Metrics # -sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "") +sent_transactions_counter = Counter( + "synapse_federation_client_sent_transactions", "", labelnames=[SERVER_NAME_LABEL] +) -events_processed_counter = Counter("synapse_federation_client_events_processed", "") +events_processed_counter = Counter( + "synapse_federation_client_events_processed", "", labelnames=[SERVER_NAME_LABEL] +) event_processing_loop_counter = Counter( - "synapse_event_processing_loop_count", "Event processing loop iterations", ["name"] + "synapse_event_processing_loop_count", + "Event processing loop iterations", + labelnames=["name", SERVER_NAME_LABEL], ) event_processing_loop_room_count = Counter( "synapse_event_processing_loop_room_count", "Rooms seen per event processing loop iteration", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) From 88bf526ae950209ee8d7b75f0d10d10265489a8c Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:09:44 -0500 Subject: [PATCH 28/48] Fill in `synapse/push/bulk_push_rule_evaluator.py` --- synapse/push/bulk_push_rule_evaluator.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py index 1f4f5b90c3a..af9167b294d 100644 --- a/synapse/push/bulk_push_rule_evaluator.py +++ b/synapse/push/bulk_push_rule_evaluator.py @@ -50,6 +50,7 @@ from synapse.events import EventBase, relation_from_event from synapse.events.snapshot import EventContext from synapse.logging.context import make_deferred_yieldable, run_in_background +from synapse.metrics import SERVER_NAME_LABEL from synapse.state import CREATE_KEY, POWER_KEY from synapse.storage.databases.main.roommember import EventIdMembership from synapse.storage.invite_rule import InviteRule @@ -68,11 +69,16 @@ logger = logging.getLogger(__name__) +# FIXME: These are unused push_rules_invalidation_counter = Counter( - "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", "" + "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", + "", + labelnames=[SERVER_NAME_LABEL], ) push_rules_state_size_counter = Counter( - "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", "" + "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", + "", + labelnames=[SERVER_NAME_LABEL], ) From 3cb1d28818ec2b81bc508ee31e38d9c52390813e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:19:30 -0500 Subject: [PATCH 29/48] Restore and standardize comment for unused metrics --- synapse/notifier.py | 5 +++++ synapse/push/bulk_push_rule_evaluator.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/synapse/notifier.py b/synapse/notifier.py index 887957b151b..45204842ce3 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -74,6 +74,11 @@ logger = logging.getLogger(__name__) +# FIXME: Unused metric, remove if not needed. +notified_events_counter = Counter( + "synapse_notifier_notified_events", "", labelnames=[SERVER_NAME_LABEL] +) + users_woken_by_stream_counter = Counter( "synapse_notifier_users_woken_by_stream", "", diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py index af9167b294d..9a0a8a2f242 100644 --- a/synapse/push/bulk_push_rule_evaluator.py +++ b/synapse/push/bulk_push_rule_evaluator.py @@ -69,12 +69,13 @@ logger = logging.getLogger(__name__) -# FIXME: These are unused +# FIXME: Unused metric, remove if not needed. push_rules_invalidation_counter = Counter( "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", "", labelnames=[SERVER_NAME_LABEL], ) +# FIXME: Unused metric, remove if not needed. push_rules_state_size_counter = Counter( "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", "", From c2cb0247dc9a4b3b583067f663ca5afbf6b1c341 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:21:58 -0500 Subject: [PATCH 30/48] Fill in `synapse/push/httppusher.py` --- synapse/push/httppusher.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py index 7df8a128c9d..53dfb54104e 100644 --- a/synapse/push/httppusher.py +++ b/synapse/push/httppusher.py @@ -31,6 +31,7 @@ from synapse.api.constants import EventTypes from synapse.events import EventBase from synapse.logging import opentracing +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.push import Pusher, PusherConfig, PusherConfigException from synapse.storage.databases.main.event_push_actions import HttpPushAction @@ -46,21 +47,25 @@ http_push_processed_counter = Counter( "synapse_http_httppusher_http_pushes_processed", "Number of push notifications successfully sent", + labelnames=[SERVER_NAME_LABEL], ) http_push_failed_counter = Counter( "synapse_http_httppusher_http_pushes_failed", "Number of push notifications which failed", + labelnames=[SERVER_NAME_LABEL], ) http_badges_processed_counter = Counter( "synapse_http_httppusher_badge_updates_processed", "Number of badge updates successfully sent", + labelnames=[SERVER_NAME_LABEL], ) http_badges_failed_counter = Counter( "synapse_http_httppusher_badge_updates_failed", "Number of badge updates which failed", + labelnames=[SERVER_NAME_LABEL], ) @@ -106,6 +111,7 @@ class HttpPusher(Pusher): def __init__(self, hs: "HomeServer", pusher_config: PusherConfig): super().__init__(hs, pusher_config) + self.server_name = hs.hostname self._storage_controllers = self.hs.get_storage_controllers() self.app_display_name = pusher_config.app_display_name self.device_display_name = pusher_config.device_display_name @@ -265,7 +271,9 @@ async def _unsafe_process(self) -> None: processed = await self._process_one(push_action) if processed: - http_push_processed_counter.inc() + http_push_processed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() self.backoff_delay = HttpPusher.INITIAL_BACKOFF_SEC self.last_stream_ordering = push_action.stream_ordering pusher_still_exists = ( @@ -289,7 +297,9 @@ async def _unsafe_process(self) -> None: self.app_id, self.pushkey, self.user_id, self.failing_since ) else: - http_push_failed_counter.inc() + http_push_failed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() if not self.failing_since: self.failing_since = self.clock.time_msec() await self.store.update_pusher_failing_since( @@ -540,9 +550,13 @@ async def _send_badge(self, badge: int) -> None: } try: await self.http_client.post_json_get_json(self.url, d) - http_badges_processed_counter.inc() + http_badges_processed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() except Exception as e: logger.warning( "Failed to send badge count to %s: %s %s", self.name, type(e), e ) - http_badges_failed_counter.inc() + http_badges_failed_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() From e469abeaa5b35bcca71c9265db0e97bfa929f39a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:27:47 -0500 Subject: [PATCH 31/48] Fill in `synapse/push/mailer.py` --- synapse/push/mailer.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/synapse/push/mailer.py b/synapse/push/mailer.py index fadba480dd8..d76cc8237ba 100644 --- a/synapse/push/mailer.py +++ b/synapse/push/mailer.py @@ -32,6 +32,7 @@ from synapse.api.errors import StoreError from synapse.config.emailconfig import EmailSubjectConfig from synapse.events import EventBase +from synapse.metrics import SERVER_NAME_LABEL from synapse.push.presentable_names import ( calculate_room_name, descriptor_from_member_events, @@ -60,7 +61,7 @@ emails_sent_counter = Counter( "synapse_emails_sent_total", "Emails sent by type", - ["type"], + labelnames=["type", SERVER_NAME_LABEL], ) @@ -123,6 +124,7 @@ def __init__( template_text: jinja2.Template, ): self.hs = hs + self.server_name = hs.hostname self.template_html = template_html self.template_text = template_text @@ -137,8 +139,6 @@ def __init__( logger.info("Created Mailer for app_name %s", app_name) - emails_sent_counter.labels("password_reset") - async def send_password_reset_mail( self, email_address: str, token: str, client_secret: str, sid: str ) -> None: @@ -162,7 +162,10 @@ async def send_password_reset_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels("password_reset").inc() + emails_sent_counter.labels( + type="password_reset", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() await self.send_email( email_address, @@ -171,8 +174,6 @@ async def send_password_reset_mail( template_vars, ) - emails_sent_counter.labels("registration") - async def send_registration_mail( self, email_address: str, token: str, client_secret: str, sid: str ) -> None: @@ -196,7 +197,10 @@ async def send_registration_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels("registration").inc() + emails_sent_counter.labels( + type="registration", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() await self.send_email( email_address, @@ -205,8 +209,6 @@ async def send_registration_mail( template_vars, ) - emails_sent_counter.labels("already_in_use") - async def send_already_in_use_mail(self, email_address: str) -> None: """Send an email if the address is already bound to an user account @@ -214,6 +216,11 @@ async def send_already_in_use_mail(self, email_address: str) -> None: email_address: Email address we're sending to the "already in use" mail """ + emails_sent_counter.labels( + type="already_in_use", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() + await self.send_email( email_address, self.email_subjects.email_already_in_use @@ -221,8 +228,6 @@ async def send_already_in_use_mail(self, email_address: str) -> None: {}, ) - emails_sent_counter.labels("add_threepid") - async def send_add_threepid_mail( self, email_address: str, token: str, client_secret: str, sid: str ) -> None: @@ -247,7 +252,10 @@ async def send_add_threepid_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels("add_threepid").inc() + emails_sent_counter.labels( + type="add_threepid", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() await self.send_email( email_address, @@ -256,8 +264,6 @@ async def send_add_threepid_mail( template_vars, ) - emails_sent_counter.labels("notification") - async def send_notification_mail( self, app_id: str, @@ -352,7 +358,10 @@ async def _fetch_room_state(room_id: str) -> None: "reason": reason, } - emails_sent_counter.labels("notification").inc() + emails_sent_counter.labels( + type="notification", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() await self.send_email( email_address, summary_text, template_vars, unsubscribe_link From f49c793772e85d21e8cc6b70a900eef1b27c386d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:39:05 -0500 Subject: [PATCH 32/48] Fill in `synapse/replication/http/_base.py` --- synapse/replication/http/_base.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 00025386801..5b86a4aa0c1 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -38,6 +38,7 @@ from synapse.http.site import SynapseRequest from synapse.logging import opentracing from synapse.logging.opentracing import trace_with_opname +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import JsonDict from synapse.util.caches.response_cache import ResponseCache from synapse.util.cancellation import is_function_cancellable @@ -57,7 +58,7 @@ _outgoing_request_counter = Counter( "synapse_outgoing_replication_requests", "Number of outgoing replication requests, by replication method name and result", - ["name", "code"], + labelnames=["name", "code", SERVER_NAME_LABEL], ) @@ -200,6 +201,7 @@ def make_client(cls, hs: "HomeServer") -> Callable: parameter to specify which instance to hit (the instance must be in the `instance_map` config). """ + server_name = hs.hostname clock = hs.get_clock() client = hs.get_replication_client() local_instance_name = hs.get_instance_name() @@ -328,15 +330,27 @@ async def send_request( # We convert to SynapseError as we know that it was a SynapseError # on the main process that we should send to the client. (And # importantly, not stack traces everywhere) - _outgoing_request_counter.labels(cls.NAME, e.code).inc() + _outgoing_request_counter.labels( + name=cls.NAME, + code=e.code, + **{SERVER_NAME_LABEL: server_name}, + ).inc() raise e.to_synapse_error() except Exception as e: - _outgoing_request_counter.labels(cls.NAME, "ERR").inc() + _outgoing_request_counter.labels( + name=cls.NAME, + code="ERR", + **{SERVER_NAME_LABEL: server_name}, + ).inc() raise SynapseError( 502, f"Failed to talk to {instance_name} process" ) from e - _outgoing_request_counter.labels(cls.NAME, 200).inc() + _outgoing_request_counter.labels( + name=cls.NAME, + code=200, + **{SERVER_NAME_LABEL: server_name}, + ).inc() # Wait on any streams that the remote may have written to. for stream_name, position in result.pop( From 73f05ec1fe4c078c59755086facece40f244b5cb Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:41:35 -0500 Subject: [PATCH 33/48] Fill in `synapse/replication/tcp/external_cache.py` --- synapse/replication/tcp/external_cache.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index a95771b5f60..2c531ae40c4 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -26,6 +26,7 @@ from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable +from synapse.metrics import SERVER_NAME_LABEL from synapse.util import json_decoder, json_encoder if TYPE_CHECKING: @@ -36,13 +37,13 @@ set_counter = Counter( "synapse_external_cache_set", "Number of times we set a cache", - labelnames=["cache_name"], + labelnames=["cache_name", SERVER_NAME_LABEL], ) get_counter = Counter( "synapse_external_cache_get", "Number of times we get a cache", - labelnames=["cache_name", "hit"], + labelnames=["cache_name", "hit", SERVER_NAME_LABEL], ) response_timer = Histogram( @@ -69,6 +70,8 @@ class ExternalCache: """ def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname + if hs.config.redis.redis_enabled: self._redis_connection: Optional["ConnectionHandler"] = ( hs.get_outbound_redis_connection() @@ -93,7 +96,9 @@ async def set(self, cache_name: str, key: str, value: Any, expiry_ms: int) -> No if self._redis_connection is None: return - set_counter.labels(cache_name).inc() + set_counter.labels( + cache_name=cache_name, **{SERVER_NAME_LABEL: self.server_name} + ).inc() # txredisapi requires the value to be string, bytes or numbers, so we # encode stuff in JSON. @@ -131,7 +136,11 @@ async def get(self, cache_name: str, key: str) -> Optional[Any]: logger.debug("Got cache result %s %s: %r", cache_name, key, result) - get_counter.labels(cache_name, result is not None).inc() + get_counter.labels( + cache_name=cache_name, + hit=result is not None, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() if not result: return None From 7523a5abba249431a405676536e3424474449b4b Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:44:57 -0500 Subject: [PATCH 34/48] Fill in `synapse/replication/tcp/handler.py` --- synapse/replication/tcp/handler.py | 36 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index 1fafbb48c3e..b015c401cfc 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -40,7 +40,7 @@ from twisted.internet.protocol import ReconnectingClientFactory -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.tcp.commands import ( ClearUserSyncsCommand, @@ -81,13 +81,26 @@ # number of updates received for each RDATA stream inbound_rdata_count = Counter( - "synapse_replication_tcp_protocol_inbound_rdata_count", "", ["stream_name"] + "synapse_replication_tcp_protocol_inbound_rdata_count", + "", + labelnames=["stream_name", SERVER_NAME_LABEL], +) +user_sync_counter = Counter( + "synapse_replication_tcp_resource_user_sync", "", labelnames=[SERVER_NAME_LABEL] +) +federation_ack_counter = Counter( + "synapse_replication_tcp_resource_federation_ack", + "", + labelnames=[SERVER_NAME_LABEL], +) +# FIXME: Unused metric, remove if not needed. +remove_pusher_counter = Counter( + "synapse_replication_tcp_resource_remove_pusher", "", labelnames=[SERVER_NAME_LABEL] ) -user_sync_counter = Counter("synapse_replication_tcp_resource_user_sync", "") -federation_ack_counter = Counter("synapse_replication_tcp_resource_federation_ack", "") -remove_pusher_counter = Counter("synapse_replication_tcp_resource_remove_pusher", "") -user_ip_cache_counter = Counter("synapse_replication_tcp_resource_user_ip_cache", "") +user_ip_cache_counter = Counter( + "synapse_replication_tcp_resource_user_ip_cache", "", labelnames=[SERVER_NAME_LABEL] +) # the type of the entries in _command_queues_by_stream @@ -102,6 +115,7 @@ class ReplicationCommandHandler: """ def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self._replication_data_handler = hs.get_replication_data_handler() self._presence_handler = hs.get_presence_handler() self._store = hs.get_datastores().main @@ -437,7 +451,7 @@ def will_announce_positions(self) -> None: def on_USER_SYNC( self, conn: IReplicationConnection, cmd: UserSyncCommand ) -> Optional[Awaitable[None]]: - user_sync_counter.inc() + user_sync_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() if self._is_presence_writer: return self._presence_handler.update_external_syncs_row( @@ -461,7 +475,7 @@ def on_CLEAR_USER_SYNC( def on_FEDERATION_ACK( self, conn: IReplicationConnection, cmd: FederationAckCommand ) -> None: - federation_ack_counter.inc() + federation_ack_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() if self._federation_sender: self._federation_sender.federation_ack(cmd.instance_name, cmd.token) @@ -469,7 +483,7 @@ def on_FEDERATION_ACK( def on_USER_IP( self, conn: IReplicationConnection, cmd: UserIpCommand ) -> Optional[Awaitable[None]]: - user_ip_cache_counter.inc() + user_ip_cache_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() if self._is_master or self._should_insert_client_ips: # We make a point of only returning an awaitable if there's actually @@ -509,7 +523,9 @@ def on_RDATA(self, conn: IReplicationConnection, cmd: RdataCommand) -> None: return stream_name = cmd.stream_name - inbound_rdata_count.labels(stream_name).inc() + inbound_rdata_count.labels( + stream_name=stream_name, **{SERVER_NAME_LABEL: self.server_name} + ).inc() # We put the received command into a queue here for two reasons: # 1. so we don't try and concurrently handle multiple rows for the From 3fd2cf428e1edb0c680f5efbb442b89d9503bf11 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 16:59:13 -0500 Subject: [PATCH 35/48] Fill in `synapse/replication/tcp/protocol.py` --- synapse/replication/tcp/protocol.py | 41 +++++++++++++++++++++-------- synapse/replication/tcp/redis.py | 19 +++++++++++-- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index fb9c539122b..c53d3f15b01 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -39,7 +39,7 @@ from twisted.python.failure import Failure from synapse.logging.context import PreserveLoggingContext -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge from synapse.metrics.background_process_metrics import ( BackgroundProcessLoggingContext, run_as_background_process, @@ -64,19 +64,21 @@ connection_close_counter = Counter( - "synapse_replication_tcp_protocol_close_reason", "", ["reason_type"] + "synapse_replication_tcp_protocol_close_reason", + "", + labelnames=["reason_type", SERVER_NAME_LABEL], ) tcp_inbound_commands_counter = Counter( "synapse_replication_tcp_protocol_inbound_commands", "Number of commands received from replication, by command and name of process connected to", - ["command", "name"], + labelnames=["command", "name", SERVER_NAME_LABEL], ) tcp_outbound_commands_counter = Counter( "synapse_replication_tcp_protocol_outbound_commands", "Number of commands sent to replication, by command and name of process connected to", - ["command", "name"], + labelnames=["command", "name", SERVER_NAME_LABEL], ) # A list of all connected protocols. This allows us to send metrics about the @@ -137,7 +139,10 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver): max_line_buffer = 10000 - def __init__(self, clock: Clock, handler: "ReplicationCommandHandler"): + def __init__( + self, server_name: str, clock: Clock, handler: "ReplicationCommandHandler" + ): + self.server_name = server_name self.clock = clock self.command_handler = handler @@ -244,7 +249,11 @@ def _parse_and_dispatch_line(self, line: bytes) -> None: self.last_received_command = self.clock.time_msec() - tcp_inbound_commands_counter.labels(cmd.NAME, self.name).inc() + tcp_inbound_commands_counter.labels( + command=cmd.NAME, + name=self.name, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() self.handle_command(cmd) @@ -318,7 +327,11 @@ def send_command(self, cmd: Command, do_buffer: bool = True) -> None: self._queue_command(cmd) return - tcp_outbound_commands_counter.labels(cmd.NAME, self.name).inc() + tcp_outbound_commands_counter.labels( + command=cmd.NAME, + name=self.name, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() string = "%s %s" % (cmd.NAME, cmd.to_line()) if "\n" in string: @@ -390,9 +403,15 @@ def connectionLost(self, reason: Failure) -> None: # type: ignore[override] logger.info("[%s] Replication connection closed: %r", self.id(), reason) if isinstance(reason, Failure): assert reason.type is not None - connection_close_counter.labels(reason.type.__name__).inc() + connection_close_counter.labels( + reason_type=reason.type.__name__, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() else: - connection_close_counter.labels(reason.__class__.__name__).inc() # type: ignore[unreachable] + connection_close_counter.labels( # type: ignore[unreachable] + reason_type=reason.__class__.__name__, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() try: # Remove us from list of connections to be monitored @@ -449,7 +468,7 @@ class ServerReplicationStreamProtocol(BaseReplicationStreamProtocol): def __init__( self, server_name: str, clock: Clock, handler: "ReplicationCommandHandler" ): - super().__init__(clock, handler) + super().__init__(server_name, clock, handler) self.server_name = server_name @@ -474,7 +493,7 @@ def __init__( clock: Clock, command_handler: "ReplicationCommandHandler", ): - super().__init__(clock, command_handler) + super().__init__(server_name, clock, command_handler) self.client_name = client_name self.server_name = server_name diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py index c4601a61410..9a2f6978c04 100644 --- a/synapse/replication/tcp/redis.py +++ b/synapse/replication/tcp/redis.py @@ -37,6 +37,7 @@ from twisted.python.failure import Failure from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import ( BackgroundProcessLoggingContext, run_as_background_process, @@ -97,6 +98,9 @@ class RedisSubscriber(SubscriberProtocol): immediately after initialisation. Attributes: + server_name: The homeserver name of the Synapse instance that this connection + is associated with. This is used to label metrics and should be set to + `hs.hostname`. synapse_handler: The command handler to handle incoming commands. synapse_stream_prefix: The *redis* stream name to subscribe to and publish from (not anything to do with Synapse replication streams). @@ -104,6 +108,7 @@ class RedisSubscriber(SubscriberProtocol): commands. """ + server_name: str synapse_handler: "ReplicationCommandHandler" synapse_stream_prefix: str synapse_channel_names: List[str] @@ -171,7 +176,11 @@ def _parse_and_dispatch_message(self, message: str) -> None: # We use "redis" as the name here as we don't have 1:1 connections to # remote instances. - tcp_inbound_commands_counter.labels(cmd.NAME, "redis").inc() + tcp_inbound_commands_counter.labels( + command=cmd.NAME, + name="redis", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() self.handle_command(cmd) @@ -232,7 +241,11 @@ async def _async_send_command(self, cmd: Command) -> None: # We use "redis" as the name here as we don't have 1:1 connections to # remote instances. - tcp_outbound_commands_counter.labels(cmd.NAME, "redis").inc() + tcp_outbound_commands_counter.labels( + command=cmd.NAME, + name="redis", + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() channel_name = cmd.redis_channel_name(self.synapse_stream_prefix) @@ -350,6 +363,7 @@ def __init__( password=hs.config.redis.redis_password, ) + self.server_name = hs.hostname self.synapse_handler = hs.get_replication_command_handler() self.synapse_stream_prefix = hs.hostname self.synapse_channel_names = channel_names @@ -364,6 +378,7 @@ def buildProtocol(self, addr: IAddress) -> RedisSubscriber: # as to do so would involve overriding `buildProtocol` entirely, however # the base method does some other things than just instantiating the # protocol. + p.server_name = self.server_name p.synapse_handler = self.synapse_handler p.synapse_outbound_redis_connection = self.synapse_outbound_redis_connection p.synapse_stream_prefix = self.synapse_stream_prefix From 513249b13444c11e678dfb03ea250cf100919ecf Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:00:44 -0500 Subject: [PATCH 36/48] Fill in `synapse/replication/tcp/resource.py` --- synapse/replication/tcp/resource.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py index d647a2b3326..d8576d2af36 100644 --- a/synapse/replication/tcp/resource.py +++ b/synapse/replication/tcp/resource.py @@ -29,6 +29,7 @@ from twisted.internet.interfaces import IAddress from twisted.internet.protocol import ServerFactory +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.tcp.commands import PositionCommand from synapse.replication.tcp.protocol import ServerReplicationStreamProtocol @@ -40,7 +41,9 @@ from synapse.server import HomeServer stream_updates_counter = Counter( - "synapse_replication_tcp_resource_stream_updates", "", ["stream_name"] + "synapse_replication_tcp_resource_stream_updates", + "", + labelnames=["stream_name", SERVER_NAME_LABEL], ) logger = logging.getLogger(__name__) @@ -78,6 +81,7 @@ class ReplicationStreamer: """ def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.store = hs.get_datastores().main self.clock = hs.get_clock() self.notifier = hs.get_notifier() @@ -219,7 +223,10 @@ async def _run_notifier_loop(self) -> None: len(updates), current_token, ) - stream_updates_counter.labels(stream.NAME).inc(len(updates)) + stream_updates_counter.labels( + stream_name=stream.NAME, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(len(updates)) else: # The token has advanced but there is no data to From e0ffa36431d37def748bfd75a758aefdf0dd4c8f Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:03:43 -0500 Subject: [PATCH 37/48] Fill in `synapse/state/__init__.py` --- synapse/state/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py index 1c3e5d00a92..f5e9f75842d 100644 --- a/synapse/state/__init__.py +++ b/synapse/state/__init__.py @@ -51,6 +51,7 @@ ) from synapse.logging.context import ContextResourceUsage from synapse.logging.opentracing import tag_args, trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.replication.http.state import ReplicationUpdateCurrentStateRestServlet from synapse.state import v1, v2 from synapse.storage.databases.main.events_worker import EventRedactBehaviour @@ -606,11 +607,13 @@ class _StateResMetrics: "synapse_state_res_cpu_for_biggest_room_seconds", "CPU time spent performing state resolution for the single most expensive " "room for state resolution", + labelnames=[SERVER_NAME_LABEL], ) _biggest_room_by_db_counter = Counter( "synapse_state_res_db_for_biggest_room_seconds", "Database time spent performing state resolution for the single most " "expensive room for state resolution", + labelnames=[SERVER_NAME_LABEL], ) _cpu_times = Histogram( @@ -631,6 +634,7 @@ class StateResolutionHandler: """ def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.clock = hs.get_clock() self.resolve_linearizer = Linearizer(name="state_resolve_lock") @@ -873,7 +877,9 @@ def _report_biggest( # report info on the single biggest to prometheus _, biggest_metrics = biggest[0] - prometheus_counter_metric.inc(extract_key(biggest_metrics)) + prometheus_counter_metric.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( + extract_key(biggest_metrics) + ) def _make_state_cache_entry( From bc17483b31ed837801c3baf4b9817cfb68de3dcb Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:05:29 -0500 Subject: [PATCH 38/48] Fill in `synapse/storage/database.py` --- synapse/storage/database.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/synapse/storage/database.py b/synapse/storage/database.py index 61881956149..d999a4c2825 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -61,7 +61,7 @@ current_context, make_deferred_yieldable, ) -from synapse.metrics import LaterGauge, register_threadpool +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, register_threadpool from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.background_updates import BackgroundUpdater from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine @@ -85,8 +85,16 @@ sql_scheduling_timer = Histogram("synapse_storage_schedule_time", "sec") sql_query_timer = Histogram("synapse_storage_query_time", "sec", ["verb"]) -sql_txn_count = Counter("synapse_storage_transaction_time_count", "sec", ["desc"]) -sql_txn_duration = Counter("synapse_storage_transaction_time_sum", "sec", ["desc"]) +sql_txn_count = Counter( + "synapse_storage_transaction_time_count", + "sec", + labelnames=["desc", SERVER_NAME_LABEL], +) +sql_txn_duration = Counter( + "synapse_storage_transaction_time_sum", + "sec", + labelnames=["desc", SERVER_NAME_LABEL], +) # Unique indexes which have been added in background updates. Maps from table name @@ -561,6 +569,7 @@ def __init__( engine: BaseDatabaseEngine, ): self.hs = hs + self.server_name = hs.hostname self._clock = hs.get_clock() self._txn_limit = database_config.config.get("txn_limit", 0) self._database_config = database_config @@ -866,8 +875,14 @@ def new_transaction( self._current_txn_total_time += duration self._txn_perf_counters.update(desc, duration) - sql_txn_count.labels(desc).inc(1) - sql_txn_duration.labels(desc).inc(duration) + sql_txn_count.labels( + desc=desc, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(1) + sql_txn_duration.labels( + desc=desc, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc(duration) async def runInteraction( self, From 36a7a89ccac88d76019639b0ebdd3b72392601d1 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:08:20 -0500 Subject: [PATCH 39/48] Fill in `synapse/storage/controllers/persist_events.py` --- synapse/storage/controllers/persist_events.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index f5131fe2915..20e6782670d 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -61,6 +61,7 @@ start_active_span_follows_from, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.controllers.state import StateStorageController from synapse.storage.databases import Databases @@ -82,19 +83,23 @@ logger = logging.getLogger(__name__) # The number of times we are recalculating the current state -state_delta_counter = Counter("synapse_storage_events_state_delta", "") +state_delta_counter = Counter( + "synapse_storage_events_state_delta", "", labelnames=[SERVER_NAME_LABEL] +) # The number of times we are recalculating state when there is only a # single forward extremity state_delta_single_event_counter = Counter( - "synapse_storage_events_state_delta_single_event", "" + "synapse_storage_events_state_delta_single_event", + "", + labelnames=[SERVER_NAME_LABEL], ) # The number of times we are reculating state when we could have resonably # calculated the delta when we calculated the state for an event we were # persisting. state_delta_reuse_delta_counter = Counter( - "synapse_storage_events_state_delta_reuse_delta", "" + "synapse_storage_events_state_delta_reuse_delta", "", labelnames=[SERVER_NAME_LABEL] ) # The number of forward extremities for each new event. @@ -115,16 +120,19 @@ state_resolutions_during_persistence = Counter( "synapse_storage_events_state_resolutions_during_persistence", "Number of times we had to do state res to calculate new current state", + labelnames=[SERVER_NAME_LABEL], ) potential_times_prune_extremities = Counter( "synapse_storage_events_potential_times_prune_extremities", "Number of times we might be able to prune extremities", + labelnames=[SERVER_NAME_LABEL], ) times_pruned_extremities = Counter( "synapse_storage_events_times_pruned_extremities", "Number of times we were actually be able to prune extremities", + labelnames=[SERVER_NAME_LABEL], ) @@ -337,6 +345,7 @@ def __init__( assert stores.persist_events self.persist_events_store = stores.persist_events + self.server_name = hs.hostname self._clock = hs.get_clock() self._instance_name = hs.get_instance_name() self.is_mine_id = hs.is_mine_id @@ -698,9 +707,11 @@ async def _calculate_new_forward_extremities_and_state_delta( if all_single_prev_not_state: return (new_forward_extremities, None) - state_delta_counter.inc() + state_delta_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() if len(new_latest_event_ids) == 1: - state_delta_single_event_counter.inc() + state_delta_single_event_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() # This is a fairly handwavey check to see if we could # have guessed what the delta would have been when @@ -715,7 +726,9 @@ async def _calculate_new_forward_extremities_and_state_delta( for ev, _ in ev_ctx_rm: prev_event_ids = set(ev.prev_event_ids()) if latest_event_ids == prev_event_ids: - state_delta_reuse_delta_counter.inc() + state_delta_reuse_delta_counter.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() break logger.debug("Calculating state delta for room %s", room_id) @@ -977,7 +990,9 @@ async def _get_new_state_after_events( ), ) - state_resolutions_during_persistence.inc() + state_resolutions_during_persistence.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() # If the returned state matches the state group of one of the new # forward extremities then we check if we are able to prune some state @@ -1005,7 +1020,9 @@ async def _prune_extremities( """See if we can prune any of the extremities after calculating the resolved state. """ - potential_times_prune_extremities.inc() + potential_times_prune_extremities.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc() # We keep all the extremities that have the same state group, and # see if we can drop the others. @@ -1103,7 +1120,7 @@ async def _prune_extremities( return new_latest_event_ids - times_pruned_extremities.inc() + times_pruned_extremities.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() logger.info( "Pruning forward extremities in room %s: from %s -> %s", From 24f3e0ac6ed43580734cd889c79e95997a6c5995 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:10:31 -0500 Subject: [PATCH 40/48] Fill in `synapse/storage/databases/main/event_federation.py` --- synapse/storage/databases/main/event_federation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 46aa5902d8d..afeda220d18 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -45,6 +45,7 @@ from synapse.api.room_versions import EventFormatVersions, RoomVersion from synapse.events import EventBase, make_event_from_dict from synapse.logging.opentracing import tag_args, trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause from synapse.storage.background_updates import ForeignKeyConstraint @@ -80,6 +81,7 @@ "synapse_federation_server_number_inbound_pdu_pruned", "The number of events in the inbound federation staging that have been " "pruned due to the queue getting too long", + labelnames=[SERVER_NAME_LABEL], ) logger = logging.getLogger(__name__) @@ -137,6 +139,7 @@ def __init__( super().__init__(database, db_conn, hs) self.hs = hs + self.server_name = hs.hostname if hs.config.worker.run_background_tasks: hs.get_clock().looping_call( @@ -1997,7 +2000,9 @@ async def prune_staged_events_in_room( if not to_delete: return False - pdus_pruned_from_federation_queue.inc(len(to_delete)) + pdus_pruned_from_federation_queue.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).inc(len(to_delete)) logger.info( "Pruning %d events in room %s from federation queue", len(to_delete), From 05ef5e29c21358f459dabd1c25c91ff0e9479a49 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:12:19 -0500 Subject: [PATCH 41/48] Fix `server_name` atribute problems with database classes (`EventFederationWorkerStore`) ``` synapse/storage/databases/main/__init__.py:114: error: Cannot determine type of "server_name" in base class "EventFederationWorkerStore" [misc] ``` --- synapse/storage/_base.py | 1 + synapse/storage/databases/main/event_federation.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index 2309b1648e3..548e7df9305 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -55,6 +55,7 @@ def __init__( hs: "HomeServer", ): self.hs = hs + self.server_name = hs.hostname self._clock = hs.get_clock() self.database_engine = database.engine self.db_pool = database diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index afeda220d18..63910232aa6 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -139,7 +139,6 @@ def __init__( super().__init__(database, db_conn, hs) self.hs = hs - self.server_name = hs.hostname if hs.config.worker.run_background_tasks: hs.get_clock().looping_call( From 05ecf5c9e3b6648db807ee041bd1ed67438ea1d5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:23:29 -0500 Subject: [PATCH 42/48] Fill in `synapse/storage/databases/main/events.py` --- synapse/storage/databases/main/events.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index b7cc0433e75..4143809ed7b 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -55,6 +55,7 @@ from synapse.events.snapshot import EventContext from synapse.events.utils import parse_stripped_state_event from synapse.logging.opentracing import trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, @@ -88,11 +89,13 @@ logger = logging.getLogger(__name__) -persist_event_counter = Counter("synapse_storage_events_persisted_events", "") +persist_event_counter = Counter( + "synapse_storage_events_persisted_events", "", labelnames=[SERVER_NAME_LABEL] +) event_counter = Counter( "synapse_storage_events_persisted_events_sep", "", - ["type", "origin_type", "origin_entity"], + labelnames=["type", "origin_type", "origin_entity", SERVER_NAME_LABEL], ) # State event type/key pairs that we need to gather to fill in the @@ -236,6 +239,7 @@ def __init__( db_conn: LoggingDatabaseConnection, ): self.hs = hs + self.server_name = hs.hostname self.db_pool = db self.store = main_data_store self.database_engine = db.engine @@ -356,7 +360,9 @@ async def _persist_events_and_state_updates( new_event_links=new_event_links, sliding_sync_table_changes=sliding_sync_table_changes, ) - persist_event_counter.inc(len(events_and_contexts)) + persist_event_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( + len(events_and_contexts) + ) if not use_negative_stream_ordering: # we don't want to set the event_persisted_position to a negative @@ -374,7 +380,12 @@ async def _persist_events_and_state_updates( origin_type = "remote" origin_entity = get_domain_from_id(event.sender) - event_counter.labels(event.type, origin_type, origin_entity).inc() + event_counter.labels( + type=event.type, + origin_type=origin_type, + origin_entity=origin_entity, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() if new_forward_extremities: self.store.get_latest_event_ids_in_room.prefill( @@ -2719,7 +2730,7 @@ def _add_to_cache( txn: LoggingTransaction, events_and_contexts: List[Tuple[EventBase, EventContext]], ) -> None: - to_prefill = [] + to_prefill: List[EventCacheEntry] = [] ev_map = {e.event_id: e for e, _ in events_and_contexts} if not ev_map: From 732f605afb235079c729171f9dc9af432efe32b7 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 4 Jul 2025 17:31:17 -0500 Subject: [PATCH 43/48] Fill in `synapse/util/ratelimitutils.py` --- synapse/rest/client/register.py | 6 ++++-- synapse/server.py | 3 ++- synapse/util/ratelimitutils.py | 24 ++++++++++++++++++------ tests/util/test_ratelimitutils.py | 8 ++++---- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/synapse/rest/client/register.py b/synapse/rest/client/register.py index 58231d2b044..13148072d78 100644 --- a/synapse/rest/client/register.py +++ b/synapse/rest/client/register.py @@ -323,10 +323,12 @@ class UsernameAvailabilityRestServlet(RestServlet): def __init__(self, hs: "HomeServer"): super().__init__() self.hs = hs + self.server_name = hs.hostname self.registration_handler = hs.get_registration_handler() self.ratelimiter = FederationRateLimiter( - hs.get_clock(), - FederationRatelimitSettings( + our_server_name=self.server_name, + clock=hs.get_clock(), + config=FederationRatelimitSettings( # Time window of 2s window_size=2000, # Artificially delay requests if rate > sleep_limit/window_size diff --git a/synapse/server.py b/synapse/server.py index fd16abb9ead..52b3b0d5dcd 100644 --- a/synapse/server.py +++ b/synapse/server.py @@ -844,7 +844,8 @@ def get_replication_streams(self) -> Dict[str, Stream]: @cache_in_self def get_federation_ratelimiter(self) -> FederationRateLimiter: return FederationRateLimiter( - self.get_clock(), + our_server_name=self.hostname, + clock=self.get_clock(), config=self.config.ratelimiting.rc_federation, metrics_name="federation_servlets", ) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 3f067b792c4..343ee4f4549 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -52,7 +52,7 @@ run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge from synapse.util import Clock if typing.TYPE_CHECKING: @@ -65,12 +65,12 @@ rate_limit_sleep_counter = Counter( "synapse_rate_limit_sleep", "Number of requests slept by the rate limiter", - ["rate_limiter_name"], + labelnames=["rate_limiter_name", SERVER_NAME_LABEL], ) rate_limit_reject_counter = Counter( "synapse_rate_limit_reject", "Number of requests rejected by the rate limiter", - ["rate_limiter_name"], + labelnames=["rate_limiter_name", SERVER_NAME_LABEL], ) queue_wait_timer = Histogram( "synapse_rate_limit_queue_wait_time_seconds", @@ -157,6 +157,7 @@ class FederationRateLimiter: def __init__( self, + our_server_name: str, clock: Clock, config: FederationRatelimitSettings, metrics_name: Optional[str] = None, @@ -174,7 +175,10 @@ def __init__( def new_limiter() -> "_PerHostRatelimiter": return _PerHostRatelimiter( - clock=clock, config=config, metrics_name=metrics_name + our_server_name=our_server_name, + clock=clock, + config=config, + metrics_name=metrics_name, ) self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = ( @@ -205,6 +209,7 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None] class _PerHostRatelimiter: def __init__( self, + our_server_name: str, clock: Clock, config: FederationRatelimitSettings, metrics_name: Optional[str] = None, @@ -218,6 +223,7 @@ def __init__( for this rate limiter. from the rest in the metrics """ + self.our_server_name = our_server_name self.clock = clock self.metrics_name = metrics_name @@ -296,7 +302,10 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) if self.metrics_name: - rate_limit_reject_counter.labels(self.metrics_name).inc() + rate_limit_reject_counter.labels( + rate_limiter_name=self.metrics_name, + **{SERVER_NAME_LABEL: self.our_server_name}, + ).inc() raise LimitExceededError( limiter_name="rc_federation", retry_after_ms=int(self.window_size / self.sleep_limit), @@ -333,7 +342,10 @@ def queue_request() -> "defer.Deferred[None]": self.sleep_sec, ) if self.metrics_name: - rate_limit_sleep_counter.labels(self.metrics_name).inc() + rate_limit_sleep_counter.labels( + rate_limiter_name=self.metrics_name, + **{SERVER_NAME_LABEL: self.our_server_name}, + ).inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) diff --git a/tests/util/test_ratelimitutils.py b/tests/util/test_ratelimitutils.py index 7bb45f9bf21..20281d04fec 100644 --- a/tests/util/test_ratelimitutils.py +++ b/tests/util/test_ratelimitutils.py @@ -37,7 +37,7 @@ def test_ratelimit(self) -> None: """A simple test with the default values""" reactor, clock = get_clock() rc_config = build_rc_config() - ratelimiter = FederationRateLimiter(clock, rc_config) + ratelimiter = FederationRateLimiter("test_server", clock, rc_config) with ratelimiter.ratelimit("testhost") as d1: # shouldn't block @@ -47,7 +47,7 @@ def test_concurrent_limit(self) -> None: """Test what happens when we hit the concurrent limit""" reactor, clock = get_clock() rc_config = build_rc_config({"rc_federation": {"concurrent": 2}}) - ratelimiter = FederationRateLimiter(clock, rc_config) + ratelimiter = FederationRateLimiter("test_server", clock, rc_config) with ratelimiter.ratelimit("testhost") as d1: # shouldn't block @@ -74,7 +74,7 @@ def test_sleep_limit(self) -> None: rc_config = build_rc_config( {"rc_federation": {"sleep_limit": 2, "sleep_delay": 500}} ) - ratelimiter = FederationRateLimiter(clock, rc_config) + ratelimiter = FederationRateLimiter("test_server", clock, rc_config) with ratelimiter.ratelimit("testhost") as d1: # shouldn't block @@ -105,7 +105,7 @@ def test_lots_of_queued_things(self) -> None: } } ) - ratelimiter = FederationRateLimiter(clock, rc_config) + ratelimiter = FederationRateLimiter("test_server", clock, rc_config) with ratelimiter.ratelimit("testhost") as d: # shouldn't block From 5fb3acb2f258bd82ffcdb6b2f5235bc49a55c613 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 7 Jul 2025 16:34:33 -0500 Subject: [PATCH 44/48] Fill in `synapse/metrics/background_process_metrics.py` TODO: still need to fill in `run_as_background_process` usage --- synapse/metrics/background_process_metrics.py | 76 +++++++++++++------ synapse/replication/tcp/protocol.py | 4 +- synapse/replication/tcp/redis.py | 2 +- .../test_background_process_metrics.py | 4 +- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 49d0ff9fc19..5f9883100cc 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -49,6 +49,7 @@ PreserveLoggingContext, ) from synapse.logging.opentracing import SynapseTags, start_active_span +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics._types import Collector if TYPE_CHECKING: @@ -64,13 +65,13 @@ _background_process_start_count = Counter( "synapse_background_process_start_count", "Number of background processes started", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) _background_process_in_flight_count = Gauge( "synapse_background_process_in_flight_count", "Number of background processes in flight", - labelnames=["name"], + labelnames=["name", SERVER_NAME_LABEL], ) # we set registry=None in all of these to stop them getting registered with @@ -80,21 +81,21 @@ _background_process_ru_utime = Counter( "synapse_background_process_ru_utime_seconds", "User CPU time used by background processes, in seconds", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], registry=None, ) _background_process_ru_stime = Counter( "synapse_background_process_ru_stime_seconds", "System CPU time used by background processes, in seconds", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], registry=None, ) _background_process_db_txn_count = Counter( "synapse_background_process_db_txn_count", "Number of database transactions done by background processes", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], registry=None, ) @@ -104,14 +105,14 @@ "Seconds spent by background processes waiting for database " "transactions, excluding scheduling time" ), - ["name"], + labelnames=["name", SERVER_NAME_LABEL], registry=None, ) _background_process_db_sched_duration = Counter( "synapse_background_process_db_sched_duration_seconds", "Seconds spent by background processes waiting for database connections", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], registry=None, ) @@ -169,8 +170,9 @@ def collect(self) -> Iterable[Metric]: class _BackgroundProcess: - def __init__(self, desc: str, ctx: LoggingContext): + def __init__(self, *, desc: str, server_name: str, ctx: LoggingContext): self.desc = desc + self.server_name = server_name self._context = ctx self._reported_stats: Optional[ContextResourceUsage] = None @@ -185,15 +187,21 @@ def update_metrics(self) -> None: # For unknown reasons, the difference in times can be negative. See comment in # synapse.http.request_metrics.RequestMetrics.update_metrics. - _background_process_ru_utime.labels(self.desc).inc(max(diff.ru_utime, 0)) - _background_process_ru_stime.labels(self.desc).inc(max(diff.ru_stime, 0)) - _background_process_db_txn_count.labels(self.desc).inc(diff.db_txn_count) - _background_process_db_txn_duration.labels(self.desc).inc( - diff.db_txn_duration_sec - ) - _background_process_db_sched_duration.labels(self.desc).inc( - diff.db_sched_duration_sec - ) + _background_process_ru_utime.labels( + name=self.desc, **{SERVER_NAME_LABEL: self.server_name} + ).inc(max(diff.ru_utime, 0)) + _background_process_ru_stime.labels( + name=self.desc, **{SERVER_NAME_LABEL: self.server_name} + ).inc(max(diff.ru_stime, 0)) + _background_process_db_txn_count.labels( + name=self.desc, **{SERVER_NAME_LABEL: self.server_name} + ).inc(diff.db_txn_count) + _background_process_db_txn_duration.labels( + name=self.desc, **{SERVER_NAME_LABEL: self.server_name} + ).inc(diff.db_txn_duration_sec) + _background_process_db_sched_duration.labels( + name=self.desc, **{SERVER_NAME_LABEL: self.server_name} + ).inc(diff.db_sched_duration_sec) R = TypeVar("R") @@ -201,6 +209,7 @@ def update_metrics(self) -> None: def run_as_background_process( desc: "LiteralString", + server_name: str, func: Callable[..., Awaitable[Optional[R]]], *args: Any, bg_start_span: bool = True, @@ -218,6 +227,8 @@ def run_as_background_process( Args: desc: a description for this background process type + server_name: The homeserver name that this background process is being run for + (this should be `hs.hostname`). func: a function, which may return a Deferred or a coroutine bg_start_span: Whether to start an opentracing span. Defaults to True. Should only be disabled for processes that will not log to or tag @@ -236,10 +247,16 @@ async def run() -> Optional[R]: count = _background_process_counts.get(desc, 0) _background_process_counts[desc] = count + 1 - _background_process_start_count.labels(desc).inc() - _background_process_in_flight_count.labels(desc).inc() + _background_process_start_count.labels( + name=desc, **{SERVER_NAME_LABEL: server_name} + ).inc() + _background_process_in_flight_count.labels( + name=desc, **{SERVER_NAME_LABEL: server_name} + ).inc() - with BackgroundProcessLoggingContext(desc, count) as context: + with BackgroundProcessLoggingContext( + name=desc, server_name=server_name, instance_id=count + ) as context: try: if bg_start_span: ctx = start_active_span( @@ -256,7 +273,9 @@ async def run() -> Optional[R]: ) return None finally: - _background_process_in_flight_count.labels(desc).dec() + _background_process_in_flight_count.labels( + name=desc, **{SERVER_NAME_LABEL: server_name} + ).dec() with PreserveLoggingContext(): # Note that we return a Deferred here so that it can be used in a @@ -317,13 +336,20 @@ class BackgroundProcessLoggingContext(LoggingContext): __slots__ = ["_proc"] - def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None): + def __init__( + self, + *, + name: str, + server_name: str, + instance_id: Optional[Union[int, str]] = None, + ): """ Args: name: The name of the background process. Each distinct `name` gets a separate prometheus time series. - + server_name: The homeserver name that this background process is being run for + (this should be `hs.hostname`). instance_id: an identifer to add to `name` to distinguish this instance of the named background process in the logs. If this is `None`, one is made up based on id(self). @@ -331,7 +357,9 @@ def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None): if instance_id is None: instance_id = id(self) super().__init__("%s-%s" % (name, instance_id)) - self._proc: Optional[_BackgroundProcess] = _BackgroundProcess(name, self) + self._proc: Optional[_BackgroundProcess] = _BackgroundProcess( + name=name, server_name=server_name, ctx=self + ) def start(self, rusage: "Optional[resource.struct_rusage]") -> None: """Log context has started running (again).""" diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index c53d3f15b01..8fa26d8481c 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -171,7 +171,9 @@ def __init__( # capture the sentinel context as its containing context and won't prevent # GC of / unintentionally reactivate what would be the current context. self._logging_context = BackgroundProcessLoggingContext( - "replication-conn", self.conn_id + name="replication-conn", + server_name=self.server_name, + instance_id=self.conn_id, ) def connectionMade(self) -> None: diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py index 9a2f6978c04..a67f4ce3be1 100644 --- a/synapse/replication/tcp/redis.py +++ b/synapse/replication/tcp/redis.py @@ -124,7 +124,7 @@ def __init__(self, *args: Any, **kwargs: Any): # capture the sentinel context as its containing context and won't prevent # GC of / unintentionally reactivate what would be the current context. self._logging_context = BackgroundProcessLoggingContext( - "replication_command_handler" + name="replication_command_handler", server_name=self.server_name ) def connectionMade(self) -> None: diff --git a/tests/metrics/test_background_process_metrics.py b/tests/metrics/test_background_process_metrics.py index f0f6cb29128..7253e51c38a 100644 --- a/tests/metrics/test_background_process_metrics.py +++ b/tests/metrics/test_background_process_metrics.py @@ -14,6 +14,8 @@ def test_update_metrics_with_negative_time_diff(self) -> None: mock_logging_context = Mock(spec=LoggingContext) mock_logging_context.get_resource_usage.return_value = usage - process = _BackgroundProcess("test process", mock_logging_context) + process = _BackgroundProcess( + name="test process", server_name="test_server", ctx=mock_logging_context + ) # Should not raise process.update_metrics() From b396b7c5384d958f37d57f4b92cdb10d0f12757a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 7 Jul 2025 18:21:14 -0500 Subject: [PATCH 45/48] Fill in `synapse/metrics/background_process_metrics.py` (`run_as_background_process`) --- synapse/_scripts/update_synapse_database.py | 7 ++- synapse/appservice/scheduler.py | 60 +++++++++++-------- synapse/crypto/keyring.py | 6 +- synapse/events/auto_accept_invites.py | 2 + synapse/federation/sender/__init__.py | 5 +- .../sender/per_destination_queue.py | 1 + synapse/handlers/appservice.py | 4 +- synapse/handlers/auth.py | 1 + synapse/handlers/deactivate_account.py | 5 +- synapse/handlers/delayed_events.py | 9 ++- synapse/handlers/device.py | 4 ++ synapse/handlers/federation.py | 14 ++++- synapse/handlers/federation_event.py | 2 + synapse/handlers/message.py | 6 +- synapse/handlers/pagination.py | 7 ++- synapse/handlers/presence.py | 6 +- synapse/handlers/room_member.py | 5 +- synapse/handlers/stats.py | 3 +- synapse/handlers/typing.py | 15 ++++- synapse/handlers/user_directory.py | 12 +++- synapse/media/media_repository.py | 8 ++- synapse/media/url_previewer.py | 2 +- synapse/metrics/background_process_metrics.py | 2 +- synapse/metrics/common_usage_metrics.py | 6 +- synapse/module_api/__init__.py | 2 + synapse/push/emailpusher.py | 3 +- synapse/push/httppusher.py | 6 +- synapse/push/pusherpool.py | 5 +- synapse/replication/tcp/client.py | 5 +- synapse/replication/tcp/handler.py | 5 +- synapse/replication/tcp/protocol.py | 4 +- synapse/replication/tcp/redis.py | 12 +++- synapse/replication/tcp/resource.py | 4 +- synapse/rest/client/room.py | 2 + synapse/server.py | 2 +- synapse/storage/background_updates.py | 6 +- synapse/storage/controllers/persist_events.py | 6 +- synapse/storage/database.py | 2 + .../storage/databases/main/account_data.py | 1 + synapse/storage/databases/main/cache.py | 5 +- synapse/storage/databases/main/deviceinbox.py | 2 + synapse/storage/databases/main/devices.py | 1 + .../storage/databases/main/end_to_end_keys.py | 1 + .../storage/databases/main/events_worker.py | 7 ++- synapse/storage/databases/main/presence.py | 1 + synapse/storage/databases/main/push_rule.py | 1 + synapse/storage/databases/main/pusher.py | 1 + synapse/storage/databases/main/receipts.py | 1 + synapse/storage/databases/main/room.py | 1 + synapse/storage/util/id_generators.py | 7 +++ synapse/util/batching_queue.py | 9 ++- synapse/util/caches/expiringcache.py | 4 +- synapse/util/distributor.py | 12 +++- synapse/util/retryutils.py | 21 +++++-- synapse/util/task_scheduler.py | 5 +- tests/appservice/test_scheduler.py | 20 ++++++- .../test_background_process_metrics.py | 2 +- tests/test_distributor.py | 2 +- tests/util/test_batching_queue.py | 5 +- 59 files changed, 278 insertions(+), 87 deletions(-) diff --git a/synapse/_scripts/update_synapse_database.py b/synapse/_scripts/update_synapse_database.py index d8b4dbd6c64..70e55984184 100644 --- a/synapse/_scripts/update_synapse_database.py +++ b/synapse/_scripts/update_synapse_database.py @@ -53,6 +53,7 @@ def __init__(self, config: HomeServerConfig): def run_background_updates(hs: HomeServer) -> None: + server_name = hs.hostname main = hs.get_datastores().main state = hs.get_datastores().state @@ -66,7 +67,11 @@ async def run_background_updates() -> None: def run() -> None: # Apply all background updates on the database. defer.ensureDeferred( - run_as_background_process("background_updates", run_background_updates) + run_as_background_process( + "background_updates", + server_name, + run_background_updates, + ) ) reactor.callWhenRunning(run) diff --git a/synapse/appservice/scheduler.py b/synapse/appservice/scheduler.py index ab3f4e15fe9..87b8bdb2d9c 100644 --- a/synapse/appservice/scheduler.py +++ b/synapse/appservice/scheduler.py @@ -103,18 +103,15 @@ class ApplicationServiceScheduler: - """Public facing API for this module. Does the required DI to tie the - components together. This also serves as the "event_pool", which in this + """ + Public facing API for this module. Does the required dependency injection (DI) to + tie the components together. This also serves as the "event_pool", which in this case is a simple array. """ def __init__(self, hs: "HomeServer"): - self.clock = hs.get_clock() - self.store = hs.get_datastores().main - self.as_api = hs.get_application_service_api() - - self.txn_ctrl = _TransactionController(self.clock, self.store, self.as_api) - self.queuer = _ServiceQueuer(self.txn_ctrl, self.clock, hs) + self.txn_ctrl = _TransactionController(hs) + self.queuer = _ServiceQueuer(self.txn_ctrl, hs) async def start(self) -> None: logger.info("Starting appservice scheduler") @@ -184,9 +181,7 @@ class _ServiceQueuer: appservice at a given time. """ - def __init__( - self, txn_ctrl: "_TransactionController", clock: Clock, hs: "HomeServer" - ): + def __init__(self, txn_ctrl: "_TransactionController", hs: "HomeServer"): # dict of {service_id: [events]} self.queued_events: Dict[str, List[EventBase]] = {} # dict of {service_id: [events]} @@ -199,10 +194,11 @@ def __init__( # the appservices which currently have a transaction in flight self.requests_in_flight: Set[str] = set() self.txn_ctrl = txn_ctrl - self.clock = clock self._msc3202_transaction_extensions_enabled: bool = ( hs.config.experimental.msc3202_transaction_extensions ) + self.server_name = hs.hostname + self.clock = hs.get_clock() self._store = hs.get_datastores().main def start_background_request(self, service: ApplicationService) -> None: @@ -210,7 +206,9 @@ def start_background_request(self, service: ApplicationService) -> None: if service.id in self.requests_in_flight: return - run_as_background_process("as-sender", self._send_request, service) + run_as_background_process( + "as-sender", self.server_name, self._send_request, service + ) async def _send_request(self, service: ApplicationService) -> None: # sanity-check: we shouldn't get here if this service already has a sender @@ -359,10 +357,11 @@ class _TransactionController: (Note we have only have one of these in the homeserver.) """ - def __init__(self, clock: Clock, store: DataStore, as_api: ApplicationServiceApi): - self.clock = clock - self.store = store - self.as_api = as_api + def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname + self.clock = hs.get_clock() + self.store = hs.get_datastores().main + self.as_api = hs.get_application_service_api() # map from service id to recoverer instance self.recoverers: Dict[str, "_Recoverer"] = {} @@ -446,7 +445,12 @@ def start_recoverer(self, service: ApplicationService) -> None: logger.info("Starting recoverer for AS ID %s", service.id) assert service.id not in self.recoverers recoverer = self.RECOVERER_CLASS( - self.clock, self.store, self.as_api, service, self.on_recovered + self.server_name, + self.clock, + self.store, + self.as_api, + service, + self.on_recovered, ) self.recoverers[service.id] = recoverer recoverer.recover() @@ -477,21 +481,24 @@ class _Recoverer: We have one of these for each appservice which is currently considered DOWN. Args: - clock (synapse.util.Clock): - store (synapse.storage.DataStore): - as_api (synapse.appservice.api.ApplicationServiceApi): - service (synapse.appservice.ApplicationService): the service we are managing - callback (callable[_Recoverer]): called once the service recovers. + server_name: the homeserver name (used to label metrics) (this should be `hs.hostname`). + clock: + store: + as_api: + service: the service we are managing + callback: called once the service recovers. """ def __init__( self, + server_name: str, clock: Clock, store: DataStore, as_api: ApplicationServiceApi, service: ApplicationService, callback: Callable[["_Recoverer"], Awaitable[None]], ): + self.server_name = server_name self.clock = clock self.store = store self.as_api = as_api @@ -504,7 +511,11 @@ def recover(self) -> None: delay = 2**self.backoff_counter logger.info("Scheduling retries on %s in %fs", self.service.id, delay) self.scheduled_recovery = self.clock.call_later( - delay, run_as_background_process, "as-recoverer", self.retry + delay, + run_as_background_process, + "as-recoverer", + self.server_name, + self.retry, ) def _backoff(self) -> None: @@ -525,6 +536,7 @@ def force_retry(self) -> None: # Run a retry, which will resechedule a recovery if it fails. run_as_background_process( "retry", + self.server_name, self.retry, ) diff --git a/synapse/crypto/keyring.py b/synapse/crypto/keyring.py index 643d2d4e66d..3a7b327b99d 100644 --- a/synapse/crypto/keyring.py +++ b/synapse/crypto/keyring.py @@ -473,8 +473,12 @@ async def _inner_fetch_key_request( class KeyFetcher(metaclass=abc.ABCMeta): def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self._queue = BatchingQueue( - self.__class__.__name__, hs.get_clock(), self._fetch_keys + name=self.__class__.__name__, + server_name=self.server_name, + clock=hs.get_clock(), + process_batch_callback=self._fetch_keys, ) async def get_keys( diff --git a/synapse/events/auto_accept_invites.py b/synapse/events/auto_accept_invites.py index 9e5f76f33f0..300ecfdf7a3 100644 --- a/synapse/events/auto_accept_invites.py +++ b/synapse/events/auto_accept_invites.py @@ -34,6 +34,7 @@ class InviteAutoAccepter: def __init__(self, config: AutoAcceptInvitesConfig, api: ModuleApi): # Keep a reference to the Module API. self._api = api + self.server_name = api.server_name self._config = config if not self._config.enabled: @@ -113,6 +114,7 @@ async def on_new_event(self, event: EventBase, *args: Any) -> None: # that occurs when responding to invites over federation (see https://github.com/matrix-org/synapse-auto-accept-invite/issues/12) run_as_background_process( "retry_make_join", + self.server_name, self._retry_make_join, event.state_key, event.state_key, diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 0121fac175b..082b61db0a9 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -438,6 +438,7 @@ def __init__(self, hs: "HomeServer"): run_as_background_process, WAKEUP_RETRY_PERIOD_SEC * 1000.0, "wake_destinations_needing_catchup", + self.server_name, self._wake_destinations_needing_catchup, ) @@ -480,7 +481,9 @@ def notify_new_events(self, max_token: RoomStreamToken) -> None: # fire off a processing loop in the background run_as_background_process( - "process_event_queue_for_federation", self._process_event_queue_loop + "process_event_queue_for_federation", + self.server_name, + self._process_event_queue_loop, ) async def _process_event_queue_loop(self) -> None: diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 34ba44f14f5..8f29ec34b73 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -311,6 +311,7 @@ def attempt_new_transaction(self) -> None: run_as_background_process( "federation_transaction_transmission_loop", + self.server_name, self._transaction_transmission_loop, ) diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index acc1ae63d7a..3c84b7781c7 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -167,7 +167,9 @@ async def start_scheduler() -> None: except Exception: logger.error("Application Services Failure") - run_as_background_process("as_scheduler", start_scheduler) + run_as_background_process( + "as_scheduler", self.server_name, start_scheduler + ) self.started_scheduler = True # Fork off pushes to these services diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index a802941340b..be25a01c9d1 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -248,6 +248,7 @@ def __init__(self, hs: "HomeServer"): run_as_background_process, 5 * 60 * 1000, "expire_old_sessions", + self.server_name, self._expire_old_sessions, ) diff --git a/synapse/handlers/deactivate_account.py b/synapse/handlers/deactivate_account.py index 42e53d920ab..980544074ea 100644 --- a/synapse/handlers/deactivate_account.py +++ b/synapse/handlers/deactivate_account.py @@ -40,6 +40,7 @@ class DeactivateAccountHandler: def __init__(self, hs: "HomeServer"): self.store = hs.get_datastores().main self.hs = hs + self.server_name = hs.hostname self._auth_handler = hs.get_auth_handler() self._device_handler = hs.get_device_handler() self._room_member_handler = hs.get_room_member_handler() @@ -245,7 +246,9 @@ def _start_user_parting(self) -> None: pending deactivation, if it isn't already running. """ if not self._user_parter_running: - run_as_background_process("user_parter_loop", self._user_parter_loop) + run_as_background_process( + "user_parter_loop", self.server_name, self._user_parter_loop + ) async def _user_parter_loop(self) -> None: """Loop that parts deactivated users from rooms""" diff --git a/synapse/handlers/delayed_events.py b/synapse/handlers/delayed_events.py index 80cb1cec9b0..a1466677699 100644 --- a/synapse/handlers/delayed_events.py +++ b/synapse/handlers/delayed_events.py @@ -54,6 +54,7 @@ class DelayedEventsHandler: def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self._store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._config = hs.config @@ -109,12 +110,13 @@ async def _schedule_db_events() -> None: # Can send the events in background after having awaited on marking them as processed run_as_background_process( "_send_events", + self.server_name, self._send_events, events, ) self._initialized_from_db = run_as_background_process( - "_schedule_db_events", _schedule_db_events + "_schedule_db_events", self.server_name, _schedule_db_events ) else: self._repl_client = ReplicationAddedDelayedEventRestServlet.make_client(hs) @@ -139,7 +141,9 @@ async def process() -> None: finally: self._event_processing = False - run_as_background_process("delayed_events.notify_new_event", process) + run_as_background_process( + "delayed_events.notify_new_event", self.server_name, process + ) async def _unsafe_process_new_event(self) -> None: # If self._event_pos is None then means we haven't fetched it from the DB yet @@ -447,6 +451,7 @@ def _schedule_next_at(self, next_send_ts: Timestamp) -> None: delay_sec, run_as_background_process, "_send_on_timeout", + self.server_name, self._send_on_timeout, ) else: diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index c6e44dae6a3..3e6a4891bd7 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -564,6 +564,7 @@ def __init__(self, hs: "HomeServer"): run_as_background_process, DELETE_STALE_DEVICES_INTERVAL_MS, "delete_stale_devices", + self.server_name, self._delete_stale_devices, ) @@ -1213,6 +1214,7 @@ class DeviceListUpdater(DeviceListWorkerUpdater): "Handles incoming device list updates from federation and updates the DB" def __init__(self, hs: "HomeServer", device_handler: DeviceHandler): + self.server_name = hs.hostname self.store = hs.get_datastores().main self.federation = hs.get_federation_client() self.clock = hs.get_clock() @@ -1243,6 +1245,7 @@ def __init__(self, hs: "HomeServer", device_handler: DeviceHandler): self.clock.looping_call( run_as_background_process, 30 * 1000, + server_name=self.server_name, func=self._maybe_retry_device_resync, desc="_maybe_retry_device_resync", ) @@ -1364,6 +1367,7 @@ async def _handle_device_updates(self, user_id: str) -> None: await self.store.mark_remote_users_device_caches_as_stale([user_id]) run_as_background_process( "_maybe_retry_device_resync", + self.server_name, self.multi_user_device_resync, [user_id], False, diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 015fb3edca0..956312ccde7 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -204,7 +204,9 @@ def __init__(self, hs: "HomeServer"): # were shut down. if not hs.config.worker.worker_app: run_as_background_process( - "resume_sync_partial_state_room", self._resume_partial_state_room_sync + "resume_sync_partial_state_room", + self.server_name, + self._resume_partial_state_room_sync, ) @trace @@ -333,6 +335,7 @@ async def _maybe_backfill_inner( ) run_as_background_process( "_maybe_backfill_inner_anyway_with_max_depth", + self.server_name, self.maybe_backfill, room_id=room_id, # We use `MAX_DEPTH` so that we find all backfill points next @@ -806,7 +809,10 @@ async def do_invite_join( # have. Hence we fire off the background task, but don't wait for it. run_as_background_process( - "handle_queued_pdus", self._handle_queued_pdus, room_queue + "handle_queued_pdus", + self.server_name, + self._handle_queued_pdus, + room_queue, ) async def do_knock( @@ -1890,7 +1896,9 @@ async def _sync_partial_state_room_wrapper() -> None: ) run_as_background_process( - desc="sync_partial_state_room", func=_sync_partial_state_room_wrapper + desc="sync_partial_state_room", + server_name=self.server_name, + func=_sync_partial_state_room_wrapper, ) async def _sync_partial_state_room( diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index cbdbd10cae4..136ecc808a2 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -940,6 +940,7 @@ async def _process_new_pulled_events(new_events: Collection[EventBase]) -> None: if len(events_with_failed_pull_attempts) > 0: run_as_background_process( "_process_new_pulled_events_with_failed_pull_attempts", + self.server_name, _process_new_pulled_events, events_with_failed_pull_attempts, ) @@ -1533,6 +1534,7 @@ async def _process_received_pdu( if resync: run_as_background_process( "resync_device_due_to_pdu", + self.server_name, self._resync_device, event.sender, ) diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index 5d6ee6996f7..23700c6ebb7 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -92,6 +92,7 @@ class MessageHandler: """Contains some read only APIs to get state about a room""" def __init__(self, hs: "HomeServer"): + self.server_name = hs.hostname self.auth = hs.get_auth() self.clock = hs.get_clock() self.state = hs.get_state_handler() @@ -107,7 +108,7 @@ def __init__(self, hs: "HomeServer"): if not hs.config.worker.worker_app: run_as_background_process( - "_schedule_next_expiry", self._schedule_next_expiry + "_schedule_next_expiry", self.server_name, self._schedule_next_expiry ) async def get_room_data( @@ -439,6 +440,7 @@ def _schedule_expiry_for_event(self, event_id: str, expiry_ts: int) -> None: delay, run_as_background_process, "_expire_event", + self.server_name, self._expire_event, event_id, ) @@ -541,6 +543,7 @@ def __init__(self, hs: "HomeServer"): self.clock.looping_call( lambda: run_as_background_process( "send_dummy_events_to_fill_extremities", + self.server_name, self._send_dummy_events_to_fill_extremities, ), 5 * 60 * 1000, @@ -1971,6 +1974,7 @@ async def persist_and_notify_client_events( # matters as sometimes presence code can take a while. run_as_background_process( "bump_presence_active_time", + self.server_name, self._bump_active_time, requester.user, requester.device_id, diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py index 4070b74b7af..df1a7e714ce 100644 --- a/synapse/handlers/pagination.py +++ b/synapse/handlers/pagination.py @@ -79,12 +79,12 @@ class PaginationHandler: def __init__(self, hs: "HomeServer"): self.hs = hs + self.server_name = hs.hostname self.auth = hs.get_auth() self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._state_storage_controller = self._storage_controllers.state self.clock = hs.get_clock() - self._server_name = hs.hostname self._room_shutdown_handler = hs.get_room_shutdown_handler() self._relations_handler = hs.get_relations_handler() self._worker_locks = hs.get_worker_locks_handler() @@ -119,6 +119,7 @@ def __init__(self, hs: "HomeServer"): run_as_background_process, job.interval, "purge_history_for_rooms_in_range", + self.server_name, self.purge_history_for_rooms_in_range, job.shortest_max_lifetime, job.longest_max_lifetime, @@ -245,6 +246,7 @@ async def purge_history_for_rooms_in_range( # other purges in the same room. run_as_background_process( PURGE_HISTORY_ACTION_NAME, + self.server_name, self.purge_history, room_id, token, @@ -395,7 +397,7 @@ async def purge_room( write=True, ): # first check that we have no users in this room - joined = await self.store.is_host_joined(room_id, self._server_name) + joined = await self.store.is_host_joined(room_id, self.server_name) if joined: if force: logger.info( @@ -604,6 +606,7 @@ async def get_messages( # for a costly federation call and processing. run_as_background_process( "maybe_backfill_in_the_background", + self.server_name, self.hs.get_federation_handler().maybe_backfill, room_id, curr_topo, diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index bbe81dbf4f3..e71a50d6f1c 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -534,6 +534,7 @@ def __init__(self, hs: "HomeServer"): "shutdown", run_as_background_process, "generic_presence.on_shutdown", + self.server_name, self._on_shutdown, ) @@ -834,6 +835,7 @@ def __init__(self, hs: "HomeServer"): "shutdown", run_as_background_process, "presence.on_shutdown", + self.server_name, self._on_shutdown, ) @@ -1523,7 +1525,9 @@ async def _process_presence() -> None: finally: self._event_processing = False - run_as_background_process("presence.notify_new_event", _process_presence) + run_as_background_process( + "presence.notify_new_event", self.server_name, _process_presence + ) async def _unsafe_process(self) -> None: # Loop round handling deltas until we're up to date diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index cf9db7b0189..21ac7e8a1cd 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -2183,6 +2183,7 @@ def __init__(self, hs: "HomeServer"): super().__init__(hs) self._hs = hs + self.server_name = hs.hostname self._store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._clock = hs.get_clock() @@ -2214,7 +2215,9 @@ async def process() -> None: finally: self._is_processing = False - run_as_background_process("room_forgetter.notify_new_event", process) + run_as_background_process( + "room_forgetter.notify_new_event", self.server_name, process + ) async def _unsafe_process(self) -> None: # If self.pos is None then means we haven't fetched it from DB diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py index 8f90c170602..5fd539aa270 100644 --- a/synapse/handlers/stats.py +++ b/synapse/handlers/stats.py @@ -53,6 +53,7 @@ class StatsHandler: def __init__(self, hs: "HomeServer"): self.hs = hs + self.server_name = hs.hostname self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self.state = hs.get_state_handler() @@ -88,7 +89,7 @@ async def process() -> None: finally: self._is_processing = False - run_as_background_process("stats.notify_new_event", process) + run_as_background_process("stats.notify_new_event", self.server_name, process) async def _unsafe_process(self) -> None: # If self.pos is None then means we haven't fetched it from DB diff --git a/synapse/handlers/typing.py b/synapse/handlers/typing.py index 8d693fee304..f0bbf414e6e 100644 --- a/synapse/handlers/typing.py +++ b/synapse/handlers/typing.py @@ -80,7 +80,7 @@ class FollowerTypingHandler: def __init__(self, hs: "HomeServer"): self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() - self.server_name = hs.config.server.server_name + self.server_name = hs.hostname self.clock = hs.get_clock() self.is_mine_id = hs.is_mine_id self.is_mine_server_name = hs.is_mine_server_name @@ -143,7 +143,11 @@ def _handle_timeout_for_member(self, now: int, member: RoomMember) -> None: last_fed_poke = self._member_last_federation_poke.get(member, None) if not last_fed_poke or last_fed_poke + FEDERATION_PING_INTERVAL <= now: run_as_background_process( - "typing._push_remote", self._push_remote, member=member, typing=True + "typing._push_remote", + self.server_name, + self._push_remote, + member=member, + typing=True, ) # Add a paranoia timer to ensure that we always have a timer for @@ -216,6 +220,7 @@ def process_replication_rows( if self.federation: run_as_background_process( "_send_changes_in_typing_to_remotes", + self.server_name, self._send_changes_in_typing_to_remotes, row.room_id, prev_typing, @@ -375,7 +380,11 @@ def _push_update(self, member: RoomMember, typing: bool) -> None: if self.hs.is_mine_id(member.user_id): # Only send updates for changes to our own users. run_as_background_process( - "typing._push_remote", self._push_remote, member, typing + "typing._push_remote", + self.server_name, + self._push_remote, + member, + typing, ) self._push_update_local(member=member, typing=typing) diff --git a/synapse/handlers/user_directory.py b/synapse/handlers/user_directory.py index 1f692c79a02..6ac1d6eafd1 100644 --- a/synapse/handlers/user_directory.py +++ b/synapse/handlers/user_directory.py @@ -192,7 +192,9 @@ async def process() -> None: self._is_processing = False self._is_processing = True - run_as_background_process("user_directory.notify_new_event", process) + run_as_background_process( + "user_directory.notify_new_event", self.server_name, process + ) async def handle_local_profile_change( self, user_id: str, profile: ProfileInfo @@ -604,7 +606,9 @@ async def process() -> None: self._is_refreshing_remote_profiles = False self._is_refreshing_remote_profiles = True - run_as_background_process("user_directory.refresh_remote_profiles", process) + run_as_background_process( + "user_directory.refresh_remote_profiles", self.server_name, process + ) async def _unsafe_refresh_remote_profiles(self) -> None: limit = MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO - len( @@ -686,7 +690,9 @@ async def process() -> None: self._is_refreshing_remote_profiles_for_servers.add(server_name) run_as_background_process( - "user_directory.refresh_remote_profiles_for_remote_server", process + "user_directory.refresh_remote_profiles_for_remote_server", + self.server_name, + process, ) async def _unsafe_refresh_remote_profiles_for_remote_server( diff --git a/synapse/media/media_repository.py b/synapse/media/media_repository.py index 1d05c536af8..afabdaa0e14 100644 --- a/synapse/media/media_repository.py +++ b/synapse/media/media_repository.py @@ -179,12 +179,16 @@ def __init__(self, hs: "HomeServer"): def _start_update_recently_accessed(self) -> Deferred: return run_as_background_process( - "update_recently_accessed_media", self._update_recently_accessed + "update_recently_accessed_media", + self.server_name, + self._update_recently_accessed, ) def _start_apply_media_retention_rules(self) -> Deferred: return run_as_background_process( - "apply_media_retention_rules", self._apply_media_retention_rules + "apply_media_retention_rules", + self.server_name, + self._apply_media_retention_rules, ) async def _update_recently_accessed(self) -> None: diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py index 0c665e1942b..8c79d5aa7c2 100644 --- a/synapse/media/url_previewer.py +++ b/synapse/media/url_previewer.py @@ -739,7 +739,7 @@ async def _handle_oembed_response( def _start_expire_url_cache_data(self) -> Deferred: return run_as_background_process( - "expire_url_cache_data", self._expire_url_cache_data + "expire_url_cache_data", self.server_name, self._expire_url_cache_data ) async def _expire_url_cache_data(self) -> None: diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 5f9883100cc..4b810a4c013 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -358,7 +358,7 @@ def __init__( instance_id = id(self) super().__init__("%s-%s" % (name, instance_id)) self._proc: Optional[_BackgroundProcess] = _BackgroundProcess( - name=name, server_name=server_name, ctx=self + desc=name, server_name=server_name, ctx=self ) def start(self, rusage: "Optional[resource.struct_rusage]") -> None: diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py index 970367e9e05..ea1ffd171de 100644 --- a/synapse/metrics/common_usage_metrics.py +++ b/synapse/metrics/common_usage_metrics.py @@ -47,6 +47,7 @@ class CommonUsageMetricsManager: """Collects common usage metrics.""" def __init__(self, hs: "HomeServer") -> None: + self.server_name = hs.hostname self._store = hs.get_datastores().main self._clock = hs.get_clock() @@ -62,12 +63,15 @@ async def get_metrics(self) -> CommonUsageMetrics: async def setup(self) -> None: """Keep the gauges for common usage metrics up to date.""" run_as_background_process( - desc="common_usage_metrics_update_gauges", func=self._update_gauges + desc="common_usage_metrics_update_gauges", + server_name=self.server_name, + func=self._update_gauges, ) self._clock.looping_call( run_as_background_process, 5 * 60 * 1000, desc="common_usage_metrics_update_gauges", + server_name=self.server_name, func=self._update_gauges, ) diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py index b1a2476da84..0929e3aad12 100644 --- a/synapse/module_api/__init__.py +++ b/synapse/module_api/__init__.py @@ -1333,6 +1333,7 @@ def looping_background_call( run_as_background_process, msec, desc, + self.server_name, lambda: maybe_awaitable(f(*args, **kwargs)), ) else: @@ -1390,6 +1391,7 @@ def delayed_background_call( msec * 0.001, run_as_background_process, desc, + self.server_name, lambda: maybe_awaitable(f(*args, **kwargs)), ) diff --git a/synapse/push/emailpusher.py b/synapse/push/emailpusher.py index 0a14c534f71..09ca14584a2 100644 --- a/synapse/push/emailpusher.py +++ b/synapse/push/emailpusher.py @@ -68,6 +68,7 @@ def __init__(self, hs: "HomeServer", pusher_config: PusherConfig, mailer: Mailer super().__init__(hs, pusher_config) self.mailer = mailer + self.server_name = hs.hostname self.store = self.hs.get_datastores().main self.email = pusher_config.pushkey self.timed_call: Optional[IDelayedCall] = None @@ -117,7 +118,7 @@ def _start_processing(self) -> None: if self._is_processing: return - run_as_background_process("emailpush.process", self._process) + run_as_background_process("emailpush.process", self.server_name, self._process) def _pause_processing(self) -> None: """Used by tests to temporarily pause processing of events. diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py index 53dfb54104e..5946a6e9724 100644 --- a/synapse/push/httppusher.py +++ b/synapse/push/httppusher.py @@ -182,7 +182,9 @@ def on_new_receipts(self) -> None: # We could check the receipts are actually m.read receipts here, # but currently that's the only type of receipt anyway... - run_as_background_process("http_pusher.on_new_receipts", self._update_badge) + run_as_background_process( + "http_pusher.on_new_receipts", self.server_name, self._update_badge + ) async def _update_badge(self) -> None: # XXX as per https://github.com/matrix-org/matrix-doc/issues/2627, this seems @@ -217,7 +219,7 @@ def _start_processing(self) -> None: if self.failing_since and self.timed_call and self.timed_call.active(): return - run_as_background_process("httppush.process", self._process) + run_as_background_process("httppush.process", self.server_name, self._process) async def _process(self) -> None: # we should never get here if we are already processing diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index 0a7541b4c70..47685cccf94 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -65,6 +65,7 @@ class PusherPool: def __init__(self, hs: "HomeServer"): self.hs = hs + self.server_name = hs.hostname self.pusher_factory = PusherFactory(hs) self.store = self.hs.get_datastores().main self.clock = self.hs.get_clock() @@ -99,7 +100,9 @@ def start(self) -> None: if not self._should_start_pushers: logger.info("Not starting pushers because they are disabled in the config") return - run_as_background_process("start_pushers", self._start_pushers) + run_as_background_process( + "start_pushers", self.server_name, self._start_pushers + ) async def add_or_update_pusher( self, diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index 0bd5478cd35..34d44d002de 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -404,6 +404,7 @@ class FederationSenderHandler: def __init__(self, hs: "HomeServer"): assert hs.should_send_federation() + self.server_name = hs.hostname self.store = hs.get_datastores().main self._is_mine_id = hs.is_mine_id self._hs = hs @@ -494,7 +495,9 @@ async def update_token(self, token: int) -> None: # no need to queue up another task. return - run_as_background_process("_save_and_send_ack", self._save_and_send_ack) + run_as_background_process( + "_save_and_send_ack", self.server_name, self._save_and_send_ack + ) async def _save_and_send_ack(self) -> None: """Save the current federation position in the database and send an ACK diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index b015c401cfc..13caadb2843 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -335,7 +335,10 @@ def _add_command_to_stream_queue( # fire off a background process to start processing the queue. run_as_background_process( - "process-replication-data", self._unsafe_process_queue, stream_name + "process-replication-data", + self.server_name, + self._unsafe_process_queue, + stream_name, ) async def _unsafe_process_queue(self, stream_name: str) -> None: diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index 8fa26d8481c..c726baf142a 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -291,7 +291,9 @@ def handle_command(self, cmd: Command) -> None: if isawaitable(res): run_as_background_process( - "replication-" + cmd.get_logcontext_id(), lambda: res + "replication-" + cmd.get_logcontext_id(), + self.server_name, + lambda: res, ) handled = True diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py index a67f4ce3be1..ffe9e25a0f2 100644 --- a/synapse/replication/tcp/redis.py +++ b/synapse/replication/tcp/redis.py @@ -130,7 +130,9 @@ def __init__(self, *args: Any, **kwargs: Any): def connectionMade(self) -> None: logger.info("Connected to redis") super().connectionMade() - run_as_background_process("subscribe-replication", self._send_subscribe) + run_as_background_process( + "subscribe-replication", self.server_name, self._send_subscribe + ) async def _send_subscribe(self) -> None: # it's important to make sure that we only send the REPLICATE command once we @@ -206,7 +208,7 @@ def handle_command(self, cmd: Command) -> None: if isawaitable(res): run_as_background_process( - "replication-" + cmd.get_logcontext_id(), lambda: res + "replication-" + cmd.get_logcontext_id(), self.server_name, lambda: res ) def connectionLost(self, reason: Failure) -> None: # type: ignore[override] @@ -228,7 +230,11 @@ def send_command(self, cmd: Command) -> None: cmd: The command to send """ run_as_background_process( - "send-cmd", self._async_send_command, cmd, bg_start_span=False + "send-cmd", + self.server_name, + self._async_send_command, + cmd, + bg_start_span=False, ) async def _async_send_command(self, cmd: Command) -> None: diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py index d8576d2af36..e65e0fce1c3 100644 --- a/synapse/replication/tcp/resource.py +++ b/synapse/replication/tcp/resource.py @@ -147,7 +147,9 @@ def on_notifier_poke(self) -> None: logger.debug("Notifier poke loop already running") return - run_as_background_process("replication_notifier", self._run_notifier_loop) + run_as_background_process( + "replication_notifier", self.server_name, self._run_notifier_loop + ) async def _run_notifier_loop(self) -> None: self.is_looping = True diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 4600a877780..9e22f0d0230 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -1201,6 +1201,7 @@ class RoomRedactEventRestServlet(TransactionRestServlet): def __init__(self, hs: "HomeServer"): super().__init__(hs) + self.server_name = hs.hostname self.event_creation_handler = hs.get_event_creation_handler() self.auth = hs.get_auth() self._store = hs.get_datastores().main @@ -1285,6 +1286,7 @@ async def _do( if with_relations: run_as_background_process( "redact_related_events", + self.server_name, self._relation_handler.redact_events_related_to, requester=requester, event_id=event_id, diff --git a/synapse/server.py b/synapse/server.py index 52b3b0d5dcd..0ba3142a6e8 100644 --- a/synapse/server.py +++ b/synapse/server.py @@ -422,7 +422,7 @@ def get_datastores(self) -> Databases: @cache_in_self def get_distributor(self) -> Distributor: - return Distributor() + return Distributor(server_name=self.hostname) @cache_in_self def get_registration_ratelimiter(self) -> Ratelimiter: diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index d170bbddaa8..acc0abee63e 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -249,6 +249,7 @@ def __init__(self, hs: "HomeServer", database: "DatabasePool"): self._clock = hs.get_clock() self.db_pool = database self.hs = hs + self.server_name = hs.hostname self._database_name = database.name() @@ -395,7 +396,10 @@ def start_doing_background_updates(self) -> None: self._all_done = False sleep = self.sleep_enabled run_as_background_process( - "background_updates", self.run_background_updates, sleep + "background_updates", + self.server_name, + self.run_background_updates, + sleep, ) async def run_background_updates(self, sleep: bool) -> None: diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index 20e6782670d..cf9507ebd4e 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -193,6 +193,7 @@ class _EventPeristenceQueue(Generic[_PersistResult]): def __init__( self, + server_name: str, per_item_callback: Callable[ [str, _EventPersistQueueTask], Awaitable[_PersistResult], @@ -203,6 +204,7 @@ def __init__( The per_item_callback will be called for each item added via add_to_queue, and its result will be returned via the Deferreds returned from add_to_queue. """ + self.server_name = server_name self._event_persist_queues: Dict[str, Deque[_EventPersistQueueItem]] = {} self._currently_persisting_rooms: Set[str] = set() self._per_item_callback = per_item_callback @@ -307,7 +309,7 @@ async def handle_queue_loop() -> None: self._currently_persisting_rooms.discard(room_id) # set handle_queue_loop off in the background - run_as_background_process("persist_events", handle_queue_loop) + run_as_background_process("persist_events", self.server_name, handle_queue_loop) def _get_drainining_queue( self, room_id: str @@ -350,7 +352,7 @@ def __init__( self._instance_name = hs.get_instance_name() self.is_mine_id = hs.is_mine_id self._event_persist_queue = _EventPeristenceQueue( - self._process_event_persist_queue_task + self.server_name, self._process_event_persist_queue_task ) self._state_resolution_handler = hs.get_state_resolution_handler() self._state_controller = state_controller diff --git a/synapse/storage/database.py b/synapse/storage/database.py index d999a4c2825..0ecfe421528 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -611,6 +611,7 @@ def __init__( 0.0, run_as_background_process, "upsert_safety_check", + self.server_name, self._check_safe_to_upsert, ) @@ -653,6 +654,7 @@ async def _check_safe_to_upsert(self) -> None: 15.0, run_as_background_process, "upsert_safety_check", + self.server_name, self._check_safe_to_upsert, ) diff --git a/synapse/storage/databases/main/account_data.py b/synapse/storage/databases/main/account_data.py index 715815cc091..49adb3e7caf 100644 --- a/synapse/storage/databases/main/account_data.py +++ b/synapse/storage/databases/main/account_data.py @@ -77,6 +77,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="account_data", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("room_account_data", "instance_name", "stream_id"), diff --git a/synapse/storage/databases/main/cache.py b/synapse/storage/databases/main/cache.py index 9418fb6dd75..9ebd48d26ae 100644 --- a/synapse/storage/databases/main/cache.py +++ b/synapse/storage/databases/main/cache.py @@ -105,10 +105,11 @@ def __init__( # caches to invalidate. (This reduces the amount of writes to the DB # that happen). self._cache_id_gen = MultiWriterIdGenerator( - db_conn, - database, + db_conn=db_conn, + db=database, notifier=hs.get_replication_notifier(), stream_name="caches", + server_name=self.server_name, instance_name=hs.get_instance_name(), tables=[ ( diff --git a/synapse/storage/databases/main/deviceinbox.py b/synapse/storage/databases/main/deviceinbox.py index a22eab24742..825e3148ee2 100644 --- a/synapse/storage/databases/main/deviceinbox.py +++ b/synapse/storage/databases/main/deviceinbox.py @@ -108,6 +108,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="to_device", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("device_inbox", "instance_name", "stream_id"), @@ -153,6 +154,7 @@ def __init__( run_as_background_process, DEVICE_FEDERATION_INBOX_CLEANUP_INTERVAL_MS, "_delete_old_federation_inbox_rows", + self.server_name, self._delete_old_federation_inbox_rows, ) diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py index 941d278e6c7..5413d53dfb7 100644 --- a/synapse/storage/databases/main/devices.py +++ b/synapse/storage/databases/main/devices.py @@ -101,6 +101,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="device_lists_stream", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("device_lists_stream", "instance_name", "stream_id"), diff --git a/synapse/storage/databases/main/end_to_end_keys.py b/synapse/storage/databases/main/end_to_end_keys.py index 0700b0087b0..45697b23b5c 100644 --- a/synapse/storage/databases/main/end_to_end_keys.py +++ b/synapse/storage/databases/main/end_to_end_keys.py @@ -1555,6 +1555,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="e2e_cross_signing_keys", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("e2e_cross_signing_keys", "instance_name", "stream_id"), diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 9cc0493307c..042e111e579 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -235,6 +235,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="events", + server_name=self.server_name, instance_name=hs.get_instance_name(), tables=[ ("events", "instance_name", "stream_ordering"), @@ -249,6 +250,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="backfill", + server_name=self.server_name, instance_name=hs.get_instance_name(), tables=[ ("events", "instance_name", "stream_ordering"), @@ -332,6 +334,7 @@ def get_chain_id_txn(txn: Cursor) -> int: db=database, notifier=hs.get_replication_notifier(), stream_name="un_partial_stated_event_stream", + server_name=self.server_name, instance_name=hs.get_instance_name(), tables=[("un_partial_stated_event_stream", "instance_name", "stream_id")], sequence_name="un_partial_stated_event_stream_sequence", @@ -1129,7 +1132,9 @@ def _maybe_start_fetch_thread(self) -> None: should_start = False if should_start: - run_as_background_process("fetch_events", self._fetch_thread) + run_as_background_process( + "fetch_events", self.server_name, self._fetch_thread + ) async def _fetch_thread(self) -> None: """Services requests for events from `_event_fetch_list`.""" diff --git a/synapse/storage/databases/main/presence.py b/synapse/storage/databases/main/presence.py index 065c8856036..ad414810aef 100644 --- a/synapse/storage/databases/main/presence.py +++ b/synapse/storage/databases/main/presence.py @@ -91,6 +91,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="presence_stream", + server_name=self.server_name, instance_name=self._instance_name, tables=[("presence_stream", "instance_name", "stream_id")], sequence_name="presence_stream_sequence", diff --git a/synapse/storage/databases/main/push_rule.py b/synapse/storage/databases/main/push_rule.py index 86c87f78bf1..1269d5e0d5c 100644 --- a/synapse/storage/databases/main/push_rule.py +++ b/synapse/storage/databases/main/push_rule.py @@ -146,6 +146,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="push_rules_stream", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("push_rules_stream", "instance_name", "stream_id"), diff --git a/synapse/storage/databases/main/pusher.py b/synapse/storage/databases/main/pusher.py index a8a37b6c85e..9a0a12b5c10 100644 --- a/synapse/storage/databases/main/pusher.py +++ b/synapse/storage/databases/main/pusher.py @@ -88,6 +88,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="pushers", + server_name=self.server_name, instance_name=self._instance_name, tables=[ ("pushers", "instance_name", "id"), diff --git a/synapse/storage/databases/main/receipts.py b/synapse/storage/databases/main/receipts.py index 99643315107..af1bbaae5f8 100644 --- a/synapse/storage/databases/main/receipts.py +++ b/synapse/storage/databases/main/receipts.py @@ -139,6 +139,7 @@ def __init__( db_conn=db_conn, db=database, notifier=hs.get_replication_notifier(), + server_name=self.server_name, stream_name="receipts", instance_name=self._instance_name, tables=[("receipts_linearized", "instance_name", "stream_id")], diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py index 58451d3ff19..9b76210defd 100644 --- a/synapse/storage/databases/main/room.py +++ b/synapse/storage/databases/main/room.py @@ -160,6 +160,7 @@ def __init__( db=database, notifier=hs.get_replication_notifier(), stream_name="un_partial_stated_room_stream", + server_name=self.server_name, instance_name=self._instance_name, tables=[("un_partial_stated_room_stream", "instance_name", "stream_id")], sequence_name="un_partial_stated_room_stream_sequence", diff --git a/synapse/storage/util/id_generators.py b/synapse/storage/util/id_generators.py index e8588f33cf5..308a3db35ff 100644 --- a/synapse/storage/util/id_generators.py +++ b/synapse/storage/util/id_generators.py @@ -189,6 +189,8 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator): db stream_name: A name for the stream, for use in the `stream_positions` table. (Does not need to be the same as the replication stream name) + server_name: The homeserver name of the server (used to label metrics) + (this should be `hs.hostname`). instance_name: The name of this instance. tables: List of tables associated with the stream. Tuple of table name, column name that stores the writer's instance name, and @@ -204,10 +206,12 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator): def __init__( self, + *, db_conn: LoggingDatabaseConnection, db: DatabasePool, notifier: "ReplicationNotifier", stream_name: str, + server_name: str, instance_name: str, tables: List[Tuple[str, str, str]], sequence_name: str, @@ -217,6 +221,7 @@ def __init__( self._db = db self._notifier = notifier self._stream_name = stream_name + self.server_name = server_name self._instance_name = instance_name self._positive = positive self._writers = writers @@ -552,6 +557,7 @@ def get_next_txn(self, txn: LoggingTransaction) -> int: txn.call_after( run_as_background_process, "MultiWriterIdGenerator._update_table", + self.server_name, self._db.runInteraction, "MultiWriterIdGenerator._update_table", self._update_stream_positions_table_txn, @@ -588,6 +594,7 @@ def get_next_mult_txn(self, txn: LoggingTransaction, n: int) -> List[int]: txn.call_after( run_as_background_process, "MultiWriterIdGenerator._update_table", + self.server_name, self._db.runInteraction, "MultiWriterIdGenerator._update_table", self._update_stream_positions_table_txn, diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 3fb697751fc..3a2d664d28d 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -85,6 +85,8 @@ class BatchingQueue(Generic[V, R]): Args: name: A name for the queue, used for logging contexts and metrics. This must be unique, otherwise the metrics will be wrong. + server_name: The homeserver name of the server (used to label metrics) + (this should be `hs.hostname`). clock: The clock to use to schedule work. process_batch_callback: The callback to to be run to process a batch of work. @@ -92,11 +94,14 @@ class BatchingQueue(Generic[V, R]): def __init__( self, + *, name: str, + server_name: str, clock: Clock, process_batch_callback: Callable[[List[V]], Awaitable[R]], ): self._name = name + self.server_name = server_name self._clock = clock # The set of keys currently being processed. @@ -135,7 +140,9 @@ async def add_to_queue(self, value: V, key: Hashable = ()) -> R: # If we're not currently processing the key fire off a background # process to start processing. if key not in self._processing_keys: - run_as_background_process(self._name, self._process_queue, key) + run_as_background_process( + self._name, self.server_name, self._process_queue, key + ) with self._number_in_flight_metric.track_inprogress(): return await make_deferred_yieldable(d) diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py index 3198fdd2eda..6ef9f6a330c 100644 --- a/synapse/util/caches/expiringcache.py +++ b/synapse/util/caches/expiringcache.py @@ -90,7 +90,9 @@ def __init__( return def f() -> "defer.Deferred[None]": - return run_as_background_process("prune_cache", self._prune_cache) + return run_as_background_process( + "prune_cache", server_name, self._prune_cache + ) self._clock.looping_call(f, self._expiry_ms / 2) diff --git a/synapse/util/distributor.py b/synapse/util/distributor.py index 95786bd3dde..f48ae3373ce 100644 --- a/synapse/util/distributor.py +++ b/synapse/util/distributor.py @@ -58,7 +58,13 @@ class Distributor: model will do for today. """ - def __init__(self) -> None: + def __init__(self, server_name: str) -> None: + """ + Args: + server_name: The homeserver name of the server (used to label metrics) + (this should be `hs.hostname`). + """ + self.server_name = server_name self.signals: Dict[str, Signal] = {} self.pre_registration: Dict[str, List[Callable]] = {} @@ -91,7 +97,9 @@ def fire(self, name: str, *args: Any, **kwargs: Any) -> None: if name not in self.signals: raise KeyError("%r does not have a signal named %s" % (self, name)) - run_as_background_process(name, self.signals[name].fire, *args, **kwargs) + run_as_background_process( + name, self.server_name, self.signals[name].fire, *args, **kwargs + ) P = ParamSpec("P") diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 42be1c8d28a..d414d2f01f2 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -60,6 +60,7 @@ def __init__(self, retry_last_ts: int, retry_interval: int, destination: str): async def get_retry_limiter( destination: str, + our_server_name: str, clock: Clock, store: DataStore, ignore_backoff: bool = False, @@ -74,6 +75,7 @@ async def get_retry_limiter( Args: destination: name of homeserver + our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`) clock: timing source store: datastore ignore_backoff: true to ignore the historical backoff data and @@ -114,11 +116,12 @@ async def get_retry_limiter( backoff_on_failure = not ignore_backoff return RetryDestinationLimiter( - destination, - clock, - store, - failure_ts, - retry_interval, + destination=destination, + our_server_name=our_server_name, + clock=clock, + store=store, + failure_ts=failure_ts, + retry_interval=retry_interval, backoff_on_failure=backoff_on_failure, **kwargs, ) @@ -151,7 +154,9 @@ async def filter_destinations_by_retry_limiter( class RetryDestinationLimiter: def __init__( self, + *, destination: str, + our_server_name: str, clock: Clock, store: DataStore, failure_ts: Optional[int], @@ -169,6 +174,7 @@ def __init__( Args: destination + our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`) clock store failure_ts: when this destination started failing (in ms since @@ -184,6 +190,7 @@ def __init__( backoff_on_all_error_codes: Whether we should back off on any error code. """ + self.our_server_name = our_server_name self.clock = clock self.store = store self.destination = destination @@ -318,4 +325,6 @@ async def store_retry_timings() -> None: logger.exception("Failed to store destination_retry_timings") # we deliberately do this in the background. - run_as_background_process("store_retry_timings", store_retry_timings) + run_as_background_process( + "store_retry_timings", self.our_server_name, store_retry_timings + ) diff --git a/synapse/util/task_scheduler.py b/synapse/util/task_scheduler.py index 5169656c73c..6789189a00e 100644 --- a/synapse/util/task_scheduler.py +++ b/synapse/util/task_scheduler.py @@ -101,6 +101,7 @@ class TaskScheduler: def __init__(self, hs: "HomeServer"): self._hs = hs + self.server_name = hs.hostname self._store = hs.get_datastores().main self._clock = hs.get_clock() self._running_tasks: Set[str] = set() @@ -354,7 +355,7 @@ async def inner() -> None: finally: self._launching_new_tasks = False - run_as_background_process("launch_scheduled_tasks", inner) + run_as_background_process("launch_scheduled_tasks", self.server_name, inner) @wrap_as_background_process("clean_scheduled_tasks") async def _clean_scheduled_tasks(self) -> None: @@ -485,4 +486,4 @@ async def wrapper() -> None: self._running_tasks.add(task.id) await self.update_task(task.id, status=TaskStatus.ACTIVE) - run_as_background_process(f"task-{task.action}", wrapper) + run_as_background_process(f"task-{task.action}", self.server_name, wrapper) diff --git a/tests/appservice/test_scheduler.py b/tests/appservice/test_scheduler.py index a5bf7e06357..10edd575bc1 100644 --- a/tests/appservice/test_scheduler.py +++ b/tests/appservice/test_scheduler.py @@ -53,11 +53,24 @@ def setUp(self) -> None: self.clock = MockClock() self.store = Mock() self.as_api = Mock() + + self.hs = Mock( + spec_set=[ + "get_datastores", + "get_clock", + "get_application_service_api", + "hostname", + ] + ) + self.hs.get_clock.return_value = self.clock + self.hs.get_datastores.return_value = Mock( + main=self.store, + ) + self.hs.get_application_service_api.return_value = self.as_api + self.recoverer = Mock() self.recoverer_fn = Mock(return_value=self.recoverer) - self.txnctrl = _TransactionController( - clock=cast(Clock, self.clock), store=self.store, as_api=self.as_api - ) + self.txnctrl = _TransactionController(self.hs) self.txnctrl.RECOVERER_CLASS = self.recoverer_fn def test_single_service_up_txn_sent(self) -> None: @@ -163,6 +176,7 @@ def setUp(self) -> None: self.service = Mock() self.callback = AsyncMock() self.recoverer = _Recoverer( + server_name="test_server", clock=cast(Clock, self.clock), as_api=self.as_api, store=self.store, diff --git a/tests/metrics/test_background_process_metrics.py b/tests/metrics/test_background_process_metrics.py index 7253e51c38a..1f47601b95b 100644 --- a/tests/metrics/test_background_process_metrics.py +++ b/tests/metrics/test_background_process_metrics.py @@ -15,7 +15,7 @@ def test_update_metrics_with_negative_time_diff(self) -> None: mock_logging_context.get_resource_usage.return_value = usage process = _BackgroundProcess( - name="test process", server_name="test_server", ctx=mock_logging_context + desc="test process", server_name="test_server", ctx=mock_logging_context ) # Should not raise process.update_metrics() diff --git a/tests/test_distributor.py b/tests/test_distributor.py index 18792fdee3f..19dafe64ed4 100644 --- a/tests/test_distributor.py +++ b/tests/test_distributor.py @@ -28,7 +28,7 @@ class DistributorTestCase(unittest.TestCase): def setUp(self) -> None: - self.dist = Distributor() + self.dist = Distributor(server_name="test_server") def test_signal_dispatch(self) -> None: self.dist.declare("alert") diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 2dcf3a3412f..49ddc117255 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -50,7 +50,10 @@ def setUp(self) -> None: self._pending_calls: List[Tuple[List[str], defer.Deferred]] = [] self.queue: BatchingQueue[str, str] = BatchingQueue( - "test_queue", hs_clock, self._process_queue + name="test_queue", + server_name="test_server", + clock=hs_clock, + process_batch_callback=self._process_queue, ) async def _process_queue(self, values: List[str]) -> str: From ecb479e1c6a944c6902171d9749a6a94d57528e8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 14:15:08 -0500 Subject: [PATCH 46/48] Add changelog --- changelog.d/18656.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/18656.misc diff --git a/changelog.d/18656.misc b/changelog.d/18656.misc new file mode 100644 index 00000000000..44323a3a81a --- /dev/null +++ b/changelog.d/18656.misc @@ -0,0 +1 @@ +Refactor `Counter` metrics to be homeserver-scoped. From 1b03330790d55521e4d742a787cc4d32adde1e2a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 14:27:02 -0500 Subject: [PATCH 47/48] Fix `make_request(...)` in tests --- tests/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/server.py b/tests/server.py index 0c519bc4c98..0ace329619f 100644 --- a/tests/server.py +++ b/tests/server.py @@ -432,7 +432,7 @@ def make_request( channel = FakeChannel(site, reactor, ip=client_ip) - req = request(channel, site) + req = request(channel, site, our_server_name="test_server") channel.request = req req.content = BytesIO(content) From 822935c17cebf40362c6e0709e7b1a633fb2c1ef Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 24 Jul 2025 15:36:21 -0500 Subject: [PATCH 48/48] Remove debug log --- scripts-dev/mypy_synapse_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 224b4562c44..2ccac97103b 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -71,7 +71,6 @@ def get_function_signature_hook( def get_method_signature_hook( self, fullname: str ) -> Optional[Callable[[MethodSigContext], CallableType]]: - # print(f"m fullname={fullname}") if fullname.startswith( ( "synapse.util.caches.descriptors.CachedFunction.__call__",