From 68962210469f4d77b519aabe9eca322ebe845702 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Thu, 3 Jul 2025 19:46:32 -0500
Subject: [PATCH 01/48] Start of lint for checking whether Prometheus metrics
 have the `server_name` label

---
 scripts-dev/mypy_synapse_plugin.py |  83 +++-
 synapse/util/ratelimitutils.py     | 668 ++++++++++++++---------------
 2 files changed, 416 insertions(+), 335 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index a15c3c005cf..817272900ae 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -29,7 +29,14 @@
 from mypy.erasetype import remove_instance_last_known_values
 from mypy.errorcodes import ErrorCode
 from mypy.nodes import ARG_NAMED_OPT, TempNode, Var
-from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin
+from mypy.plugin import (
+    FunctionSigContext,
+    MethodSigContext,
+    Plugin,
+    FunctionContext,
+    ClassDefContext,
+    FunctionLike,
+)
 from mypy.typeops import bind_self
 from mypy.types import (
     AnyType,
@@ -42,12 +49,22 @@
     UninhabitedType,
     UnionType,
 )
+from mypy.nodes import StrExpr, TupleExpr, ListExpr
 
 
 class SynapsePlugin(Plugin):
+    def get_function_signature_hook(
+        self, fullname: str
+    ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
+        if fullname in ("prometheus_client.metrics.Counter",):
+            return check_asdf
+
+        return None
+
     def get_method_signature_hook(
         self, fullname: str
     ) -> Optional[Callable[[MethodSigContext], CallableType]]:
+        # print(f"m fullname={fullname}")
         if fullname.startswith(
             (
                 "synapse.util.caches.descriptors.CachedFunction.__call__",
@@ -65,6 +82,70 @@ def get_method_signature_hook(
         return None
 
 
+def check_asdf(ctx: FunctionSigContext) -> CallableType:
+    """
+    TODO
+    """
+    # The true signature, this isn't being modified so this is what will be returned.
+    signature: CallableType = ctx.default_signature
+
+    # Sanity check the arguments are still as expected in this version of
+    # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
+    #
+    # signature.arg_names should be: ["name", "documentation", "labelnames", ...]
+    if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
+        ctx.api.fail(
+            f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
+            f"{signature.arg_names[2]}",
+            ctx.context,
+        )
+        return signature
+
+    # Ensure mypy is passing the correct number of arguments because we are doing some
+    # dirty indexing into `ctx.args` later on.
+    assert len(ctx.args) == len(signature.arg_names), (
+        f"Expected {len(signature.arg_names)} arguments for {signature.name}, "
+        f"but got {len(ctx.args)}"
+    )
+
+    # `ctx.args` should look like this:
+    # ```
+    # [
+    #     [StrExpr("name")],
+    #     [StrExpr("documentation")],
+    #     [ListExpr([StrExpr("label1"), StrExpr("label2")])]
+    #     ...
+    # ]
+    # ```
+    labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
+    if isinstance(labelnames_arg_expression, ListExpr):
+        labels = []
+        for labelname_expression in labelnames_arg_expression.items:
+            if isinstance(labelname_expression, StrExpr):
+                labels.append(labelname_expression.value)
+            else:
+                ctx.api.fail(
+                    f"Expected all items in the 3rd argument of {signature.name} to be strings, but got "
+                    f"{labelname_expression.__class__.__name__}",
+                    ctx.context,
+                )
+
+        if "server_name" not in labels:
+            ctx.api.fail(
+                f"Expected {signature.name} to include 'server_name' in the list of labels",
+                ctx.context,
+            )
+    else:
+        ctx.api.fail(
+            f"Expected the 3rd argument of {signature.name} to be a list of label names, but got "
+            f"{labelnames_arg_expression}",
+            ctx.context,
+        )
+        return signature
+
+    return signature
+
+
 def _get_true_return_type(signature: CallableType) -> mypy.types.Type:
     """
     Get the "final" return type of a callable which might return an Awaitable/Deferred.
diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index 3f067b792c4..d8c3504754a 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -42,23 +42,23 @@
 
 from prometheus_client.core import Counter
 
-from twisted.internet import defer
-
-from synapse.api.errors import LimitExceededError
-from synapse.config.ratelimiting import FederationRatelimitSettings
-from synapse.logging.context import (
-    PreserveLoggingContext,
-    make_deferred_yieldable,
-    run_in_background,
-)
-from synapse.logging.opentracing import start_active_span
-from synapse.metrics import Histogram, LaterGauge
-from synapse.util import Clock
+# from twisted.internet import defer
+
+# from synapse.api.errors import LimitExceededError
+# from synapse.config.ratelimiting import FederationRatelimitSettings
+# from synapse.logging.context import (
+#     PreserveLoggingContext,
+#     make_deferred_yieldable,
+#     run_in_background,
+# )
+# from synapse.logging.opentracing import start_active_span
+# from synapse.metrics import Histogram, LaterGauge
+# from synapse.util import Clock
 
-if typing.TYPE_CHECKING:
-    from contextlib import _GeneratorContextManager
+# if typing.TYPE_CHECKING:
+#     from contextlib import _GeneratorContextManager
 
-logger = logging.getLogger(__name__)
+# logger = logging.getLogger(__name__)
 
 
 # Track how much the ratelimiter is affecting requests
@@ -72,326 +72,326 @@
     "Number of requests rejected by the rate limiter",
     ["rate_limiter_name"],
 )
-queue_wait_timer = Histogram(
-    "synapse_rate_limit_queue_wait_time_seconds",
-    "Amount of time spent waiting for the rate limiter to let our request through.",
-    ["rate_limiter_name"],
-    buckets=(
-        0.005,
-        0.01,
-        0.025,
-        0.05,
-        0.1,
-        0.25,
-        0.5,
-        0.75,
-        1.0,
-        2.5,
-        5.0,
-        10.0,
-        20.0,
-        "+Inf",
-    ),
-)
+# queue_wait_timer = Histogram(
+#     "synapse_rate_limit_queue_wait_time_seconds",
+#     "Amount of time spent waiting for the rate limiter to let our request through.",
+#     ["rate_limiter_name"],
+#     buckets=(
+#         0.005,
+#         0.01,
+#         0.025,
+#         0.05,
+#         0.1,
+#         0.25,
+#         0.5,
+#         0.75,
+#         1.0,
+#         2.5,
+#         5.0,
+#         10.0,
+#         20.0,
+#         "+Inf",
+#     ),
+# )
 
 
 # This must be a `WeakSet`, otherwise we indirectly hold on to entire `HomeServer`s
 # during trial test runs and leak a lot of memory.
-_rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet()
-# Protects the _rate_limiter_instances set from concurrent access
-_rate_limiter_instances_lock = threading.Lock()
-
-
-def _get_counts_from_rate_limiter_instance(
-    count_func: Callable[["FederationRateLimiter"], int],
-) -> Mapping[Tuple[str, ...], int]:
-    """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
-    # Cast to a list to prevent it changing while the Prometheus
-    # thread is collecting metrics
-    with _rate_limiter_instances_lock:
-        rate_limiter_instances = list(_rate_limiter_instances)
-
-    # Map from (metrics_name,) -> int, the number of something like slept hosts
-    # or rejected hosts. The key type is Tuple[str], but we leave the length
-    # unspecified for compatability with LaterGauge's annotations.
-    counts: Dict[Tuple[str, ...], int] = {}
-    for rate_limiter_instance in rate_limiter_instances:
-        # Only track metrics if they provided a `metrics_name` to
-        # differentiate this instance of the rate limiter.
-        if rate_limiter_instance.metrics_name:
-            key = (rate_limiter_instance.metrics_name,)
-            counts[key] = count_func(rate_limiter_instance)
-
-    return counts
-
-
-# We track the number of affected hosts per time-period so we can
-# differentiate one really noisy homeserver from a general
-# ratelimit tuning problem across the federation.
-LaterGauge(
-    "synapse_rate_limit_sleep_affected_hosts",
-    "Number of hosts that had requests put to sleep",
-    ["rate_limiter_name"],
-    lambda: _get_counts_from_rate_limiter_instance(
-        lambda rate_limiter_instance: sum(
-            ratelimiter.should_sleep()
-            for ratelimiter in rate_limiter_instance.ratelimiters.values()
-        )
-    ),
-)
-LaterGauge(
-    "synapse_rate_limit_reject_affected_hosts",
-    "Number of hosts that had requests rejected",
-    ["rate_limiter_name"],
-    lambda: _get_counts_from_rate_limiter_instance(
-        lambda rate_limiter_instance: sum(
-            ratelimiter.should_reject()
-            for ratelimiter in rate_limiter_instance.ratelimiters.values()
-        )
-    ),
-)
-
-
-class FederationRateLimiter:
-    """Used to rate limit request per-host."""
-
-    def __init__(
-        self,
-        clock: Clock,
-        config: FederationRatelimitSettings,
-        metrics_name: Optional[str] = None,
-    ):
-        """
-        Args:
-            clock
-            config
-            metrics_name: The name of the rate limiter so we can differentiate it
-                from the rest in the metrics. If `None`, we don't track metrics
-                for this rate limiter.
-
-        """
-        self.metrics_name = metrics_name
-
-        def new_limiter() -> "_PerHostRatelimiter":
-            return _PerHostRatelimiter(
-                clock=clock, config=config, metrics_name=metrics_name
-            )
-
-        self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = (
-            collections.defaultdict(new_limiter)
-        )
-
-        with _rate_limiter_instances_lock:
-            _rate_limiter_instances.add(self)
-
-    def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
-        """Used to ratelimit an incoming request from a given host
-
-        Example usage:
-
-            with rate_limiter.ratelimit(origin) as wait_deferred:
-                yield wait_deferred
-                # Handle request ...
-
-        Args:
-            host: Origin of incoming request.
-
-        Returns:
-            context manager which returns a deferred.
-        """
-        return self.ratelimiters[host].ratelimit(host)
-
-
-class _PerHostRatelimiter:
-    def __init__(
-        self,
-        clock: Clock,
-        config: FederationRatelimitSettings,
-        metrics_name: Optional[str] = None,
-    ):
-        """
-        Args:
-            clock
-            config
-            metrics_name: The name of the rate limiter so we can differentiate it
-                from the rest in the metrics. If `None`, we don't track metrics
-                for this rate limiter.
-                from the rest in the metrics
-        """
-        self.clock = clock
-        self.metrics_name = metrics_name
-
-        self.window_size = config.window_size
-        self.sleep_limit = config.sleep_limit
-        self.sleep_sec = config.sleep_delay / 1000.0
-        self.reject_limit = config.reject_limit
-        self.concurrent_requests = config.concurrent
-
-        # request_id objects for requests which have been slept
-        self.sleeping_requests: Set[object] = set()
-
-        # map from request_id object to Deferred for requests which are ready
-        # for processing but have been queued
-        self.ready_request_queue: collections.OrderedDict[
-            object, defer.Deferred[None]
-        ] = collections.OrderedDict()
-
-        # request id objects for requests which are in progress
-        self.current_processing: Set[object] = set()
-
-        # times at which we have recently (within the last window_size ms)
-        # received requests.
-        self.request_times: List[int] = []
-
-    @contextlib.contextmanager
-    def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]":
-        # `contextlib.contextmanager` takes a generator and turns it into a
-        # context manager. The generator should only yield once with a value
-        # to be returned by manager.
-        # Exceptions will be reraised at the yield.
-
-        self.host = host
-
-        request_id = object()
-        # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant
-        # type-checking, but we'd need Twisted >= 21.2.
-        ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id))
-        try:
-            yield ret
-        finally:
-            self._on_exit(request_id)
-
-    def should_reject(self) -> bool:
-        """
-        Whether to reject the request if we already have too many queued up
-        (either sleeping or in the ready queue).
-        """
-        queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
-        return queue_size > self.reject_limit
-
-    def should_sleep(self) -> bool:
-        """
-        Whether to sleep the request if we already have too many requests coming
-        through within the window.
-        """
-        return len(self.request_times) > self.sleep_limit
-
-    async def _on_enter_with_tracing(self, request_id: object) -> None:
-        maybe_metrics_cm: ContextManager = contextlib.nullcontext()
-        if self.metrics_name:
-            maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
-        with start_active_span("ratelimit wait"), maybe_metrics_cm:
-            await self._on_enter(request_id)
-
-    def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
-        time_now = self.clock.time_msec()
-
-        # remove any entries from request_times which aren't within the window
-        self.request_times[:] = [
-            r for r in self.request_times if time_now - r < self.window_size
-        ]
-
-        # reject the request if we already have too many queued up (either
-        # sleeping or in the ready queue).
-        if self.should_reject():
-            logger.debug("Ratelimiter(%s): rejecting request", self.host)
-            if self.metrics_name:
-                rate_limit_reject_counter.labels(self.metrics_name).inc()
-            raise LimitExceededError(
-                limiter_name="rc_federation",
-                retry_after_ms=int(self.window_size / self.sleep_limit),
-            )
-
-        self.request_times.append(time_now)
-
-        def queue_request() -> "defer.Deferred[None]":
-            if len(self.current_processing) >= self.concurrent_requests:
-                queue_defer: defer.Deferred[None] = defer.Deferred()
-                self.ready_request_queue[request_id] = queue_defer
-                logger.info(
-                    "Ratelimiter(%s): queueing request (queue now %i items)",
-                    self.host,
-                    len(self.ready_request_queue),
-                )
-
-                return queue_defer
-            else:
-                return defer.succeed(None)
-
-        logger.debug(
-            "Ratelimit(%s) [%s]: len(self.request_times)=%d",
-            self.host,
-            id(request_id),
-            len(self.request_times),
-        )
-
-        if self.should_sleep():
-            logger.debug(
-                "Ratelimiter(%s) [%s]: sleeping request for %f sec",
-                self.host,
-                id(request_id),
-                self.sleep_sec,
-            )
-            if self.metrics_name:
-                rate_limit_sleep_counter.labels(self.metrics_name).inc()
-            ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
-
-            self.sleeping_requests.add(request_id)
-
-            def on_wait_finished(_: Any) -> "defer.Deferred[None]":
-                logger.debug(
-                    "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id)
-                )
-                self.sleeping_requests.discard(request_id)
-                queue_defer = queue_request()
-                return queue_defer
-
-            ret_defer.addBoth(on_wait_finished)
-        else:
-            ret_defer = queue_request()
-
-        def on_start(r: object) -> object:
-            logger.debug(
-                "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id)
-            )
-            self.current_processing.add(request_id)
-            return r
-
-        def on_err(r: object) -> object:
-            # XXX: why is this necessary? this is called before we start
-            # processing the request so why would the request be in
-            # current_processing?
-            self.current_processing.discard(request_id)
-            return r
-
-        def on_both(r: object) -> object:
-            # Ensure that we've properly cleaned up.
-            self.sleeping_requests.discard(request_id)
-            self.ready_request_queue.pop(request_id, None)
-            return r
-
-        ret_defer.addCallbacks(on_start, on_err)
-        ret_defer.addBoth(on_both)
-        return make_deferred_yieldable(ret_defer)
-
-    def _on_exit(self, request_id: object) -> None:
-        logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id))
-
-        # When requests complete synchronously, we will recursively start the next
-        # request in the queue. To avoid stack exhaustion, we defer starting the next
-        # request until the next reactor tick.
-
-        def start_next_request() -> None:
-            # We only remove the completed request from the list when we're about to
-            # start the next one, otherwise we can allow extra requests through.
-            self.current_processing.discard(request_id)
-            try:
-                # start processing the next item on the queue.
-                _, deferred = self.ready_request_queue.popitem(last=False)
-
-                with PreserveLoggingContext():
-                    deferred.callback(None)
-            except KeyError:
-                pass
-
-        self.clock.call_later(0.0, start_next_request)
+# _rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet()
+# # Protects the _rate_limiter_instances set from concurrent access
+# _rate_limiter_instances_lock = threading.Lock()
+
+
+# def _get_counts_from_rate_limiter_instance(
+#     count_func: Callable[["FederationRateLimiter"], int],
+# ) -> Mapping[Tuple[str, ...], int]:
+#     """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
+#     # Cast to a list to prevent it changing while the Prometheus
+#     # thread is collecting metrics
+#     with _rate_limiter_instances_lock:
+#         rate_limiter_instances = list(_rate_limiter_instances)
+
+#     # Map from (metrics_name,) -> int, the number of something like slept hosts
+#     # or rejected hosts. The key type is Tuple[str], but we leave the length
+#     # unspecified for compatability with LaterGauge's annotations.
+#     counts: Dict[Tuple[str, ...], int] = {}
+#     for rate_limiter_instance in rate_limiter_instances:
+#         # Only track metrics if they provided a `metrics_name` to
+#         # differentiate this instance of the rate limiter.
+#         if rate_limiter_instance.metrics_name:
+#             key = (rate_limiter_instance.metrics_name,)
+#             counts[key] = count_func(rate_limiter_instance)
+
+#     return counts
+
+
+# # We track the number of affected hosts per time-period so we can
+# # differentiate one really noisy homeserver from a general
+# # ratelimit tuning problem across the federation.
+# LaterGauge(
+#     "synapse_rate_limit_sleep_affected_hosts",
+#     "Number of hosts that had requests put to sleep",
+#     ["rate_limiter_name"],
+#     lambda: _get_counts_from_rate_limiter_instance(
+#         lambda rate_limiter_instance: sum(
+#             ratelimiter.should_sleep()
+#             for ratelimiter in rate_limiter_instance.ratelimiters.values()
+#         )
+#     ),
+# )
+# LaterGauge(
+#     "synapse_rate_limit_reject_affected_hosts",
+#     "Number of hosts that had requests rejected",
+#     ["rate_limiter_name"],
+#     lambda: _get_counts_from_rate_limiter_instance(
+#         lambda rate_limiter_instance: sum(
+#             ratelimiter.should_reject()
+#             for ratelimiter in rate_limiter_instance.ratelimiters.values()
+#         )
+#     ),
+# )
+
+
+# class FederationRateLimiter:
+#     """Used to rate limit request per-host."""
+
+#     def __init__(
+#         self,
+#         clock: Clock,
+#         config: FederationRatelimitSettings,
+#         metrics_name: Optional[str] = None,
+#     ):
+#         """
+#         Args:
+#             clock
+#             config
+#             metrics_name: The name of the rate limiter so we can differentiate it
+#                 from the rest in the metrics. If `None`, we don't track metrics
+#                 for this rate limiter.
+
+#         """
+#         self.metrics_name = metrics_name
+
+#         def new_limiter() -> "_PerHostRatelimiter":
+#             return _PerHostRatelimiter(
+#                 clock=clock, config=config, metrics_name=metrics_name
+#             )
+
+#         self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = (
+#             collections.defaultdict(new_limiter)
+#         )
+
+#         with _rate_limiter_instances_lock:
+#             _rate_limiter_instances.add(self)
+
+#     def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
+#         """Used to ratelimit an incoming request from a given host
+
+#         Example usage:
+
+#             with rate_limiter.ratelimit(origin) as wait_deferred:
+#                 yield wait_deferred
+#                 # Handle request ...
+
+#         Args:
+#             host: Origin of incoming request.
+
+#         Returns:
+#             context manager which returns a deferred.
+#         """
+#         return self.ratelimiters[host].ratelimit(host)
+
+
+# class _PerHostRatelimiter:
+#     def __init__(
+#         self,
+#         clock: Clock,
+#         config: FederationRatelimitSettings,
+#         metrics_name: Optional[str] = None,
+#     ):
+#         """
+#         Args:
+#             clock
+#             config
+#             metrics_name: The name of the rate limiter so we can differentiate it
+#                 from the rest in the metrics. If `None`, we don't track metrics
+#                 for this rate limiter.
+#                 from the rest in the metrics
+#         """
+#         self.clock = clock
+#         self.metrics_name = metrics_name
+
+#         self.window_size = config.window_size
+#         self.sleep_limit = config.sleep_limit
+#         self.sleep_sec = config.sleep_delay / 1000.0
+#         self.reject_limit = config.reject_limit
+#         self.concurrent_requests = config.concurrent
+
+#         # request_id objects for requests which have been slept
+#         self.sleeping_requests: Set[object] = set()
+
+#         # map from request_id object to Deferred for requests which are ready
+#         # for processing but have been queued
+#         self.ready_request_queue: collections.OrderedDict[
+#             object, defer.Deferred[None]
+#         ] = collections.OrderedDict()
+
+#         # request id objects for requests which are in progress
+#         self.current_processing: Set[object] = set()
+
+#         # times at which we have recently (within the last window_size ms)
+#         # received requests.
+#         self.request_times: List[int] = []
+
+#     @contextlib.contextmanager
+#     def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]":
+#         # `contextlib.contextmanager` takes a generator and turns it into a
+#         # context manager. The generator should only yield once with a value
+#         # to be returned by manager.
+#         # Exceptions will be reraised at the yield.
+
+#         self.host = host
+
+#         request_id = object()
+#         # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant
+#         # type-checking, but we'd need Twisted >= 21.2.
+#         ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id))
+#         try:
+#             yield ret
+#         finally:
+#             self._on_exit(request_id)
+
+#     def should_reject(self) -> bool:
+#         """
+#         Whether to reject the request if we already have too many queued up
+#         (either sleeping or in the ready queue).
+#         """
+#         queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
+#         return queue_size > self.reject_limit
+
+#     def should_sleep(self) -> bool:
+#         """
+#         Whether to sleep the request if we already have too many requests coming
+#         through within the window.
+#         """
+#         return len(self.request_times) > self.sleep_limit
+
+#     async def _on_enter_with_tracing(self, request_id: object) -> None:
+#         maybe_metrics_cm: ContextManager = contextlib.nullcontext()
+#         if self.metrics_name:
+#             maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
+#         with start_active_span("ratelimit wait"), maybe_metrics_cm:
+#             await self._on_enter(request_id)
+
+#     def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
+#         time_now = self.clock.time_msec()
+
+#         # remove any entries from request_times which aren't within the window
+#         self.request_times[:] = [
+#             r for r in self.request_times if time_now - r < self.window_size
+#         ]
+
+#         # reject the request if we already have too many queued up (either
+#         # sleeping or in the ready queue).
+#         if self.should_reject():
+#             logger.debug("Ratelimiter(%s): rejecting request", self.host)
+#             if self.metrics_name:
+#                 rate_limit_reject_counter.labels(self.metrics_name).inc()
+#             raise LimitExceededError(
+#                 limiter_name="rc_federation",
+#                 retry_after_ms=int(self.window_size / self.sleep_limit),
+#             )
+
+#         self.request_times.append(time_now)
+
+#         def queue_request() -> "defer.Deferred[None]":
+#             if len(self.current_processing) >= self.concurrent_requests:
+#                 queue_defer: defer.Deferred[None] = defer.Deferred()
+#                 self.ready_request_queue[request_id] = queue_defer
+#                 logger.info(
+#                     "Ratelimiter(%s): queueing request (queue now %i items)",
+#                     self.host,
+#                     len(self.ready_request_queue),
+#                 )
+
+#                 return queue_defer
+#             else:
+#                 return defer.succeed(None)
+
+#         logger.debug(
+#             "Ratelimit(%s) [%s]: len(self.request_times)=%d",
+#             self.host,
+#             id(request_id),
+#             len(self.request_times),
+#         )
+
+#         if self.should_sleep():
+#             logger.debug(
+#                 "Ratelimiter(%s) [%s]: sleeping request for %f sec",
+#                 self.host,
+#                 id(request_id),
+#                 self.sleep_sec,
+#             )
+#             if self.metrics_name:
+#                 rate_limit_sleep_counter.labels(self.metrics_name).inc()
+#             ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
+
+#             self.sleeping_requests.add(request_id)
+
+#             def on_wait_finished(_: Any) -> "defer.Deferred[None]":
+#                 logger.debug(
+#                     "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id)
+#                 )
+#                 self.sleeping_requests.discard(request_id)
+#                 queue_defer = queue_request()
+#                 return queue_defer
+
+#             ret_defer.addBoth(on_wait_finished)
+#         else:
+#             ret_defer = queue_request()
+
+#         def on_start(r: object) -> object:
+#             logger.debug(
+#                 "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id)
+#             )
+#             self.current_processing.add(request_id)
+#             return r
+
+#         def on_err(r: object) -> object:
+#             # XXX: why is this necessary? this is called before we start
+#             # processing the request so why would the request be in
+#             # current_processing?
+#             self.current_processing.discard(request_id)
+#             return r
+
+#         def on_both(r: object) -> object:
+#             # Ensure that we've properly cleaned up.
+#             self.sleeping_requests.discard(request_id)
+#             self.ready_request_queue.pop(request_id, None)
+#             return r
+
+#         ret_defer.addCallbacks(on_start, on_err)
+#         ret_defer.addBoth(on_both)
+#         return make_deferred_yieldable(ret_defer)
+
+#     def _on_exit(self, request_id: object) -> None:
+#         logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id))
+
+#         # When requests complete synchronously, we will recursively start the next
+#         # request in the queue. To avoid stack exhaustion, we defer starting the next
+#         # request until the next reactor tick.
+
+#         def start_next_request() -> None:
+#             # We only remove the completed request from the list when we're about to
+#             # start the next one, otherwise we can allow extra requests through.
+#             self.current_processing.discard(request_id)
+#             try:
+#                 # start processing the next item on the queue.
+#                 _, deferred = self.ready_request_queue.popitem(last=False)
+
+#                 with PreserveLoggingContext():
+#                     deferred.callback(None)
+#             except KeyError:
+#                 pass
+
+#         self.clock.call_later(0.0, start_next_request)

From ada39e37210af7664e9e66ce56898b13ee69d61e Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:18:52 -0500
Subject: [PATCH 02/48] Add more comments

---
 scripts-dev/mypy_synapse_plugin.py | 33 ++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index 817272900ae..590c7d41ae5 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -57,7 +57,7 @@ def get_function_signature_hook(
         self, fullname: str
     ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
         if fullname in ("prometheus_client.metrics.Counter",):
-            return check_asdf
+            return check_prometheus_metric_instantiation
 
         return None
 
@@ -82,9 +82,19 @@ def get_method_signature_hook(
         return None
 
 
-def check_asdf(ctx: FunctionSigContext) -> CallableType:
+def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
     """
-    TODO
+    Ensure that the `prometheus_client` metrics are instantiated with `server_name` in
+    the list of labels.
+
+    This is important because we support multiple Synapse instances running in the same
+    process, where all metrics share a single global `REGISTRY`. The `server_name` label
+    ensures metrics are correctly separated by homeserver.
+
+    There are also some metrics that apply at the process level, such as
+    CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't
+    have a `server_name` label. In those cases, use use a type ignore comment to
+    disable the check, e.g. `# type: ignore[misc]`.
     """
     # The true signature, this isn't being modified so this is what will be returned.
     signature: CallableType = ctx.default_signature
@@ -92,7 +102,7 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType:
     # Sanity check the arguments are still as expected in this version of
     # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
     #
-    # signature.arg_names should be: ["name", "documentation", "labelnames", ...]
+    # `signature.arg_names` should be: ["name", "documentation", "labelnames", ...]
     if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
         ctx.api.fail(
             f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
@@ -104,10 +114,12 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType:
     # Ensure mypy is passing the correct number of arguments because we are doing some
     # dirty indexing into `ctx.args` later on.
     assert len(ctx.args) == len(signature.arg_names), (
-        f"Expected {len(signature.arg_names)} arguments for {signature.name}, "
-        f"but got {len(ctx.args)}"
+        f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})"
+        f"to match the number of arguments from the function signature context ({len(ctx.args)})"
     )
 
+    # Check if the `labelnames` argument includes the `server_name` label.
+    #
     # `ctx.args` should look like this:
     # ```
     # [
@@ -119,17 +131,22 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType:
     # ```
     labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
     if isinstance(labelnames_arg_expression, ListExpr):
+        # Collect the label names as a list of strings.
         labels = []
         for labelname_expression in labelnames_arg_expression.items:
             if isinstance(labelname_expression, StrExpr):
                 labels.append(labelname_expression.value)
             else:
                 ctx.api.fail(
-                    f"Expected all items in the 3rd argument of {signature.name} to be strings, but got "
+                    f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got "
                     f"{labelname_expression.__class__.__name__}",
                     ctx.context,
+                    code=CUSTOM_ERROR,
                 )
 
+        # Check if the `labelnames` argument includes the `server_name` label.
+        #
+        # TODO: Use `SERVER_NAME_LABEL` here
         if "server_name" not in labels:
             ctx.api.fail(
                 f"Expected {signature.name} to include 'server_name' in the list of labels",
@@ -137,7 +154,7 @@ def check_asdf(ctx: FunctionSigContext) -> CallableType:
             )
     else:
         ctx.api.fail(
-            f"Expected the 3rd argument of {signature.name} to be a list of label names, but got "
+            f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got "
             f"{labelnames_arg_expression}",
             ctx.context,
         )

From 0c45aab8d184c4c803e9d4477390c3a876d13f8e Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:30:34 -0500
Subject: [PATCH 03/48] Add specific error code

---
 scripts-dev/mypy_synapse_plugin.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index 590c7d41ae5..b55ec610c73 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -51,6 +51,17 @@
 )
 from mypy.nodes import StrExpr, TupleExpr, ListExpr
 
+PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
+    "missing-server-name-label",
+    "'server_name' label required in metric",
+    category="per-homeserver-tenant-metrics",
+)
+PROMETHEUS_METRIC_MANGLED_LABELS = ErrorCode(
+    "mangled-labels",
+    "`labelnames` argument should be a list of strings",
+    category="per-homeserver-tenant-metrics",
+)
+
 
 class SynapsePlugin(Plugin):
     def get_function_signature_hook(
@@ -94,7 +105,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
     There are also some metrics that apply at the process level, such as
     CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't
     have a `server_name` label. In those cases, use use a type ignore comment to
-    disable the check, e.g. `# type: ignore[misc]`.
+    disable the check, e.g. `# type: ignore[missing-server-name-label]`.
     """
     # The true signature, this isn't being modified so this is what will be returned.
     signature: CallableType = ctx.default_signature
@@ -141,7 +152,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
                     f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got "
                     f"{labelname_expression.__class__.__name__}",
                     ctx.context,
-                    code=CUSTOM_ERROR,
+                    code=PROMETHEUS_METRIC_MANGLED_LABELS,
                 )
 
         # Check if the `labelnames` argument includes the `server_name` label.
@@ -151,12 +162,14 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
             ctx.api.fail(
                 f"Expected {signature.name} to include 'server_name' in the list of labels",
                 ctx.context,
+                code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
             )
     else:
         ctx.api.fail(
             f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got "
             f"{labelnames_arg_expression}",
             ctx.context,
+            code=PROMETHEUS_METRIC_MANGLED_LABELS,
         )
         return signature
 

From 6e74eca2688edb1f9af697c22383e6d032eb2c5d Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:32:08 -0500
Subject: [PATCH 04/48] Revert temp changes to `synapse/util/ratelimitutils.py`

---
 synapse/util/ratelimitutils.py | 668 ++++++++++++++++-----------------
 1 file changed, 334 insertions(+), 334 deletions(-)

diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index d8c3504754a..3f067b792c4 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -42,23 +42,23 @@
 
 from prometheus_client.core import Counter
 
-# from twisted.internet import defer
-
-# from synapse.api.errors import LimitExceededError
-# from synapse.config.ratelimiting import FederationRatelimitSettings
-# from synapse.logging.context import (
-#     PreserveLoggingContext,
-#     make_deferred_yieldable,
-#     run_in_background,
-# )
-# from synapse.logging.opentracing import start_active_span
-# from synapse.metrics import Histogram, LaterGauge
-# from synapse.util import Clock
+from twisted.internet import defer
+
+from synapse.api.errors import LimitExceededError
+from synapse.config.ratelimiting import FederationRatelimitSettings
+from synapse.logging.context import (
+    PreserveLoggingContext,
+    make_deferred_yieldable,
+    run_in_background,
+)
+from synapse.logging.opentracing import start_active_span
+from synapse.metrics import Histogram, LaterGauge
+from synapse.util import Clock
 
-# if typing.TYPE_CHECKING:
-#     from contextlib import _GeneratorContextManager
+if typing.TYPE_CHECKING:
+    from contextlib import _GeneratorContextManager
 
-# logger = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 # Track how much the ratelimiter is affecting requests
@@ -72,326 +72,326 @@
     "Number of requests rejected by the rate limiter",
     ["rate_limiter_name"],
 )
-# queue_wait_timer = Histogram(
-#     "synapse_rate_limit_queue_wait_time_seconds",
-#     "Amount of time spent waiting for the rate limiter to let our request through.",
-#     ["rate_limiter_name"],
-#     buckets=(
-#         0.005,
-#         0.01,
-#         0.025,
-#         0.05,
-#         0.1,
-#         0.25,
-#         0.5,
-#         0.75,
-#         1.0,
-#         2.5,
-#         5.0,
-#         10.0,
-#         20.0,
-#         "+Inf",
-#     ),
-# )
+queue_wait_timer = Histogram(
+    "synapse_rate_limit_queue_wait_time_seconds",
+    "Amount of time spent waiting for the rate limiter to let our request through.",
+    ["rate_limiter_name"],
+    buckets=(
+        0.005,
+        0.01,
+        0.025,
+        0.05,
+        0.1,
+        0.25,
+        0.5,
+        0.75,
+        1.0,
+        2.5,
+        5.0,
+        10.0,
+        20.0,
+        "+Inf",
+    ),
+)
 
 
 # This must be a `WeakSet`, otherwise we indirectly hold on to entire `HomeServer`s
 # during trial test runs and leak a lot of memory.
-# _rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet()
-# # Protects the _rate_limiter_instances set from concurrent access
-# _rate_limiter_instances_lock = threading.Lock()
-
-
-# def _get_counts_from_rate_limiter_instance(
-#     count_func: Callable[["FederationRateLimiter"], int],
-# ) -> Mapping[Tuple[str, ...], int]:
-#     """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
-#     # Cast to a list to prevent it changing while the Prometheus
-#     # thread is collecting metrics
-#     with _rate_limiter_instances_lock:
-#         rate_limiter_instances = list(_rate_limiter_instances)
-
-#     # Map from (metrics_name,) -> int, the number of something like slept hosts
-#     # or rejected hosts. The key type is Tuple[str], but we leave the length
-#     # unspecified for compatability with LaterGauge's annotations.
-#     counts: Dict[Tuple[str, ...], int] = {}
-#     for rate_limiter_instance in rate_limiter_instances:
-#         # Only track metrics if they provided a `metrics_name` to
-#         # differentiate this instance of the rate limiter.
-#         if rate_limiter_instance.metrics_name:
-#             key = (rate_limiter_instance.metrics_name,)
-#             counts[key] = count_func(rate_limiter_instance)
-
-#     return counts
-
-
-# # We track the number of affected hosts per time-period so we can
-# # differentiate one really noisy homeserver from a general
-# # ratelimit tuning problem across the federation.
-# LaterGauge(
-#     "synapse_rate_limit_sleep_affected_hosts",
-#     "Number of hosts that had requests put to sleep",
-#     ["rate_limiter_name"],
-#     lambda: _get_counts_from_rate_limiter_instance(
-#         lambda rate_limiter_instance: sum(
-#             ratelimiter.should_sleep()
-#             for ratelimiter in rate_limiter_instance.ratelimiters.values()
-#         )
-#     ),
-# )
-# LaterGauge(
-#     "synapse_rate_limit_reject_affected_hosts",
-#     "Number of hosts that had requests rejected",
-#     ["rate_limiter_name"],
-#     lambda: _get_counts_from_rate_limiter_instance(
-#         lambda rate_limiter_instance: sum(
-#             ratelimiter.should_reject()
-#             for ratelimiter in rate_limiter_instance.ratelimiters.values()
-#         )
-#     ),
-# )
-
-
-# class FederationRateLimiter:
-#     """Used to rate limit request per-host."""
-
-#     def __init__(
-#         self,
-#         clock: Clock,
-#         config: FederationRatelimitSettings,
-#         metrics_name: Optional[str] = None,
-#     ):
-#         """
-#         Args:
-#             clock
-#             config
-#             metrics_name: The name of the rate limiter so we can differentiate it
-#                 from the rest in the metrics. If `None`, we don't track metrics
-#                 for this rate limiter.
-
-#         """
-#         self.metrics_name = metrics_name
-
-#         def new_limiter() -> "_PerHostRatelimiter":
-#             return _PerHostRatelimiter(
-#                 clock=clock, config=config, metrics_name=metrics_name
-#             )
-
-#         self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = (
-#             collections.defaultdict(new_limiter)
-#         )
-
-#         with _rate_limiter_instances_lock:
-#             _rate_limiter_instances.add(self)
-
-#     def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
-#         """Used to ratelimit an incoming request from a given host
-
-#         Example usage:
-
-#             with rate_limiter.ratelimit(origin) as wait_deferred:
-#                 yield wait_deferred
-#                 # Handle request ...
-
-#         Args:
-#             host: Origin of incoming request.
-
-#         Returns:
-#             context manager which returns a deferred.
-#         """
-#         return self.ratelimiters[host].ratelimit(host)
-
-
-# class _PerHostRatelimiter:
-#     def __init__(
-#         self,
-#         clock: Clock,
-#         config: FederationRatelimitSettings,
-#         metrics_name: Optional[str] = None,
-#     ):
-#         """
-#         Args:
-#             clock
-#             config
-#             metrics_name: The name of the rate limiter so we can differentiate it
-#                 from the rest in the metrics. If `None`, we don't track metrics
-#                 for this rate limiter.
-#                 from the rest in the metrics
-#         """
-#         self.clock = clock
-#         self.metrics_name = metrics_name
-
-#         self.window_size = config.window_size
-#         self.sleep_limit = config.sleep_limit
-#         self.sleep_sec = config.sleep_delay / 1000.0
-#         self.reject_limit = config.reject_limit
-#         self.concurrent_requests = config.concurrent
-
-#         # request_id objects for requests which have been slept
-#         self.sleeping_requests: Set[object] = set()
-
-#         # map from request_id object to Deferred for requests which are ready
-#         # for processing but have been queued
-#         self.ready_request_queue: collections.OrderedDict[
-#             object, defer.Deferred[None]
-#         ] = collections.OrderedDict()
-
-#         # request id objects for requests which are in progress
-#         self.current_processing: Set[object] = set()
-
-#         # times at which we have recently (within the last window_size ms)
-#         # received requests.
-#         self.request_times: List[int] = []
-
-#     @contextlib.contextmanager
-#     def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]":
-#         # `contextlib.contextmanager` takes a generator and turns it into a
-#         # context manager. The generator should only yield once with a value
-#         # to be returned by manager.
-#         # Exceptions will be reraised at the yield.
-
-#         self.host = host
-
-#         request_id = object()
-#         # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant
-#         # type-checking, but we'd need Twisted >= 21.2.
-#         ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id))
-#         try:
-#             yield ret
-#         finally:
-#             self._on_exit(request_id)
-
-#     def should_reject(self) -> bool:
-#         """
-#         Whether to reject the request if we already have too many queued up
-#         (either sleeping or in the ready queue).
-#         """
-#         queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
-#         return queue_size > self.reject_limit
-
-#     def should_sleep(self) -> bool:
-#         """
-#         Whether to sleep the request if we already have too many requests coming
-#         through within the window.
-#         """
-#         return len(self.request_times) > self.sleep_limit
-
-#     async def _on_enter_with_tracing(self, request_id: object) -> None:
-#         maybe_metrics_cm: ContextManager = contextlib.nullcontext()
-#         if self.metrics_name:
-#             maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
-#         with start_active_span("ratelimit wait"), maybe_metrics_cm:
-#             await self._on_enter(request_id)
-
-#     def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
-#         time_now = self.clock.time_msec()
-
-#         # remove any entries from request_times which aren't within the window
-#         self.request_times[:] = [
-#             r for r in self.request_times if time_now - r < self.window_size
-#         ]
-
-#         # reject the request if we already have too many queued up (either
-#         # sleeping or in the ready queue).
-#         if self.should_reject():
-#             logger.debug("Ratelimiter(%s): rejecting request", self.host)
-#             if self.metrics_name:
-#                 rate_limit_reject_counter.labels(self.metrics_name).inc()
-#             raise LimitExceededError(
-#                 limiter_name="rc_federation",
-#                 retry_after_ms=int(self.window_size / self.sleep_limit),
-#             )
-
-#         self.request_times.append(time_now)
-
-#         def queue_request() -> "defer.Deferred[None]":
-#             if len(self.current_processing) >= self.concurrent_requests:
-#                 queue_defer: defer.Deferred[None] = defer.Deferred()
-#                 self.ready_request_queue[request_id] = queue_defer
-#                 logger.info(
-#                     "Ratelimiter(%s): queueing request (queue now %i items)",
-#                     self.host,
-#                     len(self.ready_request_queue),
-#                 )
-
-#                 return queue_defer
-#             else:
-#                 return defer.succeed(None)
-
-#         logger.debug(
-#             "Ratelimit(%s) [%s]: len(self.request_times)=%d",
-#             self.host,
-#             id(request_id),
-#             len(self.request_times),
-#         )
-
-#         if self.should_sleep():
-#             logger.debug(
-#                 "Ratelimiter(%s) [%s]: sleeping request for %f sec",
-#                 self.host,
-#                 id(request_id),
-#                 self.sleep_sec,
-#             )
-#             if self.metrics_name:
-#                 rate_limit_sleep_counter.labels(self.metrics_name).inc()
-#             ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
-
-#             self.sleeping_requests.add(request_id)
-
-#             def on_wait_finished(_: Any) -> "defer.Deferred[None]":
-#                 logger.debug(
-#                     "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id)
-#                 )
-#                 self.sleeping_requests.discard(request_id)
-#                 queue_defer = queue_request()
-#                 return queue_defer
-
-#             ret_defer.addBoth(on_wait_finished)
-#         else:
-#             ret_defer = queue_request()
-
-#         def on_start(r: object) -> object:
-#             logger.debug(
-#                 "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id)
-#             )
-#             self.current_processing.add(request_id)
-#             return r
-
-#         def on_err(r: object) -> object:
-#             # XXX: why is this necessary? this is called before we start
-#             # processing the request so why would the request be in
-#             # current_processing?
-#             self.current_processing.discard(request_id)
-#             return r
-
-#         def on_both(r: object) -> object:
-#             # Ensure that we've properly cleaned up.
-#             self.sleeping_requests.discard(request_id)
-#             self.ready_request_queue.pop(request_id, None)
-#             return r
-
-#         ret_defer.addCallbacks(on_start, on_err)
-#         ret_defer.addBoth(on_both)
-#         return make_deferred_yieldable(ret_defer)
-
-#     def _on_exit(self, request_id: object) -> None:
-#         logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id))
-
-#         # When requests complete synchronously, we will recursively start the next
-#         # request in the queue. To avoid stack exhaustion, we defer starting the next
-#         # request until the next reactor tick.
-
-#         def start_next_request() -> None:
-#             # We only remove the completed request from the list when we're about to
-#             # start the next one, otherwise we can allow extra requests through.
-#             self.current_processing.discard(request_id)
-#             try:
-#                 # start processing the next item on the queue.
-#                 _, deferred = self.ready_request_queue.popitem(last=False)
-
-#                 with PreserveLoggingContext():
-#                     deferred.callback(None)
-#             except KeyError:
-#                 pass
-
-#         self.clock.call_later(0.0, start_next_request)
+_rate_limiter_instances: MutableSet["FederationRateLimiter"] = WeakSet()
+# Protects the _rate_limiter_instances set from concurrent access
+_rate_limiter_instances_lock = threading.Lock()
+
+
+def _get_counts_from_rate_limiter_instance(
+    count_func: Callable[["FederationRateLimiter"], int],
+) -> Mapping[Tuple[str, ...], int]:
+    """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
+    # Cast to a list to prevent it changing while the Prometheus
+    # thread is collecting metrics
+    with _rate_limiter_instances_lock:
+        rate_limiter_instances = list(_rate_limiter_instances)
+
+    # Map from (metrics_name,) -> int, the number of something like slept hosts
+    # or rejected hosts. The key type is Tuple[str], but we leave the length
+    # unspecified for compatability with LaterGauge's annotations.
+    counts: Dict[Tuple[str, ...], int] = {}
+    for rate_limiter_instance in rate_limiter_instances:
+        # Only track metrics if they provided a `metrics_name` to
+        # differentiate this instance of the rate limiter.
+        if rate_limiter_instance.metrics_name:
+            key = (rate_limiter_instance.metrics_name,)
+            counts[key] = count_func(rate_limiter_instance)
+
+    return counts
+
+
+# We track the number of affected hosts per time-period so we can
+# differentiate one really noisy homeserver from a general
+# ratelimit tuning problem across the federation.
+LaterGauge(
+    "synapse_rate_limit_sleep_affected_hosts",
+    "Number of hosts that had requests put to sleep",
+    ["rate_limiter_name"],
+    lambda: _get_counts_from_rate_limiter_instance(
+        lambda rate_limiter_instance: sum(
+            ratelimiter.should_sleep()
+            for ratelimiter in rate_limiter_instance.ratelimiters.values()
+        )
+    ),
+)
+LaterGauge(
+    "synapse_rate_limit_reject_affected_hosts",
+    "Number of hosts that had requests rejected",
+    ["rate_limiter_name"],
+    lambda: _get_counts_from_rate_limiter_instance(
+        lambda rate_limiter_instance: sum(
+            ratelimiter.should_reject()
+            for ratelimiter in rate_limiter_instance.ratelimiters.values()
+        )
+    ),
+)
+
+
+class FederationRateLimiter:
+    """Used to rate limit request per-host."""
+
+    def __init__(
+        self,
+        clock: Clock,
+        config: FederationRatelimitSettings,
+        metrics_name: Optional[str] = None,
+    ):
+        """
+        Args:
+            clock
+            config
+            metrics_name: The name of the rate limiter so we can differentiate it
+                from the rest in the metrics. If `None`, we don't track metrics
+                for this rate limiter.
+
+        """
+        self.metrics_name = metrics_name
+
+        def new_limiter() -> "_PerHostRatelimiter":
+            return _PerHostRatelimiter(
+                clock=clock, config=config, metrics_name=metrics_name
+            )
+
+        self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = (
+            collections.defaultdict(new_limiter)
+        )
+
+        with _rate_limiter_instances_lock:
+            _rate_limiter_instances.add(self)
+
+    def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
+        """Used to ratelimit an incoming request from a given host
+
+        Example usage:
+
+            with rate_limiter.ratelimit(origin) as wait_deferred:
+                yield wait_deferred
+                # Handle request ...
+
+        Args:
+            host: Origin of incoming request.
+
+        Returns:
+            context manager which returns a deferred.
+        """
+        return self.ratelimiters[host].ratelimit(host)
+
+
+class _PerHostRatelimiter:
+    def __init__(
+        self,
+        clock: Clock,
+        config: FederationRatelimitSettings,
+        metrics_name: Optional[str] = None,
+    ):
+        """
+        Args:
+            clock
+            config
+            metrics_name: The name of the rate limiter so we can differentiate it
+                from the rest in the metrics. If `None`, we don't track metrics
+                for this rate limiter.
+                from the rest in the metrics
+        """
+        self.clock = clock
+        self.metrics_name = metrics_name
+
+        self.window_size = config.window_size
+        self.sleep_limit = config.sleep_limit
+        self.sleep_sec = config.sleep_delay / 1000.0
+        self.reject_limit = config.reject_limit
+        self.concurrent_requests = config.concurrent
+
+        # request_id objects for requests which have been slept
+        self.sleeping_requests: Set[object] = set()
+
+        # map from request_id object to Deferred for requests which are ready
+        # for processing but have been queued
+        self.ready_request_queue: collections.OrderedDict[
+            object, defer.Deferred[None]
+        ] = collections.OrderedDict()
+
+        # request id objects for requests which are in progress
+        self.current_processing: Set[object] = set()
+
+        # times at which we have recently (within the last window_size ms)
+        # received requests.
+        self.request_times: List[int] = []
+
+    @contextlib.contextmanager
+    def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]":
+        # `contextlib.contextmanager` takes a generator and turns it into a
+        # context manager. The generator should only yield once with a value
+        # to be returned by manager.
+        # Exceptions will be reraised at the yield.
+
+        self.host = host
+
+        request_id = object()
+        # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant
+        # type-checking, but we'd need Twisted >= 21.2.
+        ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id))
+        try:
+            yield ret
+        finally:
+            self._on_exit(request_id)
+
+    def should_reject(self) -> bool:
+        """
+        Whether to reject the request if we already have too many queued up
+        (either sleeping or in the ready queue).
+        """
+        queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
+        return queue_size > self.reject_limit
+
+    def should_sleep(self) -> bool:
+        """
+        Whether to sleep the request if we already have too many requests coming
+        through within the window.
+        """
+        return len(self.request_times) > self.sleep_limit
+
+    async def _on_enter_with_tracing(self, request_id: object) -> None:
+        maybe_metrics_cm: ContextManager = contextlib.nullcontext()
+        if self.metrics_name:
+            maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
+        with start_active_span("ratelimit wait"), maybe_metrics_cm:
+            await self._on_enter(request_id)
+
+    def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
+        time_now = self.clock.time_msec()
+
+        # remove any entries from request_times which aren't within the window
+        self.request_times[:] = [
+            r for r in self.request_times if time_now - r < self.window_size
+        ]
+
+        # reject the request if we already have too many queued up (either
+        # sleeping or in the ready queue).
+        if self.should_reject():
+            logger.debug("Ratelimiter(%s): rejecting request", self.host)
+            if self.metrics_name:
+                rate_limit_reject_counter.labels(self.metrics_name).inc()
+            raise LimitExceededError(
+                limiter_name="rc_federation",
+                retry_after_ms=int(self.window_size / self.sleep_limit),
+            )
+
+        self.request_times.append(time_now)
+
+        def queue_request() -> "defer.Deferred[None]":
+            if len(self.current_processing) >= self.concurrent_requests:
+                queue_defer: defer.Deferred[None] = defer.Deferred()
+                self.ready_request_queue[request_id] = queue_defer
+                logger.info(
+                    "Ratelimiter(%s): queueing request (queue now %i items)",
+                    self.host,
+                    len(self.ready_request_queue),
+                )
+
+                return queue_defer
+            else:
+                return defer.succeed(None)
+
+        logger.debug(
+            "Ratelimit(%s) [%s]: len(self.request_times)=%d",
+            self.host,
+            id(request_id),
+            len(self.request_times),
+        )
+
+        if self.should_sleep():
+            logger.debug(
+                "Ratelimiter(%s) [%s]: sleeping request for %f sec",
+                self.host,
+                id(request_id),
+                self.sleep_sec,
+            )
+            if self.metrics_name:
+                rate_limit_sleep_counter.labels(self.metrics_name).inc()
+            ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
+
+            self.sleeping_requests.add(request_id)
+
+            def on_wait_finished(_: Any) -> "defer.Deferred[None]":
+                logger.debug(
+                    "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id)
+                )
+                self.sleeping_requests.discard(request_id)
+                queue_defer = queue_request()
+                return queue_defer
+
+            ret_defer.addBoth(on_wait_finished)
+        else:
+            ret_defer = queue_request()
+
+        def on_start(r: object) -> object:
+            logger.debug(
+                "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id)
+            )
+            self.current_processing.add(request_id)
+            return r
+
+        def on_err(r: object) -> object:
+            # XXX: why is this necessary? this is called before we start
+            # processing the request so why would the request be in
+            # current_processing?
+            self.current_processing.discard(request_id)
+            return r
+
+        def on_both(r: object) -> object:
+            # Ensure that we've properly cleaned up.
+            self.sleeping_requests.discard(request_id)
+            self.ready_request_queue.pop(request_id, None)
+            return r
+
+        ret_defer.addCallbacks(on_start, on_err)
+        ret_defer.addBoth(on_both)
+        return make_deferred_yieldable(ret_defer)
+
+    def _on_exit(self, request_id: object) -> None:
+        logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id))
+
+        # When requests complete synchronously, we will recursively start the next
+        # request in the queue. To avoid stack exhaustion, we defer starting the next
+        # request until the next reactor tick.
+
+        def start_next_request() -> None:
+            # We only remove the completed request from the list when we're about to
+            # start the next one, otherwise we can allow extra requests through.
+            self.current_processing.discard(request_id)
+            try:
+                # start processing the next item on the queue.
+                _, deferred = self.ready_request_queue.popitem(last=False)
+
+                with PreserveLoggingContext():
+                    deferred.callback(None)
+            except KeyError:
+                pass
+
+        self.clock.call_later(0.0, start_next_request)

From 7651a44635cc58e22c89cb4f0ad4541298b54410 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:33:20 -0500
Subject: [PATCH 05/48] Indicate future plan

---
 scripts-dev/mypy_synapse_plugin.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index b55ec610c73..e0574d01738 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -67,7 +67,11 @@ class SynapsePlugin(Plugin):
     def get_function_signature_hook(
         self, fullname: str
     ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
-        if fullname in ("prometheus_client.metrics.Counter",):
+        if fullname in (
+            "prometheus_client.metrics.Counter",
+            # TODO: Add other prometheus_client metrics that need checking as we
+            # refactor, see https://github.com/element-hq/synapse/issues/18592
+        ):
             return check_prometheus_metric_instantiation
 
         return None

From 944abdd09df04bc03281e32dfb6e729fe297a54f Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:48:16 -0500
Subject: [PATCH 06/48] Adjust errors

---
 scripts-dev/mypy_synapse_plugin.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index e0574d01738..e19e514ef90 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -28,14 +28,12 @@
 import mypy.types
 from mypy.erasetype import remove_instance_last_known_values
 from mypy.errorcodes import ErrorCode
-from mypy.nodes import ARG_NAMED_OPT, TempNode, Var
+from mypy.nodes import ARG_NAMED_OPT, ListExpr, StrExpr, TempNode, Var
 from mypy.plugin import (
+    FunctionLike,
     FunctionSigContext,
     MethodSigContext,
     Plugin,
-    FunctionContext,
-    ClassDefContext,
-    FunctionLike,
 )
 from mypy.typeops import bind_self
 from mypy.types import (
@@ -49,7 +47,6 @@
     UninhabitedType,
     UnionType,
 )
-from mypy.nodes import StrExpr, TupleExpr, ListExpr
 
 PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
     "missing-server-name-label",
@@ -170,10 +167,10 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
             )
     else:
         ctx.api.fail(
-            f"Expected the `labelnames` argument of {signature.name} to be a list of label names, but got "
-            f"{labelnames_arg_expression}",
+            f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
+            f"(including 'server_name'), but got {labelnames_arg_expression}",
             ctx.context,
-            code=PROMETHEUS_METRIC_MANGLED_LABELS,
+            code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
         )
         return signature
 

From 143fa3f61138020aae205a10a111e16f98afa141 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 11:52:08 -0500
Subject: [PATCH 07/48] Explain when it's okay to ignore lint

---
 scripts-dev/mypy_synapse_plugin.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index e19e514ef90..f7dbbc634d6 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -161,14 +161,18 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
         # TODO: Use `SERVER_NAME_LABEL` here
         if "server_name" not in labels:
             ctx.api.fail(
-                f"Expected {signature.name} to include 'server_name' in the list of labels",
+                f"Expected {signature.name} to include 'server_name' in the list of labels."
+                " If this is a process-level metric (vs homeserver-level), use a type ignore "
+                "comment to disable this check.",
                 ctx.context,
                 code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
             )
     else:
         ctx.api.fail(
             f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
-            f"(including 'server_name'), but got {labelnames_arg_expression}",
+            f"(including 'server_name'), but got {labelnames_arg_expression}.",
+            " If this is a process-level metric (vs homeserver-level), use a type ignore "
+            "comment to disable this check.",
             ctx.context,
             code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
         )

From f66758fc1cf3a3bd94c149bae89df671729e076a Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 12:54:43 -0500
Subject: [PATCH 08/48] Use `SERVER_NAME_LABEL`

---
 scripts-dev/mypy_synapse_plugin.py | 30 +++++++++++++++---------------
 synapse/metrics/__init__.py        | 15 +++++++++++++++
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index f7dbbc634d6..8821e3a8770 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -48,6 +48,8 @@
     UnionType,
 )
 
+from synapse.metrics import SERVER_NAME_LABEL
+
 PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
     "missing-server-name-label",
     "'server_name' label required in metric",
@@ -97,16 +99,16 @@ def get_method_signature_hook(
 def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
     """
     Ensure that the `prometheus_client` metrics are instantiated with `server_name` in
-    the list of labels.
+    the list of labels (use constant `SERVER_NAME_LABEL`).
 
     This is important because we support multiple Synapse instances running in the same
     process, where all metrics share a single global `REGISTRY`. The `server_name` label
     ensures metrics are correctly separated by homeserver.
 
-    There are also some metrics that apply at the process level, such as
-    CPU usage, Python garbage collection, Twisted reactor tick time which shouldn't
-    have a `server_name` label. In those cases, use use a type ignore comment to
-    disable the check, e.g. `# type: ignore[missing-server-name-label]`.
+    There are also some metrics that apply at the process level, such as CPU usage,
+    Python garbage collection, Twisted reactor tick time which shouldn't have a
+    `server_name` label. In those cases, use use a type ignore
+    comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`.
     """
     # The true signature, this isn't being modified so this is what will be returned.
     signature: CallableType = ctx.default_signature
@@ -130,7 +132,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
         f"to match the number of arguments from the function signature context ({len(ctx.args)})"
     )
 
-    # Check if the `labelnames` argument includes the `server_name` label.
+    # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
     #
     # `ctx.args` should look like this:
     # ```
@@ -156,22 +158,20 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
                     code=PROMETHEUS_METRIC_MANGLED_LABELS,
                 )
 
-        # Check if the `labelnames` argument includes the `server_name` label.
-        #
-        # TODO: Use `SERVER_NAME_LABEL` here
-        if "server_name" not in labels:
+        # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
+        if SERVER_NAME_LABEL not in labels:
             ctx.api.fail(
-                f"Expected {signature.name} to include 'server_name' in the list of labels."
-                " If this is a process-level metric (vs homeserver-level), use a type ignore "
-                "comment to disable this check.",
+                f"Expected {signature.name} to include 'server_name' (use constant `SERVER_NAME_LABEL`) "
+                "in the list of labels. If this is a process-level metric (vs homeserver-level), "
+                "use a type ignore comment to disable this check.",
                 ctx.context,
                 code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
             )
     else:
         ctx.api.fail(
             f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
-            f"(including 'server_name'), but got {labelnames_arg_expression}.",
-            " If this is a process-level metric (vs homeserver-level), use a type ignore "
+            f"(including 'server_name', use constant `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
+            "If this is a process-level metric (vs homeserver-level), use a type ignore "
             "comment to disable this check.",
             ctx.context,
             code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 86ac2c23959..da58df6b968 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -66,6 +66,21 @@
 
 HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
 
+SERVER_NAME_LABEL = "server_name"
+"""
+The `server_name` label is used to identify the homeserver that the metrics correspond
+to. Because we support multiple instances of Synapse running in the same process and all
+metrics are in a single global `REGISTRY`, we need to manually label any metrics.
+
+In the case of a Synapse homeserver, this should be set to the homeserver name
+(`hs.hostname`).
+
+We're purposely not using the `instance` label for this purpose as that should be "The
+<host>:<port> part of the target's URL that was scraped.". Also: "In Prometheus
+terms, an endpoint you can scrape is called an *instance*, usually corresponding to a
+single process." (source: https://prometheus.io/docs/concepts/jobs_instances/)
+"""
+
 
 class _RegistryProxy:
     @staticmethod

From 3705aecdb3b0216a15211f84bbb0137014b42361 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:09:52 -0500
Subject: [PATCH 09/48] Support `NameExpr` `SERVER_NAME_LABEL` in lint

---
 scripts-dev/mypy_synapse_plugin.py | 57 ++++++++++++------------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index 8821e3a8770..224b4562c44 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -28,7 +28,7 @@
 import mypy.types
 from mypy.erasetype import remove_instance_last_known_values
 from mypy.errorcodes import ErrorCode
-from mypy.nodes import ARG_NAMED_OPT, ListExpr, StrExpr, TempNode, Var
+from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, Var
 from mypy.plugin import (
     FunctionLike,
     FunctionSigContext,
@@ -48,16 +48,9 @@
     UnionType,
 )
 
-from synapse.metrics import SERVER_NAME_LABEL
-
 PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
     "missing-server-name-label",
-    "'server_name' label required in metric",
-    category="per-homeserver-tenant-metrics",
-)
-PROMETHEUS_METRIC_MANGLED_LABELS = ErrorCode(
-    "mangled-labels",
-    "`labelnames` argument should be a list of strings",
+    "`SERVER_NAME_LABEL` required in metric",
     category="per-homeserver-tenant-metrics",
 )
 
@@ -98,17 +91,17 @@ def get_method_signature_hook(
 
 def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
     """
-    Ensure that the `prometheus_client` metrics are instantiated with `server_name` in
-    the list of labels (use constant `SERVER_NAME_LABEL`).
+    Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label
+    when instantiated.
 
     This is important because we support multiple Synapse instances running in the same
     process, where all metrics share a single global `REGISTRY`. The `server_name` label
     ensures metrics are correctly separated by homeserver.
 
     There are also some metrics that apply at the process level, such as CPU usage,
-    Python garbage collection, Twisted reactor tick time which shouldn't have a
-    `server_name` label. In those cases, use use a type ignore
-    comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`.
+    Python garbage collection, Twisted reactor tick time which shouldn't have the
+    `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the
+    check, e.g. `# type: ignore[missing-server-name-label]`.
     """
     # The true signature, this isn't being modified so this is what will be returned.
     signature: CallableType = ctx.default_signature
@@ -132,7 +125,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
         f"to match the number of arguments from the function signature context ({len(ctx.args)})"
     )
 
-    # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
+    # Check if the `labelnames` argument includes `SERVER_NAME_LABEL`
     #
     # `ctx.args` should look like this:
     # ```
@@ -145,34 +138,28 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
     # ```
     labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
     if isinstance(labelnames_arg_expression, ListExpr):
-        # Collect the label names as a list of strings.
-        labels = []
-        for labelname_expression in labelnames_arg_expression.items:
-            if isinstance(labelname_expression, StrExpr):
-                labels.append(labelname_expression.value)
-            else:
-                ctx.api.fail(
-                    f"Expected all items in the `labelnames` argument of {signature.name} to be strings, but got "
-                    f"{labelname_expression.__class__.__name__}",
-                    ctx.context,
-                    code=PROMETHEUS_METRIC_MANGLED_LABELS,
-                )
-
         # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
-        if SERVER_NAME_LABEL not in labels:
+        for labelname_expression in labelnames_arg_expression.items:
+            if (
+                isinstance(labelname_expression, NameExpr)
+                and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL"
+            ):
+                # Found the `SERVER_NAME_LABEL`, all good!
+                break
+        else:
             ctx.api.fail(
-                f"Expected {signature.name} to include 'server_name' (use constant `SERVER_NAME_LABEL`) "
-                "in the list of labels. If this is a process-level metric (vs homeserver-level), "
-                "use a type ignore comment to disable this check.",
+                f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. "
+                "If this is a process-level metric (vs homeserver-level), use a type ignore comment "
+                "to disable this check.",
                 ctx.context,
                 code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
             )
     else:
         ctx.api.fail(
             f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
-            f"(including 'server_name', use constant `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
-            "If this is a process-level metric (vs homeserver-level), use a type ignore "
-            "comment to disable this check.",
+            f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
+            "If this is a process-level metric (vs homeserver-level), use a type ignore comment "
+            "to disable this check.",
             ctx.context,
             code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
         )

From 2ba8a31c7bfa00f62bc5d7a50829d9ad7fbbc22e Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:25:38 -0500
Subject: [PATCH 10/48] Fill in `synapse/notifier.py`

---
 synapse/notifier.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/synapse/notifier.py b/synapse/notifier.py
index 6190432b877..887957b151b 100644
--- a/synapse/notifier.py
+++ b/synapse/notifier.py
@@ -50,7 +50,7 @@
 from synapse.logging import issue9533_logger
 from synapse.logging.context import PreserveLoggingContext
 from synapse.logging.opentracing import log_kv, start_active_span
-from synapse.metrics import LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge
 from synapse.streams.config import PaginationConfig
 from synapse.types import (
     ISynapseReactor,
@@ -74,10 +74,10 @@
 
 logger = logging.getLogger(__name__)
 
-notified_events_counter = Counter("synapse_notifier_notified_events", "")
-
 users_woken_by_stream_counter = Counter(
-    "synapse_notifier_users_woken_by_stream", "", ["stream"]
+    "synapse_notifier_users_woken_by_stream",
+    "",
+    labelnames=["stream", SERVER_NAME_LABEL],
 )
 
 T = TypeVar("T")
@@ -224,6 +224,7 @@ def __init__(self, hs: "HomeServer"):
         self.room_to_user_streams: Dict[str, Set[_NotifierUserStream]] = {}
 
         self.hs = hs
+        self.server_name = hs.hostname
         self._storage_controllers = hs.get_storage_controllers()
         self.event_sources = hs.get_event_sources()
         self.store = hs.get_datastores().main
@@ -350,9 +351,10 @@ async def on_un_partial_stated_room(
             for listener in listeners:
                 listener.callback(current_token)
 
-        users_woken_by_stream_counter.labels(StreamKeyType.UN_PARTIAL_STATED_ROOMS).inc(
-            len(user_streams)
-        )
+        users_woken_by_stream_counter.labels(
+            stream=StreamKeyType.UN_PARTIAL_STATED_ROOMS,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc(len(user_streams))
 
         # Poke the replication so that other workers also see the write to
         # the un-partial-stated rooms stream.
@@ -575,7 +577,10 @@ def on_new_event(
                         listener.callback(current_token)
 
             if user_streams:
-                users_woken_by_stream_counter.labels(stream_key).inc(len(user_streams))
+                users_woken_by_stream_counter.labels(
+                    stream=stream_key,
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc(len(user_streams))
 
         self.notify_replication()
 

From a28be1acc650ae8909a02becec3b7d129b4003d9 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:35:44 -0500
Subject: [PATCH 11/48] Fill in `synapse/appservice/api.py`

---
 synapse/appservice/api.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py
index 19322471dc5..88a22185df3 100644
--- a/synapse/appservice/api.py
+++ b/synapse/appservice/api.py
@@ -48,6 +48,7 @@
 from synapse.events.utils import SerializeEventConfig, serialize_event
 from synapse.http.client import SimpleHttpClient, is_unknown_endpoint
 from synapse.logging import opentracing
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import DeviceListUpdates, JsonDict, JsonMapping, ThirdPartyInstanceID
 from synapse.util.caches.response_cache import ResponseCache
 
@@ -59,29 +60,31 @@
 sent_transactions_counter = Counter(
     "synapse_appservice_api_sent_transactions",
     "Number of /transactions/ requests sent",
-    ["service"],
+    labelnames=["service", SERVER_NAME_LABEL],
 )
 
 failed_transactions_counter = Counter(
     "synapse_appservice_api_failed_transactions",
     "Number of /transactions/ requests that failed to send",
-    ["service"],
+    labelnames=["service", SERVER_NAME_LABEL],
 )
 
 sent_events_counter = Counter(
-    "synapse_appservice_api_sent_events", "Number of events sent to the AS", ["service"]
+    "synapse_appservice_api_sent_events",
+    "Number of events sent to the AS",
+    labelnames=["service", SERVER_NAME_LABEL],
 )
 
 sent_ephemeral_counter = Counter(
     "synapse_appservice_api_sent_ephemeral",
     "Number of ephemeral events sent to the AS",
-    ["service"],
+    labelnames=["service", SERVER_NAME_LABEL],
 )
 
 sent_todevice_counter = Counter(
     "synapse_appservice_api_sent_todevice",
     "Number of todevice messages sent to the AS",
-    ["service"],
+    labelnames=["service", SERVER_NAME_LABEL],
 )
 
 HOUR_IN_MS = 60 * 60 * 1000
@@ -126,6 +129,7 @@ class ApplicationServiceApi(SimpleHttpClient):
 
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
+        self.server_name = hs.hostname
         self.clock = hs.get_clock()
         self.config = hs.config.appservice
 
@@ -378,6 +382,7 @@ async def push_bulk(
                     "left": list(device_list_summary.left),
                 }
 
+        labels = {"service": service.id, SERVER_NAME_LABEL: self.server_name}
         try:
             args = None
             if self.config.use_appservice_legacy_authorization:
@@ -395,10 +400,10 @@ async def push_bulk(
                     service.url,
                     [event.get("event_id") for event in events],
                 )
-            sent_transactions_counter.labels(service.id).inc()
-            sent_events_counter.labels(service.id).inc(len(serialized_events))
-            sent_ephemeral_counter.labels(service.id).inc(len(ephemeral))
-            sent_todevice_counter.labels(service.id).inc(len(to_device_messages))
+            sent_transactions_counter.labels(**labels).inc()
+            sent_events_counter.labels(**labels).inc(len(serialized_events))
+            sent_ephemeral_counter.labels(**labels).inc(len(ephemeral))
+            sent_todevice_counter.labels(**labels).inc(len(to_device_messages))
             return True
         except CodeMessageException as e:
             logger.warning(
@@ -417,7 +422,7 @@ async def push_bulk(
                 ex.args,
                 exc_info=logger.isEnabledFor(logging.DEBUG),
             )
-        failed_transactions_counter.labels(service.id).inc()
+        failed_transactions_counter.labels(**labels).inc()
         return False
 
     async def claim_client_keys(

From 48782ac975988952ad4f1531762d4ab8f96862e0 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:44:02 -0500
Subject: [PATCH 12/48] Fill in `synapse/federation/federation_client.py`

---
 synapse/federation/federation_client.py | 29 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py
index 2d1da70793c..4566ace7091 100644
--- a/synapse/federation/federation_client.py
+++ b/synapse/federation/federation_client.py
@@ -74,6 +74,7 @@
 from synapse.http.client import is_unknown_endpoint
 from synapse.http.types import QueryParams
 from synapse.logging.opentracing import SynapseTags, log_kv, set_tag, tag_args, trace
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import JsonDict, StrCollection, UserID, get_domain_from_id
 from synapse.types.handlers.policy_server import RECOMMENDATION_OK, RECOMMENDATION_SPAM
 from synapse.util.async_helpers import concurrently_execute
@@ -85,7 +86,9 @@
 
 logger = logging.getLogger(__name__)
 
-sent_queries_counter = Counter("synapse_federation_client_sent_queries", "", ["type"])
+sent_queries_counter = Counter(
+    "synapse_federation_client_sent_queries", "", labelnames=["type", SERVER_NAME_LABEL]
+)
 
 
 PDU_RETRY_TIME_MS = 1 * 60 * 1000
@@ -137,7 +140,7 @@ def __init__(self, hs: "HomeServer"):
         self.state = hs.get_state_handler()
         self.transport_layer = hs.get_federation_transport_client()
 
-        self.hostname = hs.hostname
+        self.server_name = hs.hostname
         self.signing_key = hs.signing_key
 
         # Cache mapping `event_id` to a tuple of the event itself and the `pull_origin`
@@ -207,7 +210,10 @@ async def make_query(
         Returns:
             The JSON object from the response
         """
-        sent_queries_counter.labels(query_type).inc()
+        sent_queries_counter.labels(
+            type=query_type,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         return await self.transport_layer.make_query(
             destination,
@@ -229,7 +235,10 @@ async def query_client_keys(
         Returns:
             The JSON object from the response
         """
-        sent_queries_counter.labels("client_device_keys").inc()
+        sent_queries_counter.labels(
+            type="client_device_keys",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
         return await self.transport_layer.query_client_keys(
             destination, content, timeout
         )
@@ -240,7 +249,10 @@ async def query_user_devices(
         """Query the device keys for a list of user ids hosted on a remote
         server.
         """
-        sent_queries_counter.labels("user_devices").inc()
+        sent_queries_counter.labels(
+            type="user_devices",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
         return await self.transport_layer.query_user_devices(
             destination, user_id, timeout
         )
@@ -262,7 +274,10 @@ async def claim_client_keys(
         Returns:
             The JSON object from the response
         """
-        sent_queries_counter.labels("client_one_time_keys").inc()
+        sent_queries_counter.labels(
+            type="client_one_time_keys",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         # Convert the query with counts into a stable and unstable query and check
         # if attempting to claim more than 1 OTK.
@@ -1068,7 +1083,7 @@ async def send_request(destination: str) -> Tuple[str, EventBase, RoomVersion]:
             # there's some we never care about
             ev = builder.create_local_event_from_event_dict(
                 self._clock,
-                self.hostname,
+                self.server_name,
                 self.signing_key,
                 room_version=room_version,
                 event_dict=pdu_dict,

From cdbeb4fd723f6f5e56d05f1bbad57bae1c2277cf Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:46:14 -0500
Subject: [PATCH 13/48] Fill in `synapse/federation/federation_server.py`

---
 synapse/federation/federation_server.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index 8718b704017..90ccc904917 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -82,6 +82,7 @@
     tag_args,
     trace,
 )
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import wrap_as_background_process
 from synapse.replication.http.federation import (
     ReplicationFederationSendEduRestServlet,
@@ -105,12 +106,18 @@
 
 logger = logging.getLogger(__name__)
 
-received_pdus_counter = Counter("synapse_federation_server_received_pdus", "")
+received_pdus_counter = Counter(
+    "synapse_federation_server_received_pdus", "", labelnames=[SERVER_NAME_LABEL]
+)
 
-received_edus_counter = Counter("synapse_federation_server_received_edus", "")
+received_edus_counter = Counter(
+    "synapse_federation_server_received_edus", "", labelnames=[SERVER_NAME_LABEL]
+)
 
 received_queries_counter = Counter(
-    "synapse_federation_server_received_queries", "", ["type"]
+    "synapse_federation_server_received_queries",
+    "",
+    labelnames=["type", SERVER_NAME_LABEL],
 )
 
 pdu_process_time = Histogram(
@@ -424,7 +431,9 @@ async def _handle_pdus_in_txn(
             report back to the sending server.
         """
 
-        received_pdus_counter.inc(len(transaction.pdus))
+        received_pdus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc(
+            len(transaction.pdus)
+        )
 
         origin_host, _ = parse_server_name(origin)
 
@@ -543,7 +552,7 @@ async def _handle_edus_in_txn(self, origin: str, transaction: Transaction) -> No
         """Process the EDUs in a received transaction."""
 
         async def _process_edu(edu_dict: JsonDict) -> None:
-            received_edus_counter.inc()
+            received_edus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
             edu = Edu(
                 origin=origin,
@@ -658,7 +667,10 @@ async def on_pdu_request(
     async def on_query_request(
         self, query_type: str, args: Dict[str, str]
     ) -> Tuple[int, Dict[str, Any]]:
-        received_queries_counter.labels(query_type).inc()
+        received_queries_counter.labels(
+            type=query_type,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
         resp = await self.registry.on_query(query_type, args)
         return 200, resp
 

From 5cd7f3e9b0b419fc72903c92edd9b459f466ef69 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:48:21 -0500
Subject: [PATCH 14/48] Fill in `synapse/federation/sender/__init__.py`

---
 synapse/federation/sender/__init__.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py
index 2eef7b707d7..c695f5d494c 100644
--- a/synapse/federation/sender/__init__.py
+++ b/synapse/federation/sender/__init__.py
@@ -160,6 +160,7 @@
 from synapse.federation.units import Edu
 from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.metrics import (
+    SERVER_NAME_LABEL,
     LaterGauge,
     event_processing_loop_counter,
     event_processing_loop_room_count,
@@ -189,11 +190,13 @@
 sent_pdus_destination_dist_count = Counter(
     "synapse_federation_client_sent_pdu_destinations_count",
     "Number of PDUs queued for sending to one or more destinations",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 sent_pdus_destination_dist_total = Counter(
     "synapse_federation_client_sent_pdu_destinations",
     "Total number of PDUs queued for sending across all destinations",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 # Time (in s) to wait before trying to wake up destinations that have
@@ -723,8 +726,12 @@ async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None:
         if not destinations:
             return
 
-        sent_pdus_destination_dist_total.inc(len(destinations))
-        sent_pdus_destination_dist_count.inc()
+        sent_pdus_destination_dist_total.labels(
+            **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(len(destinations))
+        sent_pdus_destination_dist_count.labels(
+            **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         assert pdu.internal_metadata.stream_ordering
 

From 6847cc3800fc5e9d65fe6eab367830c92fe41880 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:50:18 -0500
Subject: [PATCH 15/48] Fill in
 `synapse/federation/sender/per_destination_queue.py`

---
 .../federation/sender/per_destination_queue.py  | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py
index b3f65e8237e..9ab95a988ea 100644
--- a/synapse/federation/sender/per_destination_queue.py
+++ b/synapse/federation/sender/per_destination_queue.py
@@ -40,7 +40,7 @@
 from synapse.handlers.presence import format_user_presence_state
 from synapse.logging import issue9533_logger
 from synapse.logging.opentracing import SynapseTags, set_tag
-from synapse.metrics import sent_transactions_counter
+from synapse.metrics import SERVER_NAME_LABEL, sent_transactions_counter
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.types import JsonDict, ReadReceipt
 from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
@@ -56,13 +56,15 @@
 
 
 sent_edus_counter = Counter(
-    "synapse_federation_client_sent_edus", "Total number of EDUs successfully sent"
+    "synapse_federation_client_sent_edus",
+    "Total number of EDUs successfully sent",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 sent_edus_by_type = Counter(
     "synapse_federation_client_sent_edus_by_type",
     "Number of sent EDUs successfully sent, by event type",
-    ["type"],
+    labelnames=["type", SERVER_NAME_LABEL],
 )
 
 
@@ -361,9 +363,14 @@ async def _transaction_transmission_loop(self) -> None:
                     )
 
                     sent_transactions_counter.inc()
-                    sent_edus_counter.inc(len(pending_edus))
+                    sent_edus_counter.labels(
+                        **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc(len(pending_edus))
                     for edu in pending_edus:
-                        sent_edus_by_type.labels(edu.edu_type).inc()
+                        sent_edus_by_type.labels(
+                            type=edu.edu_type,
+                            **{SERVER_NAME_LABEL: self.server_name},
+                        ).inc()
 
         except NotRetryingDestination as e:
             logger.debug(

From 70547f39d45acfcd9d55fbeb787d571cd3e55ae6 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:52:47 -0500
Subject: [PATCH 16/48] Fill in `synapse/handlers/appservice.py`

---
 synapse/handlers/appservice.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py
index f3bbdb5a05b..ec624e405d7 100644
--- a/synapse/handlers/appservice.py
+++ b/synapse/handlers/appservice.py
@@ -42,6 +42,7 @@
 from synapse.handlers.presence import format_user_presence_state
 from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.metrics import (
+    SERVER_NAME_LABEL,
     event_processing_loop_counter,
     event_processing_loop_room_count,
 )
@@ -68,11 +69,14 @@
 
 logger = logging.getLogger(__name__)
 
-events_processed_counter = Counter("synapse_handlers_appservice_events_processed", "")
+events_processed_counter = Counter(
+    "synapse_handlers_appservice_events_processed", "", labelnames=[SERVER_NAME_LABEL]
+)
 
 
 class ApplicationServicesHandler:
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self.is_mine_id = hs.is_mine_id
         self.appservice_api = hs.get_application_service_api()
@@ -200,7 +204,9 @@ async def handle_room_events(events: Iterable[EventBase]) -> None:
                         "appservice_sender"
                     ).set(upper_bound)
 
-                    events_processed_counter.inc(len(events))
+                    events_processed_counter.labels(
+                        **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc(len(events))
 
                     event_processing_loop_room_count.labels("appservice_sender").inc(
                         len(events_by_room)

From ecbcdeaaa2186c5a015c1a29b6f05943eacf7a4d Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 13:57:41 -0500
Subject: [PATCH 17/48] Fill in `synapse/handlers/auth.py`

---
 synapse/handlers/auth.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index 9e3e70ec1c0..a802941340b 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -70,6 +70,7 @@
 from synapse.http.server import finish_request, respond_with_html
 from synapse.http.site import SynapseRequest
 from synapse.logging.context import defer_to_thread
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.databases.main.registration import (
     LoginTokenExpired,
@@ -95,7 +96,7 @@
 invalid_login_token_counter = Counter(
     "synapse_user_login_invalid_login_tokens",
     "Counts the number of rejected m.login.token on /login",
-    ["reason"],
+    labelnames=["reason", SERVER_NAME_LABEL],
 )
 
 
@@ -198,6 +199,7 @@ class AuthHandler:
     SESSION_EXPIRE_MS = 48 * 60 * 60 * 1000
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self.auth = hs.get_auth()
         self.auth_blocking = hs.get_auth_blocking()
@@ -270,8 +272,6 @@ def __init__(self, hs: "HomeServer"):
             hs.config.sso.sso_account_deactivated_template
         )
 
-        self._server_name = hs.config.server.server_name
-
         # cast to tuple for use with str.startswith
         self._whitelisted_sso_clients = tuple(hs.config.sso.sso_client_whitelist)
 
@@ -1477,11 +1477,20 @@ async def consume_login_token(self, login_token: str) -> LoginTokenLookupResult:
         try:
             return await self.store.consume_login_token(login_token)
         except LoginTokenExpired:
-            invalid_login_token_counter.labels("expired").inc()
+            invalid_login_token_counter.labels(
+                reason="expired",
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
         except LoginTokenReused:
-            invalid_login_token_counter.labels("reused").inc()
+            invalid_login_token_counter.labels(
+                reason="reused",
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
         except NotFoundError:
-            invalid_login_token_counter.labels("not found").inc()
+            invalid_login_token_counter.labels(
+                reason="not found",
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
 
         raise AuthError(403, "Invalid login token", errcode=Codes.FORBIDDEN)
 
@@ -1856,7 +1865,7 @@ async def complete_sso_login(
         html = self._sso_redirect_confirm_template.render(
             display_url=display_url,
             redirect_url=redirect_url,
-            server_name=self._server_name,
+            server_name=self.server_name,
             new_user=new_user,
             user_id=registered_user_id,
             user_profile=user_profile_data,

From 6e014d327fa698865f0fa1a9575c639035767fbe Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:00:31 -0500
Subject: [PATCH 18/48] Fill in `synapse/handlers/federation_event.py`

---
 synapse/handlers/federation_event.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py
index 1e738f484f9..cbdbd10cae4 100644
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -76,6 +76,7 @@
     tag_args,
     trace,
 )
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.http.devices import (
     ReplicationMultiUserDevicesResyncRestServlet,
@@ -108,6 +109,7 @@
 soft_failed_event_counter = Counter(
     "synapse_federation_soft_failed_events_total",
     "Events received over federation that we marked as soft_failed",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 # Added to debug performance and track progress on optimizations
@@ -149,6 +151,7 @@ class FederationEventHandler:
     """
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self._clock = hs.get_clock()
         self._store = hs.get_datastores().main
         self._state_store = hs.get_datastores().state
@@ -173,7 +176,6 @@ def __init__(self, hs: "HomeServer"):
 
         self._is_mine_id = hs.is_mine_id
         self._is_mine_server_name = hs.is_mine_server_name
-        self._server_name = hs.hostname
         self._instance_name = hs.get_instance_name()
 
         self._config = hs.config
@@ -257,7 +259,7 @@ async def on_receive_pdu(self, origin: str, pdu: EventBase) -> None:
         # Note that if we were never in the room then we would have already
         # dropped the event, since we wouldn't know the room version.
         is_in_room = await self._event_auth_handler.is_host_in_room(
-            room_id, self._server_name
+            room_id, self.server_name
         )
         if not is_in_room:
             logger.info(
@@ -2062,7 +2064,9 @@ async def _check_for_soft_fail(
                     "hs": origin,
                 },
             )
-            soft_failed_event_counter.inc()
+            soft_failed_event_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc()
             event.internal_metadata.soft_failed = True
 
     async def _load_or_fetch_auth_events_for_event(

From 1fadfedbf2853637a187a04f9b2b87bbc246e133 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:11:44 -0500
Subject: [PATCH 19/48] Fill in `synapse/handlers/presence.py`

---
 synapse/handlers/presence.py    | 120 +++++++++++++++++++++++++-------
 tests/handlers/test_presence.py |   9 +++
 2 files changed, 102 insertions(+), 27 deletions(-)

diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py
index 390cafa8f63..bbe81dbf4f3 100644
--- a/synapse/handlers/presence.py
+++ b/synapse/handlers/presence.py
@@ -105,7 +105,7 @@
 from synapse.appservice import ApplicationService
 from synapse.events.presence_router import PresenceRouter
 from synapse.logging.context import run_in_background
-from synapse.metrics import LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge
 from synapse.metrics.background_process_metrics import (
     run_as_background_process,
     wrap_as_background_process,
@@ -137,24 +137,40 @@
 logger = logging.getLogger(__name__)
 
 
-notified_presence_counter = Counter("synapse_handler_presence_notified_presence", "")
+notified_presence_counter = Counter(
+    "synapse_handler_presence_notified_presence", "", labelnames=[SERVER_NAME_LABEL]
+)
 federation_presence_out_counter = Counter(
-    "synapse_handler_presence_federation_presence_out", ""
+    "synapse_handler_presence_federation_presence_out",
+    "",
+    labelnames=[SERVER_NAME_LABEL],
+)
+presence_updates_counter = Counter(
+    "synapse_handler_presence_presence_updates", "", labelnames=[SERVER_NAME_LABEL]
+)
+timers_fired_counter = Counter(
+    "synapse_handler_presence_timers_fired", "", labelnames=[SERVER_NAME_LABEL]
 )
-presence_updates_counter = Counter("synapse_handler_presence_presence_updates", "")
-timers_fired_counter = Counter("synapse_handler_presence_timers_fired", "")
 federation_presence_counter = Counter(
-    "synapse_handler_presence_federation_presence", ""
+    "synapse_handler_presence_federation_presence", "", labelnames=[SERVER_NAME_LABEL]
+)
+bump_active_time_counter = Counter(
+    "synapse_handler_presence_bump_active_time", "", labelnames=[SERVER_NAME_LABEL]
 )
-bump_active_time_counter = Counter("synapse_handler_presence_bump_active_time", "")
 
-get_updates_counter = Counter("synapse_handler_presence_get_updates", "", ["type"])
+get_updates_counter = Counter(
+    "synapse_handler_presence_get_updates", "", labelnames=["type", SERVER_NAME_LABEL]
+)
 
 notify_reason_counter = Counter(
-    "synapse_handler_presence_notify_reason", "", ["locality", "reason"]
+    "synapse_handler_presence_notify_reason",
+    "",
+    labelnames=["locality", "reason", SERVER_NAME_LABEL],
 )
 state_transition_counter = Counter(
-    "synapse_handler_presence_state_transition", "", ["locality", "from", "to"]
+    "synapse_handler_presence_state_transition",
+    "",
+    labelnames=["locality", "from", "to", SERVER_NAME_LABEL],
 )
 
 # If a user was last active in the last LAST_ACTIVE_GRANULARITY, consider them
@@ -484,6 +500,7 @@ def __exit__(
 class WorkerPresenceHandler(BasePresenceHandler):
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
+        self.server_name = hs.hostname
         self._presence_writer_instance = hs.config.worker.writers.presence[0]
 
         # Route presence EDUs to the right worker
@@ -666,7 +683,9 @@ async def process_replication_rows(
             old_state = self.user_to_current_state.get(new_state.user_id)
             self.user_to_current_state[new_state.user_id] = new_state
             is_mine = self.is_mine_id(new_state.user_id)
-            if not old_state or should_notify(old_state, new_state, is_mine):
+            if not old_state or should_notify(
+                old_state, new_state, is_mine, self.server_name
+            ):
                 state_to_notify.append(new_state)
 
         stream_id = token
@@ -747,6 +766,7 @@ async def bump_presence_active_time(
 class PresenceHandler(BasePresenceHandler):
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
+        self.server_name = hs.hostname
         self.wheel_timer: WheelTimer[str] = WheelTimer()
         self.notifier = hs.get_notifier()
 
@@ -969,6 +989,7 @@ async def _update_states(
                     prev_state,
                     new_state,
                     is_mine=self.is_mine_id(user_id),
+                    our_server_name=self.server_name,
                     wheel_timer=self.wheel_timer,
                     now=now,
                     # When overriding disabled presence, don't kick off all the
@@ -988,10 +1009,14 @@ async def _update_states(
 
             # TODO: We should probably ensure there are no races hereafter
 
-            presence_updates_counter.inc(len(new_states))
+            presence_updates_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc(len(new_states))
 
             if to_notify:
-                notified_presence_counter.inc(len(to_notify))
+                notified_presence_counter.labels(
+                    **{SERVER_NAME_LABEL: self.server_name}
+                ).inc(len(to_notify))
                 await self._persist_and_notify(list(to_notify.values()))
 
             self.unpersisted_users_changes |= {s.user_id for s in new_states}
@@ -1010,7 +1035,9 @@ async def _update_states(
                 if user_id not in to_notify
             }
             if to_federation_ping:
-                federation_presence_out_counter.inc(len(to_federation_ping))
+                federation_presence_out_counter.labels(
+                    **{SERVER_NAME_LABEL: self.server_name}
+                ).inc(len(to_federation_ping))
 
                 hosts_to_states = await get_interested_remotes(
                     self.store,
@@ -1060,7 +1087,9 @@ async def _handle_timeouts(self) -> None:
             for user_id in users_to_check
         ]
 
-        timers_fired_counter.inc(len(states))
+        timers_fired_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc(
+            len(states)
+        )
 
         # Set of user ID & device IDs which are currently syncing.
         syncing_user_devices = {
@@ -1094,7 +1123,7 @@ async def bump_presence_active_time(
 
         user_id = user.to_string()
 
-        bump_active_time_counter.inc()
+        bump_active_time_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
         now = self.clock.time_msec()
 
@@ -1346,7 +1375,9 @@ async def incoming_presence(self, origin: str, content: JsonDict) -> None:
             updates.append(prev_state.copy_and_replace(**new_fields))
 
         if updates:
-            federation_presence_counter.inc(len(updates))
+            federation_presence_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc(len(updates))
             await self._update_states(updates)
 
     async def set_state(
@@ -1655,7 +1686,10 @@ async def _handle_state_delta(self, room_id: str, deltas: List[StateDelta]) -> N
 
 
 def should_notify(
-    old_state: UserPresenceState, new_state: UserPresenceState, is_mine: bool
+    old_state: UserPresenceState,
+    new_state: UserPresenceState,
+    is_mine: bool,
+    our_server_name: str,
 ) -> bool:
     """Decides if a presence state change should be sent to interested parties."""
     user_location = "remote"
@@ -1666,19 +1700,38 @@ def should_notify(
         return False
 
     if old_state.status_msg != new_state.status_msg:
-        notify_reason_counter.labels(user_location, "status_msg_change").inc()
+        notify_reason_counter.labels(
+            locality=user_location,
+            reason="status_msg_change",
+            **{SERVER_NAME_LABEL: our_server_name},
+        ).inc()
         return True
 
     if old_state.state != new_state.state:
-        notify_reason_counter.labels(user_location, "state_change").inc()
+        notify_reason_counter.labels(
+            locality=user_location,
+            reason="state_change",
+            **{SERVER_NAME_LABEL: our_server_name},
+        ).inc()
         state_transition_counter.labels(
-            user_location, old_state.state, new_state.state
+            **{
+                "locality": user_location,
+                # `from` is a reserved word in Python so we have to label it this way if
+                # we want to use keyword args.
+                "from": old_state.state,
+                "to": new_state.state,
+                SERVER_NAME_LABEL: our_server_name,
+            },
         ).inc()
         return True
 
     if old_state.state == PresenceState.ONLINE:
         if new_state.currently_active != old_state.currently_active:
-            notify_reason_counter.labels(user_location, "current_active_change").inc()
+            notify_reason_counter.labels(
+                locality=user_location,
+                reason="current_active_change",
+                **{SERVER_NAME_LABEL: our_server_name},
+            ).inc()
             return True
 
         if (
@@ -1688,14 +1741,18 @@ def should_notify(
             # Only notify about last active bumps if we're not currently active
             if not new_state.currently_active:
                 notify_reason_counter.labels(
-                    user_location, "last_active_change_online"
+                    locality=user_location,
+                    reason="last_active_change_online",
+                    **{SERVER_NAME_LABEL: our_server_name},
                 ).inc()
                 return True
 
     elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY:
         # Always notify for a transition where last active gets bumped.
         notify_reason_counter.labels(
-            user_location, "last_active_change_not_online"
+            locality=user_location,
+            reason="last_active_change_not_online",
+            **{SERVER_NAME_LABEL: our_server_name},
         ).inc()
         return True
 
@@ -1761,6 +1818,7 @@ def __init__(self, hs: "HomeServer"):
         #   AuthHandler -> Notifier -> PresenceEventSource -> ModuleApi -> AuthHandler
         self.get_presence_handler = hs.get_presence_handler
         self.get_presence_router = hs.get_presence_router
+        self.server_name = hs.hostname
         self.clock = hs.get_clock()
         self.store = hs.get_datastores().main
 
@@ -1870,7 +1928,10 @@ async def get_new_events(
 
                     # If we have the full list of changes for presence we can
                     # simply check which ones share a room with the user.
-                    get_updates_counter.labels("stream").inc()
+                    get_updates_counter.labels(
+                        type="stream",
+                        **{SERVER_NAME_LABEL: self.server_name},
+                    ).inc()
 
                     sharing_users = await self.store.do_users_share_a_room(
                         user_id, updated_users
@@ -1883,7 +1944,10 @@ async def get_new_events(
                 else:
                     # Too many possible updates. Find all users we can see and check
                     # if any of them have changed.
-                    get_updates_counter.labels("full").inc()
+                    get_updates_counter.labels(
+                        type="full",
+                        **{SERVER_NAME_LABEL: self.server_name},
+                    ).inc()
 
                     users_interested_in = (
                         await self.store.get_users_who_share_room_with_user(user_id)
@@ -2133,6 +2197,7 @@ def handle_update(
     prev_state: UserPresenceState,
     new_state: UserPresenceState,
     is_mine: bool,
+    our_server_name: str,
     wheel_timer: WheelTimer,
     now: int,
     persist: bool,
@@ -2145,6 +2210,7 @@ def handle_update(
         prev_state
         new_state
         is_mine: Whether the user is ours
+        our_server_name: The homeserver name of the our server (`hs.hostname`)
         wheel_timer
         now: Time now in ms
         persist: True if this state should persist until another update occurs.
@@ -2213,7 +2279,7 @@ def handle_update(
             )
 
     # Check whether the change was something worth notifying about
-    if should_notify(prev_state, new_state, is_mine):
+    if should_notify(prev_state, new_state, is_mine, our_server_name):
         new_state = new_state.copy_and_replace(last_federation_update_ts=now)
         persist_and_notify = True
 
diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py
index 6b7bf112c21..7ee411cfd94 100644
--- a/tests/handlers/test_presence.py
+++ b/tests/handlers/test_presence.py
@@ -90,6 +90,7 @@ def test_offline_to_online(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -137,6 +138,7 @@ def test_online_to_online(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -187,6 +189,7 @@ def test_online_to_online_last_active_noop(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -235,6 +238,7 @@ def test_online_to_online_last_active(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -275,6 +279,7 @@ def test_remote_ping_timer(self) -> None:
             prev_state,
             new_state,
             is_mine=False,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -314,6 +319,7 @@ def test_online_to_offline(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -341,6 +347,7 @@ def test_online_to_idle(self) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,
@@ -431,6 +438,7 @@ def test_override(self, initial_state: str, final_state: str) -> None:
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=True,
@@ -494,6 +502,7 @@ def _test_ratelimit_offline_to_online_to_unavailable(
             prev_state,
             new_state,
             is_mine=True,
+            our_server_name=self.hs.hostname,
             wheel_timer=wheel_timer,
             now=now,
             persist=False,

From a6a686322230b146a20255ced65091cb992810f1 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:18:57 -0500
Subject: [PATCH 20/48] Fill in `synapse/handlers/register.py`

---
 synapse/handlers/register.py | 40 ++++++++++++++++++++++++------------
 synapse/handlers/sso.py      | 12 ++++++-----
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py
index 970013ef20c..342740d3cf8 100644
--- a/synapse/handlers/register.py
+++ b/synapse/handlers/register.py
@@ -46,6 +46,7 @@
 from synapse.config.server import is_threepid_reserved
 from synapse.handlers.device import DeviceHandler
 from synapse.http.servlet import assert_params_in_dict
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.replication.http.login import RegisterDeviceReplicationServlet
 from synapse.replication.http.register import (
     ReplicationPostRegisterActionsServlet,
@@ -64,29 +65,38 @@
 registration_counter = Counter(
     "synapse_user_registrations_total",
     "Number of new users registered (since restart)",
-    ["guest", "shadow_banned", "auth_provider"],
+    labelnames=["guest", "shadow_banned", "auth_provider", SERVER_NAME_LABEL],
 )
 
 login_counter = Counter(
     "synapse_user_logins_total",
     "Number of user logins (since restart)",
-    ["guest", "auth_provider"],
+    labelnames=["guest", "auth_provider", SERVER_NAME_LABEL],
 )
 
 
-def init_counters_for_auth_provider(auth_provider_id: str) -> None:
+def init_counters_for_auth_provider(auth_provider_id: str, server_name: str) -> None:
     """Ensure the prometheus counters for the given auth provider are initialised
 
     This fixes a problem where the counters are not reported for a given auth provider
     until the user first logs in/registers.
+
+    Args:
+        auth_provider_id: The ID of the auth provider to initialise counters for.
+        server_name: Our server name (used to label metrics) (this should be `hs.hostname`).
     """
     for is_guest in (True, False):
-        login_counter.labels(guest=is_guest, auth_provider=auth_provider_id)
+        login_counter.labels(
+            guest=is_guest,
+            auth_provider=auth_provider_id,
+            **{SERVER_NAME_LABEL: server_name},
+        )
         for shadow_banned in (True, False):
             registration_counter.labels(
                 guest=is_guest,
                 shadow_banned=shadow_banned,
                 auth_provider=auth_provider_id,
+                **{SERVER_NAME_LABEL: server_name},
             )
 
 
@@ -99,6 +109,7 @@ class LoginDict(TypedDict):
 
 class RegistrationHandler:
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
         self.clock = hs.get_clock()
@@ -114,7 +125,6 @@ def __init__(self, hs: "HomeServer"):
         self._account_validity_handler = hs.get_account_validity_handler()
         self._user_consent_version = self.hs.config.consent.user_consent_version
         self._server_notices_mxid = hs.config.servernotices.server_notices_mxid
-        self._server_name = hs.hostname
         self._user_types_config = hs.config.user_types
 
         self._spam_checker_module_callbacks = hs.get_module_api_callbacks().spam_checker
@@ -141,7 +151,9 @@ def __init__(self, hs: "HomeServer"):
         )
         self.refresh_token_lifetime = hs.config.registration.refresh_token_lifetime
 
-        init_counters_for_auth_provider("")
+        init_counters_for_auth_provider(
+            auth_provider_id="", server_name=self.server_name
+        )
 
     async def check_username(
         self,
@@ -365,6 +377,7 @@ async def register_user(
             guest=make_guest,
             shadow_banned=shadow_banned,
             auth_provider=(auth_provider_id or ""),
+            **{SERVER_NAME_LABEL: self.server_name},
         ).inc()
 
         # If the user does not need to consent at registration, auto-join any
@@ -425,7 +438,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None:
         if self.hs.config.registration.auto_join_user_id:
             fake_requester = create_requester(
                 self.hs.config.registration.auto_join_user_id,
-                authenticated_entity=self._server_name,
+                authenticated_entity=self.server_name,
             )
 
             # If the room requires an invite, add the user to the list of invites.
@@ -438,7 +451,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None:
             requires_join = True
         else:
             fake_requester = create_requester(
-                user_id, authenticated_entity=self._server_name
+                user_id, authenticated_entity=self.server_name
             )
 
         # Choose whether to federate the new room.
@@ -470,7 +483,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None:
 
                     await room_member_handler.update_membership(
                         requester=create_requester(
-                            user_id, authenticated_entity=self._server_name
+                            user_id, authenticated_entity=self.server_name
                         ),
                         target=UserID.from_string(user_id),
                         room_id=room_id,
@@ -496,7 +509,7 @@ async def _create_and_join_rooms(self, user_id: str) -> None:
                     if requires_join:
                         await room_member_handler.update_membership(
                             requester=create_requester(
-                                user_id, authenticated_entity=self._server_name
+                                user_id, authenticated_entity=self.server_name
                             ),
                             target=UserID.from_string(user_id),
                             room_id=room_id,
@@ -542,7 +555,7 @@ async def _join_rooms(self, user_id: str) -> None:
                 # we don't have a local user in the room to craft up an invite with.
                 requires_invite = await self.store.is_host_joined(
                     room_id,
-                    self._server_name,
+                    self.server_name,
                 )
 
                 if requires_invite:
@@ -570,7 +583,7 @@ async def _join_rooms(self, user_id: str) -> None:
                     await room_member_handler.update_membership(
                         requester=create_requester(
                             self.hs.config.registration.auto_join_user_id,
-                            authenticated_entity=self._server_name,
+                            authenticated_entity=self.server_name,
                         ),
                         target=UserID.from_string(user_id),
                         room_id=room_id,
@@ -582,7 +595,7 @@ async def _join_rooms(self, user_id: str) -> None:
                 # Send the join.
                 await room_member_handler.update_membership(
                     requester=create_requester(
-                        user_id, authenticated_entity=self._server_name
+                        user_id, authenticated_entity=self.server_name
                     ),
                     target=UserID.from_string(user_id),
                     room_id=room_id,
@@ -810,6 +823,7 @@ async def register_device(
         login_counter.labels(
             guest=is_guest,
             auth_provider=(auth_provider_id or ""),
+            **{SERVER_NAME_LABEL: self.server_name},
         ).inc()
 
         return (
diff --git a/synapse/handlers/sso.py b/synapse/handlers/sso.py
index 7c5cf91aba5..53225b95f77 100644
--- a/synapse/handlers/sso.py
+++ b/synapse/handlers/sso.py
@@ -203,7 +203,7 @@ class SsoHandler:
     def __init__(self, hs: "HomeServer"):
         self._clock = hs.get_clock()
         self._store = hs.get_datastores().main
-        self._server_name = hs.hostname
+        self.server_name = hs.hostname
         self._is_mine_server_name = hs.is_mine_server_name
         self._registration_handler = hs.get_registration_handler()
         self._auth_handler = hs.get_auth_handler()
@@ -239,7 +239,9 @@ def register_identity_provider(self, p: SsoIdentityProvider) -> None:
         p_id = p.idp_id
         assert p_id not in self._identity_providers
         self._identity_providers[p_id] = p
-        init_counters_for_auth_provider(p_id)
+        init_counters_for_auth_provider(
+            auth_provider_id=p_id, server_name=self.server_name
+        )
 
     def get_identity_providers(self) -> Mapping[str, SsoIdentityProvider]:
         """Get the configured identity providers"""
@@ -570,7 +572,7 @@ async def _call_attribute_mapper(
                 return attributes
 
             # Check if this mxid already exists
-            user_id = UserID(attributes.localpart, self._server_name).to_string()
+            user_id = UserID(attributes.localpart, self.server_name).to_string()
             if not await self._store.get_users_by_id_case_insensitive(user_id):
                 # This mxid is free
                 break
@@ -908,7 +910,7 @@ async def complete_sso_ui_auth_request(
 
         # render an error page.
         html = self._bad_user_template.render(
-            server_name=self._server_name,
+            server_name=self.server_name,
             user_id_to_verify=user_id_to_verify,
         )
         respond_with_html(request, 200, html)
@@ -960,7 +962,7 @@ async def check_username_availability(
 
         if contains_invalid_mxid_characters(localpart):
             raise SynapseError(400, "localpart is invalid: %s" % (localpart,))
-        user_id = UserID(localpart, self._server_name).to_string()
+        user_id = UserID(localpart, self.server_name).to_string()
         user_infos = await self._store.get_users_by_id_case_insensitive(user_id)
 
         logger.info("[session %s] users: %s", session_id, user_infos)

From e81be25436861b8e6233dbbe8b9453eb3f8a778d Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:20:31 -0500
Subject: [PATCH 21/48] Fill in `synapse/handlers/sync.py`

---
 synapse/handlers/sync.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py
index a400e63fd56..4d1b32b562c 100644
--- a/synapse/handlers/sync.py
+++ b/synapse/handlers/sync.py
@@ -63,6 +63,7 @@
     start_active_span,
     trace,
 )
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.storage.databases.main.event_push_actions import RoomNotifCounts
 from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary
 from synapse.storage.databases.main.stream import PaginateFunction
@@ -104,7 +105,7 @@
     "Count of non empty sync responses. type is initial_sync/full_state_sync"
     "/incremental_sync. lazy_loaded indicates if lazy loaded members were "
     "enabled for that request.",
-    ["type", "lazy_loaded"],
+    labelnames=["type", "lazy_loaded", SERVER_NAME_LABEL],
 )
 
 # Store the cache that tracks which lazy-loaded members have been sent to a given
@@ -329,6 +330,7 @@ class E2eeSyncResult:
 
 class SyncHandler:
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.hs_config = hs.config
         self.store = hs.get_datastores().main
         self.notifier = hs.get_notifier()
@@ -611,7 +613,11 @@ async def current_sync_callback(
                 lazy_loaded = "true"
             else:
                 lazy_loaded = "false"
-            non_empty_sync_counter.labels(sync_label, lazy_loaded).inc()
+            non_empty_sync_counter.labels(
+                type=sync_label,
+                lazy_loaded=lazy_loaded,
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
 
         return result
 

From d42e742d37b4cf608c14cbdd79c68ce2c5a3e141 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:24:51 -0500
Subject: [PATCH 22/48] Fill in `synapse/http/client.py`

---
 synapse/http/client.py | 43 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/synapse/http/client.py b/synapse/http/client.py
index 928bfb228a7..e8078b0a257 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -85,6 +85,7 @@
 from synapse.http.types import QueryParams
 from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.logging.opentracing import set_tag, start_active_span, tags
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import ISynapseReactor, StrSequence
 from synapse.util import json_decoder
 from synapse.util.async_helpers import timeout_deferred
@@ -108,9 +109,13 @@
 
 logger = logging.getLogger(__name__)
 
-outgoing_requests_counter = Counter("synapse_http_client_requests", "", ["method"])
+outgoing_requests_counter = Counter(
+    "synapse_http_client_requests", "", labelnames=["method", SERVER_NAME_LABEL]
+)
 incoming_responses_counter = Counter(
-    "synapse_http_client_responses", "", ["method", "code"]
+    "synapse_http_client_responses",
+    "",
+    labelnames=["method", "code", SERVER_NAME_LABEL],
 )
 
 # the type of the headers map, to be passed to the t.w.h.Headers.
@@ -346,6 +351,7 @@ def __init__(
         treq_args: Optional[Dict[str, Any]] = None,
     ):
         self.hs = hs
+        self.server_name = hs.hostname
         self.reactor = hs.get_reactor()
 
         self._extra_treq_args = treq_args or {}
@@ -384,7 +390,9 @@ async def request(
             RequestTimedOutError if the request times out before the headers are read
 
         """
-        outgoing_requests_counter.labels(method).inc()
+        outgoing_requests_counter.labels(
+            method=method, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         # log request but strip `access_token` (AS requests for example include this)
         logger.debug("Sending request %s %s", method, redact_uri(uri))
@@ -438,7 +446,11 @@ async def request(
 
                 response = await make_deferred_yieldable(request_deferred)
 
-                incoming_responses_counter.labels(method, response.code).inc()
+                incoming_responses_counter.labels(
+                    method=method,
+                    code=response.code,
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
                 logger.info(
                     "Received response to %s %s: %s",
                     method,
@@ -447,7 +459,11 @@ async def request(
                 )
                 return response
             except Exception as e:
-                incoming_responses_counter.labels(method, "ERR").inc()
+                incoming_responses_counter.labels(
+                    method=method,
+                    code="ERR",
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
                 logger.info(
                     "Error sending request to  %s %s: %s %s",
                     method,
@@ -855,6 +871,7 @@ def __init__(
             hs: The HomeServer instance to pass in
         """
         super().__init__(hs)
+        self.server_name = hs.hostname
 
         # Use a pool, but a very small one.
         pool = HTTPConnectionPool(self.reactor)
@@ -891,7 +908,9 @@ async def request(
             RequestTimedOutError if the request times out before the headers are read
 
         """
-        outgoing_requests_counter.labels(method).inc()
+        outgoing_requests_counter.labels(
+            method=method, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         logger.debug("Sending request %s %s", method, uri)
 
@@ -948,7 +967,11 @@ async def request(
 
                 response = await make_deferred_yieldable(request_deferred)
 
-                incoming_responses_counter.labels(method, response.code).inc()
+                incoming_responses_counter.labels(
+                    method=method,
+                    code=response.code,
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
                 logger.info(
                     "Received response to %s %s: %s",
                     method,
@@ -957,7 +980,11 @@ async def request(
                 )
                 return response
             except Exception as e:
-                incoming_responses_counter.labels(method, "ERR").inc()
+                incoming_responses_counter.labels(
+                    method=method,
+                    code="ERR",
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
                 logger.info(
                     "Error sending request to  %s %s: %s %s",
                     method,

From f197c92c90f8add397848ea1b28e4c07c8e411c5 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 14:28:05 -0500
Subject: [PATCH 23/48] Fill in `synapse/http/matrixfederationclient.py`

---
 synapse/http/matrixfederationclient.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py
index 97a863a1189..2ab1e2ae3ec 100644
--- a/synapse/http/matrixfederationclient.py
+++ b/synapse/http/matrixfederationclient.py
@@ -87,6 +87,7 @@
 from synapse.logging import opentracing
 from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.logging.opentracing import set_tag, start_active_span, tags
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import JsonDict
 from synapse.util import json_decoder
 from synapse.util.async_helpers import AwakenableSleeper, Linearizer, timeout_deferred
@@ -99,10 +100,14 @@
 logger = logging.getLogger(__name__)
 
 outgoing_requests_counter = Counter(
-    "synapse_http_matrixfederationclient_requests", "", ["method"]
+    "synapse_http_matrixfederationclient_requests",
+    "",
+    labelnames=["method", SERVER_NAME_LABEL],
 )
 incoming_responses_counter = Counter(
-    "synapse_http_matrixfederationclient_responses", "", ["method", "code"]
+    "synapse_http_matrixfederationclient_responses",
+    "",
+    labelnames=["method", "code", SERVER_NAME_LABEL],
 )
 
 
@@ -694,7 +699,9 @@ async def _send_request(
                         _sec_timeout,
                     )
 
-                    outgoing_requests_counter.labels(request.method).inc()
+                    outgoing_requests_counter.labels(
+                        method=request.method, **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc()
 
                     try:
                         with Measure(self.clock, "outbound_request"):
@@ -729,7 +736,9 @@ async def _send_request(
                         raise RequestSendFailed(e, can_retry=True) from e
 
                     incoming_responses_counter.labels(
-                        request.method, response.code
+                        method=request.method,
+                        code=response.code,
+                        **{SERVER_NAME_LABEL: self.server_name},
                     ).inc()
 
                     set_tag(tags.HTTP_STATUS_CODE, response.code)

From c28d4bcf96650a221622ffd0cdcb5aad9bd4ec26 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 15:05:42 -0500
Subject: [PATCH 24/48] Fill in `synapse/http/request_metrics.py`

---
 synapse/http/request_metrics.py               | 109 ++++++++++++------
 synapse/http/site.py                          |  12 +-
 tests/handlers/test_oauth_delegation.py       |   4 +-
 tests/logging/test_terse_json.py              |   6 +-
 tests/rest/key/v2/test_remote_key_resource.py |   4 +-
 5 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py
index 366f06eb809..799b0cb4c77 100644
--- a/synapse/http/request_metrics.py
+++ b/synapse/http/request_metrics.py
@@ -27,40 +27,52 @@
 from prometheus_client.core import Counter, Histogram
 
 from synapse.logging.context import current_context
-from synapse.metrics import LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge
 
 logger = logging.getLogger(__name__)
 
 
 # total number of responses served, split by method/servlet/tag
 response_count = Counter(
-    "synapse_http_server_response_count", "", ["method", "servlet", "tag"]
+    "synapse_http_server_response_count",
+    "",
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 requests_counter = Counter(
-    "synapse_http_server_requests_received", "", ["method", "servlet"]
+    "synapse_http_server_requests_received",
+    "",
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 outgoing_responses_counter = Counter(
-    "synapse_http_server_responses", "", ["method", "code"]
+    "synapse_http_server_responses",
+    "",
+    labelnames=["method", "code", SERVER_NAME_LABEL],
 )
 
 response_timer = Histogram(
     "synapse_http_server_response_time_seconds",
     "sec",
-    ["method", "servlet", "tag", "code"],
+    labelnames=["method", "servlet", "tag", "code", SERVER_NAME_LABEL],
 )
 
 response_ru_utime = Counter(
-    "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"]
+    "synapse_http_server_response_ru_utime_seconds",
+    "sec",
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 response_ru_stime = Counter(
-    "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"]
+    "synapse_http_server_response_ru_stime_seconds",
+    "sec",
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 response_db_txn_count = Counter(
-    "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"]
+    "synapse_http_server_response_db_txn_count",
+    "",
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 # seconds spent waiting for db txns, excluding scheduling time, when processing
@@ -68,34 +80,42 @@
 response_db_txn_duration = Counter(
     "synapse_http_server_response_db_txn_duration_seconds",
     "",
-    ["method", "servlet", "tag"],
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 # seconds spent waiting for a db connection, when processing this request
 response_db_sched_duration = Counter(
     "synapse_http_server_response_db_sched_duration_seconds",
     "",
-    ["method", "servlet", "tag"],
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 # size in bytes of the response written
 response_size = Counter(
-    "synapse_http_server_response_size", "", ["method", "servlet", "tag"]
+    "synapse_http_server_response_size",
+    "",
+    labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL],
 )
 
 # In flight metrics are incremented while the requests are in flight, rather
 # than when the response was written.
 
 in_flight_requests_ru_utime = Counter(
-    "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
+    "synapse_http_server_in_flight_requests_ru_utime_seconds",
+    "",
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 in_flight_requests_ru_stime = Counter(
-    "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
+    "synapse_http_server_in_flight_requests_ru_stime_seconds",
+    "",
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 in_flight_requests_db_txn_count = Counter(
-    "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"]
+    "synapse_http_server_in_flight_requests_db_txn_count",
+    "",
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 # seconds spent waiting for db txns, excluding scheduling time, when processing
@@ -103,14 +123,14 @@
 in_flight_requests_db_txn_duration = Counter(
     "synapse_http_server_in_flight_requests_db_txn_duration_seconds",
     "",
-    ["method", "servlet"],
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 # seconds spent waiting for a db connection, when processing this request
 in_flight_requests_db_sched_duration = Counter(
     "synapse_http_server_in_flight_requests_db_sched_duration_seconds",
     "",
-    ["method", "servlet"],
+    labelnames=["method", "servlet", SERVER_NAME_LABEL],
 )
 
 _in_flight_requests: Set["RequestMetrics"] = set()
@@ -149,6 +169,13 @@ def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]:
 
 
 class RequestMetrics:
+    def __init__(self, our_server_name: str) -> None:
+        """
+        Args:
+            our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`)
+        """
+        self.our_server_name = our_server_name
+
     def start(self, time_sec: float, name: str, method: str) -> None:
         self.start_ts = time_sec
         self.start_context = current_context()
@@ -194,33 +221,39 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None:
 
         response_code_str = str(response_code)
 
-        outgoing_responses_counter.labels(self.method, response_code_str).inc()
+        outgoing_responses_counter.labels(
+            method=self.method,
+            code=response_code_str,
+            **{SERVER_NAME_LABEL: self.our_server_name},
+        ).inc()
+
+        response_base_labels = {
+            "method": self.method,
+            "servlet": self.name,
+            "tag": tag,
+            SERVER_NAME_LABEL: self.our_server_name,
+        }
 
-        response_count.labels(self.method, self.name, tag).inc()
+        response_count.labels(**response_base_labels).inc()
 
-        response_timer.labels(self.method, self.name, tag, response_code_str).observe(
+        response_timer.labels(code=response_code_str, **response_base_labels).observe(
             time_sec - self.start_ts
         )
 
         resource_usage = context.get_resource_usage()
 
-        response_ru_utime.labels(self.method, self.name, tag).inc(
-            resource_usage.ru_utime
-        )
-        response_ru_stime.labels(self.method, self.name, tag).inc(
-            resource_usage.ru_stime
-        )
-        response_db_txn_count.labels(self.method, self.name, tag).inc(
+        response_ru_utime.labels(**response_base_labels).inc(resource_usage.ru_utime)
+        response_ru_stime.labels(**response_base_labels).inc(resource_usage.ru_stime)
+        response_db_txn_count.labels(**response_base_labels).inc(
             resource_usage.db_txn_count
         )
-        response_db_txn_duration.labels(self.method, self.name, tag).inc(
+        response_db_txn_duration.labels(**response_base_labels).inc(
             resource_usage.db_txn_duration_sec
         )
-        response_db_sched_duration.labels(self.method, self.name, tag).inc(
+        response_db_sched_duration.labels(**response_base_labels).inc(
             resource_usage.db_sched_duration_sec
         )
-
-        response_size.labels(self.method, self.name, tag).inc(sent_bytes)
+        response_size.labels(**response_base_labels).inc(sent_bytes)
 
         # We always call this at the end to ensure that we update the metrics
         # regardless of whether a call to /metrics while the request was in
@@ -240,24 +273,30 @@ def update_metrics(self) -> None:
         diff = new_stats - self._request_stats
         self._request_stats = new_stats
 
+        in_flight_labels = {
+            "method": self.method,
+            "servlet": self.name,
+            SERVER_NAME_LABEL: self.our_server_name,
+        }
+
         # max() is used since rapid use of ru_stime/ru_utime can end up with the
         # count going backwards due to NTP, time smearing, fine-grained
         # correction, or floating points. Who knows, really?
-        in_flight_requests_ru_utime.labels(self.method, self.name).inc(
+        in_flight_requests_ru_utime.labels(**in_flight_labels).inc(
             max(diff.ru_utime, 0)
         )
-        in_flight_requests_ru_stime.labels(self.method, self.name).inc(
+        in_flight_requests_ru_stime.labels(**in_flight_labels).inc(
             max(diff.ru_stime, 0)
         )
 
-        in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
+        in_flight_requests_db_txn_count.labels(**in_flight_labels).inc(
             diff.db_txn_count
         )
 
-        in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(
+        in_flight_requests_db_txn_duration.labels(**in_flight_labels).inc(
             diff.db_txn_duration_sec
         )
 
-        in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
+        in_flight_requests_db_sched_duration.labels(**in_flight_labels).inc(
             diff.db_sched_duration_sec
         )
diff --git a/synapse/http/site.py b/synapse/http/site.py
index e83a4447b2f..1ceccc197e6 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -44,6 +44,7 @@
     LoggingContext,
     PreserveLoggingContext,
 )
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import ISynapseReactor, Requester
 
 if TYPE_CHECKING:
@@ -83,12 +84,14 @@ def __init__(
         self,
         channel: HTTPChannel,
         site: "SynapseSite",
+        hs: "HomeServer",
         *args: Any,
         max_request_body_size: int = 1024,
         request_id_header: Optional[str] = None,
         **kw: Any,
     ):
         super().__init__(channel, *args, **kw)
+        self.our_server_name = hs.hostname
         self._max_request_body_size = max_request_body_size
         self.request_id_header = request_id_header
         self.synapse_site = site
@@ -334,7 +337,11 @@ def render(self, resrc: Resource) -> None:
             # dispatching to the handler, so that the handler
             # can update the servlet name in the request
             # metrics
-            requests_counter.labels(self.get_method(), self.request_metrics.name).inc()
+            requests_counter.labels(
+                method=self.get_method(),
+                servlet=self.request_metrics.name,
+                **{SERVER_NAME_LABEL: self.our_server_name},
+            ).inc()
 
     @contextlib.contextmanager
     def processing(self) -> Generator[None, None, None]:
@@ -455,7 +462,7 @@ def _started_processing(self, servlet_name: str) -> None:
                 self.request_metrics.name.
         """
         self.start_time = time.time()
-        self.request_metrics = RequestMetrics()
+        self.request_metrics = RequestMetrics(our_server_name=self.our_server_name)
         self.request_metrics.start(
             self.start_time, name=servlet_name, method=self.get_method()
         )
@@ -705,6 +712,7 @@ def request_factory(channel: HTTPChannel, queued: bool) -> Request:
             return request_class(
                 channel,
                 self,
+                hs,
                 max_request_body_size=max_request_body_size,
                 queued=queued,
                 request_id_header=request_id_header,
diff --git a/tests/handlers/test_oauth_delegation.py b/tests/handlers/test_oauth_delegation.py
index fefa2f1135c..2ef25f73a15 100644
--- a/tests/handlers/test_oauth_delegation.py
+++ b/tests/handlers/test_oauth_delegation.py
@@ -867,7 +867,7 @@ async def mock_http_client_request(
         # First test a known access token
         channel = FakeChannel(self.site, self.reactor)
         # type-ignore: FakeChannel is a mock of an HTTPChannel, not a proper HTTPChannel
-        req = SynapseRequest(channel, self.site)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
         req.client.host = EXAMPLE_IPV4_ADDR
         req.requestHeaders.addRawHeader("Authorization", f"Bearer {known_token}")
         req.requestHeaders.addRawHeader("User-Agent", EXAMPLE_USER_AGENT)
@@ -899,7 +899,7 @@ async def mock_http_client_request(
         MAS_USER_AGENT = "masmasmas"
 
         channel = FakeChannel(self.site, self.reactor)
-        req = SynapseRequest(channel, self.site)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
         req.client.host = MAS_IPV4_ADDR
         req.requestHeaders.addRawHeader(
             "Authorization", f"Bearer {self.auth._admin_token()}"
diff --git a/tests/logging/test_terse_json.py b/tests/logging/test_terse_json.py
index d9cbbbd51ef..ddc2837ff9a 100644
--- a/tests/logging/test_terse_json.py
+++ b/tests/logging/test_terse_json.py
@@ -171,8 +171,12 @@ def test_with_request_context(self) -> None:
         site.site_tag = "test-site"
         site.server_version_string = "Server v1"
         site.reactor = Mock()
+
+        hs = Mock(spec=["hostname"])
+        hs.hostname = "test_server"
+
         request = SynapseRequest(
-            cast(HTTPChannel, FakeChannel(site, self.reactor)), site
+            cast(HTTPChannel, FakeChannel(site, self.reactor)), site, hs
         )
         # Call requestReceived to finish instantiating the object.
         request.content = BytesIO()
diff --git a/tests/rest/key/v2/test_remote_key_resource.py b/tests/rest/key/v2/test_remote_key_resource.py
index 21e12b2a2f2..62538bbdc37 100644
--- a/tests/rest/key/v2/test_remote_key_resource.py
+++ b/tests/rest/key/v2/test_remote_key_resource.py
@@ -99,7 +99,7 @@ def make_notary_request(self, server_name: str, key_id: str) -> dict:
         """
         channel = FakeChannel(self.site, self.reactor)
         # channel is a `FakeChannel` but `HTTPChannel` is expected
-        req = SynapseRequest(channel, self.site)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
         req.content = BytesIO(b"")
         req.requestReceived(
             b"GET",
@@ -201,7 +201,7 @@ async def post_json(
 
             channel = FakeChannel(self.site, self.reactor)
             # channel is a `FakeChannel` but `HTTPChannel` is expected
-            req = SynapseRequest(channel, self.site)  # type: ignore[arg-type]
+            req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
             req.content = BytesIO(encode_canonical_json(data))
 
             req.requestReceived(

From fb5e1f26d7520ee97483c67c16fd0da6ef97e808 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 15:11:55 -0500
Subject: [PATCH 25/48] Fix
 `synapse/federation/sender/per_destination_queue.py`

---
 synapse/federation/sender/per_destination_queue.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py
index 9ab95a988ea..b759644a740 100644
--- a/synapse/federation/sender/per_destination_queue.py
+++ b/synapse/federation/sender/per_destination_queue.py
@@ -93,7 +93,7 @@ def __init__(
         transaction_manager: "synapse.federation.sender.TransactionManager",
         destination: str,
     ):
-        self._server_name = hs.hostname
+        self.server_name = hs.hostname
         self._clock = hs.get_clock()
         self._storage_controllers = hs.get_storage_controllers()
         self._store = hs.get_datastores().main
@@ -561,7 +561,7 @@ async def _catch_up_transmission_loop(self) -> None:
                     new_pdus = await filter_events_for_server(
                         self._storage_controllers,
                         self._destination,
-                        self._server_name,
+                        self.server_name,
                         new_pdus,
                         redact=False,
                         filter_out_erased_senders=True,
@@ -608,7 +608,7 @@ def _get_receipt_edus(self, limit: int) -> Iterable[Edu]:
         # Send at most limit EDUs for receipts.
         for content in self._pending_receipt_edus[:limit]:
             yield Edu(
-                origin=self._server_name,
+                origin=self.server_name,
                 destination=self._destination,
                 edu_type=EduTypes.RECEIPT,
                 content=content,
@@ -634,7 +634,7 @@ async def _get_device_update_edus(self, limit: int) -> Tuple[List[Edu], int]:
         )
         edus = [
             Edu(
-                origin=self._server_name,
+                origin=self.server_name,
                 destination=self._destination,
                 edu_type=edu_type,
                 content=content,
@@ -661,7 +661,7 @@ async def _get_to_device_message_edus(self, limit: int) -> Tuple[List[Edu], int]
 
         edus = [
             Edu(
-                origin=self._server_name,
+                origin=self.server_name,
                 destination=self._destination,
                 edu_type=EduTypes.DIRECT_TO_DEVICE,
                 content=content,
@@ -734,7 +734,7 @@ async def __aenter__(self) -> Tuple[List[EventBase], List[Edu]]:
 
             pending_edus.append(
                 Edu(
-                    origin=self.queue._server_name,
+                    origin=self.queue.server_name,
                     destination=self.queue._destination,
                     edu_type=EduTypes.PRESENCE,
                     content={"push": presence_to_add},

From 49f209838c463ffdb4472f0edd4328e335e6a995 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 15:44:42 -0500
Subject: [PATCH 26/48] Fix mypy complaining about `request_metrics` `has-type`
 lints (part of `synapse/http/request_metrics.py`)

```
synapse/logging/opentracing.py:1084: error: Cannot determine type of "request_metrics"  [has-type]
synapse/logging/opentracing.py:1100: error: Cannot determine type of "request_metrics"  [has-type]
synapse/logging/opentracing.py:1106: error: Unused "type: ignore" comment  [unused-ignore]
synapse/logging/opentracing.py:1106: error: Cannot determine type of "request_metrics"  [has-type]
synapse/logging/opentracing.py:1106: note: Error code "has-type" not covered by "type: ignore" comment

synapse/http/server.py:141: error: Cannot determine type of "request_metrics"  [has-type]
synapse/http/server.py:151: error: Cannot determine type of "request_metrics"  [has-type]
synapse/http/server.py:333: error: Cannot determine type of "request_metrics"  [has-type]
synapse/http/server.py:546: error: Cannot determine type of "request_metrics"  [has-type]
synapse/http/site.py:132: error: Cannot determine type of "server_name"  [has-type]
synapse/http/site.py:341: error: Cannot determine type of "request_metrics"  [has-type]
```
---
 synapse/http/site.py                          | 7 ++++---
 tests/handlers/test_oauth_delegation.py       | 4 ++--
 tests/logging/test_terse_json.py              | 5 +----
 tests/rest/key/v2/test_remote_key_resource.py | 4 ++--
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/synapse/http/site.py b/synapse/http/site.py
index 1ceccc197e6..55088fc190e 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -84,14 +84,14 @@ def __init__(
         self,
         channel: HTTPChannel,
         site: "SynapseSite",
-        hs: "HomeServer",
+        our_server_name: str,
         *args: Any,
         max_request_body_size: int = 1024,
         request_id_header: Optional[str] = None,
         **kw: Any,
     ):
         super().__init__(channel, *args, **kw)
-        self.our_server_name = hs.hostname
+        self.our_server_name = our_server_name
         self._max_request_body_size = max_request_body_size
         self.request_id_header = request_id_header
         self.synapse_site = site
@@ -701,6 +701,7 @@ def __init__(
 
         self.site_tag = site_tag
         self.reactor: ISynapseReactor = reactor
+        self.server_name = hs.hostname
 
         assert config.http_options is not None
         proxied = config.http_options.x_forwarded
@@ -712,7 +713,7 @@ def request_factory(channel: HTTPChannel, queued: bool) -> Request:
             return request_class(
                 channel,
                 self,
-                hs,
+                our_server_name=self.server_name,
                 max_request_body_size=max_request_body_size,
                 queued=queued,
                 request_id_header=request_id_header,
diff --git a/tests/handlers/test_oauth_delegation.py b/tests/handlers/test_oauth_delegation.py
index 2ef25f73a15..539fa909a86 100644
--- a/tests/handlers/test_oauth_delegation.py
+++ b/tests/handlers/test_oauth_delegation.py
@@ -867,7 +867,7 @@ async def mock_http_client_request(
         # First test a known access token
         channel = FakeChannel(self.site, self.reactor)
         # type-ignore: FakeChannel is a mock of an HTTPChannel, not a proper HTTPChannel
-        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs.hostname)  # type: ignore[arg-type]
         req.client.host = EXAMPLE_IPV4_ADDR
         req.requestHeaders.addRawHeader("Authorization", f"Bearer {known_token}")
         req.requestHeaders.addRawHeader("User-Agent", EXAMPLE_USER_AGENT)
@@ -899,7 +899,7 @@ async def mock_http_client_request(
         MAS_USER_AGENT = "masmasmas"
 
         channel = FakeChannel(self.site, self.reactor)
-        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs.hostname)  # type: ignore[arg-type]
         req.client.host = MAS_IPV4_ADDR
         req.requestHeaders.addRawHeader(
             "Authorization", f"Bearer {self.auth._admin_token()}"
diff --git a/tests/logging/test_terse_json.py b/tests/logging/test_terse_json.py
index ddc2837ff9a..60de8d786f9 100644
--- a/tests/logging/test_terse_json.py
+++ b/tests/logging/test_terse_json.py
@@ -172,11 +172,8 @@ def test_with_request_context(self) -> None:
         site.server_version_string = "Server v1"
         site.reactor = Mock()
 
-        hs = Mock(spec=["hostname"])
-        hs.hostname = "test_server"
-
         request = SynapseRequest(
-            cast(HTTPChannel, FakeChannel(site, self.reactor)), site, hs
+            cast(HTTPChannel, FakeChannel(site, self.reactor)), site, "test_server"
         )
         # Call requestReceived to finish instantiating the object.
         request.content = BytesIO()
diff --git a/tests/rest/key/v2/test_remote_key_resource.py b/tests/rest/key/v2/test_remote_key_resource.py
index 62538bbdc37..31f3cb0f122 100644
--- a/tests/rest/key/v2/test_remote_key_resource.py
+++ b/tests/rest/key/v2/test_remote_key_resource.py
@@ -99,7 +99,7 @@ def make_notary_request(self, server_name: str, key_id: str) -> dict:
         """
         channel = FakeChannel(self.site, self.reactor)
         # channel is a `FakeChannel` but `HTTPChannel` is expected
-        req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
+        req = SynapseRequest(channel, self.site, self.hs.hostname)  # type: ignore[arg-type]
         req.content = BytesIO(b"")
         req.requestReceived(
             b"GET",
@@ -201,7 +201,7 @@ async def post_json(
 
             channel = FakeChannel(self.site, self.reactor)
             # channel is a `FakeChannel` but `HTTPChannel` is expected
-            req = SynapseRequest(channel, self.site, self.hs)  # type: ignore[arg-type]
+            req = SynapseRequest(channel, self.site, self.hs.hostname)  # type: ignore[arg-type]
             req.content = BytesIO(encode_canonical_json(data))
 
             req.requestReceived(

From 347e66d03325c29d37e459bf605583244321b006 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 15:57:00 -0500
Subject: [PATCH 27/48] Fill in `synapse/metrics/__init__.py`

---
 synapse/federation/sender/__init__.py         | 20 ++++++++++++-------
 .../sender/per_destination_queue.py           |  8 ++++++--
 synapse/handlers/appservice.py                | 14 ++++++++-----
 synapse/metrics/__init__.py                   | 14 +++++++++----
 4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py
index c695f5d494c..0121fac175b 100644
--- a/synapse/federation/sender/__init__.py
+++ b/synapse/federation/sender/__init__.py
@@ -699,13 +699,19 @@ async def handle_room_events(events: List[EventBase]) -> None:
                         "federation_sender"
                     ).set(ts)
 
-                    events_processed_counter.inc(len(event_entries))
-
-                    event_processing_loop_room_count.labels("federation_sender").inc(
-                        len(events_by_room)
-                    )
-
-                event_processing_loop_counter.labels("federation_sender").inc()
+                    events_processed_counter.labels(
+                        **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc(len(event_entries))
+
+                    event_processing_loop_room_count.labels(
+                        name="federation_sender",
+                        **{SERVER_NAME_LABEL: self.server_name},
+                    ).inc(len(events_by_room))
+
+                event_processing_loop_counter.labels(
+                    name="federation_sender",
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
 
                 synapse.metrics.event_processing_positions.labels(
                     "federation_sender"
diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py
index b759644a740..34ba44f14f5 100644
--- a/synapse/federation/sender/per_destination_queue.py
+++ b/synapse/federation/sender/per_destination_queue.py
@@ -362,7 +362,9 @@ async def _transaction_transmission_loop(self) -> None:
                         self._destination, pending_pdus, pending_edus
                     )
 
-                    sent_transactions_counter.inc()
+                    sent_transactions_counter.labels(
+                        **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc()
                     sent_edus_counter.labels(
                         **{SERVER_NAME_LABEL: self.server_name}
                     ).inc(len(pending_edus))
@@ -585,7 +587,9 @@ async def _catch_up_transmission_loop(self) -> None:
                     self._destination, room_catchup_pdus, []
                 )
 
-                sent_transactions_counter.inc()
+                sent_transactions_counter.labels(
+                    **{SERVER_NAME_LABEL: self.server_name}
+                ).inc()
 
                 # We pulled this from the DB, so it'll be non-null
                 assert pdu.internal_metadata.stream_ordering
diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py
index ec624e405d7..acc1ae63d7a 100644
--- a/synapse/handlers/appservice.py
+++ b/synapse/handlers/appservice.py
@@ -208,11 +208,15 @@ async def handle_room_events(events: Iterable[EventBase]) -> None:
                         **{SERVER_NAME_LABEL: self.server_name}
                     ).inc(len(events))
 
-                    event_processing_loop_room_count.labels("appservice_sender").inc(
-                        len(events_by_room)
-                    )
-
-                    event_processing_loop_counter.labels("appservice_sender").inc()
+                    event_processing_loop_room_count.labels(
+                        name="appservice_sender",
+                        **{SERVER_NAME_LABEL: self.server_name},
+                    ).inc(len(events_by_room))
+
+                    event_processing_loop_counter.labels(
+                        name="appservice_sender",
+                        **{SERVER_NAME_LABEL: self.server_name},
+                    ).inc()
 
                     if events:
                         now = self.clock.time_msec()
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index da58df6b968..238e0a043cf 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -395,18 +395,24 @@ def collect(self) -> Iterable[Metric]:
 # Federation Metrics
 #
 
-sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")
+sent_transactions_counter = Counter(
+    "synapse_federation_client_sent_transactions", "", labelnames=[SERVER_NAME_LABEL]
+)
 
-events_processed_counter = Counter("synapse_federation_client_events_processed", "")
+events_processed_counter = Counter(
+    "synapse_federation_client_events_processed", "", labelnames=[SERVER_NAME_LABEL]
+)
 
 event_processing_loop_counter = Counter(
-    "synapse_event_processing_loop_count", "Event processing loop iterations", ["name"]
+    "synapse_event_processing_loop_count",
+    "Event processing loop iterations",
+    labelnames=["name", SERVER_NAME_LABEL],
 )
 
 event_processing_loop_room_count = Counter(
     "synapse_event_processing_loop_room_count",
     "Rooms seen per event processing loop iteration",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
 )
 
 

From 88bf526ae950209ee8d7b75f0d10d10265489a8c Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:09:44 -0500
Subject: [PATCH 28/48] Fill in `synapse/push/bulk_push_rule_evaluator.py`

---
 synapse/push/bulk_push_rule_evaluator.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py
index 1f4f5b90c3a..af9167b294d 100644
--- a/synapse/push/bulk_push_rule_evaluator.py
+++ b/synapse/push/bulk_push_rule_evaluator.py
@@ -50,6 +50,7 @@
 from synapse.events import EventBase, relation_from_event
 from synapse.events.snapshot import EventContext
 from synapse.logging.context import make_deferred_yieldable, run_in_background
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.state import CREATE_KEY, POWER_KEY
 from synapse.storage.databases.main.roommember import EventIdMembership
 from synapse.storage.invite_rule import InviteRule
@@ -68,11 +69,16 @@
 
 logger = logging.getLogger(__name__)
 
+# FIXME: These are unused
 push_rules_invalidation_counter = Counter(
-    "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", ""
+    "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
+    "",
+    labelnames=[SERVER_NAME_LABEL],
 )
 push_rules_state_size_counter = Counter(
-    "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", ""
+    "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter",
+    "",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 

From 3cb1d28818ec2b81bc508ee31e38d9c52390813e Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:19:30 -0500
Subject: [PATCH 29/48] Restore and standardize comment for unused metrics

---
 synapse/notifier.py                      | 5 +++++
 synapse/push/bulk_push_rule_evaluator.py | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/synapse/notifier.py b/synapse/notifier.py
index 887957b151b..45204842ce3 100644
--- a/synapse/notifier.py
+++ b/synapse/notifier.py
@@ -74,6 +74,11 @@
 
 logger = logging.getLogger(__name__)
 
+# FIXME: Unused metric, remove if not needed.
+notified_events_counter = Counter(
+    "synapse_notifier_notified_events", "", labelnames=[SERVER_NAME_LABEL]
+)
+
 users_woken_by_stream_counter = Counter(
     "synapse_notifier_users_woken_by_stream",
     "",
diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py
index af9167b294d..9a0a8a2f242 100644
--- a/synapse/push/bulk_push_rule_evaluator.py
+++ b/synapse/push/bulk_push_rule_evaluator.py
@@ -69,12 +69,13 @@
 
 logger = logging.getLogger(__name__)
 
-# FIXME: These are unused
+# FIXME: Unused metric, remove if not needed.
 push_rules_invalidation_counter = Counter(
     "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
     "",
     labelnames=[SERVER_NAME_LABEL],
 )
+# FIXME: Unused metric, remove if not needed.
 push_rules_state_size_counter = Counter(
     "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter",
     "",

From c2cb0247dc9a4b3b583067f663ca5afbf6b1c341 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:21:58 -0500
Subject: [PATCH 30/48] Fill in `synapse/push/httppusher.py`

---
 synapse/push/httppusher.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py
index 7df8a128c9d..53dfb54104e 100644
--- a/synapse/push/httppusher.py
+++ b/synapse/push/httppusher.py
@@ -31,6 +31,7 @@
 from synapse.api.constants import EventTypes
 from synapse.events import EventBase
 from synapse.logging import opentracing
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.push import Pusher, PusherConfig, PusherConfigException
 from synapse.storage.databases.main.event_push_actions import HttpPushAction
@@ -46,21 +47,25 @@
 http_push_processed_counter = Counter(
     "synapse_http_httppusher_http_pushes_processed",
     "Number of push notifications successfully sent",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 http_push_failed_counter = Counter(
     "synapse_http_httppusher_http_pushes_failed",
     "Number of push notifications which failed",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 http_badges_processed_counter = Counter(
     "synapse_http_httppusher_badge_updates_processed",
     "Number of badge updates successfully sent",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 http_badges_failed_counter = Counter(
     "synapse_http_httppusher_badge_updates_failed",
     "Number of badge updates which failed",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 
@@ -106,6 +111,7 @@ class HttpPusher(Pusher):
 
     def __init__(self, hs: "HomeServer", pusher_config: PusherConfig):
         super().__init__(hs, pusher_config)
+        self.server_name = hs.hostname
         self._storage_controllers = self.hs.get_storage_controllers()
         self.app_display_name = pusher_config.app_display_name
         self.device_display_name = pusher_config.device_display_name
@@ -265,7 +271,9 @@ async def _unsafe_process(self) -> None:
                 processed = await self._process_one(push_action)
 
             if processed:
-                http_push_processed_counter.inc()
+                http_push_processed_counter.labels(
+                    **{SERVER_NAME_LABEL: self.server_name}
+                ).inc()
                 self.backoff_delay = HttpPusher.INITIAL_BACKOFF_SEC
                 self.last_stream_ordering = push_action.stream_ordering
                 pusher_still_exists = (
@@ -289,7 +297,9 @@ async def _unsafe_process(self) -> None:
                         self.app_id, self.pushkey, self.user_id, self.failing_since
                     )
             else:
-                http_push_failed_counter.inc()
+                http_push_failed_counter.labels(
+                    **{SERVER_NAME_LABEL: self.server_name}
+                ).inc()
                 if not self.failing_since:
                     self.failing_since = self.clock.time_msec()
                     await self.store.update_pusher_failing_since(
@@ -540,9 +550,13 @@ async def _send_badge(self, badge: int) -> None:
         }
         try:
             await self.http_client.post_json_get_json(self.url, d)
-            http_badges_processed_counter.inc()
+            http_badges_processed_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc()
         except Exception as e:
             logger.warning(
                 "Failed to send badge count to %s: %s %s", self.name, type(e), e
             )
-            http_badges_failed_counter.inc()
+            http_badges_failed_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc()

From e469abeaa5b35bcca71c9265db0e97bfa929f39a Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:27:47 -0500
Subject: [PATCH 31/48] Fill in `synapse/push/mailer.py`

---
 synapse/push/mailer.py | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/synapse/push/mailer.py b/synapse/push/mailer.py
index fadba480dd8..d76cc8237ba 100644
--- a/synapse/push/mailer.py
+++ b/synapse/push/mailer.py
@@ -32,6 +32,7 @@
 from synapse.api.errors import StoreError
 from synapse.config.emailconfig import EmailSubjectConfig
 from synapse.events import EventBase
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.push.presentable_names import (
     calculate_room_name,
     descriptor_from_member_events,
@@ -60,7 +61,7 @@
 emails_sent_counter = Counter(
     "synapse_emails_sent_total",
     "Emails sent by type",
-    ["type"],
+    labelnames=["type", SERVER_NAME_LABEL],
 )
 
 
@@ -123,6 +124,7 @@ def __init__(
         template_text: jinja2.Template,
     ):
         self.hs = hs
+        self.server_name = hs.hostname
         self.template_html = template_html
         self.template_text = template_text
 
@@ -137,8 +139,6 @@ def __init__(
 
         logger.info("Created Mailer for app_name %s", app_name)
 
-    emails_sent_counter.labels("password_reset")
-
     async def send_password_reset_mail(
         self, email_address: str, token: str, client_secret: str, sid: str
     ) -> None:
@@ -162,7 +162,10 @@ async def send_password_reset_mail(
 
         template_vars: TemplateVars = {"link": link}
 
-        emails_sent_counter.labels("password_reset").inc()
+        emails_sent_counter.labels(
+            type="password_reset",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         await self.send_email(
             email_address,
@@ -171,8 +174,6 @@ async def send_password_reset_mail(
             template_vars,
         )
 
-    emails_sent_counter.labels("registration")
-
     async def send_registration_mail(
         self, email_address: str, token: str, client_secret: str, sid: str
     ) -> None:
@@ -196,7 +197,10 @@ async def send_registration_mail(
 
         template_vars: TemplateVars = {"link": link}
 
-        emails_sent_counter.labels("registration").inc()
+        emails_sent_counter.labels(
+            type="registration",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         await self.send_email(
             email_address,
@@ -205,8 +209,6 @@ async def send_registration_mail(
             template_vars,
         )
 
-    emails_sent_counter.labels("already_in_use")
-
     async def send_already_in_use_mail(self, email_address: str) -> None:
         """Send an email if the address is already bound to an user account
 
@@ -214,6 +216,11 @@ async def send_already_in_use_mail(self, email_address: str) -> None:
             email_address: Email address we're sending to the "already in use" mail
         """
 
+        emails_sent_counter.labels(
+            type="already_in_use",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
+
         await self.send_email(
             email_address,
             self.email_subjects.email_already_in_use
@@ -221,8 +228,6 @@ async def send_already_in_use_mail(self, email_address: str) -> None:
             {},
         )
 
-    emails_sent_counter.labels("add_threepid")
-
     async def send_add_threepid_mail(
         self, email_address: str, token: str, client_secret: str, sid: str
     ) -> None:
@@ -247,7 +252,10 @@ async def send_add_threepid_mail(
 
         template_vars: TemplateVars = {"link": link}
 
-        emails_sent_counter.labels("add_threepid").inc()
+        emails_sent_counter.labels(
+            type="add_threepid",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         await self.send_email(
             email_address,
@@ -256,8 +264,6 @@ async def send_add_threepid_mail(
             template_vars,
         )
 
-    emails_sent_counter.labels("notification")
-
     async def send_notification_mail(
         self,
         app_id: str,
@@ -352,7 +358,10 @@ async def _fetch_room_state(room_id: str) -> None:
             "reason": reason,
         }
 
-        emails_sent_counter.labels("notification").inc()
+        emails_sent_counter.labels(
+            type="notification",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         await self.send_email(
             email_address, summary_text, template_vars, unsubscribe_link

From f49c793772e85d21e8cc6b70a900eef1b27c386d Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:39:05 -0500
Subject: [PATCH 32/48] Fill in `synapse/replication/http/_base.py`

---
 synapse/replication/http/_base.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py
index 00025386801..5b86a4aa0c1 100644
--- a/synapse/replication/http/_base.py
+++ b/synapse/replication/http/_base.py
@@ -38,6 +38,7 @@
 from synapse.http.site import SynapseRequest
 from synapse.logging import opentracing
 from synapse.logging.opentracing import trace_with_opname
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.types import JsonDict
 from synapse.util.caches.response_cache import ResponseCache
 from synapse.util.cancellation import is_function_cancellable
@@ -57,7 +58,7 @@
 _outgoing_request_counter = Counter(
     "synapse_outgoing_replication_requests",
     "Number of outgoing replication requests, by replication method name and result",
-    ["name", "code"],
+    labelnames=["name", "code", SERVER_NAME_LABEL],
 )
 
 
@@ -200,6 +201,7 @@ def make_client(cls, hs: "HomeServer") -> Callable:
         parameter to specify which instance to hit (the instance must be in
         the `instance_map` config).
         """
+        server_name = hs.hostname
         clock = hs.get_clock()
         client = hs.get_replication_client()
         local_instance_name = hs.get_instance_name()
@@ -328,15 +330,27 @@ async def send_request(
                     # We convert to SynapseError as we know that it was a SynapseError
                     # on the main process that we should send to the client. (And
                     # importantly, not stack traces everywhere)
-                    _outgoing_request_counter.labels(cls.NAME, e.code).inc()
+                    _outgoing_request_counter.labels(
+                        name=cls.NAME,
+                        code=e.code,
+                        **{SERVER_NAME_LABEL: server_name},
+                    ).inc()
                     raise e.to_synapse_error()
                 except Exception as e:
-                    _outgoing_request_counter.labels(cls.NAME, "ERR").inc()
+                    _outgoing_request_counter.labels(
+                        name=cls.NAME,
+                        code="ERR",
+                        **{SERVER_NAME_LABEL: server_name},
+                    ).inc()
                     raise SynapseError(
                         502, f"Failed to talk to {instance_name} process"
                     ) from e
 
-                _outgoing_request_counter.labels(cls.NAME, 200).inc()
+                _outgoing_request_counter.labels(
+                    name=cls.NAME,
+                    code=200,
+                    **{SERVER_NAME_LABEL: server_name},
+                ).inc()
 
                 # Wait on any streams that the remote may have written to.
                 for stream_name, position in result.pop(

From 73f05ec1fe4c078c59755086facece40f244b5cb Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:41:35 -0500
Subject: [PATCH 33/48] Fill in `synapse/replication/tcp/external_cache.py`

---
 synapse/replication/tcp/external_cache.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py
index a95771b5f60..2c531ae40c4 100644
--- a/synapse/replication/tcp/external_cache.py
+++ b/synapse/replication/tcp/external_cache.py
@@ -26,6 +26,7 @@
 
 from synapse.logging import opentracing
 from synapse.logging.context import make_deferred_yieldable
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.util import json_decoder, json_encoder
 
 if TYPE_CHECKING:
@@ -36,13 +37,13 @@
 set_counter = Counter(
     "synapse_external_cache_set",
     "Number of times we set a cache",
-    labelnames=["cache_name"],
+    labelnames=["cache_name", SERVER_NAME_LABEL],
 )
 
 get_counter = Counter(
     "synapse_external_cache_get",
     "Number of times we get a cache",
-    labelnames=["cache_name", "hit"],
+    labelnames=["cache_name", "hit", SERVER_NAME_LABEL],
 )
 
 response_timer = Histogram(
@@ -69,6 +70,8 @@ class ExternalCache:
     """
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
+
         if hs.config.redis.redis_enabled:
             self._redis_connection: Optional["ConnectionHandler"] = (
                 hs.get_outbound_redis_connection()
@@ -93,7 +96,9 @@ async def set(self, cache_name: str, key: str, value: Any, expiry_ms: int) -> No
         if self._redis_connection is None:
             return
 
-        set_counter.labels(cache_name).inc()
+        set_counter.labels(
+            cache_name=cache_name, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         # txredisapi requires the value to be string, bytes or numbers, so we
         # encode stuff in JSON.
@@ -131,7 +136,11 @@ async def get(self, cache_name: str, key: str) -> Optional[Any]:
 
         logger.debug("Got cache result %s %s: %r", cache_name, key, result)
 
-        get_counter.labels(cache_name, result is not None).inc()
+        get_counter.labels(
+            cache_name=cache_name,
+            hit=result is not None,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         if not result:
             return None

From 7523a5abba249431a405676536e3424474449b4b Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:44:57 -0500
Subject: [PATCH 34/48] Fill in `synapse/replication/tcp/handler.py`

---
 synapse/replication/tcp/handler.py | 36 +++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py
index 1fafbb48c3e..b015c401cfc 100644
--- a/synapse/replication/tcp/handler.py
+++ b/synapse/replication/tcp/handler.py
@@ -40,7 +40,7 @@
 
 from twisted.internet.protocol import ReconnectingClientFactory
 
-from synapse.metrics import LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.tcp.commands import (
     ClearUserSyncsCommand,
@@ -81,13 +81,26 @@
 
 # number of updates received for each RDATA stream
 inbound_rdata_count = Counter(
-    "synapse_replication_tcp_protocol_inbound_rdata_count", "", ["stream_name"]
+    "synapse_replication_tcp_protocol_inbound_rdata_count",
+    "",
+    labelnames=["stream_name", SERVER_NAME_LABEL],
+)
+user_sync_counter = Counter(
+    "synapse_replication_tcp_resource_user_sync", "", labelnames=[SERVER_NAME_LABEL]
+)
+federation_ack_counter = Counter(
+    "synapse_replication_tcp_resource_federation_ack",
+    "",
+    labelnames=[SERVER_NAME_LABEL],
+)
+# FIXME: Unused metric, remove if not needed.
+remove_pusher_counter = Counter(
+    "synapse_replication_tcp_resource_remove_pusher", "", labelnames=[SERVER_NAME_LABEL]
 )
-user_sync_counter = Counter("synapse_replication_tcp_resource_user_sync", "")
-federation_ack_counter = Counter("synapse_replication_tcp_resource_federation_ack", "")
-remove_pusher_counter = Counter("synapse_replication_tcp_resource_remove_pusher", "")
 
-user_ip_cache_counter = Counter("synapse_replication_tcp_resource_user_ip_cache", "")
+user_ip_cache_counter = Counter(
+    "synapse_replication_tcp_resource_user_ip_cache", "", labelnames=[SERVER_NAME_LABEL]
+)
 
 
 # the type of the entries in _command_queues_by_stream
@@ -102,6 +115,7 @@ class ReplicationCommandHandler:
     """
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self._replication_data_handler = hs.get_replication_data_handler()
         self._presence_handler = hs.get_presence_handler()
         self._store = hs.get_datastores().main
@@ -437,7 +451,7 @@ def will_announce_positions(self) -> None:
     def on_USER_SYNC(
         self, conn: IReplicationConnection, cmd: UserSyncCommand
     ) -> Optional[Awaitable[None]]:
-        user_sync_counter.inc()
+        user_sync_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
         if self._is_presence_writer:
             return self._presence_handler.update_external_syncs_row(
@@ -461,7 +475,7 @@ def on_CLEAR_USER_SYNC(
     def on_FEDERATION_ACK(
         self, conn: IReplicationConnection, cmd: FederationAckCommand
     ) -> None:
-        federation_ack_counter.inc()
+        federation_ack_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
         if self._federation_sender:
             self._federation_sender.federation_ack(cmd.instance_name, cmd.token)
@@ -469,7 +483,7 @@ def on_FEDERATION_ACK(
     def on_USER_IP(
         self, conn: IReplicationConnection, cmd: UserIpCommand
     ) -> Optional[Awaitable[None]]:
-        user_ip_cache_counter.inc()
+        user_ip_cache_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
         if self._is_master or self._should_insert_client_ips:
             # We make a point of only returning an awaitable if there's actually
@@ -509,7 +523,9 @@ def on_RDATA(self, conn: IReplicationConnection, cmd: RdataCommand) -> None:
             return
 
         stream_name = cmd.stream_name
-        inbound_rdata_count.labels(stream_name).inc()
+        inbound_rdata_count.labels(
+            stream_name=stream_name, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         # We put the received command into a queue here for two reasons:
         #   1. so we don't try and concurrently handle multiple rows for the

From 3fd2cf428e1edb0c680f5efbb442b89d9503bf11 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 16:59:13 -0500
Subject: [PATCH 35/48] Fill in `synapse/replication/tcp/protocol.py`

---
 synapse/replication/tcp/protocol.py | 41 +++++++++++++++++++++--------
 synapse/replication/tcp/redis.py    | 19 +++++++++++--
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index fb9c539122b..c53d3f15b01 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -39,7 +39,7 @@
 from twisted.python.failure import Failure
 
 from synapse.logging.context import PreserveLoggingContext
-from synapse.metrics import LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge
 from synapse.metrics.background_process_metrics import (
     BackgroundProcessLoggingContext,
     run_as_background_process,
@@ -64,19 +64,21 @@
 
 
 connection_close_counter = Counter(
-    "synapse_replication_tcp_protocol_close_reason", "", ["reason_type"]
+    "synapse_replication_tcp_protocol_close_reason",
+    "",
+    labelnames=["reason_type", SERVER_NAME_LABEL],
 )
 
 tcp_inbound_commands_counter = Counter(
     "synapse_replication_tcp_protocol_inbound_commands",
     "Number of commands received from replication, by command and name of process connected to",
-    ["command", "name"],
+    labelnames=["command", "name", SERVER_NAME_LABEL],
 )
 
 tcp_outbound_commands_counter = Counter(
     "synapse_replication_tcp_protocol_outbound_commands",
     "Number of commands sent to replication, by command and name of process connected to",
-    ["command", "name"],
+    labelnames=["command", "name", SERVER_NAME_LABEL],
 )
 
 # A list of all connected protocols. This allows us to send metrics about the
@@ -137,7 +139,10 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
 
     max_line_buffer = 10000
 
-    def __init__(self, clock: Clock, handler: "ReplicationCommandHandler"):
+    def __init__(
+        self, server_name: str, clock: Clock, handler: "ReplicationCommandHandler"
+    ):
+        self.server_name = server_name
         self.clock = clock
         self.command_handler = handler
 
@@ -244,7 +249,11 @@ def _parse_and_dispatch_line(self, line: bytes) -> None:
 
         self.last_received_command = self.clock.time_msec()
 
-        tcp_inbound_commands_counter.labels(cmd.NAME, self.name).inc()
+        tcp_inbound_commands_counter.labels(
+            command=cmd.NAME,
+            name=self.name,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         self.handle_command(cmd)
 
@@ -318,7 +327,11 @@ def send_command(self, cmd: Command, do_buffer: bool = True) -> None:
             self._queue_command(cmd)
             return
 
-        tcp_outbound_commands_counter.labels(cmd.NAME, self.name).inc()
+        tcp_outbound_commands_counter.labels(
+            command=cmd.NAME,
+            name=self.name,
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         string = "%s %s" % (cmd.NAME, cmd.to_line())
         if "\n" in string:
@@ -390,9 +403,15 @@ def connectionLost(self, reason: Failure) -> None:  # type: ignore[override]
         logger.info("[%s] Replication connection closed: %r", self.id(), reason)
         if isinstance(reason, Failure):
             assert reason.type is not None
-            connection_close_counter.labels(reason.type.__name__).inc()
+            connection_close_counter.labels(
+                reason_type=reason.type.__name__,
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
         else:
-            connection_close_counter.labels(reason.__class__.__name__).inc()  # type: ignore[unreachable]
+            connection_close_counter.labels(  # type: ignore[unreachable]
+                reason_type=reason.__class__.__name__,
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc()
 
         try:
             # Remove us from list of connections to be monitored
@@ -449,7 +468,7 @@ class ServerReplicationStreamProtocol(BaseReplicationStreamProtocol):
     def __init__(
         self, server_name: str, clock: Clock, handler: "ReplicationCommandHandler"
     ):
-        super().__init__(clock, handler)
+        super().__init__(server_name, clock, handler)
 
         self.server_name = server_name
 
@@ -474,7 +493,7 @@ def __init__(
         clock: Clock,
         command_handler: "ReplicationCommandHandler",
     ):
-        super().__init__(clock, command_handler)
+        super().__init__(server_name, clock, command_handler)
 
         self.client_name = client_name
         self.server_name = server_name
diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py
index c4601a61410..9a2f6978c04 100644
--- a/synapse/replication/tcp/redis.py
+++ b/synapse/replication/tcp/redis.py
@@ -37,6 +37,7 @@
 from twisted.python.failure import Failure
 
 from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import (
     BackgroundProcessLoggingContext,
     run_as_background_process,
@@ -97,6 +98,9 @@ class RedisSubscriber(SubscriberProtocol):
     immediately after initialisation.
 
     Attributes:
+        server_name: The homeserver name of the Synapse instance that this connection
+            is associated with. This is used to label metrics and should be set to
+            `hs.hostname`.
         synapse_handler: The command handler to handle incoming commands.
         synapse_stream_prefix: The *redis* stream name to subscribe to and publish
             from (not anything to do with Synapse replication streams).
@@ -104,6 +108,7 @@ class RedisSubscriber(SubscriberProtocol):
             commands.
     """
 
+    server_name: str
     synapse_handler: "ReplicationCommandHandler"
     synapse_stream_prefix: str
     synapse_channel_names: List[str]
@@ -171,7 +176,11 @@ def _parse_and_dispatch_message(self, message: str) -> None:
 
         # We use "redis" as the name here as we don't have 1:1 connections to
         # remote instances.
-        tcp_inbound_commands_counter.labels(cmd.NAME, "redis").inc()
+        tcp_inbound_commands_counter.labels(
+            command=cmd.NAME,
+            name="redis",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         self.handle_command(cmd)
 
@@ -232,7 +241,11 @@ async def _async_send_command(self, cmd: Command) -> None:
 
         # We use "redis" as the name here as we don't have 1:1 connections to
         # remote instances.
-        tcp_outbound_commands_counter.labels(cmd.NAME, "redis").inc()
+        tcp_outbound_commands_counter.labels(
+            command=cmd.NAME,
+            name="redis",
+            **{SERVER_NAME_LABEL: self.server_name},
+        ).inc()
 
         channel_name = cmd.redis_channel_name(self.synapse_stream_prefix)
 
@@ -350,6 +363,7 @@ def __init__(
             password=hs.config.redis.redis_password,
         )
 
+        self.server_name = hs.hostname
         self.synapse_handler = hs.get_replication_command_handler()
         self.synapse_stream_prefix = hs.hostname
         self.synapse_channel_names = channel_names
@@ -364,6 +378,7 @@ def buildProtocol(self, addr: IAddress) -> RedisSubscriber:
         # as to do so would involve overriding `buildProtocol` entirely, however
         # the base method does some other things than just instantiating the
         # protocol.
+        p.server_name = self.server_name
         p.synapse_handler = self.synapse_handler
         p.synapse_outbound_redis_connection = self.synapse_outbound_redis_connection
         p.synapse_stream_prefix = self.synapse_stream_prefix

From 513249b13444c11e678dfb03ea250cf100919ecf Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:00:44 -0500
Subject: [PATCH 36/48] Fill in `synapse/replication/tcp/resource.py`

---
 synapse/replication/tcp/resource.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py
index d647a2b3326..d8576d2af36 100644
--- a/synapse/replication/tcp/resource.py
+++ b/synapse/replication/tcp/resource.py
@@ -29,6 +29,7 @@
 from twisted.internet.interfaces import IAddress
 from twisted.internet.protocol import ServerFactory
 
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.tcp.commands import PositionCommand
 from synapse.replication.tcp.protocol import ServerReplicationStreamProtocol
@@ -40,7 +41,9 @@
     from synapse.server import HomeServer
 
 stream_updates_counter = Counter(
-    "synapse_replication_tcp_resource_stream_updates", "", ["stream_name"]
+    "synapse_replication_tcp_resource_stream_updates",
+    "",
+    labelnames=["stream_name", SERVER_NAME_LABEL],
 )
 
 logger = logging.getLogger(__name__)
@@ -78,6 +81,7 @@ class ReplicationStreamer:
     """
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self.clock = hs.get_clock()
         self.notifier = hs.get_notifier()
@@ -219,7 +223,10 @@ async def _run_notifier_loop(self) -> None:
                                 len(updates),
                                 current_token,
                             )
-                            stream_updates_counter.labels(stream.NAME).inc(len(updates))
+                            stream_updates_counter.labels(
+                                stream_name=stream.NAME,
+                                **{SERVER_NAME_LABEL: self.server_name},
+                            ).inc(len(updates))
 
                         else:
                             # The token has advanced but there is no data to

From e0ffa36431d37def748bfd75a758aefdf0dd4c8f Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:03:43 -0500
Subject: [PATCH 37/48] Fill in `synapse/state/__init__.py`

---
 synapse/state/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py
index 1c3e5d00a92..f5e9f75842d 100644
--- a/synapse/state/__init__.py
+++ b/synapse/state/__init__.py
@@ -51,6 +51,7 @@
 )
 from synapse.logging.context import ContextResourceUsage
 from synapse.logging.opentracing import tag_args, trace
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.replication.http.state import ReplicationUpdateCurrentStateRestServlet
 from synapse.state import v1, v2
 from synapse.storage.databases.main.events_worker import EventRedactBehaviour
@@ -606,11 +607,13 @@ class _StateResMetrics:
     "synapse_state_res_cpu_for_biggest_room_seconds",
     "CPU time spent performing state resolution for the single most expensive "
     "room for state resolution",
+    labelnames=[SERVER_NAME_LABEL],
 )
 _biggest_room_by_db_counter = Counter(
     "synapse_state_res_db_for_biggest_room_seconds",
     "Database time spent performing state resolution for the single most "
     "expensive room for state resolution",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 _cpu_times = Histogram(
@@ -631,6 +634,7 @@ class StateResolutionHandler:
     """
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.clock = hs.get_clock()
 
         self.resolve_linearizer = Linearizer(name="state_resolve_lock")
@@ -873,7 +877,9 @@ def _report_biggest(
 
         # report info on the single biggest to prometheus
         _, biggest_metrics = biggest[0]
-        prometheus_counter_metric.inc(extract_key(biggest_metrics))
+        prometheus_counter_metric.labels(**{SERVER_NAME_LABEL: self.server_name}).inc(
+            extract_key(biggest_metrics)
+        )
 
 
 def _make_state_cache_entry(

From bc17483b31ed837801c3baf4b9817cfb68de3dcb Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:05:29 -0500
Subject: [PATCH 38/48] Fill in `synapse/storage/database.py`

---
 synapse/storage/database.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/synapse/storage/database.py b/synapse/storage/database.py
index 61881956149..d999a4c2825 100644
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -61,7 +61,7 @@
     current_context,
     make_deferred_yieldable,
 )
-from synapse.metrics import LaterGauge, register_threadpool
+from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, register_threadpool
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.background_updates import BackgroundUpdater
 from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
@@ -85,8 +85,16 @@
 sql_scheduling_timer = Histogram("synapse_storage_schedule_time", "sec")
 
 sql_query_timer = Histogram("synapse_storage_query_time", "sec", ["verb"])
-sql_txn_count = Counter("synapse_storage_transaction_time_count", "sec", ["desc"])
-sql_txn_duration = Counter("synapse_storage_transaction_time_sum", "sec", ["desc"])
+sql_txn_count = Counter(
+    "synapse_storage_transaction_time_count",
+    "sec",
+    labelnames=["desc", SERVER_NAME_LABEL],
+)
+sql_txn_duration = Counter(
+    "synapse_storage_transaction_time_sum",
+    "sec",
+    labelnames=["desc", SERVER_NAME_LABEL],
+)
 
 
 # Unique indexes which have been added in background updates. Maps from table name
@@ -561,6 +569,7 @@ def __init__(
         engine: BaseDatabaseEngine,
     ):
         self.hs = hs
+        self.server_name = hs.hostname
         self._clock = hs.get_clock()
         self._txn_limit = database_config.config.get("txn_limit", 0)
         self._database_config = database_config
@@ -866,8 +875,14 @@ def new_transaction(
 
             self._current_txn_total_time += duration
             self._txn_perf_counters.update(desc, duration)
-            sql_txn_count.labels(desc).inc(1)
-            sql_txn_duration.labels(desc).inc(duration)
+            sql_txn_count.labels(
+                desc=desc,
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc(1)
+            sql_txn_duration.labels(
+                desc=desc,
+                **{SERVER_NAME_LABEL: self.server_name},
+            ).inc(duration)
 
     async def runInteraction(
         self,

From 36a7a89ccac88d76019639b0ebdd3b72392601d1 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:08:20 -0500
Subject: [PATCH 39/48] Fill in `synapse/storage/controllers/persist_events.py`

---
 synapse/storage/controllers/persist_events.py | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py
index f5131fe2915..20e6782670d 100644
--- a/synapse/storage/controllers/persist_events.py
+++ b/synapse/storage/controllers/persist_events.py
@@ -61,6 +61,7 @@
     start_active_span_follows_from,
     trace,
 )
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.controllers.state import StateStorageController
 from synapse.storage.databases import Databases
@@ -82,19 +83,23 @@
 logger = logging.getLogger(__name__)
 
 # The number of times we are recalculating the current state
-state_delta_counter = Counter("synapse_storage_events_state_delta", "")
+state_delta_counter = Counter(
+    "synapse_storage_events_state_delta", "", labelnames=[SERVER_NAME_LABEL]
+)
 
 # The number of times we are recalculating state when there is only a
 # single forward extremity
 state_delta_single_event_counter = Counter(
-    "synapse_storage_events_state_delta_single_event", ""
+    "synapse_storage_events_state_delta_single_event",
+    "",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 # The number of times we are reculating state when we could have resonably
 # calculated the delta when we calculated the state for an event we were
 # persisting.
 state_delta_reuse_delta_counter = Counter(
-    "synapse_storage_events_state_delta_reuse_delta", ""
+    "synapse_storage_events_state_delta_reuse_delta", "", labelnames=[SERVER_NAME_LABEL]
 )
 
 # The number of forward extremities for each new event.
@@ -115,16 +120,19 @@
 state_resolutions_during_persistence = Counter(
     "synapse_storage_events_state_resolutions_during_persistence",
     "Number of times we had to do state res to calculate new current state",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 potential_times_prune_extremities = Counter(
     "synapse_storage_events_potential_times_prune_extremities",
     "Number of times we might be able to prune extremities",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 times_pruned_extremities = Counter(
     "synapse_storage_events_times_pruned_extremities",
     "Number of times we were actually be able to prune extremities",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 
@@ -337,6 +345,7 @@ def __init__(
         assert stores.persist_events
         self.persist_events_store = stores.persist_events
 
+        self.server_name = hs.hostname
         self._clock = hs.get_clock()
         self._instance_name = hs.get_instance_name()
         self.is_mine_id = hs.is_mine_id
@@ -698,9 +707,11 @@ async def _calculate_new_forward_extremities_and_state_delta(
             if all_single_prev_not_state:
                 return (new_forward_extremities, None)
 
-        state_delta_counter.inc()
+        state_delta_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
         if len(new_latest_event_ids) == 1:
-            state_delta_single_event_counter.inc()
+            state_delta_single_event_counter.labels(
+                **{SERVER_NAME_LABEL: self.server_name}
+            ).inc()
 
             # This is a fairly handwavey check to see if we could
             # have guessed what the delta would have been when
@@ -715,7 +726,9 @@ async def _calculate_new_forward_extremities_and_state_delta(
             for ev, _ in ev_ctx_rm:
                 prev_event_ids = set(ev.prev_event_ids())
                 if latest_event_ids == prev_event_ids:
-                    state_delta_reuse_delta_counter.inc()
+                    state_delta_reuse_delta_counter.labels(
+                        **{SERVER_NAME_LABEL: self.server_name}
+                    ).inc()
                     break
 
         logger.debug("Calculating state delta for room %s", room_id)
@@ -977,7 +990,9 @@ async def _get_new_state_after_events(
             ),
         )
 
-        state_resolutions_during_persistence.inc()
+        state_resolutions_during_persistence.labels(
+            **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         # If the returned state matches the state group of one of the new
         # forward extremities then we check if we are able to prune some state
@@ -1005,7 +1020,9 @@ async def _prune_extremities(
         """See if we can prune any of the extremities after calculating the
         resolved state.
         """
-        potential_times_prune_extremities.inc()
+        potential_times_prune_extremities.labels(
+            **{SERVER_NAME_LABEL: self.server_name}
+        ).inc()
 
         # We keep all the extremities that have the same state group, and
         # see if we can drop the others.
@@ -1103,7 +1120,7 @@ async def _prune_extremities(
 
             return new_latest_event_ids
 
-        times_pruned_extremities.inc()
+        times_pruned_extremities.labels(**{SERVER_NAME_LABEL: self.server_name}).inc()
 
         logger.info(
             "Pruning forward extremities in room %s: from %s -> %s",

From 24f3e0ac6ed43580734cd889c79e95997a6c5995 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:10:31 -0500
Subject: [PATCH 40/48] Fill in
 `synapse/storage/databases/main/event_federation.py`

---
 synapse/storage/databases/main/event_federation.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index 46aa5902d8d..afeda220d18 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -45,6 +45,7 @@
 from synapse.api.room_versions import EventFormatVersions, RoomVersion
 from synapse.events import EventBase, make_event_from_dict
 from synapse.logging.opentracing import tag_args, trace
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics.background_process_metrics import wrap_as_background_process
 from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
 from synapse.storage.background_updates import ForeignKeyConstraint
@@ -80,6 +81,7 @@
     "synapse_federation_server_number_inbound_pdu_pruned",
     "The number of events in the inbound federation staging that have been "
     "pruned due to the queue getting too long",
+    labelnames=[SERVER_NAME_LABEL],
 )
 
 logger = logging.getLogger(__name__)
@@ -137,6 +139,7 @@ def __init__(
         super().__init__(database, db_conn, hs)
 
         self.hs = hs
+        self.server_name = hs.hostname
 
         if hs.config.worker.run_background_tasks:
             hs.get_clock().looping_call(
@@ -1997,7 +2000,9 @@ async def prune_staged_events_in_room(
         if not to_delete:
             return False
 
-        pdus_pruned_from_federation_queue.inc(len(to_delete))
+        pdus_pruned_from_federation_queue.labels(
+            **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(len(to_delete))
         logger.info(
             "Pruning %d events in room %s from federation queue",
             len(to_delete),

From 05ef5e29c21358f459dabd1c25c91ff0e9479a49 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:12:19 -0500
Subject: [PATCH 41/48] Fix `server_name` atribute problems with database
 classes (`EventFederationWorkerStore`)

```
synapse/storage/databases/main/__init__.py:114: error: Cannot determine type of "server_name" in base class "EventFederationWorkerStore"  [misc]
```
---
 synapse/storage/_base.py                           | 1 +
 synapse/storage/databases/main/event_federation.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py
index 2309b1648e3..548e7df9305 100644
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@@ -55,6 +55,7 @@ def __init__(
         hs: "HomeServer",
     ):
         self.hs = hs
+        self.server_name = hs.hostname
         self._clock = hs.get_clock()
         self.database_engine = database.engine
         self.db_pool = database
diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index afeda220d18..63910232aa6 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -139,7 +139,6 @@ def __init__(
         super().__init__(database, db_conn, hs)
 
         self.hs = hs
-        self.server_name = hs.hostname
 
         if hs.config.worker.run_background_tasks:
             hs.get_clock().looping_call(

From 05ecf5c9e3b6648db807ee041bd1ed67438ea1d5 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:23:29 -0500
Subject: [PATCH 42/48] Fill in `synapse/storage/databases/main/events.py`

---
 synapse/storage/databases/main/events.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index b7cc0433e75..4143809ed7b 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -55,6 +55,7 @@
 from synapse.events.snapshot import EventContext
 from synapse.events.utils import parse_stripped_state_event
 from synapse.logging.opentracing import trace
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.storage._base import db_to_json, make_in_list_sql_clause
 from synapse.storage.database import (
     DatabasePool,
@@ -88,11 +89,13 @@
 
 logger = logging.getLogger(__name__)
 
-persist_event_counter = Counter("synapse_storage_events_persisted_events", "")
+persist_event_counter = Counter(
+    "synapse_storage_events_persisted_events", "", labelnames=[SERVER_NAME_LABEL]
+)
 event_counter = Counter(
     "synapse_storage_events_persisted_events_sep",
     "",
-    ["type", "origin_type", "origin_entity"],
+    labelnames=["type", "origin_type", "origin_entity", SERVER_NAME_LABEL],
 )
 
 # State event type/key pairs that we need to gather to fill in the
@@ -236,6 +239,7 @@ def __init__(
         db_conn: LoggingDatabaseConnection,
     ):
         self.hs = hs
+        self.server_name = hs.hostname
         self.db_pool = db
         self.store = main_data_store
         self.database_engine = db.engine
@@ -356,7 +360,9 @@ async def _persist_events_and_state_updates(
                 new_event_links=new_event_links,
                 sliding_sync_table_changes=sliding_sync_table_changes,
             )
-            persist_event_counter.inc(len(events_and_contexts))
+            persist_event_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc(
+                len(events_and_contexts)
+            )
 
             if not use_negative_stream_ordering:
                 # we don't want to set the event_persisted_position to a negative
@@ -374,7 +380,12 @@ async def _persist_events_and_state_updates(
                     origin_type = "remote"
                     origin_entity = get_domain_from_id(event.sender)
 
-                event_counter.labels(event.type, origin_type, origin_entity).inc()
+                event_counter.labels(
+                    type=event.type,
+                    origin_type=origin_type,
+                    origin_entity=origin_entity,
+                    **{SERVER_NAME_LABEL: self.server_name},
+                ).inc()
 
             if new_forward_extremities:
                 self.store.get_latest_event_ids_in_room.prefill(
@@ -2719,7 +2730,7 @@ def _add_to_cache(
         txn: LoggingTransaction,
         events_and_contexts: List[Tuple[EventBase, EventContext]],
     ) -> None:
-        to_prefill = []
+        to_prefill: List[EventCacheEntry] = []
 
         ev_map = {e.event_id: e for e, _ in events_and_contexts}
         if not ev_map:

From 732f605afb235079c729171f9dc9af432efe32b7 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 4 Jul 2025 17:31:17 -0500
Subject: [PATCH 43/48] Fill in `synapse/util/ratelimitutils.py`

---
 synapse/rest/client/register.py   |  6 ++++--
 synapse/server.py                 |  3 ++-
 synapse/util/ratelimitutils.py    | 24 ++++++++++++++++++------
 tests/util/test_ratelimitutils.py |  8 ++++----
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/synapse/rest/client/register.py b/synapse/rest/client/register.py
index 58231d2b044..13148072d78 100644
--- a/synapse/rest/client/register.py
+++ b/synapse/rest/client/register.py
@@ -323,10 +323,12 @@ class UsernameAvailabilityRestServlet(RestServlet):
     def __init__(self, hs: "HomeServer"):
         super().__init__()
         self.hs = hs
+        self.server_name = hs.hostname
         self.registration_handler = hs.get_registration_handler()
         self.ratelimiter = FederationRateLimiter(
-            hs.get_clock(),
-            FederationRatelimitSettings(
+            our_server_name=self.server_name,
+            clock=hs.get_clock(),
+            config=FederationRatelimitSettings(
                 # Time window of 2s
                 window_size=2000,
                 # Artificially delay requests if rate > sleep_limit/window_size
diff --git a/synapse/server.py b/synapse/server.py
index fd16abb9ead..52b3b0d5dcd 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -844,7 +844,8 @@ def get_replication_streams(self) -> Dict[str, Stream]:
     @cache_in_self
     def get_federation_ratelimiter(self) -> FederationRateLimiter:
         return FederationRateLimiter(
-            self.get_clock(),
+            our_server_name=self.hostname,
+            clock=self.get_clock(),
             config=self.config.ratelimiting.rc_federation,
             metrics_name="federation_servlets",
         )
diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index 3f067b792c4..343ee4f4549 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -52,7 +52,7 @@
     run_in_background,
 )
 from synapse.logging.opentracing import start_active_span
-from synapse.metrics import Histogram, LaterGauge
+from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge
 from synapse.util import Clock
 
 if typing.TYPE_CHECKING:
@@ -65,12 +65,12 @@
 rate_limit_sleep_counter = Counter(
     "synapse_rate_limit_sleep",
     "Number of requests slept by the rate limiter",
-    ["rate_limiter_name"],
+    labelnames=["rate_limiter_name", SERVER_NAME_LABEL],
 )
 rate_limit_reject_counter = Counter(
     "synapse_rate_limit_reject",
     "Number of requests rejected by the rate limiter",
-    ["rate_limiter_name"],
+    labelnames=["rate_limiter_name", SERVER_NAME_LABEL],
 )
 queue_wait_timer = Histogram(
     "synapse_rate_limit_queue_wait_time_seconds",
@@ -157,6 +157,7 @@ class FederationRateLimiter:
 
     def __init__(
         self,
+        our_server_name: str,
         clock: Clock,
         config: FederationRatelimitSettings,
         metrics_name: Optional[str] = None,
@@ -174,7 +175,10 @@ def __init__(
 
         def new_limiter() -> "_PerHostRatelimiter":
             return _PerHostRatelimiter(
-                clock=clock, config=config, metrics_name=metrics_name
+                our_server_name=our_server_name,
+                clock=clock,
+                config=config,
+                metrics_name=metrics_name,
             )
 
         self.ratelimiters: DefaultDict[str, "_PerHostRatelimiter"] = (
@@ -205,6 +209,7 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]
 class _PerHostRatelimiter:
     def __init__(
         self,
+        our_server_name: str,
         clock: Clock,
         config: FederationRatelimitSettings,
         metrics_name: Optional[str] = None,
@@ -218,6 +223,7 @@ def __init__(
                 for this rate limiter.
                 from the rest in the metrics
         """
+        self.our_server_name = our_server_name
         self.clock = clock
         self.metrics_name = metrics_name
 
@@ -296,7 +302,10 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
         if self.should_reject():
             logger.debug("Ratelimiter(%s): rejecting request", self.host)
             if self.metrics_name:
-                rate_limit_reject_counter.labels(self.metrics_name).inc()
+                rate_limit_reject_counter.labels(
+                    rate_limiter_name=self.metrics_name,
+                    **{SERVER_NAME_LABEL: self.our_server_name},
+                ).inc()
             raise LimitExceededError(
                 limiter_name="rc_federation",
                 retry_after_ms=int(self.window_size / self.sleep_limit),
@@ -333,7 +342,10 @@ def queue_request() -> "defer.Deferred[None]":
                 self.sleep_sec,
             )
             if self.metrics_name:
-                rate_limit_sleep_counter.labels(self.metrics_name).inc()
+                rate_limit_sleep_counter.labels(
+                    rate_limiter_name=self.metrics_name,
+                    **{SERVER_NAME_LABEL: self.our_server_name},
+                ).inc()
             ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
 
             self.sleeping_requests.add(request_id)
diff --git a/tests/util/test_ratelimitutils.py b/tests/util/test_ratelimitutils.py
index 7bb45f9bf21..20281d04fec 100644
--- a/tests/util/test_ratelimitutils.py
+++ b/tests/util/test_ratelimitutils.py
@@ -37,7 +37,7 @@ def test_ratelimit(self) -> None:
         """A simple test with the default values"""
         reactor, clock = get_clock()
         rc_config = build_rc_config()
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter("test_server", clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -47,7 +47,7 @@ def test_concurrent_limit(self) -> None:
         """Test what happens when we hit the concurrent limit"""
         reactor, clock = get_clock()
         rc_config = build_rc_config({"rc_federation": {"concurrent": 2}})
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter("test_server", clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -74,7 +74,7 @@ def test_sleep_limit(self) -> None:
         rc_config = build_rc_config(
             {"rc_federation": {"sleep_limit": 2, "sleep_delay": 500}}
         )
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter("test_server", clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -105,7 +105,7 @@ def test_lots_of_queued_things(self) -> None:
                 }
             }
         )
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter("test_server", clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d:
             # shouldn't block

From 5fb3acb2f258bd82ffcdb6b2f5235bc49a55c613 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Mon, 7 Jul 2025 16:34:33 -0500
Subject: [PATCH 44/48] Fill in `synapse/metrics/background_process_metrics.py`

TODO: still need to fill in `run_as_background_process` usage
---
 synapse/metrics/background_process_metrics.py | 76 +++++++++++++------
 synapse/replication/tcp/protocol.py           |  4 +-
 synapse/replication/tcp/redis.py              |  2 +-
 .../test_background_process_metrics.py        |  4 +-
 4 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index 49d0ff9fc19..5f9883100cc 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -49,6 +49,7 @@
     PreserveLoggingContext,
 )
 from synapse.logging.opentracing import SynapseTags, start_active_span
+from synapse.metrics import SERVER_NAME_LABEL
 from synapse.metrics._types import Collector
 
 if TYPE_CHECKING:
@@ -64,13 +65,13 @@
 _background_process_start_count = Counter(
     "synapse_background_process_start_count",
     "Number of background processes started",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
 )
 
 _background_process_in_flight_count = Gauge(
     "synapse_background_process_in_flight_count",
     "Number of background processes in flight",
-    labelnames=["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
 )
 
 # we set registry=None in all of these to stop them getting registered with
@@ -80,21 +81,21 @@
 _background_process_ru_utime = Counter(
     "synapse_background_process_ru_utime_seconds",
     "User CPU time used by background processes, in seconds",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
     registry=None,
 )
 
 _background_process_ru_stime = Counter(
     "synapse_background_process_ru_stime_seconds",
     "System CPU time used by background processes, in seconds",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
     registry=None,
 )
 
 _background_process_db_txn_count = Counter(
     "synapse_background_process_db_txn_count",
     "Number of database transactions done by background processes",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
     registry=None,
 )
 
@@ -104,14 +105,14 @@
         "Seconds spent by background processes waiting for database "
         "transactions, excluding scheduling time"
     ),
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
     registry=None,
 )
 
 _background_process_db_sched_duration = Counter(
     "synapse_background_process_db_sched_duration_seconds",
     "Seconds spent by background processes waiting for database connections",
-    ["name"],
+    labelnames=["name", SERVER_NAME_LABEL],
     registry=None,
 )
 
@@ -169,8 +170,9 @@ def collect(self) -> Iterable[Metric]:
 
 
 class _BackgroundProcess:
-    def __init__(self, desc: str, ctx: LoggingContext):
+    def __init__(self, *, desc: str, server_name: str, ctx: LoggingContext):
         self.desc = desc
+        self.server_name = server_name
         self._context = ctx
         self._reported_stats: Optional[ContextResourceUsage] = None
 
@@ -185,15 +187,21 @@ def update_metrics(self) -> None:
 
         # For unknown reasons, the difference in times can be negative. See comment in
         # synapse.http.request_metrics.RequestMetrics.update_metrics.
-        _background_process_ru_utime.labels(self.desc).inc(max(diff.ru_utime, 0))
-        _background_process_ru_stime.labels(self.desc).inc(max(diff.ru_stime, 0))
-        _background_process_db_txn_count.labels(self.desc).inc(diff.db_txn_count)
-        _background_process_db_txn_duration.labels(self.desc).inc(
-            diff.db_txn_duration_sec
-        )
-        _background_process_db_sched_duration.labels(self.desc).inc(
-            diff.db_sched_duration_sec
-        )
+        _background_process_ru_utime.labels(
+            name=self.desc, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(max(diff.ru_utime, 0))
+        _background_process_ru_stime.labels(
+            name=self.desc, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(max(diff.ru_stime, 0))
+        _background_process_db_txn_count.labels(
+            name=self.desc, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(diff.db_txn_count)
+        _background_process_db_txn_duration.labels(
+            name=self.desc, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(diff.db_txn_duration_sec)
+        _background_process_db_sched_duration.labels(
+            name=self.desc, **{SERVER_NAME_LABEL: self.server_name}
+        ).inc(diff.db_sched_duration_sec)
 
 
 R = TypeVar("R")
@@ -201,6 +209,7 @@ def update_metrics(self) -> None:
 
 def run_as_background_process(
     desc: "LiteralString",
+    server_name: str,
     func: Callable[..., Awaitable[Optional[R]]],
     *args: Any,
     bg_start_span: bool = True,
@@ -218,6 +227,8 @@ def run_as_background_process(
 
     Args:
         desc: a description for this background process type
+        server_name: The homeserver name that this background process is being run for
+            (this should be `hs.hostname`).
         func: a function, which may return a Deferred or a coroutine
         bg_start_span: Whether to start an opentracing span. Defaults to True.
             Should only be disabled for processes that will not log to or tag
@@ -236,10 +247,16 @@ async def run() -> Optional[R]:
             count = _background_process_counts.get(desc, 0)
             _background_process_counts[desc] = count + 1
 
-        _background_process_start_count.labels(desc).inc()
-        _background_process_in_flight_count.labels(desc).inc()
+        _background_process_start_count.labels(
+            name=desc, **{SERVER_NAME_LABEL: server_name}
+        ).inc()
+        _background_process_in_flight_count.labels(
+            name=desc, **{SERVER_NAME_LABEL: server_name}
+        ).inc()
 
-        with BackgroundProcessLoggingContext(desc, count) as context:
+        with BackgroundProcessLoggingContext(
+            name=desc, server_name=server_name, instance_id=count
+        ) as context:
             try:
                 if bg_start_span:
                     ctx = start_active_span(
@@ -256,7 +273,9 @@ async def run() -> Optional[R]:
                 )
                 return None
             finally:
-                _background_process_in_flight_count.labels(desc).dec()
+                _background_process_in_flight_count.labels(
+                    name=desc, **{SERVER_NAME_LABEL: server_name}
+                ).dec()
 
     with PreserveLoggingContext():
         # Note that we return a Deferred here so that it can be used in a
@@ -317,13 +336,20 @@ class BackgroundProcessLoggingContext(LoggingContext):
 
     __slots__ = ["_proc"]
 
-    def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None):
+    def __init__(
+        self,
+        *,
+        name: str,
+        server_name: str,
+        instance_id: Optional[Union[int, str]] = None,
+    ):
         """
 
         Args:
             name: The name of the background process. Each distinct `name` gets a
                 separate prometheus time series.
-
+            server_name: The homeserver name that this background process is being run for
+                (this should be `hs.hostname`).
             instance_id: an identifer to add to `name` to distinguish this instance of
                 the named background process in the logs. If this is `None`, one is
                 made up based on id(self).
@@ -331,7 +357,9 @@ def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None):
         if instance_id is None:
             instance_id = id(self)
         super().__init__("%s-%s" % (name, instance_id))
-        self._proc: Optional[_BackgroundProcess] = _BackgroundProcess(name, self)
+        self._proc: Optional[_BackgroundProcess] = _BackgroundProcess(
+            name=name, server_name=server_name, ctx=self
+        )
 
     def start(self, rusage: "Optional[resource.struct_rusage]") -> None:
         """Log context has started running (again)."""
diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index c53d3f15b01..8fa26d8481c 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -171,7 +171,9 @@ def __init__(
             # capture the sentinel context as its containing context and won't prevent
             # GC of / unintentionally reactivate what would be the current context.
             self._logging_context = BackgroundProcessLoggingContext(
-                "replication-conn", self.conn_id
+                name="replication-conn",
+                server_name=self.server_name,
+                instance_id=self.conn_id,
             )
 
     def connectionMade(self) -> None:
diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py
index 9a2f6978c04..a67f4ce3be1 100644
--- a/synapse/replication/tcp/redis.py
+++ b/synapse/replication/tcp/redis.py
@@ -124,7 +124,7 @@ def __init__(self, *args: Any, **kwargs: Any):
             # capture the sentinel context as its containing context and won't prevent
             # GC of / unintentionally reactivate what would be the current context.
             self._logging_context = BackgroundProcessLoggingContext(
-                "replication_command_handler"
+                name="replication_command_handler", server_name=self.server_name
             )
 
     def connectionMade(self) -> None:
diff --git a/tests/metrics/test_background_process_metrics.py b/tests/metrics/test_background_process_metrics.py
index f0f6cb29128..7253e51c38a 100644
--- a/tests/metrics/test_background_process_metrics.py
+++ b/tests/metrics/test_background_process_metrics.py
@@ -14,6 +14,8 @@ def test_update_metrics_with_negative_time_diff(self) -> None:
         mock_logging_context = Mock(spec=LoggingContext)
         mock_logging_context.get_resource_usage.return_value = usage
 
-        process = _BackgroundProcess("test process", mock_logging_context)
+        process = _BackgroundProcess(
+            name="test process", server_name="test_server", ctx=mock_logging_context
+        )
         # Should not raise
         process.update_metrics()

From b396b7c5384d958f37d57f4b92cdb10d0f12757a Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Mon, 7 Jul 2025 18:21:14 -0500
Subject: [PATCH 45/48] Fill in `synapse/metrics/background_process_metrics.py`
 (`run_as_background_process`)

---
 synapse/_scripts/update_synapse_database.py   |  7 ++-
 synapse/appservice/scheduler.py               | 60 +++++++++++--------
 synapse/crypto/keyring.py                     |  6 +-
 synapse/events/auto_accept_invites.py         |  2 +
 synapse/federation/sender/__init__.py         |  5 +-
 .../sender/per_destination_queue.py           |  1 +
 synapse/handlers/appservice.py                |  4 +-
 synapse/handlers/auth.py                      |  1 +
 synapse/handlers/deactivate_account.py        |  5 +-
 synapse/handlers/delayed_events.py            |  9 ++-
 synapse/handlers/device.py                    |  4 ++
 synapse/handlers/federation.py                | 14 ++++-
 synapse/handlers/federation_event.py          |  2 +
 synapse/handlers/message.py                   |  6 +-
 synapse/handlers/pagination.py                |  7 ++-
 synapse/handlers/presence.py                  |  6 +-
 synapse/handlers/room_member.py               |  5 +-
 synapse/handlers/stats.py                     |  3 +-
 synapse/handlers/typing.py                    | 15 ++++-
 synapse/handlers/user_directory.py            | 12 +++-
 synapse/media/media_repository.py             |  8 ++-
 synapse/media/url_previewer.py                |  2 +-
 synapse/metrics/background_process_metrics.py |  2 +-
 synapse/metrics/common_usage_metrics.py       |  6 +-
 synapse/module_api/__init__.py                |  2 +
 synapse/push/emailpusher.py                   |  3 +-
 synapse/push/httppusher.py                    |  6 +-
 synapse/push/pusherpool.py                    |  5 +-
 synapse/replication/tcp/client.py             |  5 +-
 synapse/replication/tcp/handler.py            |  5 +-
 synapse/replication/tcp/protocol.py           |  4 +-
 synapse/replication/tcp/redis.py              | 12 +++-
 synapse/replication/tcp/resource.py           |  4 +-
 synapse/rest/client/room.py                   |  2 +
 synapse/server.py                             |  2 +-
 synapse/storage/background_updates.py         |  6 +-
 synapse/storage/controllers/persist_events.py |  6 +-
 synapse/storage/database.py                   |  2 +
 .../storage/databases/main/account_data.py    |  1 +
 synapse/storage/databases/main/cache.py       |  5 +-
 synapse/storage/databases/main/deviceinbox.py |  2 +
 synapse/storage/databases/main/devices.py     |  1 +
 .../storage/databases/main/end_to_end_keys.py |  1 +
 .../storage/databases/main/events_worker.py   |  7 ++-
 synapse/storage/databases/main/presence.py    |  1 +
 synapse/storage/databases/main/push_rule.py   |  1 +
 synapse/storage/databases/main/pusher.py      |  1 +
 synapse/storage/databases/main/receipts.py    |  1 +
 synapse/storage/databases/main/room.py        |  1 +
 synapse/storage/util/id_generators.py         |  7 +++
 synapse/util/batching_queue.py                |  9 ++-
 synapse/util/caches/expiringcache.py          |  4 +-
 synapse/util/distributor.py                   | 12 +++-
 synapse/util/retryutils.py                    | 21 +++++--
 synapse/util/task_scheduler.py                |  5 +-
 tests/appservice/test_scheduler.py            | 20 ++++++-
 .../test_background_process_metrics.py        |  2 +-
 tests/test_distributor.py                     |  2 +-
 tests/util/test_batching_queue.py             |  5 +-
 59 files changed, 278 insertions(+), 87 deletions(-)

diff --git a/synapse/_scripts/update_synapse_database.py b/synapse/_scripts/update_synapse_database.py
index d8b4dbd6c64..70e55984184 100644
--- a/synapse/_scripts/update_synapse_database.py
+++ b/synapse/_scripts/update_synapse_database.py
@@ -53,6 +53,7 @@ def __init__(self, config: HomeServerConfig):
 
 
 def run_background_updates(hs: HomeServer) -> None:
+    server_name = hs.hostname
     main = hs.get_datastores().main
     state = hs.get_datastores().state
 
@@ -66,7 +67,11 @@ async def run_background_updates() -> None:
     def run() -> None:
         # Apply all background updates on the database.
         defer.ensureDeferred(
-            run_as_background_process("background_updates", run_background_updates)
+            run_as_background_process(
+                "background_updates",
+                server_name,
+                run_background_updates,
+            )
         )
 
     reactor.callWhenRunning(run)
diff --git a/synapse/appservice/scheduler.py b/synapse/appservice/scheduler.py
index ab3f4e15fe9..87b8bdb2d9c 100644
--- a/synapse/appservice/scheduler.py
+++ b/synapse/appservice/scheduler.py
@@ -103,18 +103,15 @@
 
 
 class ApplicationServiceScheduler:
-    """Public facing API for this module. Does the required DI to tie the
-    components together. This also serves as the "event_pool", which in this
+    """
+    Public facing API for this module. Does the required dependency injection (DI) to
+    tie the components together. This also serves as the "event_pool", which in this
     case is a simple array.
     """
 
     def __init__(self, hs: "HomeServer"):
-        self.clock = hs.get_clock()
-        self.store = hs.get_datastores().main
-        self.as_api = hs.get_application_service_api()
-
-        self.txn_ctrl = _TransactionController(self.clock, self.store, self.as_api)
-        self.queuer = _ServiceQueuer(self.txn_ctrl, self.clock, hs)
+        self.txn_ctrl = _TransactionController(hs)
+        self.queuer = _ServiceQueuer(self.txn_ctrl, hs)
 
     async def start(self) -> None:
         logger.info("Starting appservice scheduler")
@@ -184,9 +181,7 @@ class _ServiceQueuer:
     appservice at a given time.
     """
 
-    def __init__(
-        self, txn_ctrl: "_TransactionController", clock: Clock, hs: "HomeServer"
-    ):
+    def __init__(self, txn_ctrl: "_TransactionController", hs: "HomeServer"):
         # dict of {service_id: [events]}
         self.queued_events: Dict[str, List[EventBase]] = {}
         # dict of {service_id: [events]}
@@ -199,10 +194,11 @@ def __init__(
         # the appservices which currently have a transaction in flight
         self.requests_in_flight: Set[str] = set()
         self.txn_ctrl = txn_ctrl
-        self.clock = clock
         self._msc3202_transaction_extensions_enabled: bool = (
             hs.config.experimental.msc3202_transaction_extensions
         )
+        self.server_name = hs.hostname
+        self.clock = hs.get_clock()
         self._store = hs.get_datastores().main
 
     def start_background_request(self, service: ApplicationService) -> None:
@@ -210,7 +206,9 @@ def start_background_request(self, service: ApplicationService) -> None:
         if service.id in self.requests_in_flight:
             return
 
-        run_as_background_process("as-sender", self._send_request, service)
+        run_as_background_process(
+            "as-sender", self.server_name, self._send_request, service
+        )
 
     async def _send_request(self, service: ApplicationService) -> None:
         # sanity-check: we shouldn't get here if this service already has a sender
@@ -359,10 +357,11 @@ class _TransactionController:
     (Note we have only have one of these in the homeserver.)
     """
 
-    def __init__(self, clock: Clock, store: DataStore, as_api: ApplicationServiceApi):
-        self.clock = clock
-        self.store = store
-        self.as_api = as_api
+    def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
+        self.clock = hs.get_clock()
+        self.store = hs.get_datastores().main
+        self.as_api = hs.get_application_service_api()
 
         # map from service id to recoverer instance
         self.recoverers: Dict[str, "_Recoverer"] = {}
@@ -446,7 +445,12 @@ def start_recoverer(self, service: ApplicationService) -> None:
         logger.info("Starting recoverer for AS ID %s", service.id)
         assert service.id not in self.recoverers
         recoverer = self.RECOVERER_CLASS(
-            self.clock, self.store, self.as_api, service, self.on_recovered
+            self.server_name,
+            self.clock,
+            self.store,
+            self.as_api,
+            service,
+            self.on_recovered,
         )
         self.recoverers[service.id] = recoverer
         recoverer.recover()
@@ -477,21 +481,24 @@ class _Recoverer:
     We have one of these for each appservice which is currently considered DOWN.
 
     Args:
-        clock (synapse.util.Clock):
-        store (synapse.storage.DataStore):
-        as_api (synapse.appservice.api.ApplicationServiceApi):
-        service (synapse.appservice.ApplicationService): the service we are managing
-        callback (callable[_Recoverer]): called once the service recovers.
+        server_name: the homeserver name (used to label metrics) (this should be `hs.hostname`).
+        clock:
+        store:
+        as_api:
+        service: the service we are managing
+        callback: called once the service recovers.
     """
 
     def __init__(
         self,
+        server_name: str,
         clock: Clock,
         store: DataStore,
         as_api: ApplicationServiceApi,
         service: ApplicationService,
         callback: Callable[["_Recoverer"], Awaitable[None]],
     ):
+        self.server_name = server_name
         self.clock = clock
         self.store = store
         self.as_api = as_api
@@ -504,7 +511,11 @@ def recover(self) -> None:
         delay = 2**self.backoff_counter
         logger.info("Scheduling retries on %s in %fs", self.service.id, delay)
         self.scheduled_recovery = self.clock.call_later(
-            delay, run_as_background_process, "as-recoverer", self.retry
+            delay,
+            run_as_background_process,
+            "as-recoverer",
+            self.server_name,
+            self.retry,
         )
 
     def _backoff(self) -> None:
@@ -525,6 +536,7 @@ def force_retry(self) -> None:
         # Run a retry, which will resechedule a recovery if it fails.
         run_as_background_process(
             "retry",
+            self.server_name,
             self.retry,
         )
 
diff --git a/synapse/crypto/keyring.py b/synapse/crypto/keyring.py
index 643d2d4e66d..3a7b327b99d 100644
--- a/synapse/crypto/keyring.py
+++ b/synapse/crypto/keyring.py
@@ -473,8 +473,12 @@ async def _inner_fetch_key_request(
 
 class KeyFetcher(metaclass=abc.ABCMeta):
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self._queue = BatchingQueue(
-            self.__class__.__name__, hs.get_clock(), self._fetch_keys
+            name=self.__class__.__name__,
+            server_name=self.server_name,
+            clock=hs.get_clock(),
+            process_batch_callback=self._fetch_keys,
         )
 
     async def get_keys(
diff --git a/synapse/events/auto_accept_invites.py b/synapse/events/auto_accept_invites.py
index 9e5f76f33f0..300ecfdf7a3 100644
--- a/synapse/events/auto_accept_invites.py
+++ b/synapse/events/auto_accept_invites.py
@@ -34,6 +34,7 @@ class InviteAutoAccepter:
     def __init__(self, config: AutoAcceptInvitesConfig, api: ModuleApi):
         # Keep a reference to the Module API.
         self._api = api
+        self.server_name = api.server_name
         self._config = config
 
         if not self._config.enabled:
@@ -113,6 +114,7 @@ async def on_new_event(self, event: EventBase, *args: Any) -> None:
         # that occurs when responding to invites over federation (see https://github.com/matrix-org/synapse-auto-accept-invite/issues/12)
         run_as_background_process(
             "retry_make_join",
+            self.server_name,
             self._retry_make_join,
             event.state_key,
             event.state_key,
diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py
index 0121fac175b..082b61db0a9 100644
--- a/synapse/federation/sender/__init__.py
+++ b/synapse/federation/sender/__init__.py
@@ -438,6 +438,7 @@ def __init__(self, hs: "HomeServer"):
             run_as_background_process,
             WAKEUP_RETRY_PERIOD_SEC * 1000.0,
             "wake_destinations_needing_catchup",
+            self.server_name,
             self._wake_destinations_needing_catchup,
         )
 
@@ -480,7 +481,9 @@ def notify_new_events(self, max_token: RoomStreamToken) -> None:
 
         # fire off a processing loop in the background
         run_as_background_process(
-            "process_event_queue_for_federation", self._process_event_queue_loop
+            "process_event_queue_for_federation",
+            self.server_name,
+            self._process_event_queue_loop,
         )
 
     async def _process_event_queue_loop(self) -> None:
diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py
index 34ba44f14f5..8f29ec34b73 100644
--- a/synapse/federation/sender/per_destination_queue.py
+++ b/synapse/federation/sender/per_destination_queue.py
@@ -311,6 +311,7 @@ def attempt_new_transaction(self) -> None:
 
         run_as_background_process(
             "federation_transaction_transmission_loop",
+            self.server_name,
             self._transaction_transmission_loop,
         )
 
diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py
index acc1ae63d7a..3c84b7781c7 100644
--- a/synapse/handlers/appservice.py
+++ b/synapse/handlers/appservice.py
@@ -167,7 +167,9 @@ async def start_scheduler() -> None:
                                 except Exception:
                                     logger.error("Application Services Failure")
 
-                            run_as_background_process("as_scheduler", start_scheduler)
+                            run_as_background_process(
+                                "as_scheduler", self.server_name, start_scheduler
+                            )
                             self.started_scheduler = True
 
                         # Fork off pushes to these services
diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index a802941340b..be25a01c9d1 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -248,6 +248,7 @@ def __init__(self, hs: "HomeServer"):
                 run_as_background_process,
                 5 * 60 * 1000,
                 "expire_old_sessions",
+                self.server_name,
                 self._expire_old_sessions,
             )
 
diff --git a/synapse/handlers/deactivate_account.py b/synapse/handlers/deactivate_account.py
index 42e53d920ab..980544074ea 100644
--- a/synapse/handlers/deactivate_account.py
+++ b/synapse/handlers/deactivate_account.py
@@ -40,6 +40,7 @@ class DeactivateAccountHandler:
     def __init__(self, hs: "HomeServer"):
         self.store = hs.get_datastores().main
         self.hs = hs
+        self.server_name = hs.hostname
         self._auth_handler = hs.get_auth_handler()
         self._device_handler = hs.get_device_handler()
         self._room_member_handler = hs.get_room_member_handler()
@@ -245,7 +246,9 @@ def _start_user_parting(self) -> None:
         pending deactivation, if it isn't already running.
         """
         if not self._user_parter_running:
-            run_as_background_process("user_parter_loop", self._user_parter_loop)
+            run_as_background_process(
+                "user_parter_loop", self.server_name, self._user_parter_loop
+            )
 
     async def _user_parter_loop(self) -> None:
         """Loop that parts deactivated users from rooms"""
diff --git a/synapse/handlers/delayed_events.py b/synapse/handlers/delayed_events.py
index 80cb1cec9b0..a1466677699 100644
--- a/synapse/handlers/delayed_events.py
+++ b/synapse/handlers/delayed_events.py
@@ -54,6 +54,7 @@
 
 class DelayedEventsHandler:
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self._store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
         self._config = hs.config
@@ -109,12 +110,13 @@ async def _schedule_db_events() -> None:
                 # Can send the events in background after having awaited on marking them as processed
                 run_as_background_process(
                     "_send_events",
+                    self.server_name,
                     self._send_events,
                     events,
                 )
 
             self._initialized_from_db = run_as_background_process(
-                "_schedule_db_events", _schedule_db_events
+                "_schedule_db_events", self.server_name, _schedule_db_events
             )
         else:
             self._repl_client = ReplicationAddedDelayedEventRestServlet.make_client(hs)
@@ -139,7 +141,9 @@ async def process() -> None:
             finally:
                 self._event_processing = False
 
-        run_as_background_process("delayed_events.notify_new_event", process)
+        run_as_background_process(
+            "delayed_events.notify_new_event", self.server_name, process
+        )
 
     async def _unsafe_process_new_event(self) -> None:
         # If self._event_pos is None then means we haven't fetched it from the DB yet
@@ -447,6 +451,7 @@ def _schedule_next_at(self, next_send_ts: Timestamp) -> None:
                 delay_sec,
                 run_as_background_process,
                 "_send_on_timeout",
+                self.server_name,
                 self._send_on_timeout,
             )
         else:
diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py
index c6e44dae6a3..3e6a4891bd7 100644
--- a/synapse/handlers/device.py
+++ b/synapse/handlers/device.py
@@ -564,6 +564,7 @@ def __init__(self, hs: "HomeServer"):
                 run_as_background_process,
                 DELETE_STALE_DEVICES_INTERVAL_MS,
                 "delete_stale_devices",
+                self.server_name,
                 self._delete_stale_devices,
             )
 
@@ -1213,6 +1214,7 @@ class DeviceListUpdater(DeviceListWorkerUpdater):
     "Handles incoming device list updates from federation and updates the DB"
 
     def __init__(self, hs: "HomeServer", device_handler: DeviceHandler):
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self.federation = hs.get_federation_client()
         self.clock = hs.get_clock()
@@ -1243,6 +1245,7 @@ def __init__(self, hs: "HomeServer", device_handler: DeviceHandler):
         self.clock.looping_call(
             run_as_background_process,
             30 * 1000,
+            server_name=self.server_name,
             func=self._maybe_retry_device_resync,
             desc="_maybe_retry_device_resync",
         )
@@ -1364,6 +1367,7 @@ async def _handle_device_updates(self, user_id: str) -> None:
                 await self.store.mark_remote_users_device_caches_as_stale([user_id])
                 run_as_background_process(
                     "_maybe_retry_device_resync",
+                    self.server_name,
                     self.multi_user_device_resync,
                     [user_id],
                     False,
diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py
index 015fb3edca0..956312ccde7 100644
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@@ -204,7 +204,9 @@ def __init__(self, hs: "HomeServer"):
         # were shut down.
         if not hs.config.worker.worker_app:
             run_as_background_process(
-                "resume_sync_partial_state_room", self._resume_partial_state_room_sync
+                "resume_sync_partial_state_room",
+                self.server_name,
+                self._resume_partial_state_room_sync,
             )
 
     @trace
@@ -333,6 +335,7 @@ async def _maybe_backfill_inner(
             )
             run_as_background_process(
                 "_maybe_backfill_inner_anyway_with_max_depth",
+                self.server_name,
                 self.maybe_backfill,
                 room_id=room_id,
                 # We use `MAX_DEPTH` so that we find all backfill points next
@@ -806,7 +809,10 @@ async def do_invite_join(
             # have. Hence we fire off the background task, but don't wait for it.
 
             run_as_background_process(
-                "handle_queued_pdus", self._handle_queued_pdus, room_queue
+                "handle_queued_pdus",
+                self.server_name,
+                self._handle_queued_pdus,
+                room_queue,
             )
 
     async def do_knock(
@@ -1890,7 +1896,9 @@ async def _sync_partial_state_room_wrapper() -> None:
                         )
 
         run_as_background_process(
-            desc="sync_partial_state_room", func=_sync_partial_state_room_wrapper
+            desc="sync_partial_state_room",
+            server_name=self.server_name,
+            func=_sync_partial_state_room_wrapper,
         )
 
     async def _sync_partial_state_room(
diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py
index cbdbd10cae4..136ecc808a2 100644
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -940,6 +940,7 @@ async def _process_new_pulled_events(new_events: Collection[EventBase]) -> None:
         if len(events_with_failed_pull_attempts) > 0:
             run_as_background_process(
                 "_process_new_pulled_events_with_failed_pull_attempts",
+                self.server_name,
                 _process_new_pulled_events,
                 events_with_failed_pull_attempts,
             )
@@ -1533,6 +1534,7 @@ async def _process_received_pdu(
             if resync:
                 run_as_background_process(
                     "resync_device_due_to_pdu",
+                    self.server_name,
                     self._resync_device,
                     event.sender,
                 )
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index 5d6ee6996f7..23700c6ebb7 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -92,6 +92,7 @@ class MessageHandler:
     """Contains some read only APIs to get state about a room"""
 
     def __init__(self, hs: "HomeServer"):
+        self.server_name = hs.hostname
         self.auth = hs.get_auth()
         self.clock = hs.get_clock()
         self.state = hs.get_state_handler()
@@ -107,7 +108,7 @@ def __init__(self, hs: "HomeServer"):
 
         if not hs.config.worker.worker_app:
             run_as_background_process(
-                "_schedule_next_expiry", self._schedule_next_expiry
+                "_schedule_next_expiry", self.server_name, self._schedule_next_expiry
             )
 
     async def get_room_data(
@@ -439,6 +440,7 @@ def _schedule_expiry_for_event(self, event_id: str, expiry_ts: int) -> None:
             delay,
             run_as_background_process,
             "_expire_event",
+            self.server_name,
             self._expire_event,
             event_id,
         )
@@ -541,6 +543,7 @@ def __init__(self, hs: "HomeServer"):
             self.clock.looping_call(
                 lambda: run_as_background_process(
                     "send_dummy_events_to_fill_extremities",
+                    self.server_name,
                     self._send_dummy_events_to_fill_extremities,
                 ),
                 5 * 60 * 1000,
@@ -1971,6 +1974,7 @@ async def persist_and_notify_client_events(
                 # matters as sometimes presence code can take a while.
                 run_as_background_process(
                     "bump_presence_active_time",
+                    self.server_name,
                     self._bump_active_time,
                     requester.user,
                     requester.device_id,
diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py
index 4070b74b7af..df1a7e714ce 100644
--- a/synapse/handlers/pagination.py
+++ b/synapse/handlers/pagination.py
@@ -79,12 +79,12 @@ class PaginationHandler:
 
     def __init__(self, hs: "HomeServer"):
         self.hs = hs
+        self.server_name = hs.hostname
         self.auth = hs.get_auth()
         self.store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
         self._state_storage_controller = self._storage_controllers.state
         self.clock = hs.get_clock()
-        self._server_name = hs.hostname
         self._room_shutdown_handler = hs.get_room_shutdown_handler()
         self._relations_handler = hs.get_relations_handler()
         self._worker_locks = hs.get_worker_locks_handler()
@@ -119,6 +119,7 @@ def __init__(self, hs: "HomeServer"):
                     run_as_background_process,
                     job.interval,
                     "purge_history_for_rooms_in_range",
+                    self.server_name,
                     self.purge_history_for_rooms_in_range,
                     job.shortest_max_lifetime,
                     job.longest_max_lifetime,
@@ -245,6 +246,7 @@ async def purge_history_for_rooms_in_range(
             # other purges in the same room.
             run_as_background_process(
                 PURGE_HISTORY_ACTION_NAME,
+                self.server_name,
                 self.purge_history,
                 room_id,
                 token,
@@ -395,7 +397,7 @@ async def purge_room(
             write=True,
         ):
             # first check that we have no users in this room
-            joined = await self.store.is_host_joined(room_id, self._server_name)
+            joined = await self.store.is_host_joined(room_id, self.server_name)
             if joined:
                 if force:
                     logger.info(
@@ -604,6 +606,7 @@ async def get_messages(
                 # for a costly federation call and processing.
                 run_as_background_process(
                     "maybe_backfill_in_the_background",
+                    self.server_name,
                     self.hs.get_federation_handler().maybe_backfill,
                     room_id,
                     curr_topo,
diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py
index bbe81dbf4f3..e71a50d6f1c 100644
--- a/synapse/handlers/presence.py
+++ b/synapse/handlers/presence.py
@@ -534,6 +534,7 @@ def __init__(self, hs: "HomeServer"):
             "shutdown",
             run_as_background_process,
             "generic_presence.on_shutdown",
+            self.server_name,
             self._on_shutdown,
         )
 
@@ -834,6 +835,7 @@ def __init__(self, hs: "HomeServer"):
             "shutdown",
             run_as_background_process,
             "presence.on_shutdown",
+            self.server_name,
             self._on_shutdown,
         )
 
@@ -1523,7 +1525,9 @@ async def _process_presence() -> None:
             finally:
                 self._event_processing = False
 
-        run_as_background_process("presence.notify_new_event", _process_presence)
+        run_as_background_process(
+            "presence.notify_new_event", self.server_name, _process_presence
+        )
 
     async def _unsafe_process(self) -> None:
         # Loop round handling deltas until we're up to date
diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py
index cf9db7b0189..21ac7e8a1cd 100644
--- a/synapse/handlers/room_member.py
+++ b/synapse/handlers/room_member.py
@@ -2183,6 +2183,7 @@ def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
 
         self._hs = hs
+        self.server_name = hs.hostname
         self._store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
         self._clock = hs.get_clock()
@@ -2214,7 +2215,9 @@ async def process() -> None:
             finally:
                 self._is_processing = False
 
-        run_as_background_process("room_forgetter.notify_new_event", process)
+        run_as_background_process(
+            "room_forgetter.notify_new_event", self.server_name, process
+        )
 
     async def _unsafe_process(self) -> None:
         # If self.pos is None then means we haven't fetched it from DB
diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py
index 8f90c170602..5fd539aa270 100644
--- a/synapse/handlers/stats.py
+++ b/synapse/handlers/stats.py
@@ -53,6 +53,7 @@ class StatsHandler:
 
     def __init__(self, hs: "HomeServer"):
         self.hs = hs
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
         self.state = hs.get_state_handler()
@@ -88,7 +89,7 @@ async def process() -> None:
             finally:
                 self._is_processing = False
 
-        run_as_background_process("stats.notify_new_event", process)
+        run_as_background_process("stats.notify_new_event", self.server_name, process)
 
     async def _unsafe_process(self) -> None:
         # If self.pos is None then means we haven't fetched it from DB
diff --git a/synapse/handlers/typing.py b/synapse/handlers/typing.py
index 8d693fee304..f0bbf414e6e 100644
--- a/synapse/handlers/typing.py
+++ b/synapse/handlers/typing.py
@@ -80,7 +80,7 @@ class FollowerTypingHandler:
     def __init__(self, hs: "HomeServer"):
         self.store = hs.get_datastores().main
         self._storage_controllers = hs.get_storage_controllers()
-        self.server_name = hs.config.server.server_name
+        self.server_name = hs.hostname
         self.clock = hs.get_clock()
         self.is_mine_id = hs.is_mine_id
         self.is_mine_server_name = hs.is_mine_server_name
@@ -143,7 +143,11 @@ def _handle_timeout_for_member(self, now: int, member: RoomMember) -> None:
             last_fed_poke = self._member_last_federation_poke.get(member, None)
             if not last_fed_poke or last_fed_poke + FEDERATION_PING_INTERVAL <= now:
                 run_as_background_process(
-                    "typing._push_remote", self._push_remote, member=member, typing=True
+                    "typing._push_remote",
+                    self.server_name,
+                    self._push_remote,
+                    member=member,
+                    typing=True,
                 )
 
         # Add a paranoia timer to ensure that we always have a timer for
@@ -216,6 +220,7 @@ def process_replication_rows(
             if self.federation:
                 run_as_background_process(
                     "_send_changes_in_typing_to_remotes",
+                    self.server_name,
                     self._send_changes_in_typing_to_remotes,
                     row.room_id,
                     prev_typing,
@@ -375,7 +380,11 @@ def _push_update(self, member: RoomMember, typing: bool) -> None:
         if self.hs.is_mine_id(member.user_id):
             # Only send updates for changes to our own users.
             run_as_background_process(
-                "typing._push_remote", self._push_remote, member, typing
+                "typing._push_remote",
+                self.server_name,
+                self._push_remote,
+                member,
+                typing,
             )
 
         self._push_update_local(member=member, typing=typing)
diff --git a/synapse/handlers/user_directory.py b/synapse/handlers/user_directory.py
index 1f692c79a02..6ac1d6eafd1 100644
--- a/synapse/handlers/user_directory.py
+++ b/synapse/handlers/user_directory.py
@@ -192,7 +192,9 @@ async def process() -> None:
                 self._is_processing = False
 
         self._is_processing = True
-        run_as_background_process("user_directory.notify_new_event", process)
+        run_as_background_process(
+            "user_directory.notify_new_event", self.server_name, process
+        )
 
     async def handle_local_profile_change(
         self, user_id: str, profile: ProfileInfo
@@ -604,7 +606,9 @@ async def process() -> None:
                 self._is_refreshing_remote_profiles = False
 
         self._is_refreshing_remote_profiles = True
-        run_as_background_process("user_directory.refresh_remote_profiles", process)
+        run_as_background_process(
+            "user_directory.refresh_remote_profiles", self.server_name, process
+        )
 
     async def _unsafe_refresh_remote_profiles(self) -> None:
         limit = MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO - len(
@@ -686,7 +690,9 @@ async def process() -> None:
 
         self._is_refreshing_remote_profiles_for_servers.add(server_name)
         run_as_background_process(
-            "user_directory.refresh_remote_profiles_for_remote_server", process
+            "user_directory.refresh_remote_profiles_for_remote_server",
+            self.server_name,
+            process,
         )
 
     async def _unsafe_refresh_remote_profiles_for_remote_server(
diff --git a/synapse/media/media_repository.py b/synapse/media/media_repository.py
index 1d05c536af8..afabdaa0e14 100644
--- a/synapse/media/media_repository.py
+++ b/synapse/media/media_repository.py
@@ -179,12 +179,16 @@ def __init__(self, hs: "HomeServer"):
 
     def _start_update_recently_accessed(self) -> Deferred:
         return run_as_background_process(
-            "update_recently_accessed_media", self._update_recently_accessed
+            "update_recently_accessed_media",
+            self.server_name,
+            self._update_recently_accessed,
         )
 
     def _start_apply_media_retention_rules(self) -> Deferred:
         return run_as_background_process(
-            "apply_media_retention_rules", self._apply_media_retention_rules
+            "apply_media_retention_rules",
+            self.server_name,
+            self._apply_media_retention_rules,
         )
 
     async def _update_recently_accessed(self) -> None:
diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index 0c665e1942b..8c79d5aa7c2 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -739,7 +739,7 @@ async def _handle_oembed_response(
 
     def _start_expire_url_cache_data(self) -> Deferred:
         return run_as_background_process(
-            "expire_url_cache_data", self._expire_url_cache_data
+            "expire_url_cache_data", self.server_name, self._expire_url_cache_data
         )
 
     async def _expire_url_cache_data(self) -> None:
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index 5f9883100cc..4b810a4c013 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -358,7 +358,7 @@ def __init__(
             instance_id = id(self)
         super().__init__("%s-%s" % (name, instance_id))
         self._proc: Optional[_BackgroundProcess] = _BackgroundProcess(
-            name=name, server_name=server_name, ctx=self
+            desc=name, server_name=server_name, ctx=self
         )
 
     def start(self, rusage: "Optional[resource.struct_rusage]") -> None:
diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py
index 970367e9e05..ea1ffd171de 100644
--- a/synapse/metrics/common_usage_metrics.py
+++ b/synapse/metrics/common_usage_metrics.py
@@ -47,6 +47,7 @@ class CommonUsageMetricsManager:
     """Collects common usage metrics."""
 
     def __init__(self, hs: "HomeServer") -> None:
+        self.server_name = hs.hostname
         self._store = hs.get_datastores().main
         self._clock = hs.get_clock()
 
@@ -62,12 +63,15 @@ async def get_metrics(self) -> CommonUsageMetrics:
     async def setup(self) -> None:
         """Keep the gauges for common usage metrics up to date."""
         run_as_background_process(
-            desc="common_usage_metrics_update_gauges", func=self._update_gauges
+            desc="common_usage_metrics_update_gauges",
+            server_name=self.server_name,
+            func=self._update_gauges,
         )
         self._clock.looping_call(
             run_as_background_process,
             5 * 60 * 1000,
             desc="common_usage_metrics_update_gauges",
+            server_name=self.server_name,
             func=self._update_gauges,
         )
 
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index b1a2476da84..0929e3aad12 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -1333,6 +1333,7 @@ def looping_background_call(
                 run_as_background_process,
                 msec,
                 desc,
+                self.server_name,
                 lambda: maybe_awaitable(f(*args, **kwargs)),
             )
         else:
@@ -1390,6 +1391,7 @@ def delayed_background_call(
             msec * 0.001,
             run_as_background_process,
             desc,
+            self.server_name,
             lambda: maybe_awaitable(f(*args, **kwargs)),
         )
 
diff --git a/synapse/push/emailpusher.py b/synapse/push/emailpusher.py
index 0a14c534f71..09ca14584a2 100644
--- a/synapse/push/emailpusher.py
+++ b/synapse/push/emailpusher.py
@@ -68,6 +68,7 @@ def __init__(self, hs: "HomeServer", pusher_config: PusherConfig, mailer: Mailer
         super().__init__(hs, pusher_config)
         self.mailer = mailer
 
+        self.server_name = hs.hostname
         self.store = self.hs.get_datastores().main
         self.email = pusher_config.pushkey
         self.timed_call: Optional[IDelayedCall] = None
@@ -117,7 +118,7 @@ def _start_processing(self) -> None:
         if self._is_processing:
             return
 
-        run_as_background_process("emailpush.process", self._process)
+        run_as_background_process("emailpush.process", self.server_name, self._process)
 
     def _pause_processing(self) -> None:
         """Used by tests to temporarily pause processing of events.
diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py
index 53dfb54104e..5946a6e9724 100644
--- a/synapse/push/httppusher.py
+++ b/synapse/push/httppusher.py
@@ -182,7 +182,9 @@ def on_new_receipts(self) -> None:
 
         # We could check the receipts are actually m.read receipts here,
         # but currently that's the only type of receipt anyway...
-        run_as_background_process("http_pusher.on_new_receipts", self._update_badge)
+        run_as_background_process(
+            "http_pusher.on_new_receipts", self.server_name, self._update_badge
+        )
 
     async def _update_badge(self) -> None:
         # XXX as per https://github.com/matrix-org/matrix-doc/issues/2627, this seems
@@ -217,7 +219,7 @@ def _start_processing(self) -> None:
         if self.failing_since and self.timed_call and self.timed_call.active():
             return
 
-        run_as_background_process("httppush.process", self._process)
+        run_as_background_process("httppush.process", self.server_name, self._process)
 
     async def _process(self) -> None:
         # we should never get here if we are already processing
diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py
index 0a7541b4c70..47685cccf94 100644
--- a/synapse/push/pusherpool.py
+++ b/synapse/push/pusherpool.py
@@ -65,6 +65,7 @@ class PusherPool:
 
     def __init__(self, hs: "HomeServer"):
         self.hs = hs
+        self.server_name = hs.hostname
         self.pusher_factory = PusherFactory(hs)
         self.store = self.hs.get_datastores().main
         self.clock = self.hs.get_clock()
@@ -99,7 +100,9 @@ def start(self) -> None:
         if not self._should_start_pushers:
             logger.info("Not starting pushers because they are disabled in the config")
             return
-        run_as_background_process("start_pushers", self._start_pushers)
+        run_as_background_process(
+            "start_pushers", self.server_name, self._start_pushers
+        )
 
     async def add_or_update_pusher(
         self,
diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py
index 0bd5478cd35..34d44d002de 100644
--- a/synapse/replication/tcp/client.py
+++ b/synapse/replication/tcp/client.py
@@ -404,6 +404,7 @@ class FederationSenderHandler:
     def __init__(self, hs: "HomeServer"):
         assert hs.should_send_federation()
 
+        self.server_name = hs.hostname
         self.store = hs.get_datastores().main
         self._is_mine_id = hs.is_mine_id
         self._hs = hs
@@ -494,7 +495,9 @@ async def update_token(self, token: int) -> None:
             # no need to queue up another task.
             return
 
-        run_as_background_process("_save_and_send_ack", self._save_and_send_ack)
+        run_as_background_process(
+            "_save_and_send_ack", self.server_name, self._save_and_send_ack
+        )
 
     async def _save_and_send_ack(self) -> None:
         """Save the current federation position in the database and send an ACK
diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py
index b015c401cfc..13caadb2843 100644
--- a/synapse/replication/tcp/handler.py
+++ b/synapse/replication/tcp/handler.py
@@ -335,7 +335,10 @@ def _add_command_to_stream_queue(
 
         # fire off a background process to start processing the queue.
         run_as_background_process(
-            "process-replication-data", self._unsafe_process_queue, stream_name
+            "process-replication-data",
+            self.server_name,
+            self._unsafe_process_queue,
+            stream_name,
         )
 
     async def _unsafe_process_queue(self, stream_name: str) -> None:
diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index 8fa26d8481c..c726baf142a 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -291,7 +291,9 @@ def handle_command(self, cmd: Command) -> None:
 
             if isawaitable(res):
                 run_as_background_process(
-                    "replication-" + cmd.get_logcontext_id(), lambda: res
+                    "replication-" + cmd.get_logcontext_id(),
+                    self.server_name,
+                    lambda: res,
                 )
 
             handled = True
diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py
index a67f4ce3be1..ffe9e25a0f2 100644
--- a/synapse/replication/tcp/redis.py
+++ b/synapse/replication/tcp/redis.py
@@ -130,7 +130,9 @@ def __init__(self, *args: Any, **kwargs: Any):
     def connectionMade(self) -> None:
         logger.info("Connected to redis")
         super().connectionMade()
-        run_as_background_process("subscribe-replication", self._send_subscribe)
+        run_as_background_process(
+            "subscribe-replication", self.server_name, self._send_subscribe
+        )
 
     async def _send_subscribe(self) -> None:
         # it's important to make sure that we only send the REPLICATE command once we
@@ -206,7 +208,7 @@ def handle_command(self, cmd: Command) -> None:
 
         if isawaitable(res):
             run_as_background_process(
-                "replication-" + cmd.get_logcontext_id(), lambda: res
+                "replication-" + cmd.get_logcontext_id(), self.server_name, lambda: res
             )
 
     def connectionLost(self, reason: Failure) -> None:  # type: ignore[override]
@@ -228,7 +230,11 @@ def send_command(self, cmd: Command) -> None:
             cmd: The command to send
         """
         run_as_background_process(
-            "send-cmd", self._async_send_command, cmd, bg_start_span=False
+            "send-cmd",
+            self.server_name,
+            self._async_send_command,
+            cmd,
+            bg_start_span=False,
         )
 
     async def _async_send_command(self, cmd: Command) -> None:
diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py
index d8576d2af36..e65e0fce1c3 100644
--- a/synapse/replication/tcp/resource.py
+++ b/synapse/replication/tcp/resource.py
@@ -147,7 +147,9 @@ def on_notifier_poke(self) -> None:
             logger.debug("Notifier poke loop already running")
             return
 
-        run_as_background_process("replication_notifier", self._run_notifier_loop)
+        run_as_background_process(
+            "replication_notifier", self.server_name, self._run_notifier_loop
+        )
 
     async def _run_notifier_loop(self) -> None:
         self.is_looping = True
diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py
index 4600a877780..9e22f0d0230 100644
--- a/synapse/rest/client/room.py
+++ b/synapse/rest/client/room.py
@@ -1201,6 +1201,7 @@ class RoomRedactEventRestServlet(TransactionRestServlet):
 
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
+        self.server_name = hs.hostname
         self.event_creation_handler = hs.get_event_creation_handler()
         self.auth = hs.get_auth()
         self._store = hs.get_datastores().main
@@ -1285,6 +1286,7 @@ async def _do(
                 if with_relations:
                     run_as_background_process(
                         "redact_related_events",
+                        self.server_name,
                         self._relation_handler.redact_events_related_to,
                         requester=requester,
                         event_id=event_id,
diff --git a/synapse/server.py b/synapse/server.py
index 52b3b0d5dcd..0ba3142a6e8 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -422,7 +422,7 @@ def get_datastores(self) -> Databases:
 
     @cache_in_self
     def get_distributor(self) -> Distributor:
-        return Distributor()
+        return Distributor(server_name=self.hostname)
 
     @cache_in_self
     def get_registration_ratelimiter(self) -> Ratelimiter:
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index d170bbddaa8..acc0abee63e 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -249,6 +249,7 @@ def __init__(self, hs: "HomeServer", database: "DatabasePool"):
         self._clock = hs.get_clock()
         self.db_pool = database
         self.hs = hs
+        self.server_name = hs.hostname
 
         self._database_name = database.name()
 
@@ -395,7 +396,10 @@ def start_doing_background_updates(self) -> None:
             self._all_done = False
             sleep = self.sleep_enabled
             run_as_background_process(
-                "background_updates", self.run_background_updates, sleep
+                "background_updates",
+                self.server_name,
+                self.run_background_updates,
+                sleep,
             )
 
     async def run_background_updates(self, sleep: bool) -> None:
diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py
index 20e6782670d..cf9507ebd4e 100644
--- a/synapse/storage/controllers/persist_events.py
+++ b/synapse/storage/controllers/persist_events.py
@@ -193,6 +193,7 @@ class _EventPeristenceQueue(Generic[_PersistResult]):
 
     def __init__(
         self,
+        server_name: str,
         per_item_callback: Callable[
             [str, _EventPersistQueueTask],
             Awaitable[_PersistResult],
@@ -203,6 +204,7 @@ def __init__(
         The per_item_callback will be called for each item added via add_to_queue,
         and its result will be returned via the Deferreds returned from add_to_queue.
         """
+        self.server_name = server_name
         self._event_persist_queues: Dict[str, Deque[_EventPersistQueueItem]] = {}
         self._currently_persisting_rooms: Set[str] = set()
         self._per_item_callback = per_item_callback
@@ -307,7 +309,7 @@ async def handle_queue_loop() -> None:
                 self._currently_persisting_rooms.discard(room_id)
 
         # set handle_queue_loop off in the background
-        run_as_background_process("persist_events", handle_queue_loop)
+        run_as_background_process("persist_events", self.server_name, handle_queue_loop)
 
     def _get_drainining_queue(
         self, room_id: str
@@ -350,7 +352,7 @@ def __init__(
         self._instance_name = hs.get_instance_name()
         self.is_mine_id = hs.is_mine_id
         self._event_persist_queue = _EventPeristenceQueue(
-            self._process_event_persist_queue_task
+            self.server_name, self._process_event_persist_queue_task
         )
         self._state_resolution_handler = hs.get_state_resolution_handler()
         self._state_controller = state_controller
diff --git a/synapse/storage/database.py b/synapse/storage/database.py
index d999a4c2825..0ecfe421528 100644
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -611,6 +611,7 @@ def __init__(
             0.0,
             run_as_background_process,
             "upsert_safety_check",
+            self.server_name,
             self._check_safe_to_upsert,
         )
 
@@ -653,6 +654,7 @@ async def _check_safe_to_upsert(self) -> None:
                 15.0,
                 run_as_background_process,
                 "upsert_safety_check",
+                self.server_name,
                 self._check_safe_to_upsert,
             )
 
diff --git a/synapse/storage/databases/main/account_data.py b/synapse/storage/databases/main/account_data.py
index 715815cc091..49adb3e7caf 100644
--- a/synapse/storage/databases/main/account_data.py
+++ b/synapse/storage/databases/main/account_data.py
@@ -77,6 +77,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="account_data",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("room_account_data", "instance_name", "stream_id"),
diff --git a/synapse/storage/databases/main/cache.py b/synapse/storage/databases/main/cache.py
index 9418fb6dd75..9ebd48d26ae 100644
--- a/synapse/storage/databases/main/cache.py
+++ b/synapse/storage/databases/main/cache.py
@@ -105,10 +105,11 @@ def __init__(
             # caches to invalidate. (This reduces the amount of writes to the DB
             # that happen).
             self._cache_id_gen = MultiWriterIdGenerator(
-                db_conn,
-                database,
+                db_conn=db_conn,
+                db=database,
                 notifier=hs.get_replication_notifier(),
                 stream_name="caches",
+                server_name=self.server_name,
                 instance_name=hs.get_instance_name(),
                 tables=[
                     (
diff --git a/synapse/storage/databases/main/deviceinbox.py b/synapse/storage/databases/main/deviceinbox.py
index a22eab24742..825e3148ee2 100644
--- a/synapse/storage/databases/main/deviceinbox.py
+++ b/synapse/storage/databases/main/deviceinbox.py
@@ -108,6 +108,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="to_device",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("device_inbox", "instance_name", "stream_id"),
@@ -153,6 +154,7 @@ def __init__(
                 run_as_background_process,
                 DEVICE_FEDERATION_INBOX_CLEANUP_INTERVAL_MS,
                 "_delete_old_federation_inbox_rows",
+                self.server_name,
                 self._delete_old_federation_inbox_rows,
             )
 
diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py
index 941d278e6c7..5413d53dfb7 100644
--- a/synapse/storage/databases/main/devices.py
+++ b/synapse/storage/databases/main/devices.py
@@ -101,6 +101,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="device_lists_stream",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("device_lists_stream", "instance_name", "stream_id"),
diff --git a/synapse/storage/databases/main/end_to_end_keys.py b/synapse/storage/databases/main/end_to_end_keys.py
index 0700b0087b0..45697b23b5c 100644
--- a/synapse/storage/databases/main/end_to_end_keys.py
+++ b/synapse/storage/databases/main/end_to_end_keys.py
@@ -1555,6 +1555,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="e2e_cross_signing_keys",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("e2e_cross_signing_keys", "instance_name", "stream_id"),
diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py
index 9cc0493307c..042e111e579 100644
--- a/synapse/storage/databases/main/events_worker.py
+++ b/synapse/storage/databases/main/events_worker.py
@@ -235,6 +235,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="events",
+            server_name=self.server_name,
             instance_name=hs.get_instance_name(),
             tables=[
                 ("events", "instance_name", "stream_ordering"),
@@ -249,6 +250,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="backfill",
+            server_name=self.server_name,
             instance_name=hs.get_instance_name(),
             tables=[
                 ("events", "instance_name", "stream_ordering"),
@@ -332,6 +334,7 @@ def get_chain_id_txn(txn: Cursor) -> int:
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="un_partial_stated_event_stream",
+            server_name=self.server_name,
             instance_name=hs.get_instance_name(),
             tables=[("un_partial_stated_event_stream", "instance_name", "stream_id")],
             sequence_name="un_partial_stated_event_stream_sequence",
@@ -1129,7 +1132,9 @@ def _maybe_start_fetch_thread(self) -> None:
                 should_start = False
 
         if should_start:
-            run_as_background_process("fetch_events", self._fetch_thread)
+            run_as_background_process(
+                "fetch_events", self.server_name, self._fetch_thread
+            )
 
     async def _fetch_thread(self) -> None:
         """Services requests for events from `_event_fetch_list`."""
diff --git a/synapse/storage/databases/main/presence.py b/synapse/storage/databases/main/presence.py
index 065c8856036..ad414810aef 100644
--- a/synapse/storage/databases/main/presence.py
+++ b/synapse/storage/databases/main/presence.py
@@ -91,6 +91,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="presence_stream",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[("presence_stream", "instance_name", "stream_id")],
             sequence_name="presence_stream_sequence",
diff --git a/synapse/storage/databases/main/push_rule.py b/synapse/storage/databases/main/push_rule.py
index 86c87f78bf1..1269d5e0d5c 100644
--- a/synapse/storage/databases/main/push_rule.py
+++ b/synapse/storage/databases/main/push_rule.py
@@ -146,6 +146,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="push_rules_stream",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("push_rules_stream", "instance_name", "stream_id"),
diff --git a/synapse/storage/databases/main/pusher.py b/synapse/storage/databases/main/pusher.py
index a8a37b6c85e..9a0a12b5c10 100644
--- a/synapse/storage/databases/main/pusher.py
+++ b/synapse/storage/databases/main/pusher.py
@@ -88,6 +88,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="pushers",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[
                 ("pushers", "instance_name", "id"),
diff --git a/synapse/storage/databases/main/receipts.py b/synapse/storage/databases/main/receipts.py
index 99643315107..af1bbaae5f8 100644
--- a/synapse/storage/databases/main/receipts.py
+++ b/synapse/storage/databases/main/receipts.py
@@ -139,6 +139,7 @@ def __init__(
             db_conn=db_conn,
             db=database,
             notifier=hs.get_replication_notifier(),
+            server_name=self.server_name,
             stream_name="receipts",
             instance_name=self._instance_name,
             tables=[("receipts_linearized", "instance_name", "stream_id")],
diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py
index 58451d3ff19..9b76210defd 100644
--- a/synapse/storage/databases/main/room.py
+++ b/synapse/storage/databases/main/room.py
@@ -160,6 +160,7 @@ def __init__(
             db=database,
             notifier=hs.get_replication_notifier(),
             stream_name="un_partial_stated_room_stream",
+            server_name=self.server_name,
             instance_name=self._instance_name,
             tables=[("un_partial_stated_room_stream", "instance_name", "stream_id")],
             sequence_name="un_partial_stated_room_stream_sequence",
diff --git a/synapse/storage/util/id_generators.py b/synapse/storage/util/id_generators.py
index e8588f33cf5..308a3db35ff 100644
--- a/synapse/storage/util/id_generators.py
+++ b/synapse/storage/util/id_generators.py
@@ -189,6 +189,8 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
         db
         stream_name: A name for the stream, for use in the `stream_positions`
             table. (Does not need to be the same as the replication stream name)
+        server_name: The homeserver name of the server (used to label metrics)
+            (this should be `hs.hostname`).
         instance_name: The name of this instance.
         tables: List of tables associated with the stream. Tuple of table
             name, column name that stores the writer's instance name, and
@@ -204,10 +206,12 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
 
     def __init__(
         self,
+        *,
         db_conn: LoggingDatabaseConnection,
         db: DatabasePool,
         notifier: "ReplicationNotifier",
         stream_name: str,
+        server_name: str,
         instance_name: str,
         tables: List[Tuple[str, str, str]],
         sequence_name: str,
@@ -217,6 +221,7 @@ def __init__(
         self._db = db
         self._notifier = notifier
         self._stream_name = stream_name
+        self.server_name = server_name
         self._instance_name = instance_name
         self._positive = positive
         self._writers = writers
@@ -552,6 +557,7 @@ def get_next_txn(self, txn: LoggingTransaction) -> int:
             txn.call_after(
                 run_as_background_process,
                 "MultiWriterIdGenerator._update_table",
+                self.server_name,
                 self._db.runInteraction,
                 "MultiWriterIdGenerator._update_table",
                 self._update_stream_positions_table_txn,
@@ -588,6 +594,7 @@ def get_next_mult_txn(self, txn: LoggingTransaction, n: int) -> List[int]:
             txn.call_after(
                 run_as_background_process,
                 "MultiWriterIdGenerator._update_table",
+                self.server_name,
                 self._db.runInteraction,
                 "MultiWriterIdGenerator._update_table",
                 self._update_stream_positions_table_txn,
diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py
index 3fb697751fc..3a2d664d28d 100644
--- a/synapse/util/batching_queue.py
+++ b/synapse/util/batching_queue.py
@@ -85,6 +85,8 @@ class BatchingQueue(Generic[V, R]):
     Args:
         name: A name for the queue, used for logging contexts and metrics.
             This must be unique, otherwise the metrics will be wrong.
+        server_name: The homeserver name of the server (used to label metrics)
+            (this should be `hs.hostname`).
         clock: The clock to use to schedule work.
         process_batch_callback: The callback to to be run to process a batch of
             work.
@@ -92,11 +94,14 @@ class BatchingQueue(Generic[V, R]):
 
     def __init__(
         self,
+        *,
         name: str,
+        server_name: str,
         clock: Clock,
         process_batch_callback: Callable[[List[V]], Awaitable[R]],
     ):
         self._name = name
+        self.server_name = server_name
         self._clock = clock
 
         # The set of keys currently being processed.
@@ -135,7 +140,9 @@ async def add_to_queue(self, value: V, key: Hashable = ()) -> R:
         # If we're not currently processing the key fire off a background
         # process to start processing.
         if key not in self._processing_keys:
-            run_as_background_process(self._name, self._process_queue, key)
+            run_as_background_process(
+                self._name, self.server_name, self._process_queue, key
+            )
 
         with self._number_in_flight_metric.track_inprogress():
             return await make_deferred_yieldable(d)
diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py
index 3198fdd2eda..6ef9f6a330c 100644
--- a/synapse/util/caches/expiringcache.py
+++ b/synapse/util/caches/expiringcache.py
@@ -90,7 +90,9 @@ def __init__(
             return
 
         def f() -> "defer.Deferred[None]":
-            return run_as_background_process("prune_cache", self._prune_cache)
+            return run_as_background_process(
+                "prune_cache", server_name, self._prune_cache
+            )
 
         self._clock.looping_call(f, self._expiry_ms / 2)
 
diff --git a/synapse/util/distributor.py b/synapse/util/distributor.py
index 95786bd3dde..f48ae3373ce 100644
--- a/synapse/util/distributor.py
+++ b/synapse/util/distributor.py
@@ -58,7 +58,13 @@ class Distributor:
       model will do for today.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, server_name: str) -> None:
+        """
+        Args:
+            server_name: The homeserver name of the server (used to label metrics)
+                (this should be `hs.hostname`).
+        """
+        self.server_name = server_name
         self.signals: Dict[str, Signal] = {}
         self.pre_registration: Dict[str, List[Callable]] = {}
 
@@ -91,7 +97,9 @@ def fire(self, name: str, *args: Any, **kwargs: Any) -> None:
         if name not in self.signals:
             raise KeyError("%r does not have a signal named %s" % (self, name))
 
-        run_as_background_process(name, self.signals[name].fire, *args, **kwargs)
+        run_as_background_process(
+            name, self.server_name, self.signals[name].fire, *args, **kwargs
+        )
 
 
 P = ParamSpec("P")
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 42be1c8d28a..d414d2f01f2 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -60,6 +60,7 @@ def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
 
 async def get_retry_limiter(
     destination: str,
+    our_server_name: str,
     clock: Clock,
     store: DataStore,
     ignore_backoff: bool = False,
@@ -74,6 +75,7 @@ async def get_retry_limiter(
 
     Args:
         destination: name of homeserver
+        our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`)
         clock: timing source
         store: datastore
         ignore_backoff: true to ignore the historical backoff data and
@@ -114,11 +116,12 @@ async def get_retry_limiter(
     backoff_on_failure = not ignore_backoff
 
     return RetryDestinationLimiter(
-        destination,
-        clock,
-        store,
-        failure_ts,
-        retry_interval,
+        destination=destination,
+        our_server_name=our_server_name,
+        clock=clock,
+        store=store,
+        failure_ts=failure_ts,
+        retry_interval=retry_interval,
         backoff_on_failure=backoff_on_failure,
         **kwargs,
     )
@@ -151,7 +154,9 @@ async def filter_destinations_by_retry_limiter(
 class RetryDestinationLimiter:
     def __init__(
         self,
+        *,
         destination: str,
+        our_server_name: str,
         clock: Clock,
         store: DataStore,
         failure_ts: Optional[int],
@@ -169,6 +174,7 @@ def __init__(
 
         Args:
             destination
+            our_server_name: Our homeserver name (used to label metrics) (`hs.hostname`)
             clock
             store
             failure_ts: when this destination started failing (in ms since
@@ -184,6 +190,7 @@ def __init__(
             backoff_on_all_error_codes: Whether we should back off on any
                 error code.
         """
+        self.our_server_name = our_server_name
         self.clock = clock
         self.store = store
         self.destination = destination
@@ -318,4 +325,6 @@ async def store_retry_timings() -> None:
                 logger.exception("Failed to store destination_retry_timings")
 
         # we deliberately do this in the background.
-        run_as_background_process("store_retry_timings", store_retry_timings)
+        run_as_background_process(
+            "store_retry_timings", self.our_server_name, store_retry_timings
+        )
diff --git a/synapse/util/task_scheduler.py b/synapse/util/task_scheduler.py
index 5169656c73c..6789189a00e 100644
--- a/synapse/util/task_scheduler.py
+++ b/synapse/util/task_scheduler.py
@@ -101,6 +101,7 @@ class TaskScheduler:
 
     def __init__(self, hs: "HomeServer"):
         self._hs = hs
+        self.server_name = hs.hostname
         self._store = hs.get_datastores().main
         self._clock = hs.get_clock()
         self._running_tasks: Set[str] = set()
@@ -354,7 +355,7 @@ async def inner() -> None:
             finally:
                 self._launching_new_tasks = False
 
-        run_as_background_process("launch_scheduled_tasks", inner)
+        run_as_background_process("launch_scheduled_tasks", self.server_name, inner)
 
     @wrap_as_background_process("clean_scheduled_tasks")
     async def _clean_scheduled_tasks(self) -> None:
@@ -485,4 +486,4 @@ async def wrapper() -> None:
 
         self._running_tasks.add(task.id)
         await self.update_task(task.id, status=TaskStatus.ACTIVE)
-        run_as_background_process(f"task-{task.action}", wrapper)
+        run_as_background_process(f"task-{task.action}", self.server_name, wrapper)
diff --git a/tests/appservice/test_scheduler.py b/tests/appservice/test_scheduler.py
index a5bf7e06357..10edd575bc1 100644
--- a/tests/appservice/test_scheduler.py
+++ b/tests/appservice/test_scheduler.py
@@ -53,11 +53,24 @@ def setUp(self) -> None:
         self.clock = MockClock()
         self.store = Mock()
         self.as_api = Mock()
+
+        self.hs = Mock(
+            spec_set=[
+                "get_datastores",
+                "get_clock",
+                "get_application_service_api",
+                "hostname",
+            ]
+        )
+        self.hs.get_clock.return_value = self.clock
+        self.hs.get_datastores.return_value = Mock(
+            main=self.store,
+        )
+        self.hs.get_application_service_api.return_value = self.as_api
+
         self.recoverer = Mock()
         self.recoverer_fn = Mock(return_value=self.recoverer)
-        self.txnctrl = _TransactionController(
-            clock=cast(Clock, self.clock), store=self.store, as_api=self.as_api
-        )
+        self.txnctrl = _TransactionController(self.hs)
         self.txnctrl.RECOVERER_CLASS = self.recoverer_fn
 
     def test_single_service_up_txn_sent(self) -> None:
@@ -163,6 +176,7 @@ def setUp(self) -> None:
         self.service = Mock()
         self.callback = AsyncMock()
         self.recoverer = _Recoverer(
+            server_name="test_server",
             clock=cast(Clock, self.clock),
             as_api=self.as_api,
             store=self.store,
diff --git a/tests/metrics/test_background_process_metrics.py b/tests/metrics/test_background_process_metrics.py
index 7253e51c38a..1f47601b95b 100644
--- a/tests/metrics/test_background_process_metrics.py
+++ b/tests/metrics/test_background_process_metrics.py
@@ -15,7 +15,7 @@ def test_update_metrics_with_negative_time_diff(self) -> None:
         mock_logging_context.get_resource_usage.return_value = usage
 
         process = _BackgroundProcess(
-            name="test process", server_name="test_server", ctx=mock_logging_context
+            desc="test process", server_name="test_server", ctx=mock_logging_context
         )
         # Should not raise
         process.update_metrics()
diff --git a/tests/test_distributor.py b/tests/test_distributor.py
index 18792fdee3f..19dafe64ed4 100644
--- a/tests/test_distributor.py
+++ b/tests/test_distributor.py
@@ -28,7 +28,7 @@
 
 class DistributorTestCase(unittest.TestCase):
     def setUp(self) -> None:
-        self.dist = Distributor()
+        self.dist = Distributor(server_name="test_server")
 
     def test_signal_dispatch(self) -> None:
         self.dist.declare("alert")
diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py
index 2dcf3a3412f..49ddc117255 100644
--- a/tests/util/test_batching_queue.py
+++ b/tests/util/test_batching_queue.py
@@ -50,7 +50,10 @@ def setUp(self) -> None:
 
         self._pending_calls: List[Tuple[List[str], defer.Deferred]] = []
         self.queue: BatchingQueue[str, str] = BatchingQueue(
-            "test_queue", hs_clock, self._process_queue
+            name="test_queue",
+            server_name="test_server",
+            clock=hs_clock,
+            process_batch_callback=self._process_queue,
         )
 
     async def _process_queue(self, values: List[str]) -> str:

From ecb479e1c6a944c6902171d9749a6a94d57528e8 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Wed, 23 Jul 2025 14:15:08 -0500
Subject: [PATCH 46/48] Add changelog

---
 changelog.d/18656.misc | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/18656.misc

diff --git a/changelog.d/18656.misc b/changelog.d/18656.misc
new file mode 100644
index 00000000000..44323a3a81a
--- /dev/null
+++ b/changelog.d/18656.misc
@@ -0,0 +1 @@
+Refactor `Counter` metrics to be homeserver-scoped.

From 1b03330790d55521e4d742a787cc4d32adde1e2a Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Wed, 23 Jul 2025 14:27:02 -0500
Subject: [PATCH 47/48] Fix `make_request(...)` in tests

---
 tests/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/server.py b/tests/server.py
index 0c519bc4c98..0ace329619f 100644
--- a/tests/server.py
+++ b/tests/server.py
@@ -432,7 +432,7 @@ def make_request(
 
     channel = FakeChannel(site, reactor, ip=client_ip)
 
-    req = request(channel, site)
+    req = request(channel, site, our_server_name="test_server")
     channel.request = req
 
     req.content = BytesIO(content)

From 822935c17cebf40362c6e0709e7b1a633fb2c1ef Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Thu, 24 Jul 2025 15:36:21 -0500
Subject: [PATCH 48/48] Remove debug log

---
 scripts-dev/mypy_synapse_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py
index 224b4562c44..2ccac97103b 100644
--- a/scripts-dev/mypy_synapse_plugin.py
+++ b/scripts-dev/mypy_synapse_plugin.py
@@ -71,7 +71,6 @@ def get_function_signature_hook(
     def get_method_signature_hook(
         self, fullname: str
     ) -> Optional[Callable[[MethodSigContext], CallableType]]:
-        # print(f"m fullname={fullname}")
         if fullname.startswith(
             (
                 "synapse.util.caches.descriptors.CachedFunction.__call__",