From 906c0899575a83ac69bb095e835fdec748891da4 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:29:01 -0700
Subject: [PATCH 1/2] Fix small performance regression with fp8 fast and scaled
 fp8. (#10537)

---
 comfy/ops.py       | 6 +++++-
 comfy/quant_ops.py | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index 71ca7a2bd11e..18f6b804bed6 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -421,14 +421,18 @@ def fp8_linear(self, input):
 
         if scale_input is None:
             scale_input = torch.ones((), device=input.device, dtype=torch.float32)
+            input = torch.clamp(input, min=-448, max=448, out=input)
+            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
+            layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype}
+            quantized_input = QuantizedTensor(input.reshape(-1, input_shape[2]).to(dtype).contiguous(), TensorCoreFP8Layout, layout_params_weight)
         else:
             scale_input = scale_input.to(input.device)
+            quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype)
 
         # Wrap weight in QuantizedTensor - this enables unified dispatch
         # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
         layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype}
         quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight)
-        quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype)
         o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)
 
         uncast_bias_weight(self, w, bias, offload_stream)
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index fb35a0d40b43..c822fe53cc37 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -357,9 +357,10 @@ def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn):
             scale = torch.tensor(scale)
         scale = scale.to(device=tensor.device, dtype=torch.float32)
 
-        lp_amax = torch.finfo(dtype).max
         tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype)
-        torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
+        # TODO: uncomment this if it's actually needed because the clamp has a small performance penality'
+        # lp_amax = torch.finfo(dtype).max
+        # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
         qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format)
 
         layout_params = {

From 998bf60bebd03e57a55e106434657849342b733f Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Wed, 29 Oct 2025 16:37:06 -0700
Subject: [PATCH 2/2] Add units/info for the numbers displayed on 'load
 completely' and 'load partially' log messages (#10538)

---
 comfy/model_patcher.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 74b9e48bc241..ed3f3f5cbd28 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -655,6 +655,7 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
             mem_counter = 0
             patch_counter = 0
             lowvram_counter = 0
+            lowvram_mem_counter = 0
             loading = self._load_list()
 
             load_completely = []
@@ -675,6 +676,7 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
                     if mem_counter + module_mem >= lowvram_model_memory:
                         lowvram_weight = True
                         lowvram_counter += 1
+                        lowvram_mem_counter += module_mem
                         if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
                             continue
 
@@ -748,10 +750,10 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
                     self.pin_weight_to_device("{}.{}".format(n, param))
 
             if lowvram_counter > 0:
-                logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
+                logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter))
                 self.model.model_lowvram = True
             else:
-                logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
+                logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
                 self.model.model_lowvram = False
                 if full_load:
                     self.model.to(device_to)