diff --git a/comfy/ldm/lightricks/vocoders/vocoder.py b/comfy/ldm/lightricks/vocoders/vocoder.py
index 6c4028aa89c2..2481d8bdd373 100644
--- a/comfy/ldm/lightricks/vocoders/vocoder.py
+++ b/comfy/ldm/lightricks/vocoders/vocoder.py
@@ -2,6 +2,7 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import comfy.ops
+import comfy.model_management
 import numpy as np
 import math
 
@@ -81,7 +82,7 @@ def forward(self, x):
         _, C, _ = x.shape
         if self.padding:
             x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
-        return F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        return F.conv1d(x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), stride=self.stride, groups=C)
 
 
 class UpSample1d(nn.Module):
@@ -125,7 +126,7 @@ def forward(self, x):
         _, C, _ = x.shape
         x = F.pad(x, (self.pad, self.pad), mode="replicate")
         x = self.ratio * F.conv_transpose1d(
-            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+            x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), stride=self.stride, groups=C
         )
         x = x[..., self.pad_left : -self.pad_right]
         return x
@@ -190,7 +191,7 @@ def __init__(
         self.eps = 1e-9
 
     def forward(self, x):
-        a = self.alpha.unsqueeze(0).unsqueeze(-1)
+        a = comfy.model_management.cast_to(self.alpha.unsqueeze(0).unsqueeze(-1), dtype=x.dtype, device=x.device)
         if self.alpha_logscale:
             a = torch.exp(a)
         return x + (1.0 / (a + self.eps)) * torch.sin(x * a).pow(2)
@@ -217,8 +218,8 @@ def __init__(
         self.eps = 1e-9
 
     def forward(self, x):
-        a = self.alpha.unsqueeze(0).unsqueeze(-1)
-        b = self.beta.unsqueeze(0).unsqueeze(-1)
+        a = comfy.model_management.cast_to(self.alpha.unsqueeze(0).unsqueeze(-1), dtype=x.dtype, device=x.device)
+        b = comfy.model_management.cast_to(self.beta.unsqueeze(0).unsqueeze(-1), dtype=x.dtype, device=x.device)
         if self.alpha_logscale:
             a = torch.exp(a)
             b = torch.exp(b)
@@ -596,7 +597,7 @@ def forward(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
             y = y.unsqueeze(1)                                # (B, 1, T)
         left_pad = max(0, self.win_length - self.hop_length)  # causal: left-only
         y = F.pad(y, (left_pad, 0))
-        spec = F.conv1d(y, self.forward_basis, stride=self.hop_length, padding=0)
+        spec = F.conv1d(y, comfy.model_management.cast_to(self.forward_basis, dtype=y.dtype, device=y.device), stride=self.hop_length, padding=0)
         n_freqs = spec.shape[1] // 2
         real, imag = spec[:, :n_freqs], spec[:, n_freqs:]
         magnitude = torch.sqrt(real ** 2 + imag ** 2)
@@ -647,7 +648,7 @@ def mel_spectrogram(
         """
         magnitude, phase = self.stft_fn(y)
         energy = torch.norm(magnitude, dim=1)
-        mel = torch.matmul(self.mel_basis.to(magnitude.dtype), magnitude)
+        mel = torch.matmul(comfy.model_management.cast_to(self.mel_basis, dtype=magnitude.dtype, device=y.device), magnitude)
         log_mel = torch.log(torch.clamp(mel, min=1e-5))
         return log_mel, magnitude, phase, energy
 
diff --git a/comfy/ops.py b/comfy/ops.py
index 3e19cd1b6842..87b36b5c5cb8 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -80,6 +80,21 @@ def cast_to_input(weight, input, non_blocking=False, copy=True):
 
 
 def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
+
+    #vbar doesn't support CPU weights, but some custom nodes have weird paths
+    #that might switch the layer to the CPU and expect it to work. We have to take
+    #a clone conservatively as we are mmapped and some SFT files are packed misaligned
+    #If you are a custom node author reading this, please move your layer to the GPU
+    #or declare your ModelPatcher as CPU in the first place.
+    if comfy.model_management.is_device_cpu(device):
+        weight = s.weight.to(dtype=dtype, copy=True)
+        if isinstance(weight, QuantizedTensor):
+            weight = weight.dequantize()
+        bias = None
+        if s.bias is not None:
+            bias = s.bias.to(dtype=bias_dtype, copy=True)
+        return weight, bias, (None, None, None)
+
     offload_stream = None
     xfer_dest = None
 
diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
index 32fe921ff40b..c055711438f8 100644
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -253,10 +253,12 @@ def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_fa
         return frame_idx, latent_idx
 
     @classmethod
-    def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors, latent_downscale_factor=1):
+    def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors, latent_downscale_factor=1, causal_fix=None):
         keyframe_idxs, _ = get_keyframe_idxs(cond)
         _, latent_coords = cls.PATCHIFIER.patchify(guiding_latent)
-        pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=frame_idx == 0)  # we need the causal fix only if we're placing the new latents at index 0
+        if causal_fix is None:
+            causal_fix = frame_idx == 0 or guiding_latent.shape[2] == 1
+        pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=causal_fix)
         pixel_coords[:, 0] += frame_idx
 
         # The following adjusts keyframe end positions for small grid IC-LoRA.
@@ -278,12 +280,12 @@ def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors, late
         return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs})
 
     @classmethod
-    def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors, guide_mask=None, in_channels=128, latent_downscale_factor=1):
+    def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors, guide_mask=None, in_channels=128, latent_downscale_factor=1, causal_fix=None):
         if latent_image.shape[1] != in_channels or guiding_latent.shape[1] != in_channels:
             raise ValueError("Adding guide to a combined AV latent is not supported.")
 
-        positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors, latent_downscale_factor)
-        negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors, latent_downscale_factor)
+        positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors, latent_downscale_factor, causal_fix=causal_fix)
+        negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors, latent_downscale_factor, causal_fix=causal_fix)
 
         if guide_mask is not None:
             target_h = max(noise_mask.shape[3], guide_mask.shape[3])
diff --git a/comfyui_version.py b/comfyui_version.py
index e58e0fb633d6..5da21150b702 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.16.1"
+__version__ = "0.16.3"
diff --git a/pyproject.toml b/pyproject.toml
index 199a90364020..6a83c5c6320b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.16.1"
+version = "0.16.3"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
diff --git a/requirements.txt b/requirements.txt
index 3fd44e0cf15d..9a674fac5a59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.39.19
-comfyui-workflow-templates==0.9.8
+comfyui-workflow-templates==0.9.10
 comfyui-embedded-docs==0.4.3
 torch
 torchsde
@@ -22,7 +22,7 @@ alembic
 SQLAlchemy
 av>=14.2.0
 comfy-kitchen>=0.2.7
-comfy-aimdo>=0.2.6
+comfy-aimdo>=0.2.7
 requests
 
 #non essential dependencies: