diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py index 1a0189ed4a8..63ca0946ae5 100644 --- a/comfy_extras/nodes_video_model.py +++ b/comfy_extras/nodes_video_model.py @@ -33,8 +33,11 @@ def INPUT_TYPES(s): "video_frames": ("INT", {"default": 14, "min": 1, "max": 4096}), "motion_bucket_id": ("INT", {"default": 127, "min": 1, "max": 1023}), "fps": ("INT", {"default": 6, "min": 1, "max": 1024}), - "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}) - }} + "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}), + }, + "optional": { "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1}) + } + } RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") RETURN_NAMES = ("positive", "negative", "latent") @@ -42,13 +45,18 @@ def INPUT_TYPES(s): CATEGORY = "conditioning/video_models" - def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level): + def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, + fps, augmentation_level, noise_seed=None): output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) encode_pixels = pixels[:,:,:,:3] if augmentation_level > 0: - encode_pixels += torch.randn_like(pixels) * augmentation_level + if noise_seed is not None: + generator = torch.manual_seed(noise_seed) + else: + generator = None + encode_pixels += torch.randn(pixels.shape, generator=generator) * augmentation_level t = vae.encode(encode_pixels) positive = [[pooled, {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": t}]] negative = [[torch.zeros_like(pooled), {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": torch.zeros_like(t)}]]