From 43cf3f6ec2ae6817a647340a5f04a22c71ad0555 Mon Sep 17 00:00:00 2001
From: Harel Cain <harel@lightricks.com>
Date: Wed, 17 Apr 2024 12:46:33 +0300
Subject: [PATCH 1/3] Add optional noise_seed to make augmentation
 deterministic

---
 comfy_extras/nodes_video_model.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py
index 1a0189ed4a8..97792b92220 100644
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -34,7 +34,10 @@ def INPUT_TYPES(s):
                               "motion_bucket_id": ("INT", {"default": 127, "min": 1, "max": 1023}),
                               "fps": ("INT", {"default": 6, "min": 1, "max": 1024}),
                               "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01})
-                             }}
+                             },
+                "optional":  { "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1}),
+                             }
+                }
     RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
     RETURN_NAMES = ("positive", "negative", "latent")
 
@@ -42,12 +45,15 @@ def INPUT_TYPES(s):
 
     CATEGORY = "conditioning/video_models"
 
-    def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level):
+    def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id,
+               fps, augmentation_level, noise_seed=None):
         output = clip_vision.encode_image(init_image)
         pooled = output.image_embeds.unsqueeze(0)
         pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
         encode_pixels = pixels[:,:,:,:3]
         if augmentation_level > 0:
+            if noise_seed is not None:
+                torch.manual_seed(noise_seed)
             encode_pixels += torch.randn_like(pixels) * augmentation_level
         t = vae.encode(encode_pixels)
         positive = [[pooled, {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": t}]]

From 2d3c1d689f8fc35dbcc712a5a3b16361cefc535a Mon Sep 17 00:00:00 2001
From: Harel Cain <harel@lightricks.com>
Date: Thu, 18 Apr 2024 10:28:20 +0300
Subject: [PATCH 2/3] Move to using generator for the noise, making seed
 required

---
 comfy_extras/nodes_video_model.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py
index 97792b92220..92c5ddebf91 100644
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -33,10 +33,9 @@ def INPUT_TYPES(s):
                               "video_frames": ("INT", {"default": 14, "min": 1, "max": 4096}),
                               "motion_bucket_id": ("INT", {"default": 127, "min": 1, "max": 1023}),
                               "fps": ("INT", {"default": 6, "min": 1, "max": 1024}),
-                              "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01})
-                             },
-                "optional":  { "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1}),
-                             }
+                              "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}),
+                              "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1}),
+                    }
                 }
     RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
     RETURN_NAMES = ("positive", "negative", "latent")
@@ -46,15 +45,14 @@ def INPUT_TYPES(s):
     CATEGORY = "conditioning/video_models"
 
     def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id,
-               fps, augmentation_level, noise_seed=None):
+               fps, augmentation_level, noise_seed):
         output = clip_vision.encode_image(init_image)
         pooled = output.image_embeds.unsqueeze(0)
         pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
         encode_pixels = pixels[:,:,:,:3]
         if augmentation_level > 0:
-            if noise_seed is not None:
-                torch.manual_seed(noise_seed)
-            encode_pixels += torch.randn_like(pixels) * augmentation_level
+            generator = torch.manual_seed(noise_seed)
+            encode_pixels += torch.randn(pixels.shape, generator=generator) * augmentation_level
         t = vae.encode(encode_pixels)
         positive = [[pooled, {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": t}]]
         negative = [[torch.zeros_like(pooled), {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": torch.zeros_like(t)}]]

From 4b56c3278f35ad8cd0b69401d0c1fbfea786ee9d Mon Sep 17 00:00:00 2001
From: Harel Cain <harel@lightricks.com>
Date: Sun, 21 Apr 2024 10:11:35 +0300
Subject: [PATCH 3/3] Make noise_seed optional after all

---
 comfy_extras/nodes_video_model.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py
index 92c5ddebf91..63ca0946ae5 100644
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -34,8 +34,9 @@ def INPUT_TYPES(s):
                               "motion_bucket_id": ("INT", {"default": 127, "min": 1, "max": 1023}),
                               "fps": ("INT", {"default": 6, "min": 1, "max": 1024}),
                               "augmentation_level": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}),
-                              "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1}),
-                    }
+                    },
+                "optional": { "noise_seed": ("INT", {"default": 0, "min": 0, "max": 2**32-1})
+                              }
                 }
     RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
     RETURN_NAMES = ("positive", "negative", "latent")
@@ -45,13 +46,16 @@ def INPUT_TYPES(s):
     CATEGORY = "conditioning/video_models"
 
     def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id,
-               fps, augmentation_level, noise_seed):
+               fps, augmentation_level, noise_seed=None):
         output = clip_vision.encode_image(init_image)
         pooled = output.image_embeds.unsqueeze(0)
         pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
         encode_pixels = pixels[:,:,:,:3]
         if augmentation_level > 0:
-            generator = torch.manual_seed(noise_seed)
+            if noise_seed is not None:
+                generator = torch.manual_seed(noise_seed)
+            else:
+                generator = None
             encode_pixels += torch.randn(pixels.shape, generator=generator) * augmentation_level
         t = vae.encode(encode_pixels)
         positive = [[pooled, {"motion_bucket_id": motion_bucket_id, "fps": fps, "augmentation_level": augmentation_level, "concat_latent_image": t}]]