In [1]:
%load_ext autoreload
%autoreload 2

import argparse
import os
import math
import yaml
import logging
import random
import numpy as np
import sys
import imageio

import torch

sys.path.append("/scratch/ondemand28/harryscz/diffusion")

In [2]:
def parse_args(arg_list=None):
    parser = argparse.ArgumentParser(
        description="Unconditioned Video Diffusion Inference"
    )
    parser.add_argument(
        "--dataset-path", type=str, required=True,
        help="Directory containing input reference videos."
    )
    parser.add_argument(
        "--pretrained-model-name-or-path", type=str, required=True,
        help="Path or HF ID where transformer/vae/scheduler are stored."
    )
    parser.add_argument(
        "--checkpoint-path", type=str, required=True,
        help="Path to fine‐tuned checkpoint containing transformer state_dict."
    )
    parser.add_argument(
        "--output-dir", type=str, required=True,
        help="Where to write generated videos."
    )
    parser.add_argument(
        "--model-config", type=str, required=True,
        help="YAML file describing model params (height, width, num_reference, num_target, etc.)"
    )
    parser.add_argument(
        "--batch-size", type=int, default=1,
        help="Batch size per device (usually 1 for inference)."
    )
    parser.add_argument(
        "--num-inference-steps", type=int, default=50,
        help="Number of reverse diffusion steps to run."
    )
    parser.add_argument(
        "--mixed-precision", type=str, default="bf16",
        help="Whether to run backbone in 'fp16', 'bf16', or 'fp32'."
    )
    parser.add_argument(
        "--seed", type=int, default=42,
        help="Random seed for reproducibility."
    )
    parser.add_argument(
        "--shuffle", type=int, default=False,
        help="Whether to shuffle dataset. Usually False for inference."
    )
    parser.add_argument(
        "--is-uncond", type=bool, default=False,
        help=""
    )

    # If arg_list is None, argparse picks up sys.argv; 
    # otherwise it treats arg_list as the full argv list.
    return parser.parse_args(arg_list)

args = [
    "--dataset-path", "/scratch/ondemand28/harryscz/head_audio/data/data256/uv",
    "--pretrained-model-name-or-path", "/scratch/ondemand28/harryscz/model/CogVideoX-2b",
    "--checkpoint-path",  "/scratch/ondemand28/harryscz/head_audio/trainOutput/checkpoint-1000.pt",
    "--output-dir",  "/scratch/ondemand28/harryscz/diffusion/videoOut",
    "--model-config",  "/scratch/ondemand28/harryscz/diffusion/model_config.yaml",
    "--batch-size",  "1",
    "--num-inference-steps",  "50",
    "--mixed-precision",  "no",
    "--seed",  "42",
    "--shuffle",  "0",
]

args = parse_args(args)

with open(args.model_config, "r") as f: model_config = yaml.safe_load(f)


In [3]:
from accelerate import Accelerator
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
from accelerate.logging import get_logger

with open(args.model_config, "r") as f: model_config = yaml.safe_load(f)
if args.mixed_precision.lower() == "fp16":
    dtype = torch.float16
elif args.mixed_precision.lower() == "bf16":
    dtype = torch.bfloat16
else:
    dtype = torch.float32

accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir,
                                                    logging_dir=os.path.join(args.output_dir, "logs"))
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
accelerator = Accelerator(mixed_precision=args.mixed_precision,
                            project_config=accelerator_project_config,
                            kwargs_handlers=[ddp_kwargs])

# 2.4 Set random seed
if args.seed is not None:
    set_seed(args.seed + accelerator.process_index)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

logger = get_logger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info("Accelerator state:", accelerator.state)

--- Logging error ---
Traceback (most recent call last):
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/logging/__init__.py", line 1083, in emit
    msg = self.format(record)
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/logging/__init__.py", line 927, in format
    return fmt.format(record)
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/logging/__init__.py", line 663, in format
    record.message = record.getMessage()
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/logging/__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/lib/python3.9/runpy.py", line 87, in

In [4]:
#### Dataset #####
# Video data have shape [B, C, F, H, W]

from data.CustomDataset import VideoDataset 
from torch.utils.data import DataLoader, DistributedSampler

dataset = VideoDataset(
    videos_dir=args.dataset_path,
    num_ref_frames=1,
    num_target_frames=49
)
if args.shuffle:
    sampler = DistributedSampler(
        dataset,
        num_replicas=accelerator.num_processes,
        rank=accelerator.process_index,
        shuffle=True
    )
else:
    sampler = None
data_loader = DataLoader(
    dataset,
    batch_size=args.batch_size,
    # sampler=sampler,
    collate_fn=lambda x: x[0],   # since dataset returns already‐batched items
    num_workers=2,
    pin_memory=True,
)
logger.info(f"Number of test examples: {len(data_loader)}")

06/07/2025 18:55:06 - INFO - __main__ - Number of test examples: 10


In [5]:
#### Load Model ####
device = "cuda"
dtype = torch.float32

from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
from cap_transformer import CAPVideoXTransformer3DModel

transformer = CAPVideoXTransformer3DModel.from_pretrained(
    args.pretrained_model_name_or_path,
    low_cpu_mem_usage=False,
    device_map=None,
    ignore_mismatched_sizes=True,
    subfolder="transformer",
    torch_dtype=torch.float32,
    cond_in_channels=1,  # only one channel (the ref_mask)
    sample_width=model_config["width"] // 8,
    sample_height=model_config["height"] // 8,
    max_text_seq_length=1,
    max_n_references=model_config["max_n_references"],
    apply_attention_scaling=model_config["use_growth_scaling"],
    use_rotary_positional_embeddings=False,
)

vae = AutoencoderKLCogVideoX.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="vae"
)
scheduler = CogVideoXDPMScheduler.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="scheduler",
)

vae.eval().to(dtype)
transformer.eval().to(dtype)

vae, transformer, scheduler, data_loader = accelerator.prepare(vae, transformer, scheduler, data_loader)

Some weights of the model checkpoint at /scratch/ondemand28/harryscz/model/CogVideoX-2b were not used when initializing CAPVideoXTransformer3DModel: 
 ['patch_embed.text_proj.weight, patch_embed.text_proj.bias']
Some weights of CAPVideoXTransformer3DModel were not initialized from the model checkpoint at /scratch/ondemand28/harryscz/model/CogVideoX-2b and are newly initialized: ['patch_embed.cond_proj.bias', 'patch_embed.audio_proj.weight', 'patch_embed.audio_proj.bias', 'patch_embed.cond_proj.weight', 'patch_embed.ref_temp_proj.bias', 'patch_embed.ref_temp_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# ckpt = torch.load(args.checkpoint_path, map_location="cpu")
# if "state_dict" in ckpt:
#     raw_state_dict = ckpt["state_dict"]
# elif "model_state_dict" in ckpt:
#     raw_state_dict = ckpt["model_state_dict"]
# else:
#     # If the .pt is literally just a pure state_dict, do this:
#     raw_state_dict = ckpt

# clean_state_dict = {}
# for key, val in raw_state_dict.items():
#     new_key = key
#     # e.g. if your keys start with "module.", remove it:
#     if key.startswith("module."):
#         new_key = key[len("module."):]
#     # or if saved under "model.", do:
#     # if key.startswith("model."):
#     #     new_key = key[len("model."):]
#     clean_state_dict[new_key] = val
# missing, unexpected = transformer.load_state_dict(clean_state_dict, strict=False)

# print("==> Missing keys (these will be randomly initialized because they weren't in the checkpoint):")
# for k in missing:
#     print("   ", k)
# print("==> Unexpected keys (these were in the checkpoint but didn't match any parameter in your model):")
# for k in unexpected:
#     print("   ", k)

In [7]:
for batch_id, batch in enumerate(data_loader):
    print(batch.keys())
    for chunk_id in range(len(batch["video_chunks"])):
        print(batch["video_chunks"][chunk_id].shape)
        print(batch["cond_chunks"].keys())  # Mask for each frames over H and W and channel suggesting which one works as a condition
                                            # list of tensor masks for cond chunks
        print(batch['cond_chunks']['ref_mask'][0].shape)
        print(batch["chunk_is_ref"]) # list of length frame of bool saying which on is a condition 
        print(batch["raw_audio"]) # passed as none
    break

dict_keys(['video_chunks', 'cond_chunks', 'chunk_is_ref', 'raw_audio'])
torch.Size([1, 3, 50, 256, 256])
dict_keys(['ref_mask'])
torch.Size([1, 50, 256, 256, 3])
[tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False],
       device='cuda:0')]
None


In [93]:
class BatchVideoPipeline(DiffusionPipeline):
    """
    A custom diffusion pipeline that mirrors your manual inference loop,
    but inherits from DiffusionPipeline to leverage no-grad, mixed-precision,
    and buffer reuse for maximum efficiency.
    """
    def __init__(
        self,
        vae,
        transformer,
        scheduler,
    ):
        super().__init__()
        self.register_modules(vae=vae, transformer=transformer, scheduler=scheduler)

        # Scale factors for spatial/temporal axes
        self.vae_scale_factor_spatial = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.vae_scale_factor_temporal = getattr(self.vae.config, "temporal_compression_ratio", 1)

        # Video post-processor
        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)

    @torch.no_grad()
    def __call__(
        self,
        batch: Dict[str, Union[List[torch.FloatTensor], Dict[str, List[torch.FloatTensor]]]],
        num_inference_steps: int = 50,
        guidance_scale: float = 1.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        output_type: str = "pil",
        return_dict: bool = False
    ) -> Union[List, Dict]:
        device = self._execution_device
        dtype = self.transformer.dtype

        # 1) Extract & encode
        latent_chunks: List[torch.Tensor] = []
        ref_mask_chunks: List[torch.Tensor] = []
        sequence_infos: List[tuple] = []

        for i, video in enumerate(batch["video_chunks"]):
            # video: [B, C, F, H, W]
            video = video.to(device=device, dtype=dtype)
            dist = self.vae.encode(video).latent_dist.sample()
            latent = dist * self.vae.config.scaling_factor
            latent = latent.permute(0, 2, 1, 3, 4).contiguous()  # [B, F, C_z, h, w]
            latent_chunks.append(latent)

            # mask: batch["cond_chunks"]["ref_mask"][i] shape [B, F, H, W, C_mask]
            rm = batch["cond_chunks"]["ref_mask"][i]
            rm = rm.to(device=device, dtype=dtype).permute(0, 4, 1, 2, 3)
            # now [B, C_mask, F, H, W]
            ref_mask_chunks.append(rm)

            # sequence info
            is_ref = batch.get("chunk_is_ref", [False] * len(latent_chunks))[i]
            seq = torch.arange(0, latent.shape[1], device=device)
            sequence_infos.append((is_ref, seq))

        # 2) Build 2× for classifier-free guidance
        latents = latent_chunks
        masks   = [torch.cat([m, torch.zeros_like(m)], dim=0) for m in ref_mask_chunks]
        # keep ref_latents for mixing
        ref_latents = [torch.cat([z, torch.zeros_like(z)], dim=0) for z in latent_chunks]

        # 3) dummy audio/text embeddings (adjust if you have real ones)
        B2 = latents[0].shape[0] * 2
        total_F = sum(z.shape[1] for z in latents)
        audio_embeds = torch.zeros((B2, total_F, 768), dtype=dtype, device=device)
        text_embeds  = torch.zeros((B2, 1,
            self.transformer.config.attention_head_dim * self.transformer.config.num_attention_heads
        ), dtype=dtype, device=device)

        # 4) timesteps
        timesteps, _ = retrieve_timesteps(self.scheduler, num_inference_steps, device=device)

        # 5) optional fuse QKV once
        # try:
        #     self.transformer.fuse_qkv_projections()
        # except Exception:
        #     pass

        # 6) denoising loop
        old_pred_original_samples = [None] * len(latents)
        for i, t in enumerate(timesteps):
            latent_model_inputs = [torch.cat([chunks] * 2, dim=0)for chunks in latents]
            B2, F, C, H, W = latent_model_inputs[0].shape
            # one zero condition tensor
            zero_cond = torch.zeros((B2, F, 1, H, W), dtype=dtype, device=device)

            # single forward
            noise_preds = self.transformer(
                hidden_states=latent_model_inputs,
                encoder_hidden_states=text_embeds,
                audio_embeds=audio_embeds,
                condition=[zero_cond] * len(latent_model_inputs),
                sequence_infos=[[False, torch.arange(chunk.shape[1])]for chunk in latents],
                timestep=t.expand(B2),
                image_rotary_emb=None,
                return_dict=False,
            )[0]

            print(latent_model_inputs[0].shape)
            print(noise_preds[0].shape)

            # apply guidance, scheduler.step, then mixing
            new_latents = []
            new_old_pred_original_samples = []

            for noise_pred, old_pred_original_sample, latent in zip(noise_preds, old_pred_original_samples, latent_chunks):
                print(old_pred_original_samples)
                noise_pred, noise_pred_uncond = noise_pred.chunk(2, dim=0)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
            
                latent, old_pred_original_sample = scheduler.step(
                    noise_pred,
                    old_pred_original_sample,
                    t,
                    timesteps[i - 1] if i > 0 else None,
                    latent,
                    eta=0.0,
                    generator=generator
                )

                new_latents.append(latent)
                new_old_pred_original_samples.append(old_pred_original_sample)

            latents = new_latents
            old_pred_original_samples = list(new_old_pred_original_samples)

        # 7) decode to videos
        videos = []
        for latent in latents:
            dec = latent.permute(0, 2, 1, 3, 4) / self.vae.config.scaling_factor
            frames = self.vae.decode(dec).sample
            video = self.video_processor.postprocess_video(video=frames, output_type=output_type)
            videos.append(video)

        return {"frames": videos} if return_dict else videos

In [94]:
generator = torch.Generator(device=device)
generator.manual_seed(args.seed)
pipe = BatchVideoPipeline(vae, transformer, scheduler)
pipe = pipe.to(device).to(torch.float32)   
batch = next(iter(data_loader))

In [95]:
videos = pipe(batch, num_inference_steps=50, guidance_scale=1, generator=generator)

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])
[None]


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.2492,  1.2844,  1.2492,  ...,  1.2844,  1.2492,  1.2844],
           [ 0.9349,  0.9173,  0.9349,  ...,  0.9173,  0.9349,  0.9173],
           [ 1.2492,  1.2844,  1.2492,  ...,  1.2844,  1.2492,  1.2844],
           ...,
           [ 0.9349,  0.9173,  0.9349,  ...,  0.9173,  0.9349,  0.9173],
           [ 1.2492,  1.2844,  1.2492,  ...,  1.2844,  1.2492,  1.2844],
           [ 0.9349,  0.9173,  0.9349,  ...,  0.9173,  0.9349,  0.9173]],

          [[-3.8164, -3.9071, -3.8164,  ..., -3.9071, -3.8164, -3.9071],
           [-3.4712, -3.6985, -3.4712,  ..., -3.6985, -3.4712, -3.6985],
           [-3.8164, -3.9071, -3.8164,  ..., -3.9071, -3.8164, -3.9071],
           ...,
           [-3.4712, -3.6985, -3.4712,  ..., -3.6985, -3.4712, -3.6985],
           [-3.8164, -3.9071, -3.8164,  ..., -3.9071, -3.8164, -3.9071],
           [-3.4712, -3.6985, -3.4712,  ..., -3.6985, -3.4712, -3.6985]],

          [[ 3.5069,  2.9552,  3.5069,  ...,  2.9552,  3.5069,  2.9552],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.3090,  1.3484,  1.3086,  ...,  1.3486,  1.3080,  1.3486],
           [ 0.9685,  0.9545,  0.9677,  ...,  0.9545,  0.9673,  0.9546],
           [ 1.3088,  1.3483,  1.3083,  ...,  1.3483,  1.3078,  1.3481],
           ...,
           [ 0.9652,  0.9517,  0.9650,  ...,  0.9518,  0.9649,  0.9522],
           [ 1.3053,  1.3453,  1.3053,  ...,  1.3451,  1.3049,  1.3455],
           [ 0.9652,  0.9519,  0.9652,  ...,  0.9517,  0.9649,  0.9522]],

          [[-3.8282, -3.9046, -3.8268,  ..., -3.9044, -3.8263, -3.9083],
           [-3.4778, -3.7185, -3.4766,  ..., -3.7141, -3.4786, -3.7156],
           [-3.8249, -3.9069, -3.8232,  ..., -3.9077, -3.8242, -3.9086],
           ...,
           [-3.4776, -3.7159, -3.4796,  ..., -3.7148, -3.4776, -3.7129],
           [-3.8242, -3.9075, -3.8248,  ..., -3.9079, -3.8228, -3.9058],
           [-3.4745, -3.7159, -3.4785,  ..., -3.7146, -3.4761, -3.7135]],

          [[ 3.5245,  2.9739,  3.5236,  ...,  2.9682,  3.5273,  2.9672],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.3484,  1.4001,  1.3474,  ...,  1.4004,  1.3463,  1.4005],
           [ 0.9831,  0.9653,  0.9813,  ...,  0.9652,  0.9806,  0.9655],
           [ 1.3479,  1.3998,  1.3468,  ...,  1.3997,  1.3459,  1.3994],
           ...,
           [ 0.9762,  0.9594,  0.9759,  ...,  0.9596,  0.9755,  0.9605],
           [ 1.3406,  1.3935,  1.3407,  ...,  1.3930,  1.3397,  1.3940],
           [ 0.9761,  0.9596,  0.9762,  ...,  0.9593,  0.9756,  0.9604]],

          [[-3.8156, -3.8788, -3.8126,  ..., -3.8784, -3.8116, -3.8866],
           [-3.4662, -3.6974, -3.4637,  ..., -3.6882, -3.4678, -3.6913],
           [-3.8087, -3.8836, -3.8052,  ..., -3.8854, -3.8073, -3.8871],
           ...,
           [-3.4658, -3.6919, -3.4698,  ..., -3.6895, -3.4658, -3.6857],
           [-3.8072, -3.8848, -3.8085,  ..., -3.8857, -3.8043, -3.8814],
           [-3.4591, -3.6920, -3.4677,  ..., -3.6892, -3.4625, -3.6868]],

          [[ 3.5160,  2.9881,  3.5141,  ...,  2.9762,  3.5217,  2.9742],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.3804,  1.4386,  1.3789,  ...,  1.4391,  1.3772,  1.4393],
           [ 1.0007,  0.9818,  0.9979,  ...,  0.9817,  0.9968,  0.9821],
           [ 1.3796,  1.4383,  1.3779,  ...,  1.4381,  1.3764,  1.4376],
           ...,
           [ 0.9898,  0.9725,  0.9893,  ...,  0.9730,  0.9887,  0.9743],
           [ 1.3682,  1.4284,  1.3683,  ...,  1.4276,  1.3668,  1.4291],
           [ 0.9897,  0.9730,  0.9899,  ...,  0.9725,  0.9889,  0.9742]],

          [[-3.7973, -3.8511, -3.7926,  ..., -3.8506, -3.7910, -3.8635],
           [-3.4473, -3.6730, -3.4433,  ..., -3.6586, -3.4498, -3.6635],
           [-3.7865, -3.8587, -3.7810,  ..., -3.8615, -3.7842, -3.8642],
           ...,
           [-3.4466, -3.6645, -3.4530,  ..., -3.6607, -3.4466, -3.6548],
           [-3.7841, -3.8606, -3.7862,  ..., -3.8619, -3.7796, -3.8552],
           [-3.4362, -3.6646, -3.4496,  ..., -3.6603, -3.4415, -3.6564]],

          [[ 3.4870,  2.9861,  3.4841,  ...,  2.9674,  3.4960,  2.9643],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.4094,  1.4729,  1.4073,  ...,  1.4736,  1.4049,  1.4738],
           [ 1.0178,  0.9999,  1.0139,  ...,  0.9998,  1.0124,  1.0004],
           [ 1.4083,  1.4724,  1.4059,  ...,  1.4722,  1.4039,  1.4715],
           ...,
           [ 1.0027,  0.9870,  1.0019,  ...,  0.9876,  1.0012,  0.9895],
           [ 1.3925,  1.4586,  1.3926,  ...,  1.4576,  1.3905,  1.4596],
           [ 1.0025,  0.9877,  1.0028,  ...,  0.9870,  1.0014,  0.9894]],

          [[-3.7869, -3.8446, -3.7804,  ..., -3.8439, -3.7782, -3.8618],
           [-3.4389, -3.6531, -3.4334,  ..., -3.6330, -3.4425, -3.6397],
           [-3.7718, -3.8552, -3.7642,  ..., -3.8591, -3.7687, -3.8628],
           ...,
           [-3.4380, -3.6412, -3.4468,  ..., -3.6360, -3.4379, -3.6276],
           [-3.7685, -3.8579, -3.7714,  ..., -3.8597, -3.7623, -3.8504],
           [-3.4234, -3.6413, -3.4421,  ..., -3.6353, -3.4309, -3.6299]],

          [[ 3.4741,  2.9905,  3.4701,  ...,  2.9645,  3.4867,  2.9601],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.4420,  1.5065,  1.4392,  ...,  1.5074,  1.4361,  1.5077],
           [ 1.0379,  1.0255,  1.0329,  ...,  1.0252,  1.0309,  1.0260],
           [ 1.4405,  1.5059,  1.4373,  ...,  1.5056,  1.4347,  1.5047],
           ...,
           [ 1.0182,  1.0086,  1.0172,  ...,  1.0094,  1.0162,  1.0118],
           [ 1.4198,  1.4879,  1.4200,  ...,  1.4865,  1.4172,  1.4892],
           [ 1.0180,  1.0094,  1.0183,  ...,  1.0085,  1.0165,  1.0117]],

          [[-3.7820, -3.8505, -3.7735,  ..., -3.8495, -3.7706, -3.8729],
           [-3.4412, -3.6442, -3.4340,  ..., -3.6180, -3.4458, -3.6268],
           [-3.7623, -3.8643, -3.7524,  ..., -3.8694, -3.7582, -3.8742],
           ...,
           [-3.4400, -3.6287, -3.4515,  ..., -3.6219, -3.4399, -3.6110],
           [-3.7581, -3.8678, -3.7618,  ..., -3.8702, -3.7499, -3.8580],
           [-3.4210, -3.6288, -3.4454,  ..., -3.6210, -3.4306, -3.6140]],

          [[ 3.4702,  3.0002,  3.4650,  ...,  2.9663,  3.4866,  2.9606],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.4756,  1.5395,  1.4720,  ...,  1.5406,  1.4682,  1.5409],
           [ 1.0576,  1.0539,  1.0513,  ...,  1.0536,  1.0488,  1.0546],
           [ 1.4737,  1.5387,  1.4698,  ...,  1.5383,  1.4665,  1.5372],
           ...,
           [ 1.0329,  1.0328,  1.0317,  ...,  1.0338,  1.0304,  1.0368],
           [ 1.4478,  1.5161,  1.4480,  ...,  1.5144,  1.4446,  1.5178],
           [ 1.0327,  1.0338,  1.0331,  ...,  1.0327,  1.0308,  1.0366]],

          [[-3.7726, -3.8511, -3.7619,  ..., -3.8499, -3.7583, -3.8792],
           [-3.4394, -3.6282, -3.4304,  ..., -3.5955, -3.4452, -3.6064],
           [-3.7479, -3.8684, -3.7355,  ..., -3.8748, -3.7428, -3.8808],
           ...,
           [-3.4379, -3.6088, -3.4524,  ..., -3.6002, -3.4378, -3.5866],
           [-3.7426, -3.8728, -3.7473,  ..., -3.8758, -3.7324, -3.8605],
           [-3.4141, -3.6090, -3.4447,  ..., -3.5991, -3.4262, -3.5904]],

          [[ 3.4610,  3.0088,  3.4545,  ...,  2.9663,  3.4816,  2.9592],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.5018,  1.5691,  1.4975,  ...,  1.5705,  1.4929,  1.5709],
           [ 1.0686,  1.0748,  1.0609,  ...,  1.0744,  1.0578,  1.0756],
           [ 1.4995,  1.5681,  1.4948,  ...,  1.5677,  1.4908,  1.5663],
           ...,
           [ 1.0385,  1.0491,  1.0371,  ...,  1.0503,  1.0355,  1.0539],
           [ 1.4681,  1.5407,  1.4683,  ...,  1.5386,  1.4642,  1.5427],
           [ 1.0382,  1.0503,  1.0387,  ...,  1.0490,  1.0360,  1.0538]],

          [[-3.7651, -3.8471, -3.7521,  ..., -3.8457, -3.7477, -3.8813],
           [-3.4361, -3.6126, -3.4251,  ..., -3.5727, -3.4431, -3.5860],
           [-3.7350, -3.8681, -3.7200,  ..., -3.8759, -3.7289, -3.8833],
           ...,
           [-3.4343, -3.5890, -3.4518,  ..., -3.5786, -3.4341, -3.5620],
           [-3.7286, -3.8735, -3.7343,  ..., -3.8772, -3.7162, -3.8586],
           [-3.4053, -3.5892, -3.4424,  ..., -3.5772, -3.4200, -3.5666]],

          [[ 3.4485,  3.0149,  3.4406,  ...,  2.9632,  3.4736,  2.9545],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.5293,  1.5959,  1.5242,  ...,  1.5975,  1.5187,  1.5980],
           [ 1.0788,  1.0952,  1.0697,  ...,  1.0948,  1.0660,  1.0962],
           [ 1.5266,  1.5947,  1.5209,  ...,  1.5942,  1.5162,  1.5926],
           ...,
           [ 1.0430,  1.0647,  1.0413,  ...,  1.0661,  1.0395,  1.0704],
           [ 1.4892,  1.5621,  1.4894,  ...,  1.5596,  1.4845,  1.5644],
           [ 1.0427,  1.0662,  1.0433,  ...,  1.0646,  1.0400,  1.0702]],

          [[-3.7507, -3.8404, -3.7353,  ..., -3.8387, -3.7301, -3.8812],
           [-3.4308, -3.5924, -3.4178,  ..., -3.5450, -3.4392, -3.5608],
           [-3.7150, -3.8654, -3.6971,  ..., -3.8747, -3.7076, -3.8835],
           ...,
           [-3.4287, -3.5644, -3.4495,  ..., -3.5519, -3.4285, -3.5322],
           [-3.7073, -3.8719, -3.7141,  ..., -3.8762, -3.6925, -3.8541],
           [-3.3942, -3.5646, -3.4384,  ..., -3.5503, -3.4117, -3.5377]],

          [[ 3.4355,  3.0194,  3.4260,  ...,  2.9579,  3.4654,  2.9476],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.5596,  1.6224,  1.5536,  ...,  1.6243,  1.5471,  1.6248],
           [ 1.0920,  1.1188,  1.0813,  ...,  1.1184,  1.0771,  1.1200],
           [ 1.5564,  1.6210,  1.5498,  ...,  1.6204,  1.5442,  1.6185],
           ...,
           [ 1.0501,  1.0831,  1.0481,  ...,  1.0848,  1.0459,  1.0898],
           [ 1.5126,  1.5828,  1.5129,  ...,  1.5799,  1.5071,  1.5855],
           [ 1.0497,  1.0848,  1.0504,  ...,  1.0829,  1.0466,  1.0895]],

          [[-3.7414, -3.8349, -3.7234,  ..., -3.8329, -3.7172, -3.8826],
           [-3.4294, -3.5764, -3.4141,  ..., -3.5209, -3.4392, -3.5394],
           [-3.6995, -3.8642, -3.6786,  ..., -3.8751, -3.6909, -3.8853],
           ...,
           [-3.4269, -3.5436, -3.4513,  ..., -3.5290, -3.4266, -3.5059],
           [-3.6906, -3.8717, -3.6985,  ..., -3.8768, -3.6732, -3.8509],
           [-3.3865, -3.5438, -3.4383,  ..., -3.5271, -3.4070, -3.5123]],

          [[ 3.4237,  3.0262,  3.4126,  ...,  2.9541,  3.4587,  2.9421],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.5911,  1.6538,  1.5842,  ...,  1.6560,  1.5767,  1.6566],
           [ 1.1040,  1.1460,  1.0916,  ...,  1.1454,  1.0867,  1.1473],
           [ 1.5874,  1.6522,  1.5797,  ...,  1.6516,  1.5733,  1.6494],
           ...,
           [ 1.0556,  1.1047,  1.0532,  ...,  1.1066,  1.0507,  1.1124],
           [ 1.5368,  1.6081,  1.5371,  ...,  1.6048,  1.5304,  1.6113],
           [ 1.0551,  1.1066,  1.0559,  ...,  1.1045,  1.0514,  1.1121]],

          [[-3.7352, -3.8356, -3.7144,  ..., -3.8332, -3.7073, -3.8906],
           [-3.4314, -3.5646, -3.4138,  ..., -3.5005, -3.4428, -3.5219],
           [-3.6868, -3.8694, -3.6626,  ..., -3.8819, -3.6769, -3.8938],
           ...,
           [-3.4285, -3.5266, -3.4567,  ..., -3.5098, -3.4282, -3.4831],
           [-3.6765, -3.8781, -3.6856,  ..., -3.8839, -3.6565, -3.8540],
           [-3.3819, -3.5269, -3.4417,  ..., -3.5077, -3.4056, -3.4906]],

          [[ 3.4173,  3.0360,  3.4045,  ...,  2.9527,  3.4577,  2.9388],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.6227,  1.6827,  1.6148,  ...,  1.6852,  1.6062,  1.6859],
           [ 1.1174,  1.1710,  1.1032,  ...,  1.1704,  1.0976,  1.1725],
           [ 1.6185,  1.6808,  1.6097,  ...,  1.6801,  1.6024,  1.6776],
           ...,
           [ 1.0620,  1.1238,  1.0594,  ...,  1.1260,  1.0565,  1.1327],
           [ 1.5607,  1.6304,  1.5611,  ...,  1.6266,  1.5534,  1.6340],
           [ 1.0615,  1.1261,  1.0625,  ...,  1.1236,  1.0573,  1.1323]],

          [[-3.7260, -3.8311, -3.7022,  ..., -3.8285, -3.6941, -3.8941],
           [-3.4300, -3.5513, -3.4098,  ..., -3.4781, -3.4430, -3.5026],
           [-3.6707, -3.8697, -3.6431,  ..., -3.8841, -3.6594, -3.8977],
           ...,
           [-3.4267, -3.5080, -3.4590,  ..., -3.4888, -3.4264, -3.4582],
           [-3.6589, -3.8797, -3.6693,  ..., -3.8864, -3.6360, -3.8522],
           [-3.3734, -3.5083, -3.4417,  ..., -3.4863, -3.4005, -3.4668]],

          [[ 3.4050,  3.0420,  3.3904,  ...,  2.9469,  3.4511,  2.9310],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.6526,  1.7101,  1.6437,  ...,  1.7129,  1.6340,  1.7137],
           [ 1.1311,  1.1968,  1.1151,  ...,  1.1962,  1.1087,  1.1985],
           [ 1.6479,  1.7079,  1.6380,  ...,  1.7071,  1.6296,  1.7043],
           ...,
           [ 1.0684,  1.1434,  1.0654,  ...,  1.1459,  1.0622,  1.1535],
           [ 1.5824,  1.6509,  1.5829,  ...,  1.6466,  1.5742,  1.6550],
           [ 1.0679,  1.1460,  1.0689,  ...,  1.1432,  1.0631,  1.1531]],

          [[-3.7171, -3.8271, -3.6902,  ..., -3.8241, -3.6811, -3.8983],
           [-3.4282, -3.5379, -3.4054,  ..., -3.4550, -3.4429, -3.4827],
           [-3.6546, -3.8708, -3.6233,  ..., -3.8870, -3.6418, -3.9024],
           ...,
           [-3.4244, -3.4888, -3.4610,  ..., -3.4671, -3.4241, -3.4325],
           [-3.6413, -3.8821, -3.6530,  ..., -3.8897, -3.6153, -3.8509],
           [-3.3641, -3.4892, -3.4415,  ..., -3.4643, -3.3948, -3.4422]],

          [[ 3.3948,  3.0483,  3.3782,  ...,  2.9406,  3.4470,  2.9226],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.6869,  1.7402,  1.6770,  ...,  1.7434,  1.6660,  1.7442],
           [ 1.1448,  1.2237,  1.1268,  ...,  1.2229,  1.1197,  1.2256],
           [ 1.6816,  1.7378,  1.6705,  ...,  1.7369,  1.6611,  1.7337],
           ...,
           [ 1.0744,  1.1637,  1.0710,  ...,  1.1665,  1.0674,  1.1750],
           [ 1.6081,  1.6737,  1.6086,  ...,  1.6689,  1.5988,  1.6783],
           [ 1.0738,  1.1666,  1.0750,  ...,  1.1634,  1.0685,  1.1745]],

          [[-3.7092, -3.8233, -3.6790,  ..., -3.8200, -3.6686, -3.9034],
           [-3.4293, -3.5279, -3.4036,  ..., -3.4347, -3.4458, -3.4659],
           [-3.6389, -3.8724, -3.6038,  ..., -3.8907, -3.6245, -3.9080],
           ...,
           [-3.4251, -3.4728, -3.4661,  ..., -3.4483, -3.4247, -3.4095],
           [-3.6239, -3.8851, -3.6372,  ..., -3.8936, -3.5948, -3.8501],
           [-3.3573, -3.4732, -3.4442,  ..., -3.4452, -3.3918, -3.4203]],

          [[ 3.3836,  3.0575,  3.3650,  ...,  2.9365,  3.4423,  2.9163],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.7163,  1.7674,  1.7051,  ...,  1.7709,  1.6929,  1.7719],
           [ 1.1544,  1.2503,  1.1343,  ...,  1.2494,  1.1264,  1.2524],
           [ 1.7103,  1.7647,  1.6980,  ...,  1.7637,  1.6875,  1.7601],
           ...,
           [ 1.0759,  1.1833,  1.0721,  ...,  1.1864,  1.0681,  1.1959],
           [ 1.6283,  1.6932,  1.6289,  ...,  1.6878,  1.6180,  1.6983],
           [ 1.0752,  1.1865,  1.0765,  ...,  1.1830,  1.0692,  1.1954]],

          [[-3.6990, -3.8131, -3.6652,  ..., -3.8093, -3.6537, -3.9024],
           [-3.4246, -3.5123, -3.3960,  ..., -3.4084, -3.4430, -3.4431],
           [-3.6206, -3.8679, -3.5814,  ..., -3.8882, -3.6045, -3.9075],
           ...,
           [-3.4199, -3.4508, -3.4657,  ..., -3.4236, -3.4194, -3.3802],
           [-3.6038, -3.8820, -3.6186,  ..., -3.8915, -3.5713, -3.8429],
           [-3.3443, -3.4513, -3.4412,  ..., -3.4201, -3.3828, -3.3923]],

          [[ 3.3682,  3.0613,  3.3475,  ...,  2.9263,  3.4337,  2.9038],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.7520,  1.7988,  1.7396,  ...,  1.8028,  1.7261,  1.8038],
           [ 1.1709,  1.2805,  1.1487,  ...,  1.2796,  1.1398,  1.2829],
           [ 1.7454,  1.7958,  1.7317,  ...,  1.7947,  1.7201,  1.7907],
           ...,
           [ 1.0839,  1.2062,  1.0797,  ...,  1.2097,  1.0752,  1.2202],
           [ 1.6544,  1.7165,  1.6550,  ...,  1.7105,  1.6430,  1.7222],
           [ 1.0831,  1.2098,  1.0845,  ...,  1.2059,  1.0765,  1.2196]],

          [[-3.6917, -3.8107, -3.6543,  ..., -3.8065, -3.6415, -3.9098],
           [-3.4264, -3.5018, -3.3947,  ..., -3.3865, -3.4468, -3.4250],
           [-3.6047, -3.8715, -3.5613,  ..., -3.8940, -3.5869, -3.9154],
           ...,
           [-3.4212, -3.4336, -3.4720,  ..., -3.4033, -3.4207, -3.3553],
           [-3.5862, -3.8872, -3.6026,  ..., -3.8977, -3.5501, -3.8438],
           [-3.3374, -3.4341, -3.4449,  ..., -3.3994, -3.3800, -3.3687]],

          [[ 3.3583,  3.0696,  3.3353,  ...,  2.9198,  3.4309,  2.8949],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.7845,  1.8275,  1.7709,  ...,  1.8319,  1.7560,  1.8330],
           [ 1.1840,  1.3059,  1.1594,  ...,  1.3049,  1.1497,  1.3085],
           [ 1.7773,  1.8242,  1.7621,  ...,  1.8230,  1.7493,  1.8186],
           ...,
           [ 1.0879,  1.2240,  1.0833,  ...,  1.2278,  1.0783,  1.2394],
           [ 1.6769,  1.7367,  1.6776,  ...,  1.7301,  1.6642,  1.7430],
           [ 1.0871,  1.2279,  1.0887,  ...,  1.2237,  1.0797,  1.2388]],

          [[-3.6837, -3.8014, -3.6425,  ..., -3.7968, -3.6284, -3.9107],
           [-3.4259, -3.4907, -3.3909,  ..., -3.3635, -3.4484, -3.4060],
           [-3.5878, -3.8684, -3.5398,  ..., -3.8933, -3.5681, -3.9169],
           ...,
           [-3.4202, -3.4154, -3.4762,  ..., -3.3821, -3.4196, -3.3291],
           [-3.5673, -3.8857, -3.5854,  ..., -3.8973, -3.5276, -3.8379],
           [-3.3277, -3.4160, -3.4463,  ..., -3.3778, -3.3747, -3.3439]],

          [[ 3.3446,  3.0747,  3.3192,  ...,  2.9095,  3.4247,  2.8820],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.8184,  1.8577,  1.8035,  ...,  1.8625,  1.7871,  1.8638],
           [ 1.1977,  1.3348,  1.1707,  ...,  1.3337,  1.1600,  1.3377],
           [ 1.8104,  1.8541,  1.7938,  ...,  1.8527,  1.7798,  1.8479],
           ...,
           [ 1.0922,  1.2448,  1.0871,  ...,  1.2491,  1.0817,  1.2618],
           [ 1.7002,  1.7580,  1.7010,  ...,  1.7508,  1.6864,  1.7649],
           [ 1.0913,  1.2491,  1.0930,  ...,  1.2445,  1.0832,  1.2611]],

          [[-3.6741, -3.7920, -3.6288,  ..., -3.7869, -3.6133, -3.9120],
           [-3.4230, -3.4764, -3.3845,  ..., -3.3368, -3.4477, -3.3834],
           [-3.5688, -3.8656, -3.5161,  ..., -3.8930, -3.5471, -3.9189],
           ...,
           [-3.4167, -3.3938, -3.4782,  ..., -3.3572, -3.4161, -3.2989],
           [-3.5463, -3.8846, -3.5661,  ..., -3.8974, -3.5026, -3.8321],
           [-3.3151, -3.3944, -3.4454,  ..., -3.3525, -3.3668, -3.3152]],

          [[ 3.3297,  3.0810,  3.3019,  ...,  2.8996,  3.4177,  2.8693],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.8540,  1.8893,  1.8376,  ...,  1.8945,  1.8197,  1.8959],
           [ 1.2111,  1.3651,  1.1817,  ...,  1.3639,  1.1699,  1.3682],
           [ 1.8452,  1.8853,  1.8270,  ...,  1.8838,  1.8117,  1.8786],
           ...,
           [ 1.0958,  1.2667,  1.0903,  ...,  1.2714,  1.0843,  1.2852],
           [ 1.7247,  1.7803,  1.7255,  ...,  1.7723,  1.7095,  1.7878],
           [ 1.0948,  1.2714,  1.0967,  ...,  1.2663,  1.0860,  1.2845]],

          [[-3.6663, -3.7837, -3.6168,  ..., -3.7781, -3.5999, -3.9149],
           [-3.4209, -3.4657, -3.3789,  ..., -3.3129, -3.4480, -3.3640],
           [-3.5512, -3.8641, -3.4935,  ..., -3.8940, -3.5275, -3.9224],
           ...,
           [-3.4140, -3.3753, -3.4813,  ..., -3.3353, -3.4134, -3.2716],
           [-3.5266, -3.8849, -3.5483,  ..., -3.8989, -3.4788, -3.8275],
           [-3.3031, -3.3760, -3.4454,  ..., -3.3301, -3.3595, -3.2894]],

          [[ 3.3161,  3.0868,  3.2856,  ...,  2.8885,  3.4123,  2.8554],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.8879,  1.9185,  1.8701,  ...,  1.9242,  1.8506,  1.9258],
           [ 1.2268,  1.3955,  1.1946,  ...,  1.3941,  1.1819,  1.3988],
           [ 1.8784,  1.9142,  1.8586,  ...,  1.9126,  1.8419,  1.9069],
           ...,
           [ 1.1011,  1.2883,  1.0951,  ...,  1.2933,  1.0887,  1.3084],
           [ 1.7472,  1.7998,  1.7481,  ...,  1.7912,  1.7306,  1.8080],
           [ 1.1000,  1.2934,  1.1021,  ...,  1.2878,  1.0905,  1.3076]],

          [[-3.6557, -3.7709, -3.6017,  ..., -3.7648, -3.5833, -3.9138],
           [-3.4171, -3.4516, -3.3713,  ..., -3.2853, -3.4465, -3.3409],
           [-3.5303, -3.8585, -3.4675,  ..., -3.8911, -3.5045, -3.9219],
           ...,
           [-3.4096, -3.3532, -3.4829,  ..., -3.3096, -3.4088, -3.2403],
           [-3.5035, -3.8812, -3.5271,  ..., -3.8963, -3.4515, -3.8187],
           [-3.2887, -3.3540, -3.4437,  ..., -3.3040, -3.3502, -3.2596]],

          [[ 3.2990,  3.0904,  3.2659,  ...,  2.8743,  3.4038,  2.8383],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.9205,  1.9475,  1.9012,  ...,  1.9537,  1.8800,  1.9554],
           [ 1.2400,  1.4239,  1.2052,  ...,  1.4224,  1.1913,  1.4276],
           [ 1.9102,  1.9429,  1.8887,  ...,  1.9411,  1.8705,  1.9349],
           ...,
           [ 1.1037,  1.3076,  1.0971,  ...,  1.3130,  1.0901,  1.3294],
           [ 1.7677,  1.8187,  1.7687,  ...,  1.8093,  1.7498,  1.8276],
           [ 1.1025,  1.3131,  1.1047,  ...,  1.3071,  1.0921,  1.3286]],

          [[-3.6450, -3.7559, -3.5864,  ..., -3.7494, -3.5665, -3.9110],
           [-3.4109, -3.4378, -3.3612,  ..., -3.2572, -3.4428, -3.3176],
           [-3.5089, -3.8511, -3.4408,  ..., -3.8864, -3.4809, -3.9199],
           ...,
           [-3.4027, -3.3309, -3.4823,  ..., -3.2836, -3.4019, -3.2084],
           [-3.4798, -3.8756, -3.5055,  ..., -3.8921, -3.4234, -3.8078],
           [-3.2715, -3.3317, -3.4398,  ..., -3.2775, -3.3382, -3.2294]],

          [[ 3.2796,  3.0928,  3.2436,  ...,  2.8583,  3.3933,  2.8192],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.9537,  1.9758,  1.9327,  ...,  1.9825,  1.9098,  1.9843],
           [ 1.2545,  1.4532,  1.2168,  ...,  1.4516,  1.2018,  1.4572],
           [ 1.9425,  1.9708,  1.9192,  ...,  1.9688,  1.8996,  1.9621],
           ...,
           [ 1.1070,  1.3274,  1.0999,  ...,  1.3333,  1.0924,  1.3511],
           [ 1.7884,  1.8364,  1.7894,  ...,  1.8263,  1.7690,  1.8461],
           [ 1.1057,  1.3334,  1.1082,  ...,  1.3269,  1.0945,  1.3501]],

          [[-3.6325, -3.7383, -3.5692,  ..., -3.7312, -3.5476, -3.9061],
           [-3.4026, -3.4225, -3.3489,  ..., -3.2272, -3.4372, -3.2925],
           [-3.4853, -3.8412, -3.4116,  ..., -3.8794, -3.4550, -3.9157],
           ...,
           [-3.3938, -3.3069, -3.4799,  ..., -3.2558, -3.3930, -3.1744],
           [-3.4538, -3.8678, -3.4816,  ..., -3.8856, -3.3928, -3.7944],
           [-3.2519, -3.3078, -3.4340,  ..., -3.2492, -3.3241, -3.1971]],

          [[ 3.2582,  3.0939,  3.2193,  ...,  2.8402,  3.3813,  2.7979],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 1.9889,  2.0060,  1.9663,  ...,  2.0132,  1.9416,  2.0152],
           [ 1.2717,  1.4849,  1.2310,  ...,  1.4832,  1.2149,  1.4892],
           [ 1.9768,  2.0006,  1.9517,  ...,  1.9985,  1.9305,  1.9913],
           ...,
           [ 1.1126,  1.3492,  1.1050,  ...,  1.3556,  1.0968,  1.3747],
           [ 1.8106,  1.8557,  1.8117,  ...,  1.8447,  1.7897,  1.8661],
           [ 1.1112,  1.3557,  1.1139,  ...,  1.3487,  1.0991,  1.3737]],

          [[-3.6224, -3.7212, -3.5541,  ..., -3.7136, -3.5308, -3.9022],
           [-3.3966, -3.4091, -3.3387,  ..., -3.1985, -3.4339, -3.2689],
           [-3.4636, -3.8322, -3.3841,  ..., -3.8735, -3.4309, -3.9126],
           ...,
           [-3.3871, -3.2844, -3.4799,  ..., -3.2292, -3.3862, -3.1414],
           [-3.4297, -3.8609, -3.4596,  ..., -3.8801, -3.3638, -3.7817],
           [-3.2340, -3.2854, -3.4304,  ..., -3.2221, -3.3119, -3.1659]],

          [[ 3.2375,  3.0953,  3.1955,  ...,  2.8217,  3.3702,  2.7761],
           [ 3

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.0226,  2.0348,  1.9983,  ...,  2.0426,  1.9717,  2.0447],
           [ 1.2870,  1.5150,  1.2432,  ...,  1.5131,  1.2258,  1.5196],
           [ 2.0096,  2.0289,  1.9826,  ...,  2.0267,  1.9598,  2.0189],
           ...,
           [ 1.1158,  1.3690,  1.1076,  ...,  1.3758,  1.0988,  1.3964],
           [ 1.8308,  1.8730,  1.8320,  ...,  1.8613,  1.8083,  1.8843],
           [ 1.1143,  1.3759,  1.1172,  ...,  1.3684,  1.1013,  1.3953]],

          [[-3.6085, -3.6989, -3.5350,  ..., -3.6907, -3.5099, -3.8936],
           [-3.3874, -3.3933, -3.3251,  ..., -3.1667, -3.4276, -3.2425],
           [-3.4376, -3.8184, -3.3522,  ..., -3.8627, -3.4025, -3.9048],
           ...,
           [-3.3772, -3.2592, -3.4770,  ..., -3.1998, -3.3762, -3.1054],
           [-3.4012, -3.8492, -3.4334,  ..., -3.8699, -3.3303, -3.7641],
           [-3.2125, -3.2603, -3.4238,  ..., -3.1922, -3.2963, -3.1318]],

          [[ 3.2125,  3.0939,  3.1673,  ...,  2.7995,  3.3553,  2.7505],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.0549,  2.0621,  2.0289,  ...,  2.0704,  2.0003,  2.0726],
           [ 1.3029,  1.5443,  1.2559,  ...,  1.5423,  1.2373,  1.5492],
           [ 2.0410,  2.0558,  2.0120,  ...,  2.0534,  1.9876,  2.0450],
           ...,
           [ 1.1192,  1.3876,  1.1104,  ...,  1.3950,  1.1010,  1.4171],
           [ 1.8491,  1.8885,  1.8504,  ...,  1.8759,  1.8250,  1.9006],
           [ 1.1176,  1.3951,  1.1207,  ...,  1.3870,  1.1037,  1.4159]],

          [[-3.5936, -3.6738, -3.5147,  ..., -3.6650, -3.4878, -3.8827],
           [-3.3742, -3.3760, -3.3073,  ..., -3.1328, -3.4173, -3.2141],
           [-3.4103, -3.8020, -3.3185,  ..., -3.8496, -3.3726, -3.8947],
           ...,
           [-3.3632, -3.2321, -3.4704,  ..., -3.1684, -3.3622, -3.0670],
           [-3.3711, -3.8351, -3.4057,  ..., -3.8572, -3.2951, -3.7437],
           [-3.1865, -3.2332, -3.4132,  ..., -3.1601, -3.2764, -3.0953]],

          [[ 3.1855,  3.0902,  3.1370,  ...,  2.7744,  3.3387,  2.7218],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.0882,  2.0898,  2.0603,  ...,  2.0988,  2.0297,  2.1012],
           [ 1.3191,  1.5746,  1.2688,  ...,  1.5724,  1.2489,  1.5799],
           [ 2.0733,  2.0831,  2.0423,  ...,  2.0806,  2.0161,  2.0716],
           ...,
           [ 1.1225,  1.4069,  1.1131,  ...,  1.4147,  1.1030,  1.4384],
           [ 1.8679,  1.9041,  1.8693,  ...,  1.8906,  1.8420,  1.9170],
           [ 1.1208,  1.4149,  1.1240,  ...,  1.4062,  1.1058,  1.4371]],

          [[-3.5789, -3.6479, -3.4945,  ..., -3.6384, -3.4657, -3.8715],
           [-3.3615, -3.3588, -3.2899,  ..., -3.0985, -3.4076, -3.1855],
           [-3.3827, -3.7851, -3.2845,  ..., -3.8361, -3.3423, -3.8843],
           ...,
           [-3.3497, -3.2047, -3.4644,  ..., -3.1365, -3.3486, -3.0280],
           [-3.3408, -3.8205, -3.3777,  ..., -3.8442, -3.2594, -3.7227],
           [-3.1605, -3.2059, -3.4032,  ..., -3.1278, -3.2568, -3.0583]],

          [[ 3.1583,  3.0869,  3.1064,  ...,  2.7488,  3.3223,  2.6925],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.1204,  2.1160,  2.0906,  ...,  2.1255,  2.0580,  2.1281],
           [ 1.3365,  1.6040,  1.2828,  ...,  1.6017,  1.2615,  1.6096],
           [ 2.1045,  2.1088,  2.0714,  ...,  2.1061,  2.0434,  2.0965],
           ...,
           [ 1.1266,  1.4249,  1.1165,  ...,  1.4333,  1.1057,  1.4585],
           [ 1.8851,  1.9176,  1.8866,  ...,  1.9032,  1.8575,  1.9313],
           [ 1.1247,  1.4334,  1.1282,  ...,  1.4241,  1.1087,  1.4572]],

          [[-3.5617, -3.6172, -3.4715,  ..., -3.6071, -3.4407, -3.8561],
           [-3.3451, -3.3389, -3.2686,  ..., -3.0609, -3.3943, -3.1538],
           [-3.3520, -3.7638, -3.2471,  ..., -3.8182, -3.3089, -3.8698],
           ...,
           [-3.3325, -3.1744, -3.4550,  ..., -3.1015, -3.3313, -2.9856],
           [-3.3073, -3.8016, -3.3468,  ..., -3.8270, -3.2203, -3.6971],
           [-3.1304, -3.1756, -3.3896,  ..., -3.0921, -3.2332, -3.0180]],

          [[ 3.1263,  3.0799,  3.0709,  ...,  2.7187,  3.3015,  2.6585],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.1522,  2.1421,  2.1204,  ...,  2.1522,  2.0856,  2.1550],
           [ 1.3550,  1.6332,  1.2978,  ...,  1.6307,  1.2751,  1.6392],
           [ 2.1352,  2.1344,  2.0999,  ...,  2.1315,  2.0701,  2.1213],
           ...,
           [ 1.1312,  1.4422,  1.1204,  ...,  1.4512,  1.1089,  1.4781],
           [ 1.9013,  1.9305,  1.9030,  ...,  1.9151,  1.8719,  1.9452],
           [ 1.1292,  1.4513,  1.1330,  ...,  1.4414,  1.1122,  1.4767]],

          [[-3.5443, -3.5847, -3.4482,  ..., -3.5739, -3.4154, -3.8394],
           [-3.3281, -3.3200, -3.2466,  ..., -3.0236, -3.3806, -3.1227],
           [-3.3209, -3.7409, -3.2091,  ..., -3.7990, -3.2749, -3.8539],
           ...,
           [-3.3147, -3.1446, -3.4453,  ..., -3.0669, -3.3135, -2.9434],
           [-3.2732, -3.7813, -3.3153,  ..., -3.8083, -3.1805, -3.6699],
           [-3.0993, -3.1460, -3.3756,  ..., -3.0569, -3.2089, -2.9779]],

          [[ 3.0931,  3.0725,  3.0340,  ...,  2.6876,  3.2798,  2.6234],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.1836,  2.1670,  2.1498,  ...,  2.1778,  2.1128,  2.1807],
           [ 1.3731,  1.6620,  1.3122,  ...,  1.6594,  1.2880,  1.6684],
           [ 2.1656,  2.1588,  2.1280,  ...,  2.1557,  2.0963,  2.1449],
           ...,
           [ 1.1350,  1.4588,  1.1235,  ...,  1.4684,  1.1113,  1.4970],
           [ 1.9167,  1.9419,  1.9184,  ...,  1.9255,  1.8854,  1.9575],
           [ 1.1329,  1.4686,  1.1368,  ...,  1.4580,  1.1147,  1.4955]],

          [[-3.5245, -3.5480, -3.4222,  ..., -3.5365, -3.3873, -3.8190],
           [-3.3077, -3.2990, -3.2209,  ..., -2.9836, -3.3635, -3.0890],
           [-3.2867, -3.7142, -3.1678,  ..., -3.7760, -3.2378, -3.8344],
           ...,
           [-3.2934, -3.1123, -3.4324,  ..., -3.0297, -3.2921, -2.8983],
           [-3.2360, -3.7571, -3.2808,  ..., -3.7859, -3.1374, -3.6386],
           [-3.0643, -3.1138, -3.3583,  ..., -3.0191, -3.1808, -2.9350]],

          [[ 3.0568,  3.0610,  2.9939,  ...,  2.6514,  3.2555,  2.5832],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.2137,  2.1911,  2.1778,  ...,  2.2026,  2.1385,  2.2057],
           [ 1.3911,  1.6910,  1.3264,  ...,  1.6882,  1.3007,  1.6978],
           [ 2.1945,  2.1824,  2.1547,  ...,  2.1792,  2.1209,  2.1677],
           ...,
           [ 1.1382,  1.4752,  1.1260,  ...,  1.4854,  1.1130,  1.5158],
           [ 1.9302,  1.9521,  1.9321,  ...,  1.9347,  1.8970,  1.9687],
           [ 1.1359,  1.4855,  1.1401,  ...,  1.4743,  1.1167,  1.5142]],

          [[-3.5024, -3.5064, -3.3938,  ..., -3.4943, -3.3567, -3.7942],
           [-3.2845, -3.2750, -3.1923,  ..., -2.9401, -3.3438, -3.0520],
           [-3.2499, -3.6829, -3.1236,  ..., -3.7485, -3.1980, -3.8107],
           ...,
           [-3.2693, -3.0768, -3.4169,  ..., -2.9890, -3.2679, -2.8494],
           [-3.1960, -3.7286, -3.2436,  ..., -3.7591, -3.0913, -3.6027],
           [-3.0259, -3.0783, -3.3382,  ..., -2.9777, -3.1497, -2.8884]],

          [[ 3.0165,  3.0480,  2.9496,  ...,  2.6129,  3.2275,  2.5404],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.2429,  2.2133,  2.2048,  ...,  2.2255,  2.1631,  2.2287],
           [ 1.4101,  1.7184,  1.3416,  ...,  1.7154,  1.3144,  1.7256],
           [ 2.2226,  2.2041,  2.1803,  ...,  2.2006,  2.1445,  2.1884],
           ...,
           [ 1.1420,  1.4896,  1.1291,  ...,  1.5003,  1.1153,  1.5326],
           [ 1.9423,  1.9598,  1.9443,  ...,  1.9414,  1.9071,  1.9774],
           [ 1.1396,  1.5005,  1.1441,  ...,  1.4886,  1.1192,  1.5309]],

          [[-3.4792, -3.4623, -3.3640,  ..., -3.4494, -3.3247, -3.7674],
           [-3.2588, -3.2501, -3.1611,  ..., -2.8950, -3.3216, -3.0137],
           [-3.2115, -3.6495, -3.0775,  ..., -3.7190, -3.1564, -3.7849],
           ...,
           [-3.2427, -3.0400, -3.3992,  ..., -2.9469, -3.2412, -2.7989],
           [-3.1543, -3.6978, -3.2047,  ..., -3.7302, -3.0433, -3.5644],
           [-2.9846, -3.0416, -3.3157,  ..., -2.9349, -3.1159, -2.8402]],

          [[ 2.9737,  3.0326,  2.9029,  ...,  2.5714,  3.1974,  2.4945],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.2711,  2.2350,  2.2307,  ...,  2.2479,  2.1866,  2.2514],
           [ 1.4295,  1.7455,  1.3569,  ...,  1.7424,  1.3281,  1.7531],
           [ 2.2495,  2.2253,  2.2048,  ...,  2.2216,  2.1669,  2.2087],
           ...,
           [ 1.1456,  1.5033,  1.1319,  ...,  1.5147,  1.1174,  1.5489],
           [ 1.9529,  1.9667,  1.9549,  ...,  1.9472,  1.9156,  1.9853],
           [ 1.1431,  1.5149,  1.1478,  ...,  1.5023,  1.1215,  1.5470]],

          [[-3.4522, -3.4122, -3.3303,  ..., -3.3985, -3.2887, -3.7352],
           [-3.2288, -3.2222, -3.1253,  ..., -2.8463, -3.2953, -2.9720],
           [-3.1688, -3.6103, -3.0270,  ..., -3.6839, -3.1105, -3.7536],
           ...,
           [-3.2118, -2.9997, -3.3774,  ..., -2.9012, -3.2102, -2.7446],
           [-3.1083, -3.6615, -3.1617,  ..., -3.6958, -2.9907, -3.5202],
           [-2.9386, -3.0015, -3.2890,  ..., -2.8886, -3.0776, -2.7883]],

          [[ 2.9260,  3.0135,  2.8510,  ...,  2.5253,  3.1628,  2.4439],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.2972,  2.2543,  2.2545,  ...,  2.2679,  2.2079,  2.2716],
           [ 1.4488,  1.7717,  1.3721,  ...,  1.7684,  1.3416,  1.7798],
           [ 2.2744,  2.2440,  2.2271,  ...,  2.2401,  2.1871,  2.2265],
           ...,
           [ 1.1487,  1.5158,  1.1343,  ...,  1.5278,  1.1189,  1.5639],
           [ 1.9609,  1.9707,  1.9630,  ...,  1.9502,  1.9214,  1.9904],
           [ 1.1461,  1.5280,  1.1511,  ...,  1.5147,  1.1233,  1.5619]],

          [[-3.4232, -3.3576, -3.2944,  ..., -3.3431, -3.2504, -3.6989],
           [-3.1950, -3.1928, -3.0857,  ..., -2.7955, -3.2653, -2.9283],
           [-3.1237, -3.5670, -2.9738,  ..., -3.6448, -3.0621, -3.7184],
           ...,
           [-3.1770, -2.9576, -3.3521,  ..., -2.8535, -3.1753, -2.6879],
           [-3.0597, -3.6211, -3.1162,  ..., -3.6573, -2.9355, -3.4717],
           [-2.8883, -2.9594, -3.2587,  ..., -2.8401, -3.0351, -2.7341]],

          [[ 2.8751,  2.9906,  2.7959,  ...,  2.4745,  3.1254,  2.3885],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3224,  2.2723,  2.2775,  ...,  2.2867,  2.2282,  2.2905],
           [ 1.4701,  1.7973,  1.3892,  ...,  1.7939,  1.3570,  1.8059],
           [ 2.2984,  2.2614,  2.2485,  ...,  2.2573,  2.2063,  2.2429],
           ...,
           [ 1.1535,  1.5272,  1.1382,  ...,  1.5399,  1.1220,  1.5780],
           [ 1.9675,  1.9730,  1.9698,  ...,  1.9513,  1.9259,  1.9938],
           [ 1.1507,  1.5401,  1.1559,  ...,  1.5261,  1.1266,  1.5760]],

          [[-3.3925, -3.2991, -3.2565,  ..., -3.2838, -3.2101, -3.6594],
           [-3.1579, -3.1611, -3.0425,  ..., -2.7418, -3.2321, -2.8820],
           [-3.0763, -3.5201, -2.9181,  ..., -3.6022, -3.0113, -3.6800],
           ...,
           [-3.1389, -2.9129, -3.3237,  ..., -2.8031, -3.1371, -2.6283],
           [-3.0088, -3.5772, -3.0684,  ..., -3.6154, -2.8777, -3.4196],
           [-2.8342, -2.9149, -3.2251,  ..., -2.7889, -2.9892, -2.6771]],

          [[ 2.8203,  2.9644,  2.7367,  ...,  2.4198,  3.0845,  2.3290],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3459,  2.2887,  2.2985,  ...,  2.3039,  2.2466,  2.3079],
           [ 1.4903,  1.8222,  1.4050,  ...,  1.8186,  1.3711,  1.8312],
           [ 2.3206,  2.2773,  2.2679,  ...,  2.2729,  2.2235,  2.2578],
           ...,
           [ 1.1566,  1.5375,  1.1406,  ...,  1.5509,  1.1234,  1.5910],
           [ 1.9718,  1.9733,  1.9742,  ...,  1.9504,  1.9280,  1.9952],
           [ 1.1537,  1.5511,  1.1592,  ...,  1.5363,  1.1283,  1.5889]],

          [[-3.3582, -3.2344, -3.2149,  ..., -3.2184, -3.1660, -3.6141],
           [-3.1164, -3.1270, -2.9949,  ..., -2.6851, -3.1947, -2.8328],
           [-3.0250, -3.4673, -2.8583,  ..., -3.5539, -2.9565, -3.6358],
           ...,
           [-3.0965, -2.8654, -3.2912,  ..., -2.7496, -3.0946, -2.5654],
           [-2.9539, -3.5275, -3.0167,  ..., -3.5678, -2.8157, -3.3614],
           [-2.7753, -2.8675, -3.1873,  ..., -2.7347, -2.9387, -2.6169]],

          [[ 2.7612,  2.9347,  2.6730,  ...,  2.3607,  3.0396,  2.2651],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3663,  2.3019,  2.3164,  ...,  2.3178,  2.2618,  2.3221],
           [ 1.5106,  1.8442,  1.4208,  ...,  1.8404,  1.3851,  1.8537],
           [ 2.3396,  2.2898,  2.2843,  ...,  2.2853,  2.2374,  2.2693],
           ...,
           [ 1.1594,  1.5446,  1.1425,  ...,  1.5587,  1.1244,  1.6009],
           [ 1.9726,  1.9699,  1.9752,  ...,  1.9458,  1.9265,  1.9930],
           [ 1.1563,  1.5589,  1.1621,  ...,  1.5434,  1.1295,  1.5987]],

          [[-3.3193, -3.1624, -3.1685,  ..., -3.1455, -3.1170, -3.5620],
           [-3.0696, -3.0882, -2.9416,  ..., -2.6231, -3.1519, -2.7786],
           [-2.9687, -3.4075, -2.7932,  ..., -3.4986, -2.8966, -3.5848],
           ...,
           [-3.0486, -2.8130, -3.2535,  ..., -2.6911, -3.0466, -2.4972],
           [-2.8938, -3.4708, -2.9599,  ..., -3.5132, -2.7484, -3.2960],
           [-2.7105, -2.8151, -3.1442,  ..., -2.6754, -2.8825, -2.5514]],

          [[ 2.6946,  2.8993,  2.6018,  ...,  2.2952,  2.9876,  2.1945],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3857,  2.3140,  2.3333,  ...,  2.3307,  2.2759,  2.3353],
           [ 1.5316,  1.8665,  1.4373,  ...,  1.8624,  1.3998,  1.8764],
           [ 2.3577,  2.3013,  2.2995,  ...,  2.2965,  2.2503,  2.2798],
           ...,
           [ 1.1625,  1.5516,  1.1447,  ...,  1.5664,  1.1258,  1.6108],
           [ 1.9720,  1.9651,  1.9747,  ...,  1.9398,  1.9235,  1.9893],
           [ 1.1592,  1.5666,  1.1654,  ...,  1.5503,  1.1312,  1.6084]],

          [[-3.2772, -3.0857, -3.1187,  ..., -3.0679, -3.0646, -3.5056],
           [-3.0185, -3.0470, -2.8840,  ..., -2.5582, -3.1051, -2.7216],
           [-2.9087, -3.3433, -2.7243,  ..., -3.4390, -2.8329, -3.5296],
           ...,
           [-2.9964, -2.7577, -3.2118,  ..., -2.6296, -2.9943, -2.4259],
           [-2.8300, -3.4098, -2.8994,  ..., -3.4544, -2.6772, -3.2261],
           [-2.6412, -2.7599, -3.0969,  ..., -2.6131, -2.8219, -2.4827]],

          [[ 2.6252,  2.8608,  2.5276,  ...,  2.2259,  2.9331,  2.1201],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4023,  2.3227,  2.3473,  ...,  2.3402,  2.2870,  2.3450],
           [ 1.5539,  1.8869,  1.4549,  ...,  1.8827,  1.4155,  1.8973],
           [ 2.3729,  2.3094,  2.3118,  ...,  2.3043,  2.2602,  2.2867],
           ...,
           [ 1.1665,  1.5564,  1.1479,  ...,  1.5719,  1.1280,  1.6185],
           [ 1.9680,  1.9565,  1.9708,  ...,  1.9299,  1.9171,  1.9819],
           [ 1.1631,  1.5722,  1.1695,  ...,  1.5550,  1.1336,  1.6160]],

          [[-3.2310, -3.0007, -3.0646,  ..., -2.9821, -3.0078, -3.4415],
           [-2.9615, -3.0014, -2.8204,  ..., -2.4884, -3.0524, -2.6598],
           [-2.8442, -3.2711, -2.6506,  ..., -3.3716, -2.7647, -3.4667],
           ...,
           [-2.9384, -2.6978, -3.1644,  ..., -2.5633, -2.9362, -2.3495],
           [-2.7616, -3.3410, -2.8345,  ..., -3.3877, -2.6012, -3.1481],
           [-2.5655, -2.7001, -3.0438,  ..., -2.5460, -2.7552, -2.4092]],

          [[ 2.5483,  2.8162,  2.4460,  ...,  2.1499,  2.8715,  2.0388],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4158,  2.3288,  2.3581,  ...,  2.3472,  2.2950,  2.3522],
           [ 1.5768,  1.9054,  1.4730,  ...,  1.9010,  1.4318,  1.9164],
           [ 2.3850,  2.3148,  2.3209,  ...,  2.3096,  2.2668,  2.2911],
           ...,
           [ 1.1707,  1.5590,  1.1512,  ...,  1.5753,  1.1303,  1.6241],
           [ 1.9606,  1.9449,  1.9636,  ...,  1.9171,  1.9073,  1.9716],
           [ 1.1671,  1.5756,  1.1739,  ...,  1.5576,  1.1362,  1.6215]],

          [[-3.1807, -2.9093, -3.0063,  ..., -2.8897, -2.9468, -3.3713],
           [-2.8985, -2.9532, -2.7505,  ..., -2.4154, -2.9937, -2.5952],
           [-2.7753, -3.1927, -2.5724,  ..., -3.2980, -2.6919, -3.3977],
           ...,
           [-2.8742, -2.6349, -3.1112,  ..., -2.4940, -2.8719, -2.2699],
           [-2.6887, -3.2659, -2.7651,  ..., -3.3149, -2.5206, -3.0638],
           [-2.4834, -2.6374, -2.9847,  ..., -2.4759, -2.6822, -2.3324]],

          [[ 2.4661,  2.7668,  2.3589,  ...,  2.0683,  2.8049,  1.9519],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4247,  2.3303,  2.3643,  ...,  2.3495,  2.2982,  2.3547],
           [ 1.5989,  1.9217,  1.4902,  ...,  1.9171,  1.4470,  1.9332],
           [ 2.3924,  2.3157,  2.3254,  ...,  2.3102,  2.2687,  2.2908],
           ...,
           [ 1.1738,  1.5591,  1.1533,  ...,  1.5761,  1.1315,  1.6273],
           [ 1.9482,  1.9285,  1.9513,  ...,  1.8993,  1.8924,  1.9564],
           [ 1.1700,  1.5764,  1.1771,  ...,  1.5576,  1.1377,  1.6245]],

          [[-3.1252, -2.8092, -2.9427,  ..., -2.7888, -2.8804, -3.2929],
           [-2.8293, -2.8999, -2.6744,  ..., -2.3370, -2.9290, -2.5252],
           [-2.7008, -3.1059, -2.4885,  ..., -3.2161, -2.6136, -3.3205],
           ...,
           [-2.8039, -2.5668, -3.0519,  ..., -2.4193, -2.8015, -2.1847],
           [-2.6102, -3.1825, -2.6902,  ..., -3.2338, -2.4342, -2.9710],
           [-2.3948, -2.5694, -2.9196,  ..., -2.4003, -2.6029, -2.2502]],

          [[ 2.3764,  2.7101,  2.2641,  ...,  1.9790,  2.7310,  1.8571],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4307,  2.3281,  2.3676,  ...,  2.3483,  2.2985,  2.3537],
           [ 1.6207,  1.9346,  1.5071,  ...,  1.9298,  1.4620,  1.9466],
           [ 2.3970,  2.3129,  2.3269,  ...,  2.3071,  2.2677,  2.2869],
           ...,
           [ 1.1764,  1.5556,  1.1550,  ...,  1.5733,  1.1322,  1.6268],
           [ 1.9327,  1.9082,  1.9359,  ...,  1.8777,  1.8743,  1.9373],
           [ 1.1725,  1.5737,  1.1799,  ...,  1.5540,  1.1387,  1.6240]],

          [[-3.0639, -2.6997, -2.8731,  ..., -2.6784, -2.8079, -3.2053],
           [-2.7525, -2.8408, -2.5906,  ..., -2.2524, -2.8567, -2.4491],
           [-2.6203, -3.0098, -2.3983,  ..., -3.1251, -2.5291, -3.2342],
           ...,
           [-2.7259, -2.4925, -2.9852,  ..., -2.3384, -2.7234, -2.0931],
           [-2.5256, -3.0900, -2.6091,  ..., -3.1436, -2.3416, -2.8688],
           [-2.2983, -2.4953, -2.8469,  ..., -2.3185, -2.5158, -2.1616]],

          [[ 2.2789,  2.6468,  2.1616,  ...,  1.8825,  2.6496,  1.7552],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4322,  2.3223,  2.3663,  ...,  2.3433,  2.2942,  2.3490],
           [ 1.6440,  1.9462,  1.5254,  ...,  1.9411,  1.4783,  1.9587],
           [ 2.3970,  2.3064,  2.3239,  ...,  2.3003,  2.2620,  2.2793],
           ...,
           [ 1.1802,  1.5505,  1.1579,  ...,  1.5691,  1.1340,  1.6249],
           [ 1.9123,  1.8839,  1.9157,  ...,  1.8521,  1.8513,  1.9143],
           [ 1.1761,  1.5694,  1.1838,  ...,  1.5489,  1.1408,  1.6219]],

          [[-2.9973, -2.5817, -2.7981,  ..., -2.5594, -2.7301, -3.1094],
           [-2.6688, -2.7778, -2.4998,  ..., -2.1636, -2.7775, -2.3689],
           [-2.5342, -2.9054, -2.3025,  ..., -3.0257, -2.4390, -3.1396],
           ...,
           [-2.6410, -2.4143, -2.9116,  ..., -2.2533, -2.6384, -1.9973],
           [-2.4353, -2.9890, -2.5226,  ..., -3.0450, -2.2433, -2.7582],
           [-2.1946, -2.4171, -2.7672,  ..., -2.2326, -2.4217, -2.0688]],

          [[ 2.1744,  2.5767,  2.0519,  ...,  1.7790,  2.5613,  1.6460],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4281,  2.3106,  2.3595,  ...,  2.3325,  2.2843,  2.3385],
           [ 1.6671,  1.9534,  1.5435,  ...,  1.9481,  1.4944,  1.9665],
           [ 2.3915,  2.2940,  2.3152,  ...,  2.2877,  2.2508,  2.2658],
           ...,
           [ 1.1836,  1.5410,  1.1603,  ...,  1.5603,  1.1355,  1.6185],
           [ 1.8862,  1.8537,  1.8898,  ...,  1.8205,  1.8227,  1.8854],
           [ 1.1793,  1.5607,  1.1874,  ...,  1.5393,  1.1426,  1.6154]],

          [[-2.9242, -2.4526, -2.7166,  ..., -2.4293, -2.6457, -3.0027],
           [-2.5765, -2.7087, -2.4003,  ..., -2.0685, -2.6899, -2.2825],
           [-2.4416, -2.7900, -2.2000,  ..., -2.9154, -2.3423, -3.0341],
           ...,
           [-2.5476, -2.3298, -2.8297,  ..., -2.1621, -2.5448, -1.8953],
           [-2.3385, -2.8772, -2.4295,  ..., -2.9355, -2.1383, -2.6366],
           [-2.0823, -2.3328, -2.6791,  ..., -2.1405, -2.3190, -1.9697]],

          [[ 2.0608,  2.4988,  1.9331,  ...,  1.6673,  2.4641,  1.5287],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4177,  2.2934,  2.3462,  ...,  2.3162,  2.2680,  2.3224],
           [ 1.6894,  1.9567,  1.5608,  ...,  1.9512,  1.5097,  1.9703],
           [ 2.3795,  2.2761,  2.3002,  ...,  2.2696,  2.2331,  2.2467],
           ...,
           [ 1.1863,  1.5275,  1.1621,  ...,  1.5476,  1.1362,  1.6082],
           [ 1.8537,  1.8178,  1.8574,  ...,  1.7833,  1.7876,  1.8508],
           [ 1.1818,  1.5480,  1.1902,  ...,  1.5257,  1.1436,  1.6049]],

          [[-2.8438, -2.3122, -2.6277,  ..., -2.2880, -2.5539, -2.8847],
           [-2.4749, -2.6330, -2.2916,  ..., -1.9666, -2.5929, -2.1894],
           [-2.3414, -2.6634, -2.0900,  ..., -2.7939, -2.2381, -2.9175],
           ...,
           [-2.4448, -2.2386, -2.7384,  ..., -2.0640, -2.4420, -1.7863],
           [-2.2341, -2.7541, -2.3288,  ..., -2.8148, -2.0258, -2.5037],
           [-1.9606, -2.2417, -2.5818,  ..., -2.0415, -2.2069, -1.8638]],

          [[ 1.9376,  2.4113,  1.8047,  ...,  1.5459,  2.3574,  1.4017],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.4007,  2.2693,  2.3265,  ...,  2.2931,  2.2452,  2.2995],
           [ 1.7121,  1.9557,  1.5784,  ...,  1.9500,  1.5254,  1.9699],
           [ 2.3611,  2.2514,  2.2786,  ...,  2.2446,  2.2089,  2.2208],
           ...,
           [ 1.1892,  1.5097,  1.1641,  ...,  1.5306,  1.1372,  1.5936],
           [ 1.8147,  1.7751,  1.8185,  ...,  1.7393,  1.7460,  1.8095],
           [ 1.1846,  1.5310,  1.1933,  ...,  1.5078,  1.1448,  1.5902]],

          [[-2.7554, -2.1588, -2.5309,  ..., -2.1336, -2.4542, -2.7537],
           [-2.3636, -2.5505, -2.1731,  ..., -1.8581, -2.4862, -2.0895],
           [-2.2334, -2.5237, -1.9722,  ..., -2.6593, -2.1261, -2.7877],
           ...,
           [-2.3323, -2.1407, -2.6374,  ..., -1.9593, -2.3294, -1.6707],
           [-2.1220, -2.6180, -2.2203,  ..., -2.6811, -1.9055, -2.3577],
           [-1.8291, -2.1439, -2.4747,  ..., -1.9359, -2.0851, -1.7512]],

          [[ 1.8042,  2.3138,  1.6661,  ...,  1.4145,  2.2404,  1.2646],
           [ 2

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3754,  2.2375,  2.2984,  ...,  2.2621,  2.2141,  2.2687],
           [ 1.7349,  1.9499,  1.5962,  ...,  1.9440,  1.5411,  1.9645],
           [ 2.3343,  2.2189,  2.2488,  ...,  2.2118,  2.1764,  2.1872],
           ...,
           [ 1.1924,  1.4871,  1.1663,  ...,  1.5088,  1.1385,  1.5741],
           [ 1.7674,  1.7248,  1.7714,  ...,  1.6875,  1.6961,  1.7604],
           [ 1.1876,  1.5092,  1.1967,  ...,  1.4852,  1.1464,  1.5706]],

          [[-2.6584, -1.9914, -2.4254,  ..., -1.9653, -2.3459, -2.6086],
           [-2.2410, -2.4601, -2.0433,  ..., -1.7418, -2.3682, -1.9819],
           [-2.1168, -2.3700, -1.8458,  ..., -2.5107, -2.0055, -2.6439],
           ...,
           [-2.2085, -2.0350, -2.5250,  ..., -1.8468, -2.2054, -1.5474],
           [-2.0012, -2.4678, -2.1032,  ..., -2.5333, -1.7766, -2.1978],
           [-1.6864, -2.0383, -2.3562,  ..., -1.8225, -1.9520, -1.6309]],

          [[ 1.6590,  2.2058,  1.5157,  ...,  1.2728,  2.1115,  1.1173],
           [ 1

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.3394,  2.1959,  2.2596,  ...,  2.2214,  2.1722,  2.2283],
           [ 1.7566,  1.9370,  1.6130,  ...,  1.9309,  1.5560,  1.9522],
           [ 2.2968,  2.1766,  2.2082,  ...,  2.1693,  2.1333,  2.1438],
           ...,
           [ 1.1948,  1.4577,  1.1678,  ...,  1.4802,  1.1389,  1.5478],
           [ 1.7097,  1.6649,  1.7137,  ...,  1.6263,  1.6358,  1.7017],
           [ 1.1898,  1.4806,  1.1992,  ...,  1.4557,  1.1471,  1.5442]],

          [[-2.5504, -1.8055, -2.3091,  ..., -1.7785, -2.2267, -2.4448],
           [-2.1044, -2.3600, -1.8997,  ..., -1.6160, -2.2362, -1.8647],
           [-1.9895, -2.1976, -1.7088,  ..., -2.3433, -1.8742, -2.4813],
           ...,
           [-2.0708, -1.9197, -2.3986,  ..., -1.7248, -2.0676, -1.4146],
           [-1.8697, -2.2990, -1.9754,  ..., -2.3668, -1.6371, -2.0193],
           [-1.5301, -1.9231, -2.2237,  ..., -1.6996, -1.8051, -1.5012]],

          [[ 1.4996,  2.0838,  1.3511,  ...,  1.1175,  1.9682,  0.9565],
           [ 1

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])


cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.
cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCog

[tensor([[[[[ 2.2905,  2.1421,  2.2080,  ...,  2.1684,  2.1177,  2.1755],
           [ 1.7784,  1.9158,  1.6299,  ...,  1.9094,  1.5710,  1.9314],
           [ 2.2464,  2.1221,  2.1549,  ...,  2.1146,  2.0774,  2.0882],
           ...,
           [ 1.1976,  1.4203,  1.1696,  ...,  1.4435,  1.1398,  1.5134],
           [ 1.6395,  1.5931,  1.6437,  ...,  1.5532,  1.5631,  1.6312],
           [ 1.1925,  1.4439,  1.2021,  ...,  1.4182,  1.1483,  1.5097]],

          [[-2.4293, -1.5978, -2.1799,  ..., -1.5698, -2.0947, -2.2587],
           [-1.9514, -2.2483, -1.7398,  ..., -1.4791, -2.0876, -1.7362],
           [-1.8494, -2.0031, -1.5593,  ..., -2.1538, -1.7302, -2.2964],
           ...,
           [-1.9166, -1.7930, -2.2556,  ..., -1.5915, -1.9133, -1.2709],
           [-1.7256, -2.1079, -1.8349,  ..., -2.1780, -1.4851, -1.8188],
           [-1.3576, -1.7966, -2.0747,  ..., -1.5655, -1.6420, -1.3604]],

          [[ 1.3227,  1.9453,  1.1693,  ...,  0.9463,  1.8072,  0.7798],
           [ 1

cross_attention_kwargs ['scale_multiplier'] are not expected by FusedCogVideoXAttnProcessor2_0 and will be ignored.


torch.Size([2, 13, 16, 32, 32])
torch.Size([2, 13, 16, 32, 32])
[tensor([[[[[ 2.2229,  2.0711,  2.1377,  ...,  2.0983,  2.0446,  2.1056],
           [ 1.7994,  1.8826,  1.6461,  ...,  1.8760,  1.5853,  1.8987],
           [ 2.1774,  2.0505,  2.0829,  ...,  2.0427,  2.0030,  2.0155],
           ...,
           [ 1.2000,  1.3712,  1.1711,  ...,  1.3952,  1.1404,  1.4674],
           [ 1.5510,  1.5046,  1.5554,  ...,  1.4634,  1.4723,  1.5439],
           [ 1.1947,  1.3957,  1.2047,  ...,  1.3691,  1.1491,  1.4635]],

          [[-2.2903, -1.3585, -2.0329,  ..., -1.3297, -1.9450, -2.0405],
           [-1.7752, -2.1200, -1.5568,  ..., -1.3263, -1.9157, -1.5916],
           [-1.6919, -1.7769, -1.3924,  ..., -1.9323, -1.5688, -2.0795],
           ...,
           [-1.7393, -1.6503, -2.0891,  ..., -1.4423, -1.7359, -1.1115],
           [-1.5641, -1.8850, -1.6768,  ..., -1.9573, -1.3159, -1.5866],
           [-1.1624, -1.6539, -1.9024,  ..., -1.4155, -1.4559, -1.2038]],

          [[ 1.1217,  1

In [70]:
x = iter(data_loader)

In [71]:
for i in range(10):
    print(next(x)['video_chunks'][0].shape)

torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])
torch.Size([1, 3, 50, 256, 256])


In [None]:
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
from tqdm.notebook import tqdm

latent_chunks        = [] # [B, F, C, H, W]
uncond_latent_chunks = []
cond_chunks          = []
uncond_chunks        = []
uncond_latents = []
ref_mask_chunks      = []
sequence_infos       = []
ref_latents = []

def encode_video(video):
    
    video = video.to(device, dtype=vae.dtype)
    # VAE takes shape of [B, C, F, H, W]
    with torch.no_grad():
        latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor

    # Permute back to [B, F, C, H, W]
    return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format) 


for batch_id, batch in enumerate(data_loader):
    for chunk_id in range(len(batch["video_chunks"])):
        B, C, F, H, W = batch['video_chunks'][chunk_id].shape
        latent_chunk = encode_video(batch["video_chunks"][chunk_id]) # [B, C, F, H, W]
        latent_chunks.append(latent_chunk)
        uncond_latent_chunks.append(latent_chunk * 0.)
        cond_chunk = {key : batch['cond_chunks'][key][chunk_id] for key in batch['cond_chunks'].keys()}
        ref_mask_chunks.append(cond_chunk["ref_mask"].permute(0,1,4,2,3))
        uncond_chunks.append(cond_chunk["ref_mask"] * 0.)
        uncond_latents.append(latent_chunk * 0.)
        sequence_infos.append((False, torch.arange(0, latent_chunk.shape[1], device=device)))
        ref_latents.append(torch.cat([latent_chunk, uncond_latents[chunk_id]], dim=0))

num_inference_steps=50
generator = torch.Generator(device=device)
generator.manual_seed(args.seed)

timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps, device)

latent_channels = 16 # transformer.config.in_channels

T_total = sum(chunk.shape[1] for chunk in latent_chunks)  # = 4 * (num_chunks)
inner_dim = (
    accelerator.unwrap_model(transformer).config.num_attention_heads
    * accelerator.unwrap_model(transformer).config.attention_head_dim
)

text_embeds        = torch.zeros((B, 1, inner_dim), dtype=dtype, device=device)
text_in = torch.cat([text_embeds] * 2, dim=0)

audio_embeds   = torch.zeros((B, T_total, 768), dtype=dtype, device=device)
uncond_audio_embeds = audio_embeds * 0.0
audio_in = torch.cat([audio_embeds] * 2, dim=0)

initial_latents = []
ref_latents = []

for chunk_id in range(len(latent_chunks)):
    # Add Reference latent if used as refernce or else add random noise
    if sequence_infos[chunk_id][0]:
        noisy_latent = latent_chunks[chunk_id]
    else:
        noisy_latent = torch.randn(
            latent_chunks[chunk_id].shape,
            generator=generator,
            device=device,
            dtype=dtype
        )
    
    initial_latents.append(noisy_latent)

    ref_latents.append(latent_chunks[chunk_id])

    latents = initial_latents

num_warmup_steps = max(len(timesteps) - num_inference_steps * scheduler.order, 0)

guidance_scale =  1
# noise_preds = 0
for i, t in tqdm(enumerate(timesteps), desc="Denoising"):
    print(i)
    latent_model_inputs = [torch.cat([chunks] * 2, dim=0)for chunks in latent_chunks]
    old_pred_original_samples = [None] * len(latent_chunks)
    timestep = t.expand(latent_chunks[0].shape[0])
    current_sampling_percent = i / len(timesteps)
    B, F, C, H, W = latent_chunks[0].shape

    with torch.no_grad():
        noise_preds = transformer(
            hidden_states=latent_model_inputs,
            encoder_hidden_states=text_in,
            audio_embeds=audio_in,
            condition=[torch.zeros((B * 2, F, 1, H, W), device=device, dtype=dtype) for item in latent_model_inputs],
            sequence_infos=[[False, torch.arange(chunk.shape[1])]for chunk in latent_chunks],
            timestep=timestep,
            image_rotary_emb=None,
            return_dict=False,
            growth_factor=None,
        )[0]

    new_latents = []
    new_old_pred_original_samples = []

    for noise_pred, old_pred_original_sample, latent in zip(noise_preds, old_pred_original_samples, latent_chunks):
        noise_pred, noise_pred_uncond = noise_pred.chunk(2)
        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
    
        latent, old_pred_original_sample = scheduler.step(
            noise_pred,
            old_pred_original_sample,
            t,
            timesteps[i - 1] if i > 0 else None,
            latent,
            eta=0.0,
            generator=generator
        )

        new_latents.append(latent)
        new_old_pred_original_samples.append(old_pred_original_sample)

    latents = new_latents
    old_pred_original_samples = new_old_pred_original_samples

[2025-06-07 18:53:09,021] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


06/07/2025 18:53:09 - INFO - root - gcc -pthread -B /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -I/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -fPIC -O2 -isystem /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -fPIC -c /tmp/tmp7fn1wcuz/test.c -o /tmp/tmp7fn1wcuz/test.o




06/07/2025 18:53:09 - INFO - root - gcc -pthread -B /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/compiler_compat /tmp/tmp7fn1wcuz/test.o -laio -o /tmp/tmp7fn1wcuz/a.out
/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
06/07/2025 18:53:10 - INFO - root - gcc -pthread -B /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -I/scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -fPIC -O2 -isystem /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/include -fPIC -c /tmp/tmpm8zzhwth/test.c -o /tmp/tmpm8zzhwth/test.o
06/07/2025 18:53:10 - INFO - root - gcc -pthread -B /scratch/ondemand28/harryscz/anaconda3/envs/pytorch3d/compiler_compat /tmp/tmpm8zzhwth/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o

Denoising: 0it [00:00, ?it/s]

0
