In [1]:

import argparse
import os
import time

import imageio
import numpy as np
import torch
import torchvision
from einops import rearrange
from rich import print, pretty

pretty.install()

# Distributed env defaults (torchrun will override)
os.environ.setdefault("MASTER_ADDR", "localhost")
os.environ.setdefault("MASTER_PORT", "29506")

os.environ["CUDA_VISIBLE_DEVICES"] = "1"


In [2]:
from fastvideo.configs.pipelines.wan import Wan2_2_I2V_A14B_Config
from fastvideo.distributed import get_local_torch_device
from fastvideo.distributed.parallel_state import (
    maybe_init_distributed_environment_and_model_parallel,
)
from fastvideo.fastvideo_args import ExecutionMode, FastVideoArgs, TrainingArgs, WorkloadType
from fastvideo.logger import init_logger
from fastvideo.models.schedulers.scheduling_flow_unipc_multistep import (
    FlowUniPCMultistepScheduler,
)
from fastvideo.pipelines import build_pipeline
from fastvideo.pipelines.pipeline_batch_info import ForwardBatch, TrainingBatch
from fastvideo.utils import maybe_download_model
from fastvideo.training.wan_i2v_training_pipeline import WanI2VTrainingPipeline
from fastvideo.models.schedulers.scheduling_flow_match_euler_discrete import (
    FlowMatchEulerDiscreteScheduler,
)
from fastvideo.platforms import current_platform
logger = init_logger(__name__)

INFO 02-06 15:26:48.724 [__init__.py:109] ROCm platform is unavailable: No module named 'amdsmi'
INFO 02-06 15:26:48.731 [__init__.py:47] CUDA is available


In [3]:
MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
# LOCAL_DIR = os.path.join("dev", MODEL_ID.replace("/", "-"))
LOCAL_DIR = os.path.join(os.path.expanduser("~"), "dev/data/", MODEL_ID.replace("/", "-"))
OUTPUT_DIR = "video_samples_wan2_2_14B_i2v"

EXAMPLE_PROMPT = (
    "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. "
    "The fluffy-furred feline gazes directly at the camera with a relaxed expression. "
    "Blurred beach scenery forms the background featuring crystal-clear waters, distant "
    "green hills, and a blue sky dotted with white clouds. The cat assumes a naturally "
    "relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot "
    "highlights the feline's intricate details and the refreshing atmosphere of the seaside."
)
EXAMPLE_IMAGE = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
NEGATIVE_PROMPT = (
    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，"
    "低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，"
    "形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
)
def _resolve_model_path() -> str:
    logger.info("Downloading/loading model: %s", MODEL_ID)
    model_path = maybe_download_model(MODEL_ID, local_dir=LOCAL_DIR)
    logger.info("Model path: %s", model_path)
    return model_path


print(LOCAL_DIR)

In [10]:
model_path = _resolve_model_path()

pipeline_config = Wan2_2_I2V_A14B_Config()
fastvideo_args = FastVideoArgs(
    model_path=model_path,
    num_gpus=1,
    tp_size=1,
    sp_size=1,
    hsdp_shard_dim=1,
    hsdp_replicate_dim=1,
    dit_cpu_offload=False,
    text_encoder_cpu_offload=True,
    vae_cpu_offload=False,
    pipeline_config=pipeline_config,
)
print(fastvideo_args)

INFO 02-06 15:31:13.022 [1682206808.py:21] Downloading/loading model: Wan-AI/Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:31:13.024 [utils.py:517] Downloading model snapshot from HF Hub for Wan-AI/Wan2.2-I2V-A14B-Diffusers...


Fetching 50 files:   0%|          | 0/50 [00:00<?, ?it/s]

INFO 02-06 15:31:13.148 [utils.py:524] Downloaded model to /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:31:13.148 [1682206808.py:23] Model path: /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers


In [12]:
pipeline = build_pipeline(fastvideo_args)

INFO 02-06 15:33:16.902 [utils.py:512] Model already exists locally at /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:33:16.903 [__init__.py:42] Model path: /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:33:16.903 [__init__.py:44] Building pipeline of type: basic
INFO 02-06 15:33:16.903 [profiler.py:191] Torch profiler disabled; returning no-op controller
INFO 02-06 15:33:16.904 [composed_pipeline_base.py:88] Loading pipeline modules...
INFO 02-06 15:33:16.904 [utils.py:512] Model already exists locally at /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:33:16.905 [composed_pipeline_base.py:231] Model path: /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers
INFO 02-06 15:33:16.906 [utils.py:600] Diffusers version: 0.35.0.dev0
INFO 02-06 15:33:16.906 [composed_pipeline_base.py:291] Loading pipeline modules from config: {'_class_name': 'WanImageToVideoPipeline', '_diffusers_version': '0.35.0.dev0', 'boundary_ratio': 0.9, 'image_encoder'

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 02-06 15:33:22.074 [component_loader.py:386] Loading weights took 5.14 seconds
INFO 02-06 15:33:55.251 [composed_pipeline_base.py:370] Loaded module text_encoder from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/text_encoder
INFO 02-06 15:33:55.252 [component_loader.py:995] Loading tokenizer using transformers from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/tokenizer
INFO 02-06 15:33:55.252 [component_loader.py:506] Loading tokenizer from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/tokenizer
INFO 02-06 15:33:55.703 [component_loader.py:554] Loaded tokenizer: T5TokenizerFast
INFO 02-06 15:33:55.704 [composed_pipeline_base.py:370] Loaded module tokenizer from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/tokenizer
INFO 02-06 15:33:55.704 [component_loader.py:995] Loading transformer using diffusers from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer
INFO 02-06 15:33:55.705 [component_loader.py:745] transformer cls_name: WanTransformer3DModel

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


INFO 02-06 15:34:05.581 [component_loader.py:837] Loaded model with 14.29B parameters
INFO 02-06 15:34:50.036 [composed_pipeline_base.py:370] Loaded module transformer from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer
INFO 02-06 15:34:50.037 [component_loader.py:995] Loading transformer_2 using diffusers from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer_2
INFO 02-06 15:34:50.037 [component_loader.py:745] transformer cls_name: WanTransformer3DModel
INFO 02-06 15:34:50.038 [component_loader.py:793] Loading model from 12 safetensors files: ['/home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer_2/diffusion_pytorch_model-00008-of-00012.safetensors', '/home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer_2/diffusion_pytorch_model-00012-of-00012.safetensors', '/home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer_2/diffusion_pytorch_model-00001-of-00012.safetensors', '/home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


INFO 02-06 15:35:20.359 [component_loader.py:837] Loaded model with 14.29B parameters
INFO 02-06 15:36:03.937 [composed_pipeline_base.py:370] Loaded module transformer_2 from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/transformer_2
INFO 02-06 15:36:03.938 [component_loader.py:995] Loading vae using diffusers from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/vae
INFO 02-06 15:36:05.172 [composed_pipeline_base.py:370] Loaded module vae from /home/builder/dev/Wan-AI-Wan2.2-I2V-A14B-Diffusers/vae
INFO 02-06 15:36:05.173 [lora_pipeline.py:147] trainable_transformer_modules: dict_keys(['transformer', 'transformer_2'])
INFO 02-06 15:36:05.173 [__init__.py:59] Pipelines instantiated


In [19]:
pipeline.post_init()

INFO 02-06 15:39:36.388 [composed_pipeline_base.py:159] Creating pipeline stages...


In [20]:
height = 480
width = 832
num_frames = 21
num_inference_steps = 40
guidance_scale = 5.0
seed = 42
output_path = "output-wan14b.mp4"
height_latents = height // 8
width_latents = width // 8
num_latent_frames = (num_frames - 1) // 4 + 1
n_tokens = num_latent_frames * height_latents * width_latents


batch = ForwardBatch(
    data_type="video",
    prompt="a dragon flying in the sky",
    negative_prompt=NEGATIVE_PROMPT,
    image_path="/home/builder/dev/lingbot-world/examples/00/image.jpg",
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    seed=seed,
    n_tokens=n_tokens,
    output_path=output_path,
    save_video=True,
    fps=16,
)

In [21]:
# with torch.no_grad():
#     output_batch = pipeline.forward(batch, fastvideo_args)
# torch.cuda.empty_cache()

with torch.no_grad():
    for stage in pipeline.stages:
        print(type(stage))
        batch = stage(batch, fastvideo_args)
output_batch = batch
torch.cuda.empty_cache()

  0%|          | 0/40 [00:00<?, ?it/s]

INFO 02-06 15:39:47.133 [wanvideo.py:696] Padding not applied
INFO 02-06 15:40:13.741 [wanvideo.py:696] Padding not applied


In [16]:
samples = output_batch.output
videos = rearrange(samples, "b c t h w -> t b c h w")
frames = []
for x in videos:
    x = torchvision.utils.make_grid(x, nrow=6)
    x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
    frames.append((x * 255).numpy().astype(np.uint8))
imageio.mimsave(output_path, frames, fps=batch.fps, format="mp4")
logger.info("Saved video to: %s", output_path)

RuntimeError: Tensor type unknown to einops <class 'NoneType'>

In [8]:
print(type(pipeline))
print(type(pipeline.modules["vae"]))
print(type(pipeline.stages), pipeline.stages)
del fastvideo_args
torch.cuda.empty_cache()