In [7]:
import argparse
import glob
import os
import time

import imageio
import numpy as np
import torch
import torchvision
from einops import rearrange
from rich import print, pretty

pretty.install()

# Set distributed environment variables (required even for single GPU)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29504"

from fastvideo.configs.pipelines.lingbotworld import LingbotWorldT2VBaseConfig
from fastvideo.fastvideo_args import FastVideoArgs, ExecutionMode
from fastvideo.logger import init_logger
from fastvideo.pipelines import build_pipeline
from fastvideo.pipelines.pipeline_batch_info import ForwardBatch
from fastvideo.utils import maybe_download_model

logger = init_logger(__name__)

In [2]:
MODEL_ID = "/home/builder/dev/data/lingbot-world-base-cam"

# Local directory to cache the model
LOCAL_DIR = "/home/builder/dev/data/lingbot-world-base-cam"

# Output directory for generated videos
OUTPUT_DIR = "video_samples_fastvideo-lingbot-world-base-cam"


In [3]:
EXAMPLE_PROMPT = (
    "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. "
    "The fluffy-furred feline gazes directly at the camera with a relaxed expression. "
    "Blurred beach scenery forms the background featuring crystal-clear waters, distant "
    "green hills, and a blue sky dotted with white clouds. The cat assumes a naturally "
    "relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot "
    "highlights the feline's intricate details and the refreshing atmosphere of the seaside."
)
negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
EXAMPLE_IMAGE = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"


In [4]:
def print_model_info(
    model,
    model_name: str = "Model",
    checkpoint_params: int | None = None,
):
    """
    Print detailed model architecture and parameter information.
    """
    logger.info("=" * 70)
    logger.info("%s Architecture Information", model_name)
    logger.info("=" * 70)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    logger.info("Total Parameters (runtime): %s (%.2fB)", f"{total_params:,}", total_params / 1e9)
    if checkpoint_params is not None:
        logger.info(
            "Total Parameters (checkpoint): %s (%.2fB)",
            f"{checkpoint_params:,}",
            checkpoint_params / 1e9,
        )
    logger.info("Trainable Parameters: %s (%.2fB)", f"{trainable_params:,}", trainable_params / 1e9)
    logger.info("Model dtype: %s", next(model.parameters()).dtype)
    
    # Print model architecture summary
    logger.info("\nModel Architecture:")
    logger.info("-" * 70)
    
    # Get top-level modules
    for name, module in model.named_children():
        num_params = sum(p.numel() for p in module.parameters())
        logger.info("  %s: %s (%s params)", name, module.__class__.__name__, f"{num_params:,}")
    
    # Print detailed layer counts
    logger.info("\nLayer Statistics:")
    logger.info("-" * 70)
    
    layer_types = {}
    for name, module in model.named_modules():
        module_type = module.__class__.__name__
        if module_type not in layer_types:
            layer_types[module_type] = 0
        layer_types[module_type] += 1
    
    # Sort by count and print top layer types
    sorted_layers = sorted(layer_types.items(), key=lambda x: x[1], reverse=True)[:15]
    for layer_type, count in sorted_layers:
        logger.info("  %s: %d", layer_type, count)
    
    logger.info("=" * 70)

In [5]:
model_path = maybe_download_model(MODEL_ID, local_dir=LOCAL_DIR)

INFO 02-06 14:04:21.683 [utils.py:512] Model already exists locally at /home/builder/dev/data/lingbot-world-base-cam


In [11]:
pipeline_config = LingbotWorldT2VBaseConfig()
fastvideo_args = FastVideoArgs(
        model_path=model_path,
        num_gpus=1,
        tp_size=1,
        sp_size=1,
        hsdp_shard_dim=1,
        hsdp_replicate_dim=1,
        dit_cpu_offload=False,
        text_encoder_cpu_offload=True,
        vae_cpu_offload=False,
        pipeline_config=pipeline_config,
    )
print("fastvideo_args: %s", fastvideo_args)
pipeline = build_pipeline(fastvideo_args)

INFO 02-06 14:05:40.256 [utils.py:512] Model already exists locally at /home/builder/dev/data/lingbot-world-base-cam
INFO 02-06 14:05:40.257 [__init__.py:42] Model path: /home/builder/dev/data/lingbot-world-base-cam
INFO 02-06 14:05:40.257 [__init__.py:44] Building pipeline of type: basic
INFO 02-06 14:05:40.258 [utils.py:600] Diffusers version: 0.35.0.dev0
INFO 02-06 14:05:40.258 [pipeline_registry.py:150] Loading pipelines for types: ['basic']
INFO 02-06 14:05:40.287 [pipeline_registry.py:200] Loaded 23 pipeline classes across 1 types
INFO 02-06 14:05:40.288 [utils.py:600] Diffusers version: 0.35.0.dev0
INFO 02-06 14:05:40.289 [parallel_state.py:976] Initializing distributed environment with world_size=1, device=cuda:0
INFO 02-06 14:05:40.289 [parallel_state.py:788] Using nccl backend for CUDA platform
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[G

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 02-06 14:05:42.640 [component_loader.py:386] Loading weights took 1.98 seconds
INFO 02-06 14:06:10.150 [composed_pipeline_base.py:370] Loaded module text_encoder from /home/builder/dev/data/lingbot-world-base-cam/text_encoder
INFO 02-06 14:06:10.152 [component_loader.py:995] Loading tokenizer using transformers from /home/builder/dev/data/lingbot-world-base-cam/tokenizer
INFO 02-06 14:06:10.152 [component_loader.py:506] Loading tokenizer from /home/builder/dev/data/lingbot-world-base-cam/tokenizer
INFO 02-06 14:06:10.585 [component_loader.py:554] Loaded tokenizer: T5TokenizerFast
INFO 02-06 14:06:10.586 [composed_pipeline_base.py:370] Loaded module tokenizer from /home/builder/dev/data/lingbot-world-base-cam/tokenizer
INFO 02-06 14:06:10.587 [component_loader.py:995] Loading transformer using diffusers from /home/builder/dev/data/lingbot-world-base-cam/transformer
INFO 02-06 14:06:10.587 [component_loader.py:745] transformer cls_name: LingbotWorldTransformer3DModel
INFO 02-06 14:0

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 02-06 14:06:23.294 [component_loader.py:837] Loaded model with 18.54B parameters
INFO 02-06 14:07:05.564 [composed_pipeline_base.py:370] Loaded module transformer from /home/builder/dev/data/lingbot-world-base-cam/transformer
INFO 02-06 14:07:05.565 [component_loader.py:995] Loading transformer_2 using diffusers from /home/builder/dev/data/lingbot-world-base-cam/transformer_2
INFO 02-06 14:07:05.566 [component_loader.py:745] transformer cls_name: LingbotWorldTransformer3DModel
INFO 02-06 14:07:05.567 [component_loader.py:793] Loading model from 8 safetensors files: ['/home/builder/dev/data/lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00008-of-00008.safetensors', '/home/builder/dev/data/lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00004-of-00008.safetensors', '/home/builder/dev/data/lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00007-of-00008.safetensors', '/home/builder/dev/data/lingbot-world-base-cam/transformer_2/diffusion_pytorch_m

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 02-06 14:07:16.679 [component_loader.py:837] Loaded model with 18.54B parameters
INFO 02-06 14:07:58.609 [composed_pipeline_base.py:370] Loaded module transformer_2 from /home/builder/dev/data/lingbot-world-base-cam/transformer_2
INFO 02-06 14:07:58.610 [component_loader.py:995] Loading vae using diffusers from /home/builder/dev/data/lingbot-world-base-cam/vae
INFO 02-06 14:07:59.361 [composed_pipeline_base.py:370] Loaded module vae from /home/builder/dev/data/lingbot-world-base-cam/vae
INFO 02-06 14:07:59.362 [lora_pipeline.py:147] trainable_transformer_modules: dict_keys(['transformer', 'transformer_2'])
INFO 02-06 14:07:59.363 [__init__.py:59] Pipelines instantiated


In [15]:
print(pipeline.modules.keys())
print(pipeline.modules["transformer"])
print(pipeline.modules["vae"])

In [16]:
height = 480
width = 832
num_frames = 21
num_inference_steps = 40
guidance_scale = 5.0
seed = 42
output_path = "output.mp4"
height_latents = height // 8
width_latents = width // 8
num_latent_frames = (num_frames - 1) // 4 + 1
n_tokens = num_latent_frames * height_latents * width_latents

batch = ForwardBatch(
        data_type="video",
        prompt="a dragon flying in the sky",
        negative_prompt=negative_prompt,
        image_path="/home/builder/dev/lingbot-world/examples/00/image.jpg",
        height=height,
        width=width,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        seed=seed,
        n_tokens=n_tokens,
        output_path=output_path,
        save_video=True,
        fps=16,
    )

In [17]:
output_batch = pipeline.forward(batch, fastvideo_args)

INFO 02-06 14:12:37.660 [composed_pipeline_base.py:159] Creating pipeline stages...
INFO 02-06 14:12:37.661 [cuda.py:124] Trying FASTVIDEO_ATTENTION_BACKEND=None
INFO 02-06 14:12:37.661 [cuda.py:126] Selected backend: None
INFO 02-06 14:12:37.666 [cuda.py:267] Cannot use FlashAttention-2 backend because the flash_attn package is not found. Make sure that flash_attn was built and installed (on by default).
INFO 02-06 14:12:37.666 [cuda.py:274] Using Torch SDPA backend.
INFO 02-06 14:12:37.667 [composed_pipeline_base.py:424] Running pipeline stages: dict_keys(['input_validation_stage', 'prompt_encoding_stage', 'conditioning_stage', 'timestep_preparation_stage', 'latent_preparation_stage', 'image_latent_preparation_stage', 'denoising_stage', 'decoding_stage'])


  0%|          | 0/40 [00:00<?, ?it/s]

INFO 02-06 14:12:39.916 [lingbotworld.py:404] Padding not applied
INFO 02-06 14:13:18.656 [lingbotworld.py:404] Padding not applied


In [19]:
samples = output_batch.output
videos = rearrange(samples, "b c t h w -> t b c h w")
frames = []
for x in videos:
    x = torchvision.utils.make_grid(x, nrow=6)
    x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
    frames.append((x * 255).numpy().astype(np.uint8))
imageio.mimsave(output_path, frames, fps=batch.fps, format="mp4")
logger.info("Saved video to: %s", output_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO 02-06 14:14:24.263 [1569481540.py:9] Saved video to: output.mp4


In [29]:
print(type(pipeline.stages), pipeline.stages[:])
print(type(pipeline.modules["transformer"]))
print(type(pipeline.modules["vae"]))