In [8]:
import argparse
import glob
import os
import time

import imageio
import numpy as np
import torch
import torchvision
from einops import rearrange
from rich import print, pretty

pretty.install()

# Set distributed environment variables (required even for single GPU)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29505"

from fastvideo.configs.pipelines.lingbotworld import LingbotWorldT2VBaseConfig
from fastvideo.fastvideo_args import FastVideoArgs, ExecutionMode
from fastvideo.logger import init_logger
from fastvideo.pipelines import build_pipeline
from fastvideo.pipelines.pipeline_batch_info import ForwardBatch
from fastvideo.utils import maybe_download_model

logger = init_logger(__name__)

In [9]:
MODEL_ID = "/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam"
LOCAL_DIR = "/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam"
OUTPUT_DIR = "video_samples_fastvideo-lingbot-world-base-cam"

In [None]:
MODEL_ID = "/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam"
LOCAL_DIR = "/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam"
OUTPUT_DIR = "video_samples_fastvideo-lingbot-world-base-cam"

In [10]:
EXAMPLE_PROMPT = (
    "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. "
    "The fluffy-furred feline gazes directly at the camera with a relaxed expression. "
    "Blurred beach scenery forms the background featuring crystal-clear waters, distant "
    "green hills, and a blue sky dotted with white clouds. The cat assumes a naturally "
    "relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot "
    "highlights the feline's intricate details and the refreshing atmosphere of the seaside."
)
negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
EXAMPLE_IMAGE = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"


In [11]:
def print_model_info(
    model,
    model_name: str = "Model",
    checkpoint_params: int | None = None,
):
    """
    Print detailed model architecture and parameter information.
    """
    logger.info("=" * 70)
    logger.info("%s Architecture Information", model_name)
    logger.info("=" * 70)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    logger.info("Total Parameters (runtime): %s (%.2fB)", f"{total_params:,}", total_params / 1e9)
    if checkpoint_params is not None:
        logger.info(
            "Total Parameters (checkpoint): %s (%.2fB)",
            f"{checkpoint_params:,}",
            checkpoint_params / 1e9,
        )
    logger.info("Trainable Parameters: %s (%.2fB)", f"{trainable_params:,}", trainable_params / 1e9)
    logger.info("Model dtype: %s", next(model.parameters()).dtype)
    
    # Print model architecture summary
    logger.info("\nModel Architecture:")
    logger.info("-" * 70)
    
    # Get top-level modules
    for name, module in model.named_children():
        num_params = sum(p.numel() for p in module.parameters())
        logger.info("  %s: %s (%s params)", name, module.__class__.__name__, f"{num_params:,}")
    
    # Print detailed layer counts
    logger.info("\nLayer Statistics:")
    logger.info("-" * 70)
    
    layer_types = {}
    for name, module in model.named_modules():
        module_type = module.__class__.__name__
        if module_type not in layer_types:
            layer_types[module_type] = 0
        layer_types[module_type] += 1
    
    # Sort by count and print top layer types
    sorted_layers = sorted(layer_types.items(), key=lambda x: x[1], reverse=True)[:15]
    for layer_type, count in sorted_layers:
        logger.info("  %s: %d", layer_type, count)
    
    logger.info("=" * 70)

In [12]:
model_path = maybe_download_model(MODEL_ID, local_dir=LOCAL_DIR)

INFO 02-06 17:42:11.823 [utils.py:512] Model already exists locally at /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam


In [13]:
pipeline_config = LingbotWorldT2VBaseConfig()
fastvideo_args = FastVideoArgs(
        model_path=model_path,
        num_gpus=1,
        tp_size=1,
        sp_size=1,
        hsdp_shard_dim=1,
        hsdp_replicate_dim=1,
        dit_cpu_offload=False,
        text_encoder_cpu_offload=True,
        vae_cpu_offload=False,
        pipeline_config=pipeline_config,
    )
print("fastvideo_args: %s", fastvideo_args)
pipeline = build_pipeline(fastvideo_args)

INFO 02-06 17:42:13.718 [utils.py:512] Model already exists locally at /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam
INFO 02-06 17:42:13.719 [__init__.py:42] Model path: /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam
INFO 02-06 17:42:13.720 [__init__.py:44] Building pipeline of type: basic
INFO 02-06 17:42:13.720 [parallel_state.py:976] Initializing distributed environment with world_size=1, device=cuda:0
INFO 02-06 17:42:13.720 [parallel_state.py:788] Using nccl backend for CUDA platform


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
INFO 02-06 17:42:13.998 [profiler.py:191] Torch profiler disabled; returning no-op controller
INFO 02-06 17:42:13.999 [composed_pipeline_base.py:88] Loading pipeline modules...
INFO 02-06 17:42:14.000 [utils.py:512] Model already exists locally at /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam
INFO 02-06 17:42:14.000 [composed_pipeline_base.py:231] Model path: /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam
INFO 02-06 17:42:14.001 [utils.py:600] Diffusers version: 0.35.0.dev0
INFO 02-06 17:42:14.001 [composed_pipeline_base.py:291] Loading pipeline modules from config: {'_class_name': 'WanImageT

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 02-06 17:42:15.840 [component_loader.py:386] Loading weights took 1.81 seconds
INFO 02-06 17:42:45.058 [composed_pipeline_base.py:370] Loaded module text_encoder from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/text_encoder
INFO 02-06 17:42:45.059 [component_loader.py:995] Loading tokenizer using transformers from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/tokenizer
INFO 02-06 17:42:45.059 [component_loader.py:506] Loading tokenizer from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/tokenizer
INFO 02-06 17:42:45.505 [component_loader.py:554] Loaded tokenizer: T5TokenizerFast
INFO 02-06 17:42:45.506 [composed_pipeline_base.py:370] Loaded module tokenizer from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/tokenizer
INFO 02-06 17:42:45.506 [component_loader.py:995] Loading transformer using diffusers from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer
INFO 02-06 17:42:45.507

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 02-06 17:42:57.825 [component_loader.py:837] Loaded model with 18.54B parameters
INFO 02-06 17:43:46.034 [composed_pipeline_base.py:370] Loaded module transformer from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer
INFO 02-06 17:43:46.040 [component_loader.py:995] Loading transformer_2 using diffusers from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer_2
INFO 02-06 17:43:46.042 [component_loader.py:745] transformer cls_name: LingbotWorldTransformer3DModel
INFO 02-06 17:43:46.043 [component_loader.py:793] Loading model from 8 safetensors files: ['/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00002-of-00008.safetensors', '/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00004-of-00008.safetensors', '/home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer_2/diffusion_pytorch_model-00001-of-00008

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 02-06 17:43:58.594 [component_loader.py:837] Loaded model with 18.54B parameters
INFO 02-06 17:44:43.883 [composed_pipeline_base.py:370] Loaded module transformer_2 from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/transformer_2
INFO 02-06 17:44:43.884 [component_loader.py:995] Loading vae using diffusers from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/vae
INFO 02-06 17:44:44.713 [composed_pipeline_base.py:370] Loaded module vae from /home/builder/workspace/weights/fastvideo-lingbot-world-base-cam/vae
INFO 02-06 17:44:44.714 [lora_pipeline.py:147] trainable_transformer_modules: dict_keys(['transformer', 'transformer_2'])
INFO 02-06 17:44:44.715 [__init__.py:59] Pipelines instantiated


In [None]:
print(pipeline.modules.keys())
print(pipeline.modules["transformer"])
print(pipeline.modules["vae"])

In [None]:
height = 480
width = 832
num_frames = 21
num_inference_steps = 40
guidance_scale = 5.0
seed = 42
output_path = "output.mp4"
height_latents = height // 8
width_latents = width // 8
num_latent_frames = (num_frames - 1) // 4 + 1
n_tokens = num_latent_frames * height_latents * width_latents

batch = ForwardBatch(
        data_type="video",
        prompt="a dragon flying in the sky",
        negative_prompt=negative_prompt,
        image_path="/home/builder/dev/lingbot-world/examples/00/image.jpg",
        height=height,
        width=width,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        seed=seed,
        n_tokens=n_tokens,
        output_path=output_path,
        save_video=True,
        fps=16,
    )

In [None]:
output_batch = pipeline.forward(batch, fastvideo_args)

In [None]:
samples = output_batch.output
videos = rearrange(samples, "b c t h w -> t b c h w")
frames = []
for x in videos:
    x = torchvision.utils.make_grid(x, nrow=6)
    x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
    frames.append((x * 255).numpy().astype(np.uint8))
imageio.mimsave(output_path, frames, fps=batch.fps, format="mp4")
logger.info("Saved video to: %s", output_path)

In [None]:
print(type(pipeline.stages), pipeline.stages[:])
print(type(pipeline.modules["transformer"]))
print(type(pipeline.modules["vae"]))