In [None]:
!pip install --upgrade transformers accelerate diffusers imageio-ffmpeg

In [None]:
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
!pip install git+https://github.com/pytorch/ao.git
!pip install optimum-quanto

In [None]:
# To get started, PytorchAO needs to be installed from the GitHub source and PyTorch Nightly.
# Source and nightly installation is only required until next release.

import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXPipeline
from diffusers.utils import export_to_video
from transformers import T5EncoderModel, T5Tokenizer
from torchao.quantization import quantize_, int8_weight_only, int8_dynamic_activation_int8_weight

model_id = "THUDM/CogVideoX-2b"
torch_dtype = torch.bfloat16

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
quantization = int8_weight_only

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_id, subfolder="tokenizer")

# Load and quantize encoder
text_encoder = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch_dtype)
quantize_(text_encoder, quantization())  # 8-bit quantization

# Load and quantize transformer
transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch_dtype)
quantize_(transformer, quantization())

# Load and quantize VAE
vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch_dtype)
quantize_(vae, quantization())

# Create pipeline and run inference
pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-2b",
    text_encoder=text_encoder,
    transformer=transformer,
    vae=vae,
    torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()


prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
output = pipe(
    prompt=prompt,
    height=256,
    width=256,
    num_frames=8,
    num_inference_steps=25,
    guidance_scale=7.5
).frames[0]


export_to_video(output, "output.mp4", fps=8)