In [60]:
from typing import Any, Optional
from numpy.typing import NDArray

from pathlib import Path
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from diffusers import StableDiffusionUpscalePipeline
from diffusers.models.attention_processor import AttnProcessor2_0

import av
import imageio.v3 as iio
from PIL import Image

In [61]:
DATA = Path.cwd() / "data"
SD_VID_PATH = DATA / "inter4k_222_sd.mp4"
HD_VID_PATH = DATA / "inter4k_222_hd.mp4"

sd_vid = iio.imread(SD_VID_PATH, plugin = "pyav")
fps = iio.immeta(SD_VID_PATH, plugin = "pyav")["fps"]
print(sd_vid.shape)
print(fps)

(300, 480, 640, 3)
60.0


In [46]:
def upscale_video_4x(video: NDArray, prompt: str, limit_frames: Optional[int] = None, random_seed: int = 42, ) -> NDArray:
    """
    Parameters
    ----------
    video: NDArray
        numpy array of shape (frames, height, width, channels)
    prompt: str
        text prompt given to the model
    limit_frames: int, optional
        for debugging purposes, only upscale first limit_frames frames of the video
    random_seed: int, optional
        for reproducibility
    """

    model_id = "stabilityai/stable-diffusion-x4-upscaler"
    generator = torch.Generator("cuda").manual_seed(random_seed)
    upscale = StableDiffusionUpscalePipeline.from_pretrained(
        model_id,
        torch_dtype = torch.float16,
        generator = generator,
        output_type = "ndarray",
        num_inference_steps = 20,
        num_images_per_prompt = 1,
    )
    # pipe.unet = torch.compile(pipe.unet)
    # pipe.unet.set_attn_processor(AttnProcessor2_0())
    upscale = upscale.to("cuda")
    upscale.enable_sequential_cpu_offload()
    upscale.enable_attention_slicing()

    if limit_frames is None:
        limit_frames = video.shape[0]
    print(f"limiting frames to: {limit_frames}")

    upscaled_video = list()
    for idx, frame in enumerate(video):
        print(f"Upscaling Frame: {idx}")
        if idx > limit_frames:
            break
        frame = Image.fromarray(frame)
        upscaled_frame = upscale(prompt = prompt, image = frame).images[0]
        upscaled_video.append(np.array(upscaled_frame))
    return np.stack(upscaled_video)

In [26]:
up_vid = upscale_video_4x(sd_vid, "cityscape at night", limit_frames=1)

Keyword arguments {'generator': <torch._C.Generator object at 0x7fe8b42f3610>, 'output_type': 'ndarray', 'num_inference_steps': 20, 'num_images_per_prompt': 1} are not expected by StableDiffusionUpscalePipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

limiting_frames_to: 1


  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

In [62]:
up_vid.shape

(2, 1920, 2560, 3)

In [59]:
# TODO: Rewrite Video Encoder

output = av.open(str(HD_VID_PATH), mode="w")
stream = output.add_stream("libx264", rate=30)
stream.width = up_vid.shape[2]  # Set width from the frames array
stream.height = up_vid.shape[1]  # Set height from the frames array
stream.pix_fmt = "yuv420p"

# Write frames to the video
for frame in up_vid:
    frame = av.VideoFrame.from_ndarray(frame, format="rgb24")
    packet = stream.encode(frame)
    if packet:
        output.mux(packet)