In [1]:
!pip install gradio torch transformers diffusers av numpy




In [None]:
import gradio as gr
import av
import numpy as np
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_gif

# Load vid captioning model
device = "cuda" if torch.cuda.is_available() else "cpu"
img_processor = AutoImageProcessor.from_pretrained("MCG-NJU/vidmae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
caption_model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-vid-captioning").to(device)

# Load AnimateDiff vid generator
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
scheduler = DDIMScheduler.from_pretrained(
    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1
)
pipe.scheduler = scheduler
pipe.enable_vae_slicing()
pipe.enable_model_cpu_offload()

# Helper: Generate captions for the vid
def generate_caption(vid_path):
    container = av.open(vid_path)
    seg_len = container.streams.vid[0].frames
    clip_len = caption_model.config.encoder.num_frames
    indices = set(np.linspace(0, seg_len, num=clip_len, endpoint=False).astype(np.int64))
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(vid=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
    pixel_values = img_processor(frames, return_tensors="pt").pixel_values.to(device)
    gen_kwargs = {"min_length": 10, "max_length": 20, "num_beams": 8}
    tokens = caption_model.generate(pixel_values, **gen_kwargs)
    return tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]

# Helper: Generate AI vid based on caption
def generate_ai_vid(caption):
    output = pipe(
        prompt=caption,
        negative_prompt="bad quality, worse quality",
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
        generator=torch.Generator("cpu").manual_seed(42),
    )
    gif_path = "ai_generated_vid.gif"
    export_to_gif(output.frames[0], gif_path)
    return gif_path

# Gradio interface
def process_vid(vid):
    input_vid_path = vid  # The vid is already a file path
    caption = generate_caption(input_vid_path)
    ai_vid_path = generate_ai_vid(caption)
    return input_vid_path, ai_vid_path, caption

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("### Video Captioning and AI Video Generation")
    with gr.Row():
        with gr.Column():
            vid_input = gr.Video(label="Upload a Video")
            submit_btn = gr.Button("Generate")
        with gr.Column():
            input_vid_output = gr.Video(label="Input Video")
            ai_vid_output = gr.Video(label="AI Generated Video")
            caption_output = gr.Textbox(label="Generated Caption", lines=2)

    submit_btn.click(
        process_vid,
        inputs=[vid_input],
        outputs=[input_vid_output, ai_vid_output, caption_output],
    )

# Launch Gradio app
demo.launch(debug=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Config of the encoder: <class 'transformers.models.timesformer.modeling_timesformer.TimesformerModel'> is overwritten by shared encoder config: TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-k600",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "abseiling",
    "1": "acting in play",
    "2": "adjusting glasses",
    "3": "air drummi

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7cee67e4086f10b2db.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


  0%|          | 0/25 [00:00<?, ?it/s]

