In [3]:
import torch
from transformers import AutoProcessor, AutoModel
from pathlib import Path
from pytube import YouTube
import numpy as np
from decord import VideoReader
import imageio

In [6]:
# !pip install ffmpeg
# !pip install pytube3

In [4]:
def download_youtube_video(url: str):
    yt = YouTube(url)

    streams = yt.streams.filter(file_extension="mp4")
    file_path = streams[0].download()
    return file_path

def sample_frames_from_video_file(
    file_path: str, num_frames: int = 16, frame_sampling_rate=1
):
    videoreader = VideoReader(file_path)
    videoreader.seek(0)

    # sample frames
    start_idx = 0
    end_idx = num_frames * frame_sampling_rate - 1
    indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
    frames = videoreader.get_batch(indices).asnumpy()

    return frames


def get_num_total_frames(file_path: str):
    videoreader = VideoReader(file_path)
    videoreader.seek(0)
    return len(videoreader)


def convert_frames_to_gif(frames, save_path: str = "frames.gif"):
    converted_frames = frames.astype(np.uint8)
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    imageio.mimsave(save_path, converted_frames, fps=8)
    return save_path


def create_gif_from_video_file(
    file_path: str,
    num_frames: int = 16,
    frame_sampling_rate: int = 1,
    save_path: str = "frames.gif",
):
    frames = sample_frames_from_video_file(file_path, num_frames, frame_sampling_rate)
    return convert_frames_to_gif(frames, save_path)

In [5]:
def predict(youtube_url_or_file_path, labels_text, processor, model):

    if youtube_url_or_file_path.startswith("http"):
        video_path = download_youtube_video(youtube_url_or_file_path)
    else:
        video_path = youtube_url_or_file_path
    
    # rearrange sampling rate based on video length and model input length
    num_total_frames = get_num_total_frames(video_path)
    num_model_input_frames = model.config.vision_config.num_frames
    if num_total_frames < FRAME_SAMPLING_RATE * num_model_input_frames:
        frame_sampling_rate = num_total_frames // num_model_input_frames
    else:
        frame_sampling_rate = FRAME_SAMPLING_RATE

    labels = labels_text.split(",")

    frames = sample_frames_from_video_file(
        video_path, num_model_input_frames, frame_sampling_rate
    )
    gif_path = convert_frames_to_gif(frames, save_path="video.gif")

    inputs = processor(
        text=labels, videos=list(frames), return_tensors="pt", padding=True
    )
    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    probs = outputs.logits_per_video[0].softmax(dim=-1).cpu().numpy()
    label_to_prob = {}
    for ind, label in enumerate(labels):
        label_to_prob[label] = float(probs[ind])

    return label_to_prob, gif_path

In [13]:
FRAME_SAMPLING_RATE = 4
DEFAULT_MODEL = "microsoft/xclip-base-patch16-zero-shot"

VALID_ZEROSHOT_VIDEOCLASSIFICATION_MODELS = [
    "microsoft/xclip-base-patch32",
    "microsoft/xclip-base-patch16-zero-shot",
    "microsoft/xclip-base-patch16-kinetics-600",
    "microsoft/xclip-large-patch14ft/xclip-base-patch32-16-frames",
    "microsoft/xclip-large-patch14",
    "microsoft/xclip-base-patch16-hmdb-4-shot",
    "microsoft/xclip-base-patch16-16-frames",
    "microsoft/xclip-base-patch16-hmdb-2-shot",
    "microsoft/xclip-base-patch16-ucf-2-shot",
    "microsoft/xclip-base-patch16-ucf-8-shot",
    "microsoft/xclip-base-patch16",
    "microsoft/xclip-base-patch16-hmdb-8-shot",
    "microsoft/xclip-base-patch16-hmdb-16-shot",
    "microsoft/xclip-base-patch16-ucf-16-shot",
]

examples = [
    [
        "https://www.youtu.be/l1dBM8ZECao",
        "sleeping dog,cat fight club,birds of prey",
    ],
    [
        "https://youtu.be/VMj-3S1tku0",
        "programming course,eating spaghetti,playing football",
    ],
    [
        "https://youtu.be/BRw7rvLdGzU",
        "game of thrones,the lord of the rings,vikings",
    ],
    [
        "froset.mp4",
        "a frog is sitting and snoring, a frog is jumping, a car driving fast, the moon is shining"
    ]
    
]

In [8]:
processor = AutoProcessor.from_pretrained(DEFAULT_MODEL)
model = AutoModel.from_pretrained(DEFAULT_MODEL)

In [14]:
predict(*examples[3], processor, model)

({'a frog is sitting and snoring': 0.05727091804146767,
  ' a frog is jumping': 0.11851947754621506,
  ' a car driving fast': 0.057707324624061584,
  ' the moon is shining': 0.7665022015571594},
 'video.gif')

In [19]:
from pytube import YouTube
def download_youtube_video(url: str):
    yt = YouTube(url)

    streams = yt.streams.filter(file_extension="mp4")
    file_path = streams[0].download()
    return file_path
download_youtube_video('https://www.youtube.com/watch?v=dQw4w9WgXcQ')

AttributeError: 'NoneType' object has no attribute 'span'