# Gemini Video Highlights

This idea leverages Google's Gemini LLM, specifically its long attention and caching capabilities, to handle extensive video inputs effectively. Processing full-length videos requires the ability to maintain context over long durations, which is challenging for most language models. Gemini's advanced architecture makes it possible to analyze lengthy video content without losing track of important details, enabling comprehensive understanding and processing that would be difficult or impossible with standard models.

The proposed solution uses Gemini to generate a JSON of video highlights, each with start and end timestamps, content descriptions, and an importance score. An additional algorithm then trims the original video based on these highlights and overlays captions that summarize each section. By assigning importance scores, the model ensures that only the most significant parts of the video are included, allowing users to control the length and focus of the summary. Furthermore, since the context is cached, users can request more detailed analyses on specific topics within the video, enhancing the flexibility and usefulness of the summary.

In [None]:
!pip install google-generativeai
!pip install ffmpeg-python
# You might need to reset the compute (Restart and run up to selected cell) and execute this cell again

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:4.2.7-0ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
imagemagick is already the newest version (8:6.9.10.23+dfsg-2.1ubuntu11.10).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmagick++-dev is already the newest version (8:6.9.10.23+dfsg-2.1ubuntu11.10).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
import os
import random
import datetime
import time
import json
import subprocess
import tempfile

import ffmpeg

from google.generativeai.types import GenerationConfig
import google.generativeai as genai
from google.generativeai import caching

In [None]:
from kaggle_secrets import UserSecretsClient
secret_label = "API_KEY"
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("API_KEY")
genai.configure(api_key=api_key)

In [None]:
def time_to_seconds(t):
    # Parse timestamps to seconds
    if ":" in t:
        sep = ":"
    elif "." in t:
        sep = "."
    else:
        raise Exception("Timestamp format not correct")
    ext = list(map(int, t.split(sep)))
    if len(ext) == 2:
        min, sec = ext
        hour = 0
    elif len(ext) == 3:
        hour, min, sec = ext
    return hour * 60 * 60 + min * 60 + sec


def importance_conditioned_sampling(entries, p):
    # If the user wants only a portion of the highlight to reduce the video duration, sample the most important ones first
    sample_size = int(len(entries) * p)
    weights = [float(entry.get('importance', 0.)) for entry in entries]
    sampled_entries = random.choices(entries, weights=weights, k=sample_size)
    sampled_sorted_by_time = sorted(sampled_entries, key=lambda x: x['start_time'])
    return sampled_sorted_by_time


def merge_overlapping_highlights(highlights):
    # Merge overlapping highlights, keeping the content of the last one
    if not highlights:
        return []
    # Sort highlights by start_time
    sorted_highlights = sorted(highlights, key=lambda x: x["start_time"])
    merged = [sorted_highlights[0]]
    for current in sorted_highlights[1:]:
        last = merged[-1]
        # Check if current highlight overlaps with the last merged highlight
        if current["start_time"] <= last["end_time"]:
            # Merge the two highlights
            merged_end_time = max(last["end_time"], current["end_time"])
            # Update the last merged highlight
            merged[-1] = {
                "start_time": last["start_time"],
                "end_time": merged_end_time,
                "content": current["content"],  # Keep content of the last one
                "importance": current["importance"]
            }
        else:
            # No overlap, add to merged list
            merged.append(current)
    return merged


def get_video_length(video_path):
    # Command to get duration using ffprobe
    command = [
        'ffprobe',
        '-v', 'error',
        '-select_streams', 'v:0',
        '-show_entries', 'stream=duration',
        '-of', 'default=noprint_wrappers=1:nokey=1',
        video_path
    ]
    # Execute the command
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    # Convert the output to a float
    duration = float(result.stdout.strip())
    return duration


def preprocess_highlights(highlights_dict, video_path, init_end_offset_sec=0.1, ratio_clips_maintain=1.):
    # Preprocess to reduce duration and modify times.
    init_end_offset_sec = max([init_end_offset_sec, 0.1])
    duration = get_video_length(video_path)
    idx = 0
    for idx in range(len(highlights_dict)):
        start = highlights_dict[idx]["start"]
        end = highlights_dict[idx]["end"]
        if time_to_seconds(start) > duration:
            break
        highlights_dict[idx]["start_time"] = max([time_to_seconds(start) - init_end_offset_sec, 0])
        highlights_dict[idx]["end_time"] = min([time_to_seconds(end) + init_end_offset_sec, duration - 0.5])
    highlights_dict = highlights_dict[:idx]
    if ratio_clips_maintain < 1.:
        highlights_dict = importance_conditioned_sampling(highlights_dict, ratio_clips_maintain)
    highlights_dict = merge_overlapping_highlights(highlights_dict)
    return highlights_dict


def probe_video(video_file):
    """Use ffprobe to get video metadata."""
    cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', video_file]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return json.loads(result.stdout)


def create_highlights_video(highlights_dict, video_path, output_path):
    temp_dir = tempfile.TemporaryDirectory()
    temp_dir_name = temp_dir.name

    video_info = probe_video(video_path)
    width = int(video_info['streams'][0]['width'])
    height = int(video_info['streams'][0]['height'])
    fontsize = int(height * 0.05)

    concat_file_content = ""
    for i, highlight in enumerate(highlights_dict):
        start_time = highlight['start_time']
        end_time = highlight['end_time']
        content = highlight.get('content', "")

        temp_file_path = os.path.join(temp_dir_name, f'clip_{i}.mp4')

        video = ffmpeg.input(video_path, ss=start_time, to=end_time).video.filter(
            'drawtext', text=content, fontsize=fontsize,
            fontcolor='white', x='(w-text_w)/2', y='h-(text_h*2)', borderw='2',
            bordercolor='black', shadowcolor='black', shadowx='2', shadowy='2')
        audio = ffmpeg.input(video_path, ss=start_time, to=end_time).audio

        ffmpeg.output(video, audio, temp_file_path, vcodec='libx264', acodec='aac').run()

        concat_file_content += f"file '{temp_file_path}'\n"

    concat_list_path = os.path.join(temp_dir_name, 'concat_list.txt')
    with open(concat_list_path, 'w') as f:
        f.write(concat_file_content)

    ffmpeg.input(concat_list_path, format='concat', safe=0).output(output_path, c='copy').run()

    temp_dir.cleanup()

In [None]:
# Upload the video
path_to_video_file = "/kaggle/input/videos-gemini-highlights/nba.mp4"
video_file = genai.upload_file(path=path_to_video_file)

while video_file.state.name == 'PROCESSING':
    print('Waiting for video to be processed.')
    time.sleep(2)
    video_file = genai.get_file(video_file.name)
print(f'File processing complete: {video_file.uri}')

Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
File processing complete: https://generativelanguage.googleapis.com/v1beta/files/f6h2s4igu6a7


In [None]:
# Cache the video for further questioning
system_instruction = "You are an expert analyzer, and your job is to answer the user's query based on the file you have access to. Answer with the proper language for the given context and only with timestamps information."
cache = caching.CachedContent.create(
    model='gemini-1.5-pro-002',
    display_name='video',
    system_instruction=(
        system_instruction
    ),
    contents=[video_file],
    ttl=datetime.timedelta(minutes=10)
)

In [None]:
# Create a model from the cache data
config = GenerationConfig(max_output_tokens=4098, temperature=0.3, response_mime_type="application/json")
model = genai.GenerativeModel.from_cached_content(
    cached_content=cache,
    generation_config=config
)
chat = model.start_chat()

In [None]:
# Standard prompt for the agent to create highlights
user_prompt = """
I want to get a top-10 short moments highlights. Equally distributed in time (not just the first X minutes).
Provide a list of JSONs with:
    - Each timestep of the highlight with "start" and "end" in MM:SS format.
    - What happens in a short sentence in "content".
    - The importance in "importance" of this highlight [0-1] in the context of the full context. A number between 0 and 1 that indicates how important is this highlight to understand the whole context.
Provide only those given keys: start, end, content and importance. Nothing else or the JSON will be broken.
I am looking for the most important ones from the whole video, so don't write a lot, just like a top-10 or so (only "importance" > 0.7). These are the top-10 highlights, equally distributed in time. Not a summary.
For example,
[
    {
        "start": 0:55,
        "end": 1:10,
        "content": "This is what happens in a short sentence.",
        "importance": 0.95
    },
    {
        "start": 41:50,
        "end": 42:15,
        "content": "This is what happens in a short sentence.",
        "importance": 0.71
    },
    ...
]
"""

In [None]:
# Send the query
response = chat.send_message(user_prompt)
highlights_dict = json.loads(response.text)

# If the cell above takes too much time to process, you can use this output from a previous execution (for the nba.mp4)
# highlights_dict = [{"start": "00:00", "end": "00:02", "content": "Festus Ezeli comes off the bench for his first start of the entire postseason.", "importance": 0.8}, {"start": "00:10", "end": "00:24", "content": "The game starts with no score.", "importance": 0.7}, {"start": "01:02", "end": "01:05", "content": "Thompson lays the ball in.", "importance": 0.75}, {"start": "01:56", "end": "02:00", "content": "Green scores.", "importance": 0.7}, {"start": "02:48", "end": "02:50", "content": "Barnes shoots and scores, bringing the Warriors into the lead.", "importance": 0.9}, {"start": "03:20", "end": "03:22", "content": "Green shoots for three.", "importance": 0.8}, {"start": "05:56", "end": "06:04", "content": "Irving makes a basket.", "importance": 0.7}, {"start": "07:19", "end": "07:22", "content": "Smith drives to the basket.", "importance": 0.7}, {"start": "08:27", "end": "08:29", "content": "Steve Kerr huddles with his team.", "importance": 0.7}, {"start": "10:49", "end": "10:51", "content": "James dunks the ball.", "importance": 0.75}]

In [None]:

# Preprocess
path_to_video_file = "/kaggle/input/videos-gemini-highlights/nba.mp4"
# In videos like sports, a higher "init_end_offset_sec" might provide a more useful resulting video. In other videos like conferences, a lower value makes more sense.
# If you want to keep the resulting video shorter (and thus take less time to render), set "ratio_clips_maintain" to 0.6 or 0.4 to only keep a percentage of the most important clips.
highlights_dict_preproc = preprocess_highlights(highlights_dict.copy(), path_to_video_file, init_end_offset_sec=1.5, ratio_clips_maintain=1.)

In [None]:
# Finally, create the video
output_video_path = "output.mp4"
highlights_video = create_highlights_video(highlights_dict_preproc, path_to_video_file, output_video_path)

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

In [None]:
from IPython.display import HTML
from base64 import b64encode


def play(filename):
    # From: https://www.kaggle.com/code/mistag/play-video-in-notebook
    html = ''
    video = open(filename,'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=1000 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src
    return HTML(html)


play(output_video_path)