# Multimodal demo

This is an example of how to simulate a video- and audio-aware model using existing LLM vision models (that take text and images as input, and generate text as output).

In [1]:
import os
from pathlib import Path

import dotenv
from openai import OpenAI

from media_extractor import split_video
import datauri

In [None]:
# Load OpenAI API key from .env file
dotenv.load_dotenv()
if os.environ.get("OPENAI_API_KEY") is None:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI()

This is the input video that we'll turn into the user prompt.

In [2]:
video_file = "input.mov"

from IPython.display import Video
Video(video_file, width=320)

At the time of this writing, the GPT-4o API doesn't directly support video or audio input. Instead, we'll decode the video into frames and feed them to the model as images, and decode the audio into text and feed it to the model as text.

In [3]:
audio_uri, image_uris = split_video(video_file)
audio_uri[:50]

'data:audio/mpeg;base64,SUQzBAAAAAACXVRYWFgAAAASAAA'

Decode the audio file into text, using OpenAI's `whisper-1` model. The result will serve as the text prompt for the LLM.

In [4]:
with datauri.as_tempfile(audio_uri) as audio_file:
    transcription = client.audio.transcriptions.create(
        model="whisper-1", file=Path(audio_file)
    )

user_prompt = transcription.text
user_prompt

"Can you describe what I'm wearing right now?"

We're ready to talk to the LLM: use the text and images as input, and get generated text back.

In [6]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                *[
                    {
                        "type": "image_url",
                        "image_url": {"url": image_uri, "detail": "auto"},
                    }
                    for image_uri in image_uris
                ],
            ],
        },
        {
            "role": "system",
            "content": Path("system_prompt.txt").read_text(),
        },
    ],
)
response_text = response.choices[0].message.content
response_text

'Hey there! From what I can see in the video, you’re wearing a grey long-sleeve top. It looks pretty comfy and casual.'

Use OpenAI's text-to-speech model to turn the generated text into audio.

In [7]:
audio = client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input=response_text,
    response_format="mp3",
)
response_audio_uri = datauri.from_bytes(audio.read(), "audio/mpeg")

In [8]:
with datauri.as_tempfile(response_audio_uri) as response_audio_file:
    from IPython.display import Audio
    display(Audio(response_audio_file))