In [4]:
from IPython.display import display, Image, Audio
import cv2 
import base64
from openai import OpenAI
client = OpenAI(
    api_key="",
)

In [5]:
video = cv2.VideoCapture("wearable.mp4")

base64Frames = []
while video.isOpened():
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

video.release()
print(len(base64Frames), "frames read.")

1092 frames read.


In [56]:
HUMAN_NAME = "Ethan"
HUMAN_ATTRIBUTES = "A man with brown hair"
HUMAN_GOALS = "Productivity, distraction prevention, connection to nature."

PROMPT_MESSAGES = [
    {
        "role": "user",
        "content": [
f"""You are the world's most advanced AI life coach. You are helping a human improve their life.  You have access to videos from their point of view. Your job is to observe their activities and evaluate if their behavior is aligned with their stated goals. Details about your human:
Name: {HUMAN_NAME}
Attributes: {HUMAN_ATTRIBUTES}
Goals: {HUMAN_GOALS}
            
For key behaviors observed, provide positive feedback when they are aligned with their goals and provide constructive criticism when they are not. Your output should well be used to generate a video montage of your critique of there behavior. The output should be a JSON array of objects. Each object represents a scene in the montage. Each object should have a 'narration' string which is your narration of the scene and an 'image' string which is a DALL-E prompt for an image to go along with the narration. The narration should be in the tone of a helpful coach, providing encouragement about what was done well and helpful critiques and how to improve on what wasn't. The first scene should be an introduction and welcome message. The the final screen should be a conclusion and goodbye message. The narration should be in the past tense. Just output the raw JSON array of objects.

IMPORTANT:
- In each scene your commenting on you must declare what you observed and how it relates to the goals. Refer to your human in the second person (you, your, etc) since the video is mean for them.
- Keep all of the image prompts in consistent style so the images are generated consistently for the video. Make sure to describe your humans appearance if they are included in the image prompt.
- Just pick a single image prompt for each scene. You don't need to generate multiple images for each scene.
- Only include your top three or four observations.
""",
            *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::100]),
        ],
    },
]

result = client.chat.completions.create(
    messages=PROMPT_MESSAGES,
    model="gpt-4-vision-preview",
    max_tokens=800,
)
  
print(result.choices[0].message.content)

```json
[
  {
    "narration": "Welcome, Ethan. As your AI life coach, I'm here to help you align your actions with your goals of productivity, distraction prevention, and connection to nature. Let's see how you've done!",
    "image": "digital painting of a welcoming AI life coach in a virtual office"
  },
  {
    "narration": "You demonstrated great focus while working at your computer. Your dedicated work sessions are a strong step towards high productivity.",
    "image": "digital painting of a man with brown hair working intently on a computer in a well-organized workspace"
  },
  {
    "narration": "However, I noticed your phone becoming a source of distraction. Remember, frequent checks can fragment your attention. Let's work on setting specific times to check your phone, further boosting your productivity.",
    "image": "digital painting of a man with brown hair at a desk reaching for a smartphone with notification icons hovering above it"
  },
  {
    "narration": "I observed

In [62]:
import json
from elevenlabs import generate, play, set_api_key

# Parse the JSON data
parsed_data = result.choices[0].message.content.replace("```json", "").replace("```", "")
parsed_data = json.loads(parsed_data)

# Set your ElevenLabs API key
set_api_key("")

# Initialize an array to hold the audio data
audio_clips = []

# Loop over each item in the parsed JSON array
for item in parsed_data:
    narration = item['narration']  # Replace 'narration' with the actual key you have in your JSON objects

    # Generate audio using the ElevenLabs API
    audio = generate(
        text=narration,
        voice="xxkfiTS2BHbm5fmUmlSp",
        model='eleven_multilingual_v2'
    )

    # Store the audio data in the local array
    audio_clips.append(audio)

In [58]:
images = []

# Loop over each item in the parsed JSON array
for item in parsed_data:
    prompt = item['image']  # Replace 'narration' with the actual key you have in your JSON objects

    response = client.images.generate(
      model="dall-e-3",
      prompt=prompt,
      size="1024x1024",
      quality="hd",
      n=1,
    )


    # Store the audio data in the local array
    images.append(response.data[0].url)


In [64]:
import requests
import os


# Directory where you want to save the files
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Function to save audio clips from memory to files
def save_audio_clips(audio_clips, output_dir):
    audio_filenames = []
    for i, audio_clip in enumerate(audio_clips):
        # Assuming the audio clips are in mp3 format
        filename = os.path.join(output_dir, f"audio{i}.mp3")
        with open(filename, 'wb') as f:
            f.write(audio_clip)
        audio_filenames.append(filename)
    return audio_filenames

# Function to download images from URLs to files
def download_images(images, output_dir):
    image_filenames = []
    for i, image_url in enumerate(images):
        response = requests.get(image_url)
        if response.status_code == 200:
            # Assuming the images are in jpg format, you can modify the extension if necessary
            filename = os.path.join(output_dir, f"image{i}.jpg")
            with open(filename, 'wb') as f:
                f.write(response.content)
            image_filenames.append(filename)
    return image_filenames

# Save the audio clips to files
audio_filenames = save_audio_clips(audio_clips, output_dir)

# Download the images to files
image_filenames = download_images(images, output_dir)

# Now you have lists of filenames that can be used with ffmpeg
print("Audio files saved:", audio_filenames)
print("Image files downloaded:", image_filenames)

Audio files saved: ['output/audio0.mp3', 'output/audio1.mp3', 'output/audio2.mp3', 'output/audio3.mp3', 'output/audio4.mp3']
Image files downloaded: ['output/image0.jpg', 'output/image1.jpg', 'output/image2.jpg', 'output/image3.jpg', 'output/image4.jpg']


In [65]:
import subprocess
import os

# Temporary file list for the ffmpeg concatenation
filelist = 'filelist.txt'
with open(filelist, 'w') as f:
    for i, (image_file, audio_file) in enumerate(zip(image_filenames, audio_filenames)):
        # Get the duration of the audio file in seconds
        audio_duration_command = [
            'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
            '-of', 'default=nw=1:nk=1', audio_file
        ]
        audio_duration = float(subprocess.check_output(audio_duration_command).decode('utf-8').strip())
        # Calculate the number of frames (assuming 25 fps)
        num_frames = int(audio_duration * 25)

        # Define the output video name for this image + audio
        output_video = f'output/temp_video_{i}.mp4'

        # Define the Ken Burns effect command with audio parameters
        kb_command = [
            'ffmpeg',
            '-loop', '1',
            '-i', image_file,
            '-i', audio_file,
            '-vf', f'zoompan=z=\'if(lte(zoom,1.0),1.0,zoom+0.002)\':d=125:x=\'iw/2-(iw/zoom/2)\':y=\'ih/2-(ih/zoom/2)\':fps=25:s=hd720',
            '-c:v', 'libx264',
            '-c:a', 'aac',  # Explicit AAC encoding
            '-b:a', '192k',  # Set audio bitrate
            '-ar', '44100',  # Set audio sample rate
            '-t', str(audio_duration),  # Set the duration of the output video to match the audio duration
            '-map', '0:v:0',  # Map video stream
            '-map', '1:a:0',  # Map audio stream
            output_video
        ]
        
        # Run the Ken Burns effect command
        subprocess.call(kb_command)
        
        # Write the file output to the file list for concatenation
        f.write(f"file '{output_video}'\n")

# Define the transition effect and concatenation command
transition_command = [
    'ffmpeg',
    '-f', 'concat',
    '-safe', '0',
    '-i', filelist,
    '-c:v', 'libx264',
    '-c:a', 'aac',
    '-strict', 'experimental',
    'output/final_output.mp4'
]

# Run the transition effect and concatenation command
subprocess.call(transition_command)

# Clean up temporary video files and the file list
os.remove(filelist)
# Uncomment the following lines if you want to remove temporary video files
# for i in range(len(image_files)):
#     os.remove(f'output/temp_video_{i}.mp4')

print("Video created successfully: output/final_output.mp4")




ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 14.0.3 (clang-1403.0.22.14.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --di

Video created successfully: output/final_output.mp4


frame= 1657 fps=468 q=-1.0 Lsize=    2592kB time=00:01:06.28 bitrate= 320.3kbits/s dup=1 drop=0 speed=18.7x    
video:1962kB audio:574kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 2.196153%
[libx264 @ 0x144806060] frame I:9     Avg QP:17.12  size:207029
[libx264 @ 0x144806060] frame P:417   Avg QP:19.26  size:   191
[libx264 @ 0x144806060] frame B:1231  Avg QP:29.64  size:    53
[libx264 @ 0x144806060] consecutive B-frames:  0.8%  0.2%  0.2% 98.7%
[libx264 @ 0x144806060] mb I  I16..4:  8.3% 51.1% 40.6%
[libx264 @ 0x144806060] mb P  I16..4:  0.0%  0.0%  0.0%  P16..4:  0.9%  0.1%  0.1%  0.0%  0.0%    skip:99.0%
[libx264 @ 0x144806060] mb B  I16..4:  0.0%  0.0%  0.0%  B16..8:  0.6%  0.0%  0.0%  direct: 0.0%  skip:99.4%  L0:25.7% L1:74.3% BI: 0.0%
[libx264 @ 0x144806060] 8x8 transform intra:51.0% inter:83.2%
[libx264 @ 0x144806060] coded y,u,v intra: 89.1% 69.4% 68.2% inter: 0.0% 0.0% 0.0%
[libx264 @ 0x144806060] i16 v,h,dc,p: 21% 40%  4% 35%
[libx264 @ 0x144806060]