In [1]:
import gradio as gr
import json
import ast
import os
from video_utils import get_transcript, stitch_video_segments, describe_video, download_youtube_video
from ai_utils import model, user_input
from config import GEMINI_API_KEY, ELEVEN_LABS_API_KEY, OPENAI_API_KEY, GROQ_API_KEY
from translation_utils import dub_video_translate, get_video, get_video2

def process_video(video_path, prompt, language, is_new=False):
    video_name = os.path.basename(video_path)
    output_transcript = get_transcript(video_path, language=language)

    print("Output transcript of video: ", output_transcript)

    chat = model.start_chat() if is_new else None
    
    transcript_combined = user_input(chat, output_transcript, prompt, video_name, is_new)
    transcript_combined = ast.literal_eval(transcript_combined)
    
    try:
        transcript_combined2 = json.dumps(transcript_combined, indent=4)
    except:
        transcript_combined2 = str(transcript_combined)
        print("Pretty print failed")
    
    processed_video = stitch_video_segments(video_path, transcript_combined, 'stitched_vid.mp4')  
    
    return processed_video, video_name, transcript_combined2

def process_youtube_video(youtube_url, prompt, language, is_new=False):
    video_path = download_youtube_video(youtube_url)
    if video_path is None:
        return None, "Download failed", "Failed to download YouTube video"
    
    return process_video(video_path, prompt, language, is_new)

with gr.Blocks() as demo:
    gr.Markdown("# ThirteenLabs Smart AI Editor")
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video")
            youtube_url = gr.Textbox(label="YouTube URL")
            download_youtube_button = gr.Button("Download YouTube Video")
            prompt_input = gr.Textbox(label="Prompt-to-edit")
            language_input = gr.Dropdown(choices=["en", "es", "fr", "de", "it", "ja", "ko", "zh"], label="Language", value="en")
            describe_video_button = gr.Button("Describe Video")
            get_transcript_button = gr.Button("Get Transcript")
            process_button = gr.Button("Process Video")
            process_button2 = gr.Button("Process New Video")
            process_youtube_button = gr.Button("Process YouTube Video")
        
        with gr.Column():
            video_output = gr.Video(label="Processed Video")
            video_name_output = gr.Textbox(label="Video Name")
            transcript_output = gr.Textbox(label="Processed Transcript")
            video_description_output = gr.Textbox(label="Video Description")
    
    with gr.Row():
        dub_video_btn = gr.Button("Dub video")
        dub_video_btn2 = gr.Button("Dub video with lip sync")
        target_language = gr.Textbox(label="Target Language")
        response_out = gr.Textbox(label="Translate Status")
    
    with gr.Row():
        translated_video_out = gr.Video(label="Translated Video")
    
    with gr.Row():
        translated_video_out2 = gr.Video(label="Translated Video2")

    dub_video_btn.click(fn=get_video, inputs=[], outputs=[translated_video_out])
    dub_video_btn2.click(fn=get_video2, inputs=[], outputs=[translated_video_out2])
    describe_video_button.click(fn=describe_video, inputs=[video_input], outputs=[video_description_output])
    get_transcript_button.click(fn=get_transcript, inputs=[video_input, language_input], outputs=[transcript_output])
    process_button.click(fn=process_video, inputs=[video_input, prompt_input, language_input], outputs=[video_output, video_name_output, transcript_output])
    process_button2.click(fn=lambda *args: process_video(*args, is_new=True), inputs=[video_input, prompt_input, language_input], outputs=[video_output, video_name_output, transcript_output])
    process_youtube_button.click(fn=process_youtube_video, inputs=[youtube_url, prompt_input, language_input], outputs=[video_output, video_name_output, transcript_output])


In [2]:
from vertexai.generative_models import GenerativeModel, Part

In [17]:
def describe_video(video_path):
    try:

        # genai.configure(api_key=GEMINI_API_KEY)

        # generation_config = {
        #     "temperature": 1,
        #     "top_p": 0,
        #     "top_k": 64,
        #     "max_output_tokens": 8192,
        #     "response_mime_type": "application/json",
        # }

        # safety_settings = [
        #     {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
        #     {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        #     {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        #     {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        # ]

        # model = genai.GenerativeModel(
        #     model_name="gemini-1.5-flash",
        #     safety_settings=safety_settings,
        #     generation_config=generation_config,
        # )

        vertexai.init(project='gen-lang-client-0004068608', location="us-west1")

        model = GenerativeModel(model_name="gemini-1.5-flash")

        # Prepare the prompt
        prompt = """
        Provide a description of the video.
        The description should also contain anything important which people say in the video.
        Include key elements such as:
        
        Main subject or focus
        Setting or environment
        Actions or events occurring
        Notable visual elements or patterns
        Any text or recognizable logos
        Important dialogues or spoken content
        Summarize your observations in 3-4 sentences.
        Provide in JSON format with keys time_start, time_end, description, objects_in_scene, dialog, characters_in_scene
        
        Example JSON format:
        jsonCopy{
          "time_start": "00:00",
          "time_end": "01:30",
          "description": "video_description",
          "objects_in_scene": ["object1", "object2", "object3", ...],
          "dialog": "transcript_of_dialog",
          "characters_in_scene": ["character1", "character2", ...]
        }
        Please provide your video description using this JSON format.
        mime_type=application/json
        Do not provide any explanation.
        Only output list of JSON
        """

        # Read the video file
        with open(video_path, "rb") as file:
            video_content = file.read()

        # Create video part
        video_file = Part.from_data(data=video_content, mime_type="video/mp4")

        # video_file_uri = video_path
        # video_file = Part.from_uri(video_file_uri, mime_type="video/mp4")
        
        # Generate content using Gemini
        contents = [video_file, prompt]
        response = model.generate_content(contents)

        return response.text

    except Exception as e:
        print(f"Error describing video: {e}")
        return "Unable to generate video description."

In [30]:
video_output = describe_video("/Users/dylan/jupyter/side-projects/folder-dl88/trailer-videos/Borderlands (2024) Official Trailer - Cate Blanchett Kevin Hart Jack Black.mp4")

In [31]:
print(video_output)

```json
[
  {
    "time_start": "00:00",
    "time_end": "00:01",
    "description": "A woman with red hair lies on a wooden platform, aiming a gun towards the screen.",
    "objects_in_scene": ["woman", "gun", "wooden platform"],
    "dialog": null,
    "characters_in_scene": ["woman"]
  },
  {
    "time_start": "00:01",
    "time_end": "00:02",
    "description": "A large, dark creature with tentacles stands in the middle of a desert landscape. It is obscured by dust and debris. The word 'TRAILER' appears in gold letters, superimposed on a background of green and blue. ",
    "objects_in_scene": ["creature", "desert landscape", "tentacles", "dust"],
    "dialog": null,
    "characters_in_scene": []
  },
  {
    "time_start": "00:02",
    "time_end": "00:03",
    "description": "A woman leaps over a table with several doll-like figures sitting on it. The setting appears to be a desert wasteland with a large, rusty metal structure in the background.",
    "objects_in_scene": ["woman", 

In [32]:
import cv2
from PIL import Image
import io
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import google.generativeai as genai
from moviepy.editor import VideoFileClip


def describe_video_by_frames(video_path, frame_interval=5):
    try:
        vertexai.init(project='gen-lang-client-0004068608', location="us-west1")

        model = GenerativeModel(model_name="gemini-1.5-flash")
        # genai.configure(api_key=GEMINI_API_KEY)

        # model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        
        # Prepare the prompt
        prompt = f'''
        Provide a description of the video based on the sequence of frames provided.
        The description should also contain anything important which people say in the video.
        Include key elements such as:
        1. Main subject or focus
        2. Setting or environment
        3. Actions or events occurring
        4. Notable visual elements or patterns
        5. Any text or recognizable logos
        6. Important dialogues or spoken content
        Summarize your observations in 3 sentences.

        Each frame is {frame_interval} seconds long.

        Output a list of JSON where each JSON object is the description per frame.
        Do not output any explanation. 
        Example JSON format:
        {
          "time_start": 00:00,
          "time_end": 00:05,
          "description": "frame_description",
          "objects_in_scene": ["object1", "object2", "object3", ...],
          "dialog": "transcript_of_dialog",
          "characters_in_scene": ["character1", "character2", ...]
        }
        Please provide your video description using this JSON format.
        mime_type=application/json
        Do not provide any explanation.
        Only output list of JSON
        
        '''

        # Extract frames from the video using MoviePy
        video = VideoFileClip(video_path)
        frames = []
        
        for t in range(0, int(video.duration), frame_interval):
            frame = video.get_frame(t)
            
            # Convert the frame to PIL Image
            pil_image = Image.fromarray(frame)
            
            # Save the image to a byte stream
            byte_arr = io.BytesIO()
            pil_image.save(byte_arr, format='JPEG')
            image_bytes = byte_arr.getvalue()
            
            # Create a Part object from the image bytes
            image_part = Part.from_data(data=image_bytes, mime_type="image/jpeg")
            frames.append(image_part)

        video.close()

        # Generate content using Gemini
        contents = frames + [prompt]
        response = model.generate_content(contents)
        return response.text
    except Exception as e:
        print(f"Error describing video: {e}")
        return f"Unable to generate video description. Error: {str(e)}"

In [33]:
video_description=describe_video_by_frames("/Users/dylan/Downloads/jasonwei-1.mp4")

Error describing video: Invalid format specifier ' 00:00,
          "time_end": 00:05,
          "description": "frame_description",
          "objects_in_scene": ["object1", "object2", "object3", ...],
          "dialog": "transcript_of_dialog",
          "characters_in_scene": ["character1", "character2", ...]
        ' for object of type 'str'


In [34]:
print(video_description)

Unable to generate video description. Error: Invalid format specifier ' 00:00,
          "time_end": 00:05,
          "description": "frame_description",
          "objects_in_scene": ["object1", "object2", "object3", ...],
          "dialog": "transcript_of_dialog",
          "characters_in_scene": ["character1", "character2", ...]
        ' for object of type 'str'


In [91]:
# model = GenerativeModel(model_name="gemini-1.5-flash")
genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel(model_name="gemini-1.5-flash")

model = GenerativeModel(model_name="gemini-1.5-flash")


In [92]:
image_file = Part.from_uri(
    "/Users/dylan/Downloads/00006737.jpg",
    mime_type="image/jpeg"
)

# Query the model
response = model.generate_content([image_file, "what is this image?"])
print(response.text)

PermissionDenied: 403 Permission 'aiplatform.endpoints.predict' denied on resource '//aiplatform.googleapis.com/projects/gen-lang-client-0004068608/locations/us-central1/publishers/google/models/gemini-1.5-flash' (or it may not exist). [reason: "IAM_PERMISSION_DENIED"
domain: "aiplatform.googleapis.com"
metadata {
  key: "resource"
  value: "projects/gen-lang-client-0004068608/locations/us-central1/publishers/google/models/gemini-1.5-flash"
}
metadata {
  key: "permission"
  value: "aiplatform.endpoints.predict"
}
]