## AdClip Gemini Prototype

#### AdClip Gemini leverages Gemini to understand long-context videos or video ads, and trim them based on the most important segments. There are two options: automatic trimming for long-context videos (transcript only) or [YouTube ABCDs](https://www.thinkwithgoogle.com/intl/en-apac/future-of-marketing/creativity/youtube-video-ad-creative/) (Attention, Branding, Connection, Direct) for video ads (transcript and visual description).

Contact: adclip-team@google.com

## Install

In [None]:
#@title Install Modules
!pip install google-cloud-aiplatform --quiet
!pip install google-cloud-speech --quiet
!pip install firebase_functions~=0.1.0 --quiet
!pip install google-cloud-videointelligence --quiet
!pip install moviepy --quiet
!pip install pytube --quiet

In [None]:
#@title Initialize the imports
from firebase_functions import https_fn
from firebase_admin import initialize_app, firestore
from google.cloud import speech, storage
from vertexai.preview.language_models import TextGenerationModel
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Image, Content, Part #gemini
from google.cloud import videointelligence

import moviepy.editor as moviepy
import re
import itertools
import functools
import copy
import math
import requests
from pytube import YouTube
from urllib.parse import urlparse, parse_qs

In [None]:
#@title GCS & FireStore

# Video files are stored in GCS, json files (transcripts, video shots) are stored in FireStore

def upload_blob(source_file_name: str,
                destination_blob_name: str) -> None:
    """Upload file to bucket."""
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

    print(
        'File {} uploaded to {}.'.format(source_file_name, destination_blob_name)
    )

def download_blob(source_file_name: str,
                  destination_blob_name: str) -> None:
    """Download file from bucket."""
    blob = bucket.blob(source_file_name)
    # Download the file to a destination
    blob.download_to_filename(destination_blob_name)

    print(
        'File {} downloaded to {}.'.format(source_file_name, destination_blob_name)
    )

def does_file_exist(file_path: str) -> bool:
  """Validate if file already existing in the bucket.

  Args:
    file_path: A file location.

  Returns:
    True if file existed, otherwise, False
  """
  blob = bucket.get_blob(file_path)
  return blob is not None

def upload_video_shots(file_name: str, video_shots: list) -> None:
  """Uploads video shot to firestore."""
  db = firestore.client()
  doc_ref = db.collection('video_shots').document(file_name)
  doc_ref.set({'data': video_shots})


def get_video_shots(file_name: str) -> bool:
  """Gets video shots from firestore by file name."""
  db = firestore.client()
  doc = db.collection('video_shots').document(file_name).get()
  if not doc.exists:
    return None
  return doc.to_dict().get('data')

def get_transcript(file_name: str) -> bool:
  """Gets transcript from firestore by file name."""
  db = firestore.client()
  doc = db.collection('transcripts').document(file_name).get()
  if not doc.exists:
    return None

  return doc.to_dict().get('original')

def upload_transcript(file_name: str, transcript: list) -> None:
  db = firestore.client()
  doc_ref = db.collection('transcripts').document(file_name)
  doc_ref.set({'original': transcript})

In [None]:
#@title Downloads Videos From YouTube

#import moviepy
#from moviepy import editor as moviepy
#from moviepy.editor import moviepy.editor
#from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

def get_id(youtube_url):
    """" Extract YouTube id from YouTube url
    https://www.youtube.com/watch?v=EUYpKwgqi1M -> "EUYpKwgqi1M"
    """
    u_pars = urlparse(youtube_url)
    quer_v = parse_qs(u_pars.query).get('v')
    if quer_v:
        return quer_v[0]
    pth = u_pars.path.split('/')
    if pth:
        return pth[-1]

def video_accessible(youtube_url):
    try:
        yt = YouTube(youtube_url)
    except VideoUnavailable:
        #print(f'Video {youtube_url} is not accessible')
        return False
    else:
        video_id = get_id(youtube_url)
        video_title = yt.title
        return {"video_id": video_id, "video_title": video_title}

# Download Youtube video from url
def download_video_from_url(youtube_url,video_file):
    """Download YouTube video from url
    Pytube: https://pytube.io/en/latest/api.html#pytube.Stream.download
    """
    path = '/tmp/'
    yt = YouTube(youtube_url)
    video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path=path, filename=video_file)
    return video_path  # return video_path string for other functions

def download_video(youtube_url): # Main function for CF
    if youtube_url is None:
        return {
            "error": "Missing url, sample format: https://youtu.be/9wobcM-WPQk"
        }
    if video_accessible(youtube_url) is False:
        return {
            "error": "Video is not accessible"
        }
    video_id = video_accessible(youtube_url)["video_id"]
    video_title = video_accessible(youtube_url)["video_title"]
    video_title = re.sub('\W+',' ', video_title)  #remove all special characters
    video_file = video_title + "_" + video_id + '.mp4'
    video_path_gcs = 'videos/' + video_file #adclip.appspot.com/videos/mytitle_ZDDH2.mp4
    if does_file_exist(video_path_gcs):
        print(f"Video files already exist in GCS: {video_path_gcs}")
        # return {
        #     "message": "Video files already exist in GCS"
        # }
    else:
        video_path_tmp = download_video_from_url(youtube_url=youtube_url,video_file=video_file)
        upload_blob(video_path_tmp, video_path_gcs) #tmp/video.mp4 --> adclip.appspot.com/videos/video.mp4
        #print("Video files uploaded to GCS")
    return {
        "video_uri": "https://storage.mtls.cloud.google.com/adclip.appspot.com/" + video_path_gcs,
        "full_path": video_path_gcs
    }

In [None]:
#@title Detects Shots (Video Intelligence)

def process_video(video_gcs_uri: str) -> list:
  """Processing the video to create video shots with timestamps and store the video shots in GCS.

  Args:
    video_gcs_uri: A video gcs uri for processing.

  Returns:
    A list of video shots metadata. For example:
    [
      {
        'start_time': 0.0,
        'end_time': 4.8
      },
      {
        'start_time': 5.2,
        'end_time': 5.6
      }
    ]
  """
  video_client = videointelligence.VideoIntelligenceServiceClient()

  #TODO: b/306068003 - Add speech-to-text feature here.
  features = [
    videointelligence.Feature.SHOT_CHANGE_DETECTION,
    # videointelligence.Feature.SPEECH_TRANSCRIPTION,
  ]

  transcript_config = videointelligence.SpeechTranscriptionConfig(
    language_code="en-US"
  )
  video_context = videointelligence.VideoContext(
    speech_transcription_config=transcript_config
  )

  operation = video_client.annotate_video(
    request={
      "features": features,
      "input_uri": video_gcs_uri,
    }
  )

  print("\nProcessing video.", operation)

  result = operation.result(timeout=300)

  print("\n finished processing.")

  video_shots = []
  # first result is retrieved because a single video was processed
  for i, shot in enumerate(result.annotation_results[0].shot_annotations):
    start_time = (
      shot.start_time_offset.seconds + shot.start_time_offset.microseconds / 1e6
    )
    end_time = (
      shot.end_time_offset.seconds + shot.end_time_offset.microseconds / 1e6
    )
    video_shots.append(
      {
        "start_time": math.floor(start_time * 10) / 10.0,
        "end_time": round(end_time, 1),
      }
    )
    print("\tShot {}: {} to {}".format(i, start_time, end_time))

  return video_shots

In [None]:
#@title Transcribes Audio & Syncs with Shot

def get_speech_recognition_config(language_code: str):
  """Get speech recognition config from the given language code and model.

  Check all supported language code and model here:
  https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages

  Args:
    language_code: A language code for transribing.
    model: A video transcribe model.

  Return:
    A speech recognition config.
  """
  if language_code == 'en-US':
    model = 'video'
  elif language_code == 'zh-TW':
    model = 'command_and_search'
  else:
    model = 'default'
  return speech.RecognitionConfig(
      enable_word_time_offsets=True,
      audio_channel_count=2,  # 2 is default for wav files
      # Enable automatic punctuation
      # enable_automatic_punctuation=True,
      language_code=language_code,
      model=model,
      # Works for model="video" or "phone call" (en-US only)
      use_enhanced=True,
  )

def extract_audio(video_full_path, file_name, output_name=None) -> str:
  """Extract audio from the video by the given video path.

  Args:
    video_full_path: A full video path that store in GCS.
    file_name: A file name for temp use.
    output_name: A custom output name.

  Returns:
    A path to video audio file.
  """
  file_name_without_extension = file_name.rsplit('.', 1)[0]
  if output_name is None:
    audio_output_file = file_name_without_extension  + '.wav'
  else:
    audio_output_file = output_name + '.wav'
  gcs_file_path = AUDIO_FOLDER + audio_output_file

  if does_file_exist(gcs_file_path):
    print('File {} exists'.format(gcs_file_path))
    return GS_PATH + gcs_file_path
  tmp_file_path = TEMP_FOLDER + file_name

  # use video file_path
  blob = bucket.blob(video_full_path)
  blob.download_to_filename(tmp_file_path)
  clip = moviepy.VideoFileClip(tmp_file_path)
  audio_output_path = TEMP_FOLDER + audio_output_file
  clip.audio.write_audiofile(audio_output_path)

  upload_blob(audio_output_path, gcs_file_path)

  return GS_PATH + gcs_file_path


def build_transcript(response) -> list:
  """Build video transcript response with transcript metadata.

  Args:
    response: A transcript response from speech API.

  Returns:
    A list of new video transcript strucutre and metadata.
    For example,
    [
      {
        "text": "some sentence"
        "startTime": 0,
        "endTime": 2.8,
        "duration": 2.8
        "words": [
          {
            "text": "some"
            "startTime": 0,
            "endTime": 1.2,
            "duration": 1.2
          },
          {
            "text": "sentence"
            "startTime": 1.2,
            "endTime": 2.8,
            "duration": 1.6
          }
        ]
      }
    ]
  """
  transcript_builder = []
  last_end_time = 0
  # Each result is for a consecutive portion of the audio. Iterate through
  # them to get the transcripts for the entire audio file.
  for result in response.results:
    # The first alternative is the most likely one for this portion.
    for alternative in result.alternatives:

      if len(alternative.words) > 0:
        transcript_item = {
            'text': alternative.transcript,
            'startTime': alternative.words[0].start_time.total_seconds(),
            'endTime': alternative.words[-1].end_time.total_seconds(),
            'duration': (alternative.words[-1].end_time.total_seconds()
                         - alternative.words[0].start_time.total_seconds())
        }

        transcript_item['words'] = []
        for word in alternative.words:
          transcript_item['words'].append({
              'text': word.word,
              'startTime': word.start_time.total_seconds(),
              'endTime': word.end_time.total_seconds(),
              'duration': (word.end_time.total_seconds()
                           - word.start_time.total_seconds()),
              'gap': word.end_time.total_seconds() - last_end_time})
          last_end_time = word.end_time.total_seconds()
        transcript_builder.append(transcript_item)
  return transcript_builder


def generate_transcript_item(
    words: list, start_time: float = None, end_time: float = None) -> dict:
  """Generates transcript item."""
  start_time = words[0]['startTime'] if start_time is None else start_time
  end_time = words[-1]['endTime'] if end_time is None else end_time
  return {
      'text': ' '.join(list(map(lambda word: word['text'], words))),
      'startTime': start_time,
      'endTime': end_time,
      'duration': end_time - start_time,
      'words': words
  }


def refine_by_gaps(transcript: list) -> list:
  """Refines the transcript by the gap time."""
  new_transcript = []

  for line in transcript:
    gaps = list(map(lambda clip: clip['gap'], line['words']))
    gaps.pop(0) #remove first gap

    if len(gaps) == 0:
      continue
    average = sum(gaps) / len(gaps)
    words = []
    for index, word in enumerate(line['words']):
      if index > 1 and word['gap'] > average * GAP_MULTIPLIER:
        new_transcript.append(generate_transcript_item(words))
        words = []
      words.append(word)
    if len(words) > 0:
      new_transcript.append(generate_transcript_item(words))
  return new_transcript



def merge_clips(transcript: list) -> list:
  """Merges clips under 5seconds."""
  if len(transcript) == 0:
    return []

  def merge(transcript1, transcript2):
    """Merges transcript1 and transcript2."""
    start_time = transcript1['startTime']
    end_time = max(transcript1['endTime'], transcript2['endTime'])
    return {
        'text': f"{transcript1['text']} {transcript2['text']}",
        'startTime': start_time,
        'endTime': end_time,
        'duration': end_time - start_time,
        'words': transcript1['words'] + transcript2['words'],
    }

  def is_overlapping(transcript1, transcript2):
    """Validate overlapping transcript time."""
    t2_start_time = transcript2['words'][0]['startTime']
    t2_prev_start_time = transcript2['words'][-1]['startTime']
    t1_start_time = transcript1['startTime']
    t1_end_time = transcript1['endTime']
    return t2_start_time >= t1_start_time and t2_prev_start_time <= t1_end_time

  output = []
  index = 0
  clip = transcript[index]

  for index in range(len(transcript)):
    if index < len(transcript) - 1:
      next = transcript[index + 1]
      if (next['endTime'] - clip['startTime'] <= MIN_CLIP_DURATION or
          is_overlapping(clip, next)):
        clip = merge(clip, next)
      else:
        output.append(clip)
        clip = transcript[index + 1]
    else:
      output.append(clip)
  return output


def refine_by_video_shots(
    file_name: str, video_gcs_uri: str, transcript: list) -> list:
  """Refines transcript with video shots data."""

  new_transcript = []
  video_shots = get_video_shots(file_name)

  if video_shots is None:
    video_shots = process_video(video_gcs_uri)
    upload_video_shots(file_name, video_shots)

  video_shots_index = 0
  list_of_words = list(map(lambda line: line['words'], transcript))
  transcript_words = list(itertools.chain.from_iterable(list_of_words))
  print('\\\\\ Transcript_words ////')
  print(transcript_words)
  words = []

  for index, word in enumerate(transcript_words):
    words.append(word)
    while video_shots[video_shots_index]['end_time'] <= words[0]['startTime']:
      video_shots_index = video_shots_index + 1
    video_shot = video_shots[video_shots_index]
    if word['endTime'] > video_shot['end_time']:
      start_time = min(words[0]['startTime'], video_shot['start_time'])
      if index < len(transcript_words) - 1:
        end_time = max(
            word['endTime'],
            min(
                video_shot['end_time'], transcript_words[index + 1]['startTime']
            ),
        )
      else:
        end_time = max(word['endTime'], video_shot['end_time'])
      video_shots_index = video_shots_index + 1
      new_transcript.append(
          generate_transcript_item(words, start_time, end_time)
      )
      words = []
  if len(words) > 0:
    start_time = min(
        words[0]['startTime'], video_shots[video_shots_index]['start_time']
    )
    if len(new_transcript) > 0:
      previous_last_word = new_transcript[-1]['words'][-1]
      start_time = max(start_time, previous_last_word['endTime'])

    end_time = max(word['endTime'], video_shots[video_shots_index]['end_time'])
    video_shots_index = video_shots_index + 1
    new_transcript.append(generate_transcript_item(words, start_time, end_time))

  return new_transcript

# @https_fn.on_call(
#     timeout_sec=600,
#     memory=options.MemoryOption.GB_4,
#     cpu=2,
#     region='asia-southeast1',
# )

#   video_full_path = request.data['full_path'] or VIDEO_FULL_PATH
#   file_name = request.data['file_name'] or FILE_NAME
#   language_code = request.data['language_code'] or LANGUAGE_CODE


def transcribe_video() -> any:
  """Transcribe video audio and store the transcript in GCS.

  Args:
    request: A request payload from API call.

  Returns:
    An object that contain video transcript with the timestamp data.
  """

  video_full_path = VIDEO_FULL_PATH
  file_name = FILE_NAME
  language_code = LANGUAGE_CODE

  if video_full_path is None:
    return {
        'error': (
            'Missing video uri, sample format:'
            ' https://googleapis.com/example.wav'
        )
    }

  transcript_in_firestore = get_transcript(file_name)
  if transcript_in_firestore is not None:
    return {
        'transcript': merge_clips(
            refine_by_video_shots(
                file_name,
                GS_PATH + video_full_path,
                transcript_in_firestore)),
        'original': transcript_in_firestore,
        'v1': refine_by_gaps(transcript_in_firestore),
    }

  audio_gcs_uri = extract_audio(video_full_path, file_name)
  print(f'Extracted audio is stored at {audio_gcs_uri}')

  audio = speech.RecognitionAudio(uri=audio_gcs_uri)
  client = speech.SpeechClient()

  config = get_speech_recognition_config(language_code)

  operation = client.long_running_recognize(config=config, audio=audio)

  print("Waiting for operation to complete...")
  response = operation.result(timeout=900)

  transcript = build_transcript(response)
  upload_transcript(file_name, transcript)

  return {
      'transcript': merge_clips(
          refine_by_video_shots(
              file_name,
              GS_PATH + video_full_path,
              transcript)),
      'original': transcript,
      'v1': refine_by_gaps(transcript)
  }

In [None]:
#@title Syncs with Shot (After Summarization)

def match_with_video_shots(video_shots: list,
                           transcript: list,
                           words: list) -> list:
  """Adjusts the startTime and endTime of each line in the transcript.

  This implementation helps with "jumpy" transition in the final output video.

  Args:
    video_shots: The list containing video shots in format of
    [{end_time, start_time}, {end_time, start_time},]
    transcript: The full transcript transcribed by Speech to Text AI.
    words: A list containing the startTime and eachTime of each word in the full
    transcript.

  Returns:
    The transcript with the adjusted startTime and endTime.
  """
  shot_index = 0
  word_index = 0
  for index, line in enumerate(transcript):
    while video_shots[shot_index]['endTime'] <= line['startTime']:
      shot_index += 1
    video_shot = video_shots[shot_index]

    start_time = min(line['startTime'], video_shot['startTime'])
    while (
        word_index + 1 < len(words) - 1
        and words[word_index + 1]['endTime'] < line['startTime']
    ):
      word_index += 1
    previous_word = words[word_index]
    if previous_word['startTime'] != line['startTime']:
      start_time = max(previous_word['endTime'], start_time)

    transcript[index]['startTime'] = start_time

    while video_shots[shot_index]['endTime'] < line['endTime']:
      shot_index += 1
    video_shot = video_shots[shot_index]

    end_time = max(line['endTime'], video_shot['endTime'])

    while (
        word_index < len(words) - 1
        and words[word_index]['startTime'] < line['endTime']
    ):
      word_index += 1
    next_word = words[word_index]
    if next_word['endTime'] != line['endTime']:
      end_time = min(end_time, next_word['startTime'])

    if index == len(transcript) - 1:
      end_time = video_shots[-1]['endTime']
    else:
      #manually add 0.3s to end_time for better transitions
      end_time = round(end_time + 0.3, 2)

    transcript[index]['endTime'] = end_time
    transcript[index]['duration'] = end_time - start_time
  return transcript


def extract_words_from_str(summary: str) -> list:
    """Extracts the words from the given summary splitting by space.

    Args:
      summary: A summary of the transcript.

    Return:
      A list of words from the given summary.
    """
    # Remove the trailing "transcript:" from the summarized transcript from LLM
    if summary.lstrip().lower().startswith('transcript:'):
      summary = summary.lower().replace('transcript:', '', 1)

    summary = re.sub('[,.?!]', '', summary).lower()
    summary = summary.replace('\n', ' ')

    words = summary.split(' ')
    words = list(filter(lambda word: len(word) > 0, words))
    print(f'words: {words}')
    return words



def get_clips_from_transcript(
      #self,
      transcript_words: list,
      shortened_text: str,
      input_transcript: list) -> list:
    """Identifies the clip from the summarized transcript. This function  minimizes the hallucination when LLM
    doesn't respect the original sentences from the full transcripts by adding  new words or only returning parts
    of the original sentences in its response.

    Example:
      - Original sentence: "MacBook Air for the first time ever in 15 inches  we've been dreaming about making this for years we"
      - Response from LLM: "MacBook Air for the first time ever in 15 inches..."

    Args:
      transcript: The original full transcripts
      summary: The "summarized" transcript from LLM

    Returns:
      A list containing the adjusted text, start_time, end_time, duration.
    """
    print("----get_clips_from_transcript-----'")
    print(transcript_words)
    transcript_ptr = 0
    output = []

    summary_words = extract_words_from_str(shortened_text)

    word_ptr = 0

    def does_word_match_transcript(transcript_idx: int, word_idx: int):
      if (transcript_idx >= len(transcript_words) or
          word_idx >= len(summary_words)):
        return False

      transcript_word_text = transcript_words[transcript_idx].get('text')
      transcript_word_text = re.sub('[,.?!]', '', transcript_word_text)
      return (transcript_word_text.lower() ==
              summary_words[word_idx].lower())

    while transcript_ptr < len(transcript_words):
      transcript_builder = []

      # loop until the summary word match with transcript
      # or until the transcript has True shouldKeep flag
      while (transcript_ptr < len(transcript_words)
             and not does_word_match_transcript(transcript_ptr, word_ptr)
             and transcript_words[transcript_ptr].get('shouldKeep') != True):
        transcript_ptr = transcript_ptr + 1

      # append all matched transcript summary
      # or transcript that has True shouldKeep flag
      while (transcript_ptr < len(transcript_words) and
              (does_word_match_transcript(transcript_ptr, word_ptr)
               or does_word_match_transcript(transcript_ptr + 1, word_ptr + 1)
               or does_word_match_transcript(transcript_ptr + 2, word_ptr + 1)
               or transcript_words[transcript_ptr].get('shouldKeep') == True)):
        transcript_builder.append(transcript_words[transcript_ptr])

        if does_word_match_transcript(transcript_ptr, word_ptr):
          word_ptr += 1

        elif transcript_words[transcript_ptr].get('shouldKeep') != True:
          transcript_builder.append(transcript_words[transcript_ptr+1])

          if not does_word_match_transcript(transcript_ptr + 1, word_ptr + 1):
            transcript_builder.append(transcript_words[transcript_ptr+2])
            transcript_ptr += 1

          transcript_ptr += 1
          word_ptr += 2

        transcript_ptr += 1

      if len(transcript_builder) == 0:
        continue
      if len(transcript_builder) == 1:
        word_ptr -= 1
        continue

      new_text = list(map(lambda item: item.get('text'), transcript_builder))
      output.append({
        'text': ' '.join(new_text),
        'startTime': transcript_builder[0].get('startTime'),
        'endTime': transcript_builder[-1].get('endTime'),
        'duration': (transcript_builder[-1].get('endTime') -
               transcript_builder[0].get('startTime')),
        'words': transcript_builder
      })
    return output

In [None]:
#@title Gemini Functions

from logging import Handler
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, SafetySetting, Image, Content, Part, FinishReason, HarmCategory, HarmBlockThreshold

def upload_segments_to_gemini_bucket(segments: list,
                                    source_file_name: str,
                                    destination_blob_name: str) -> list:
    """Split videos into segments as a result from the transcribe_videos (for videos with
    voiceovers) or from the shot detection API and then uploads those segmented parts to GCS
    Returns: A list of uri for segmented videos"""
    local_video_path = f"/content/{VIDEO_FULL_PATH.replace('videos', '')}"  # Process video files in Colab's content folder for quicker processing
    download_blob(source_file_name=f"{VIDEO_FULL_PATH}", #/videos/video.mp4
                  destination_blob_name=local_video_path) #download to /content/ in codelab --- change later when deploying to CF, use /tmp/
    original_clip = moviepy.VideoFileClip(local_video_path)
    segments_uri = []
    for counter, segment in enumerate(segments):
        new_clip = original_clip.subclip(segment['startTime'], segment['endTime'])
        file_name_without_extension = FILE_NAME.replace('.mp4', '')
        segment_file = f"{file_name_without_extension}_{counter}.mp4"
        new_clip.write_videofile(f"/content/{segment_file}",audio_codec="aac", logger=None)
        upload_blob(source_file_name=segment_file,
                    destination_blob_name=f'gemini/{segment_file}')  # Upload to adclip.appspot.com/gemini/ bucket (seperate from video bucket)
        segments_uri.append(f"{GS_PATH}gemini/{segment_file}")
    return segments_uri

def send_video_to_gemini(video_full_path, root_promt="", context=""):
    model = GenerativeModel("gemini-pro-vision")
    video = Part.from_uri(video_full_path, mime_type="video/mp4")
    root_prompt = "Provide a description on the visual elements of the video"
    prompt = f"{root_prompt} '\n' {context}"
    contents = [video, prompt]
    generation_config = GenerationConfig(max_output_tokens=2048,
                                         temperature=0.4,
                                         top_p=1,
                                         top_k=32)
    responses = model.generate_content(contents,
                                       stream = True,
                                       generation_config=generation_config,)
    response = ""
    for chunk in responses:
        response += chunk.text
    return response

def send_transcript_to_llm(text: str,
                           model: str = "gemini-1.5-pro-001",
                           temperature: float = 0.3,
                           max_output_tokens: int = 8192,
                           top_k: int = 32,
                           top_p: int = 1) -> str:
    """Sends a transcript to Vertex LLM.

    Args:
        text: A prompt to generate the response from the model.
        model: The Language Model to use.
        temperature: A temperature indicates the degree of randomness in token selection.
        max_output_tokens: The maximum number of tokens that can be generated in the response.
        top_k: A value indicates how the model selects tokens for output.
        top_p: A value indicates how the model selects tokens for output.

    Returns:
        A string of the summarized transcript.
    """
    model = GenerativeModel(model)
    prompt = f"{root_prompt} '\n' {text} "
    contents = [prompt]
    generation_config = GenerationConfig(
            max_output_tokens=max_output_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,)

    safety_config = [
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
    ]

    responses = model.generate_content(contents,
        generation_config=generation_config,
        safety_settings=safety_config,
        stream=False,
    )
    print(responses.text)
    return responses.text


def select_segments(num_of_lines=3, descriptions=""):
    model = GenerativeModel("gemini-1.5-pro-001")
    prompt = f"""
    You are an expert video editor. Your task is to trim a video ad based on the following criteria
    - Awareness: Strong message, scene/tease, or conclusion or Showcase the subject, product, animation tightky framed, or close up, or zoomed in
    - Branding: Mention of brand name, logo, products, packages
    - Connection: Visible face or human (body parts, humans, animations, cartoons are acceptable) or Product Interaction (for example: a user holding phone, eating product, using app)
    or Clear messaging on benefits to consumer (show what products can do, what brands want viewers to do) or Competitive claims (awards, review, recommendation) or
    Emotion response through text, speech, music (fear, laughter, sadness, disgust, surprise, delight, etc.)
    - Direction: Call-to-action is detected through text or audio or Path to purchase showing how to buy (physical stores, app, website) is detected
    or A search bar is visible on screen or Mention of limited qualities, price, special offer

    Below are the computer-generated transcript and visual description of each of the line in the video

    Provide the top {num_of_lines} for each of the criteria above in JSON.

    {{'Awareness': {{'Line': Description,
                     'Line': Description}},
    'Branding':}}

    {descriptions}
    """
    generation_config = GenerationConfig(
            max_output_tokens=2048,
            temperature=0.2,
            top_k=32,
            top_p=1,)
    safety_config = [
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
        SafetySetting(
            category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
        ),
    ]
    responses = model.generate_content(
        prompt,
        generation_config=generation_config,
        safety_settings=safety_config,
        stream=False,
        )
    return responses.text

def get_descriptions_of_all_shots():
    descriptions = ""
    for counter, segment in enumerate(segments):
        if has_voice_over:
            descriptions += (f"Line {counter} \nTranscript: {segment['text']}\nVisual Description: {segment['visual_description']}\n")
        else:
            descriptions += (f"Line {counter} \nVisual Description: {segment['visual_description']}\n")
    #print(descriptions)
    return descriptions


root_prompt ="""You are a senior copy writer for an advertising agency who excels at summarizing transcript for video ads.
Shorten the transcript by keeping important lines and removing other lines. Make sure the output is less than 30% of the original input transcript
Keep the format of the output the same with the input. Keep Line number. Do not capitalize sentences, add commas, or rewrite the output.

input:
MacBook Air
for the first time ever in 15 inches
we\'ve been dreaming about making this for years
we designed a big beautiful display
the kept it incredibly thin and super like it\'s all possible because of a basilica
M2 is so efficient but we don\'t need a fan which means you\'ll have a MacBook Air that\'s a thin as ever while running completely
silent and everything comes together inside a design packed so tight that there\'s barely room for an ant\'s
from the side it almost
disappears and the liquid Retina Display
just look at it
that\'s twice the resolution of a comparable 15 inch PC laptop and 25 percent brighter a bigger display means more room and more room means more speakers
double the Boost double the Beast
they have double the base of the
13-inch MacBook Air and because the speakers are located behind the
keyboard the sound reflects off the display towards the viewer so it feels super
immersive
okay tell us more about the chip it\'s a very efficient SOC with an 8 core
CPU tank or
GPU and a 16 Corner oh engine
running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which
means killer battery life and crazy fast
of course it also works great with iPhone
and the one thing no one realizes well other laptops that are thin and wide can sometimes feel
flimsy
so we\'re extremely intentional with the are structural design keeping it ultra light but making it as durable as possible
because you know things happen
this is the
an inch laptop but we\'ve always wanted to create its
uncompromising
expansive the in light and Powerful we love making it and we think you\'ll love using it

output:
MacBook Air
for the first time ever in 15 inches
we\'ve been dreaming about making this for years
we designed a big beautiful display
just look at it
they have double the base of the
immersive
GPU and a 16 Corner oh engine
running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which
means killer battery life and crazy fast
uncompromising
expansive the in light and Powerful we love making it and we think you\'ll love using it

input:"""

In [None]:
#@title Video Cutting Functions

from moviepy.video.fx.all import crop

def merge_segments(segments):
  if len(segments) == 0:
    return []

  output = [segments[0]]

  for index in range(1, len(segments)):
    if segments[index]['startTime'] <= output[-1]['endTime']:
        output[-1]['endTime'] = segments[index]['endTime']
    else:
        output.append(segments[index])
  return output

def clip_video(video_path, file_name, segments):
 # loading original video
    original_clip = moviepy.VideoFileClip(video_path)
    new_clip = {}

    segments = merge_segments(segments)

    for segment in segments:
        #make sure end_time does not exceed the video duration
        end_time = min(segment['endTime'], original_clip.duration)
        if new_clip:
            new_clip = moviepy.concatenate_videoclips([new_clip, original_clip.subclip(segment['startTime'], end_time)])
        else:
            new_clip = original_clip.subclip(segment['startTime'], end_time)

    (w, h) = new_clip.size

    # 9/16 ratio

    crop_width = h * 9/16
    crop_width = crop_width//2*2

    x1, x2 = (w - crop_width)//2, (w+crop_width)//2
    y1, y2 = 0, h
    cropped_clip = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)

    # 1/1 ratio

    crop_width = h
    crop_width = crop_width//2*2

    x1, x2 = (w - crop_width)//2, (w+crop_width)//2
    y1, y2 = 0, h
    cropped_clip_square = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)

    video_output_path = f"/content/output_vertical_{FILE_NAME}" # 9:16 vertical
    video_output_square = f"/content/output_square_{FILE_NAME}" # 1:1
    video_output_path_original = f"/content/output_horizontal_{FILE_NAME}"

    # Write file without wm
    cropped_clip.write_videofile(video_output_path,audio_codec="aac") #put in audio_codec so the clip has sound
    cropped_clip_square.write_videofile(video_output_square, audio_codec="aac")
    new_clip.write_videofile(video_output_path_original,audio_codec="aac") #put in audio_codec so the clip has sound

    # upload to cloud storage
    # upload_blob(video_output_path, 'output/' + 'vertical_tmp_' + file_name)
    # upload_blob(video_output_square, 'output/' + 'square_tmp_' + file_name)
    # upload_blob(video_output_path_original, 'output/' + 'landscape_tmp_' + file_name)

## GCP & Videos Processing

In [None]:
#@title Configuration

#@markdown ##### Note: There will be a pop-up window asking you to authenticate. Feel free to use your own project and enable all required APIs.

PROJECT_ID = ""   # @param {type:"string"}
LOCATION = "" # @param {type:"string"}
GCLOUD_BUCKET_NAME = "" # @param {type:"string"}
GS_PATH = f'gs://{GCLOUD_BUCKET_NAME}/'


# From CF
TEMP_FOLDER = '/tmp/'
AUDIO_FOLDER = 'videos/audio/'  # in Cloud Storage

# Customized for transcribe
GAP_MULTIPLIER = 2.5
MIN_CLIP_DURATION = 5

#@title Initialize GCP

from google.colab import auth as google_auth
google_auth.authenticate_user(project_id=PROJECT_ID)
!gcloud config set project {PROJECT_ID}
!gcloud config get-value project

storage_client = storage.Client()
bucket = storage_client.get_bucket(GCLOUD_BUCKET_NAME)

#Initialize Front End; Need to initialize to use Firestore
initialize_app()

In [None]:
#@title Download Videos from YouTube and Upload to GCS

#@markdown ##### If the functionality to download from YouTube is not working, you can upload to the video to GCS bucket and put the VIDEO_FULL_PATH directly in the box. [Link to GCS bucket](https://pantheon.corp.google.com/storage/browser/adclip.appspot.com/videos?e=13802955&mods=dm_deploy_from_gcs&project=adclip&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))) `gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}`
youtube_url = "" # @param {type:"string"}
VIDEO_FULL_PATH = "" # @param {type:"string"}

#@markdown ##### Not all languages are supported.
has_voice_over = True # @param ["False", "True"] {type:"raw"}
LANGUAGE_CODE = "en-US" # @param ["en-US", "es-ES", "id-ID", "fil-PH", "id-ID", "th-TH", "vi-VN"]

#@markdown ##### Select this option to analyze videos with visual elements. Note: Only do this for video less than 10 minutes please.
upload_to_gemini_bucket = False # @param ["False", "True"] {type:"raw"}

##@markdown ##### Select the model for Speech-to-Text. Not all models are available (for example: "video" are only available for en-US. If in doubt, use "default". Check [this link](https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages) for details"
#MODEL = "default" # @param ["default", "command_and_search", "latest_short", "video"]

try:
    VIDEO_FULL_PATH = download_video(youtube_url)["full_path"] # will return `videos/myclip.mp4`
except:
    VIDEO_FULL_PATH = VIDEO_FULL_PATH

FILE_NAME = VIDEO_FULL_PATH.split("/")[-1]

if has_voice_over:
    segments = transcribe_video()["transcript"]  # ['transcript', 'original', 'v1']
else:
    shots = process_video(
        video_gcs_uri=f"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}") # returns videoshot only
    segments = []
    for shot in shots:
        reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key
        reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())
        segments.append(reformed_dict)

print("\\\ Segments ////")
print(segments)
print(f"The video is segmented to {len(segments)} parts.")

if upload_to_gemini_bucket:
    file_name_without_extension = FILE_NAME.replace('.mp4', '')
    segment_file_0 = f"{file_name_without_extension}_0.mp4"

    if not does_file_exist(f"gemini/{segment_file_0}"):  # check if the 1st file exists in bucket
        segments_uri = upload_segments_to_gemini_bucket(segments,
                                                    source_file_name=VIDEO_FULL_PATH,
                                                    destination_blob_name=f'gemini/{FILE_NAME}')

    print(f"Completed uploading {len(segments)} segments to {GCLOUD_BUCKET_NAME} buckets")

## Gemini Processing (Visual + Semantic)

### Only run Option 1 or Option 2

In [None]:
# @title Option 1: Select Shots based on Transcript and Visual Description (ABCD framework)

if send_video_to_gemini:
    for counter, segment in enumerate(segments):
        video_full_path = f"gs://adclip.appspot.com/gemini/{file_name_without_extension}_{counter}.mp4"
        try:
            response = send_video_to_gemini(video_full_path)
            segment['visual_description'] = response.strip()
        except:
            segment['visual_description'] = ""  # the response got blocked
    descriptions = get_descriptions_of_all_shots()
    print(descriptions)

    num_of_lines = int(len(segments)/2) # The higher num of lines, the longer the video will be
    data = select_segments(num_of_lines=2, descriptions=descriptions).strip("'")
    print(data)
    print('Data is type string. Please input the line number in the box below')

In [None]:
# WRITE THE SELECTED SEGEMENTS FROM GEMINI IN THIS FUNCTION
selected_segments = sorted([0,1,2,3])

In [None]:
# @title Option 2: Select Shots based on Transcript (no Visual Description) -- use for transcript-heavy videos.

input_transcript = segments
full_text = ""
for counter, i in enumerate(input_transcript):
    line = f"{counter}: {i['text']}"
    full_text += line + '\n'

shortened_text = send_transcript_to_llm(full_text).strip(" ").strip("'")
print('----shortened_text-----')
print(shortened_text)

selected_segments = []

for line in shortened_text.split("\n"):
    try:
        counter = int(line.split(": ")[0])
        selected_segments.append(counter)
    except:
        continue

print(selected_segments)

## Video Processing

In [None]:
# @title Generate Short-Form Videos

processed_segments = []
for segment in selected_segments:
    processed_segments.append(segments[segment])
print(f"Number of segments from the original videos: {len(segments)} | Number of Segments selected by Gemini: {len(processed_segments)}")

import os

local_video_path = f"/content/{VIDEO_FULL_PATH.replace('videos/', '')}"
if not os.path.exists(local_video_path):
    download_blob(source_file_name=VIDEO_FULL_PATH,
                  destination_blob_name=local_video_path)

print(f"Start shortening videos, this process will take a while ...")
clip_video(video_path=local_video_path,
           file_name=FILE_NAME,
           segments=processed_segments)

try:
    ################ Second sync with shots###############
    print("Try processing with shots sync")
    input_transcript = processed_segments
    list_of_words = list(map(lambda line: line['words'], input_transcript))
    transcript_words = list(itertools.chain.from_iterable(list_of_words))
    video_shots = get_video_shots(FILE_NAME)
    ## process video shots to align keys name ###
    if video_shots is None:
        # shots = process_video(
        # video_gcs_uri=f"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}") # returns videoshot only
        video_shots = []
        for shot in shots: # from step 1
            reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key
            reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())
            video_shots.append(reformed_dict)
    shortened_text = ''
    for i in segments:
        shortened_text += i['text'] + '\n'
    segments_transformed = get_clips_from_transcript(transcript_words=transcript_words,
                                shortened_text=shortened_text,
                                input_transcript=input_transcript)
    try:
        segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)
        print("Matched with Shots successfully")
    except:
        print("Retry matching with shots")
        shots = get_video_shots(FILE_NAME)
        video_shots = []
        for shot in shots:
            reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key
            reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())
            video_shots.append(reformed_dict)
        segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)
    print(f"Segments selected by Gemini and processed with video shots: {len(processed_segments)}")
    FILE_NAME = f"{FILE_NAME.replace('.mp4','')}_sync.mp4"
    clip_video(video_path=local_video_path,
            file_name=FILE_NAME,
            segments=segments_transformed)

except:
    print("Can not process with shots matching. This is due to the video does not have voice over or the transcript is too heavy to be match with video shots.")

## Copy files to Google Drive (if needed)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# !cp "/content/output.mp4" "/content/drive/MyDrive/"

In [None]:
# Write data to JSON if needed

import json
with open("segments.json", 'w') as f:
    json.dump(segments, f, indent=2)

with open("segments_transformed.json", 'w') as f:
    json.dump(segments_transformed, f, indent=2)