In [None]:
# Copyright 2017 The PARSE-ego4D Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
from tqdm.notebook import tqdm
import random
import json
import pandas as pd
import datetime

In [1]:
# @title Prompt engineering

FIX_JSON_PROMPT = """\
{full_prompt}
{response}
This response returned the following JSON decode error: {e}. \
Please re-generate the response to fix the error:
"""

ANOTHER_SUGGESTION_PROMPT = """\
{full_prompt}
{response}

Thank you for generating this response. Please now generate another JSON \
response in the same manner as before, but with a different suggestion. \
Do not reuse the same response from above, but rather think about a new \
response from scratch. Make it an even more useful suggestion for the user.

Response:
"""

sys_prompt_actions = """\
Here is a list of available apps that you can use to recommend actions to the user:

Multimodal search (MMS): This application will take in the current camera input and a text query and run a multimodal search using the text query with the image as context. MMS can recognize objects, identify plants and animals, provide nutritional information, look up information, and answer general knowledge questions. It takes language and image input, and outputs text.

API format:
action: "mms"
params:
  query: str, adapted from the user query

And here are some example actions for this app:
{"action": "mms", "params": {"query": "How much does this cost?"}}
{"action": "mms", "params": {"query": "How much sugar is in this bar?"}}
{"action": "mms", "params": {"query": "Tell me about this book?"}}
{"action": "mms", "params": {"query": "What is this painting? Who is it by? Is it popular?"}}
{"action": "mms", "params": {"query": "What is the common name for this plant?"}}
{"action": "mms", "params": {"query": "Is this gluten free?"}}
{"action": "mms", "params": {"query": "Where can I buy this?"}}

Assistant: This is the Android device assistant that has access to system apps. Basic apps that can be called from the Assistant include: `Notes`, `Timer`, `Stopwatch`, `Alarm`, `Email`, `Music`, `Phone`, `Contacts`, `Messages`, `Settings`, `Calculator`. Additionally, the Assistant can control smart home gadgets, access notifications, and others.

API format:
action: "assistant"
params:
  query: str, adapted from the user query
  hint: str, hint for the assistant about what app/system to use for handling this request (optional)

And here are some example actions for this app:
{"action": "assistant", "params": {"query": "Do pineapples need to be refrigerated?"}}
{"action": "assistant", "params": {"query": "What is memorial day?", "hint": "search"}}
{"action": "assistant", "params": {"query": "What size tank does the Yamaha YZ125 have?", "hint": "search"}}
{"action": "assistant", "params": {"query": "How long does it take a car battery to die if you leave the lights on?", "hint": "search"}}
{"action": "assistant", "params": {"query": "How many carbs are in Buddha gluten-free hamburger buns?", "hint": "search"}}
{"action": "assistant", "params": {"query": "How long does marking spray paint last on cement?", "hint": "search"}}
{"action": "assistant", "params": {"query": "What song is this?"}}
{"action": "assistant", "params": {"query": "flip a coin"}}
{"action": "assistant", "params": {"query": "how much data have I used this month?", "hint": "settings"}}
{"action": "assistant", "params": {"query": "27.1 * 91", "hint": "calculator"}}
{"action": "assistant", "params": {"query": "turn off data", "hint": "settings"}}
{"action": "assistant", "params": {"query": "turn on low battery mode", "hint": "settings"}}
{"action": "assistant", "params": {"query": "remove ibuprofen from the shopping list", "hint": "notes"}}
{"action": "assistant", "params": {"query": "check if <name> responded to my email", "hint": "email"}}
{"action": "assistant", "params": {"query": "check for new emails", "hint": "email"}}
{"action": "assistant", "params": {"query": "meetings last thursday", "hint": "calendar"}}
{"action": "assistant", "params": {"query": "do I have anything scheduled for tonight?", "hint": "calendar"}}
{"action": "assistant", "params": {"query": "play music that compliments this view", "hint": "music"}}
{"action": "assistant", "params": {"query": "turn volume to 70%", "hint": "music"}}
{"action": "assistant", "params": {"query": "turn volume to 70%", "hint": "music"}}
{"action": "assistant", "params": {"query": "dim lights to 20%", "hint": "home"}}
{"action": "assistant", "params": {"query": "set AC to 70 degrees", "hint": "home"}}
{"action": "assistant", "params": {"query": "play inception on chromecast", "hint": "home"}}
{"action": "assistant", "params": {"query": "set alarm for 8am", "hint": "alarm"}}
{"action": "assistant", "params": {"query": "check active stopwatch", "hint": "stopwatch"}}
{"action": "assistant", "params": {"query": "start stopwatch", "hint": "stopwatch"}}
{"action": "assistant", "params": {"query": "set timer for 5 minutes", "hint": "timer"}}
{"action": "assistant", "params": {"query": "weather this sunday", "hint": "weather"}}
{"action": "assistant", "params": {"query": "next full moon", "hint": "weather"}}
{"action": "assistant", "params": {"query": "send message to <name>", "hint": "messages"}}
{"action": "assistant", "params": {"query": "tell <name> I'll be 5 minutes late", "hint": "messages"}}
{"action": "assistant", "params": {"query": "call <name>", "hint": "phone"}}
{"action": "assistant", "params": {"query": "show notifications", "hint": "notifications"}}
{"action": "assistant", "params": {"query": "show notifications", "hint": "notifications"}}

Memory: The memory app can store memories and retrieve them later. Memories can be enrolled manually in the app, by the user telling the memory app to remember something explicitly. Memories can also be automatically enrolled without requiring any action from the user. For example, if the user is looking at a shopping list, the memory app might automatically remember that shopping list so that it can be retrieved later.

API format:
action: "memory"
params:
  query: str, adapted from the user query
  memory_query_type: str, one of "store", "retrieve"

And here are some example actions for this app:
{"action": "memory", "params": {"query": "remember this", "memory_query_type": "store"}}
{"action": "memory", "params": {"query": "remember to never order this wine again", "memory_query_type": "store"}}
{"action": "memory", "params": {"query": "remember what I ordered here", "memory_query_type": "store"}}
{"action": "memory", "params": {"query": "remember what John is allergic to", "memory_query_type": "store"}}
{"action": "memory", "params": {"query": "remember this place", "memory_query_type": "store"}}
{"action": "memory", "params": {"query": "remember this", "memory_query_type": "retrieve"}}
{"action": "memory", "params": {"query": "what is john allergic to?", "memory_query_type": "retrieve"}}
{"action": "memory", "params": {"query": "what do I have to buy here?", "memory_query_type": "retrieve"}}

Language: The language application is an application that can either transcribe what the user is hearing right now, translate what the user is reading or hearing, determining what language is spoken.

API format:
"action": "language"
  "params"
    "query": str, adapted from the user query
    "language_query_type": str, one of "translate", "transcribe", "detect", "summarize"
    "source_language": str, optional, only specified if the user specified it
    "target_language": str, optional, only specified if the user specified it

And here are some example actions for this app:
{"action": "language", "params": {"query": "What language is this person speaking?", "language_query_type": "detect_language"}}
{"action": "language", "params": {"query": "Is he speaking Japanese?", "language_query_type": "detect_language"}}
{"action": "language", "params": {"query": "transcribe from spanish", "language_query_type": "transcribe", "source_language": "spanish"}}
{"action": "language", "params": {"query": "translate from french to english", "language_query_type": "translate", "source_language": "french", "target_language": "english"}}
{"action": "language", "params": {"query": "transcribe", "language_query_type": "transcribe"}}
{"action": "language", "params": {"query": "translate text", "language_query_type": "translate"}}
{"action": "language", "params": {"query": "summarize what we just talked about", "language_query_type": "summarize"}}

Maps: The maps application can help the user find relevant places nearby, plan routes, estimate distances and navigate to places.

API format:
action: "maps"
  params:
    query: str, adapted from the user query
    mode: str, optional, one of "walking", "cycling", "public_transport", "driving", "taxi"

And here are some example actions for this app:
{"action": "maps", "params": {"query": "closest grocery store"}}
{"action": "maps", "params": {"query": "gas station on the way to the airport"}}
{"action": "maps", "params": {"query": "nearest Starbucks", "mode": "walking"}}
{"action": "maps", "params": {"query": "directions home"}}
{"action": "maps", "params": {"query": "What is the ETA?"}}
{"action": "maps", "params": {"query": "Directions to the park"}

Instructions: This app can give detailed and step-by-step instructions to the user.

API format:
action: "instructions"
params:
  query: str, adapted from the user query

And here are some example actions for this app:
{"action": "instructions", "params": {"query": "how to repair a bike tire"}}
{"action": "instructions", "params": {"query": "How to knit a scarf?"}}
{"action": "instructions", "params": {"query": "How do I bake a sourdough bread?"}}
{"action": "instructions", "params": {"query": "How to perform basic first aid for cuts?"}}

Here is a list of proactive query examples for different contexts:

If taking medicine, then log that medicine was likely taken, potentially store photo (mapping to app: Memory)
If getting home, then notify for reminders/messages/memories that have location triggers  (mapping to app: Notifications, Memory)
If arriving at store where items on shopping list can be found, then open shopping list (mapping to app: Assistant, Memory)
If 2 hours until potluck, and I said I'm bringing apple pie, then show a reminder (mapping to app: Memory, Assistant, Calendar)
If time to take antibiotics, based on doctor's prescription, then show a reminder (mapping to app: Memory, Assistant, Calendar)
If engaging in a conversation, then enable Do Not Disturb in the system  (mapping to app: Asisstant)
If eating something, then remember what was eaten (mapping to app: Memory)
If driving, then auto Do-Not-Disturb, store activity log? (mapping to app: Asisstant, Memory)
If my child is laughing in my home, then record image/video clip to memory/photos? (mapping to app: Memory)
If leaving the office, then message partner that on the way home (mapping to app: Messages)
If leaving a building, then offer directions to the next likely location (mapping to app: Maps)
If in a meeting with person X / about subject Y, then reminder of unmentioned agenda items (mapping to app: Memory, Notifications)
If looking at a landmark, then ask if user wants to learn about the history (mapping to app: Maps)
If someone shows up at home while not there, then notify the user and allow talking to that person (mapping to app: Home (Nest))
If starting / stopping exercise, then ask if the user wants to log the activity (mapping to app: Fitness)

Here is a list of object-centered queries for different kinds of objects:

Generic: What can I do with this?, Show carbon footprint, Show how to dispose of this, Send a photo of this to <name>, Remember this, Add to to-do list
Smart home (Speaker): Music controls: pause, play, next, previous
Smart home (Lamp): Change color
Smart home (Lamp, AC): Dim, regulate, Turn on/off
Book: Mark book as read, Listen as audiobook, Show reading history, Compare to other books from the same author
Food: What can I make with this?, Show stock level at home, Show which of my saved recipes use this item, Log into nutrition tracker app, Tell me when I last consumed this, Assess freshness or expiry, Show product origin, Show nutrition, ingredients and allergens, Show product origin
Household: Show user manual, Show tutorial for use
Clothes or furniture: Show last cleaned, Mark as cleaned, Check warranty, Show materials, Show cleaning instructions, Change color
Plants: Show care instructions, Predict health, Mark as watered, Show last watered, Mark as trimmed, Show last trimmed, Mark as soiled, Show last soiled
"""

example = """\
Here is an example of this task:

Input:
   0 #C C looks around
   1 #C C interacts with lady x
   2 #C C looks around
   3 #C C walks
   4 #C C interacts
   5 #C C looks around
   6 #C C walks
   7 #C C looks around
   8 #C C walks
   9 #C C looks around
  10 #C C does something #unsure
  11 #C C interacts
  12 #C C walks
  13 #C C interacts with man y
  14 #C C walks
  15 #C C interacts
  16 #C C looks around
  17 #C C walks
  18 #C C interacts
  19 #C C looks around
  20 #C C walks
  21 #C C interacts
  22 #C C looks around
  23 #C C walks
  24 #C C looks around
  25 #C C interacts
  26 #C C looks around
  27 #C C walks
  28 #C C looks around in the supermarket.
  29 #C C walks around next to coffee maker machines.
  30 #C C looks towards a coffee maker machine.
  31 #C C walks around next to coffee maker machines.
  32 #C C looks towards a coffee maker machine.
  33 #C C looks around a cup lid dispenser.
  34 #C C walks around in the supermarket.
  35 #C C looks towards a display counter.
  36 #C C looks around in the supermarket.
  37 #C C lifts a plastic tumbler.
  38 #C C moves around in the supermarket.
  39 #C C looks around a cup lid dispenser.
  40 #C C looks around in the supermarket.
  41 #C C walks around in the supermarket.
  42 #C C views items on supermarket shelving.
  43 #C C walks around in the supermarket.
  44 #C C views items on supermarket shelving.

Response:
{
  "thoughts": "The user is in a supermarket, probably shopping. They have been \
walking around, probably looking for something. At line [28], they first \
looked around in the supermarket, so that would be a natural time for them to \
ask their AR glasses for help. It's unlikely that the AR glasses could help \
navigate the supermarket because I don't know of any service or app that would \
do that. However, they could ask the AR glasses to open their shopping list. \
Then it could display items one by one as the user goes through the \
supermarket.",
  "intent": {
    "timestamp": 28,
    "description": "Open the shopping list",
    "query": "Show me my shopping list"
  },
  "action": {
    "action": "assistant",
    "params": {
      "query": "show me my shopping list",
      "hint": "notes"
    }
  },
  "confidence": {
    "timing_confidence": 0.8,
    "query_confidence": 0.8,
    "action_confidence": 0.9
  },
  "assumptions": {
    "system_assumptions": "the user has a shopping list",
    "user_assumptions": ""
  },
}"""

narration_format = """\
We will provide a narrated user journey in the following format:

(...)
#C C interacts with the man Y
#C C raises a boot
#C C wears the boot on her left leg
#O The man Y walks out of the bedroom
#O The man Y walks into the bedroom
#O The man Y drops the boots on the floor
(...)

where #C shows that the sentence is about an action that you are doing, \
and #O shows that the sentence is about an action that someone else is doing.\
"""

json_format = """\
The format of your response should be in JSON, in which you first write out \
your thoughts, then write out the user query that the user would be asking in \
their particular situation, then the line of the narrations at which the user \
is asking the query, and finally the action that the AR glasses should take to \
respond to the user query. Here is the JSON response format:

thoughts: str   # analyze situation, rationale for suggestion
intent:
  timestamp: int   # when to ask the query
  description: str   # what the user may want to do in this situation
  query: str   # query that the user might ask the glasses
action:
  action: str   # action that the glasses should take in response to the query
  params: dict   # according to the action specification
confidence:
  timing_confidence: int   # how confident that this is a good time
  query_confidence: int   # how confident that this is a useful/helpful query
  action_confidence: int   # how confident that this is the correct and valid action
assumptions:
  system_assumptions: Optional[str]   # assumptions about the system state (e.g. memory)
  user_assumptions: Optional[str]   # assumptions about the user (e.g. vegetarian)\
"""

system_prompt = f"""\
You are a user experience researcher and you are helping us collect a dataset \
of useful interactions for augmented reality (AR) glasses. We have a set of \
applications and services on the AR glasses already, and our current goal is \
to effectively link user queries to system actions on the AR glasses, in a \
wide variety of contextual settings and use case scenarios. To do this, we \
have a dataset of narrated user journeys, of what a particular user has been \
doing with their AR glasses in the last 10-30 minutes.

{narration_format}

{sys_prompt_actions.strip()}

Given the narrations of what the user has been doing, your task is to read \
through the situation description and think about when the user would ask the \
AR glasses for help, and what they would ask their AR glasses to do for them, \
given the action list above that the AR glasses support. \

You can feel free to imagine additional circumstances that are not explicitly \
mentioned in the situation description. For example, if the situation only \
mentions that the user is walking around in the supermarket, you can imagine \
that the user is looking for a particular product, like cheese - even though \
'cheese' is never mentioned in the narration. \

You should pick the single most appropriate time at which the user would ask \
their AR glasses for help, and then respond with a query for the AR glasses. \
This query can be a question or a command.

{json_format}

{example}
"""

## Helper functions

In [None]:
def get_gemini_model(model_name: str):
    if model_name == "mm":
        pass
    elif model_name == "lm":
        pass
    else:
        raise ValueError("Unsupported model name: ", model_name)

def get_gemini_response(gemini_model, query: str):
    pass

def load_ego4d_narration_data():
    """Load the Ego4D narration JSON file, from `annotations/narration.json`."""
    pass

def check_db_connection():
    """Returns True if the database connection is successful, False otherwise."""
    pass

def store_data(df_data):
    """Write all data to sheet. Raises an exception if writing fails."""
    pass

def log_error(video_id: str, error: str):
    """Log failure. Returns True if successful, False otherwise."""
    pass

def get_existing_video_ids():
    """Get a sorted list of all video IDs that are already processed."""
    pass

In [None]:
def flatten_mixed_dict(d):
  """Flattens a dictionary with a mix of one-level and two-level keys."""
  flattened_dict = {}
  for outer_key, value in d.items():
    if isinstance(value, dict):  # Check if the value is a dictionary (two-level key)
      for inner_key, inner_value in value.items():
        if isinstance(inner_value, dict):
          flattened_dict[f"{inner_key}"] = json.dumps(inner_value)
        else:
          flattened_dict[f"{inner_key}"] = inner_value
    else:  # Handle one-level key
        flattened_dict[outer_key] = value
  return flattened_dict


def load_narration_data():
  narration_data = load_ego4d_narration_data()

  # create smaller dictionary without unused metadata
  narration = {}
  for vid_id in narration_data.keys():
    narration[vid_id] = {}
    for narr_pass in filter(lambda x: 'narration_pass' in x, narration_data[vid_id].keys()):
      summaries = []
      for summary in narration_data[vid_id][narr_pass]['summaries']:
        summaries.append(summary['summary_text'])
      narration_text = []
      for narr in narration_data[vid_id][narr_pass]['narrations']:
        narration_text.append(narr['narration_text'])
      narration[vid_id][narr_pass] = summaries, narration_text

  del narration_data
  return narration


def sample_narration(narration: dict[str, dict[str, str]], video_id: str, n_tries: int = 1):
  """Sample narration from the Ego4D narration dictionary.

  Args:
    narration: Ego4D narration dictionary
    video_id: video id
    n_tries: number of tries
  Returns:
    key, narr, summ: video_id, narration text, video summary (None, None, None)
                     if failed to load
  """
  for _ in range(n_tries):
    if video_id is None:
      key = list(narration.keys())[random.randint(0, len(narration)-1)]
    else:
      key = video_id

    if key not in narration:
      continue

    if 'narration_pass_1' not in narration[key]:
      continue

    summ, narr = narration[key]['narration_pass_1']
    if len(narr) == 0 or len(summ) == 0 or len(''.join(summ)) < 5:
      continue

    return key, narr, summ

  return None, None, None


def process_data(data):
  df_data = []
  keys = ["thoughts", "intent", "action", "confidence", "assumptions"]
  for d in [e for e in data if e is not None and isinstance(e, dict)]:
    sug1 = flatten_mixed_dict({k: d[k] for k in keys})
    video = d['video']
    sug1["video_id"] = video['video_id']
    sug1["batch_idx"] = d['batch_idx']
    sug1["batch_size"] = d['batch_size']
    sug1["model_name"] = d['model_name']
    sug1["time_added"] = d['time_added']
    sug1["user_prompt"] = video['user_prompt']
    sug1["summary"] = video['summary']
    # store
    df_data.append(sug1)

    if "another_suggestion" in d:
      sug2 = flatten_mixed_dict({k: v for k, v in d["another_suggestion"].items() if k not in ["batch_idx", "model_name"]})
      sug2["video_id"] = video['video_id']
      sug2["batch_idx"] = d['another_suggestion']['batch_idx']
      sug2["model_name"] = d['another_suggestion']['model_name']
      sug2["batch_size"] = d['another_suggestion']['batch_size']
      sug2["time_added"] = d['another_suggestion']['time_added']
      sug2["user_prompt"] = video['user_prompt']
      sug2["summary"] = video['summary']
      # store
      df_data.append(sug2)

  df_data = pd.DataFrame(df_data)
  return df_data

def write_data(data):
  try:
    df_data = process_data(data)
    store_data(df_data)
    return True
  except Exception as e:
    print("EXCEPTION when writing data", type(e), e)
    return False

In [None]:
class DataGenerator:
  def __init__(self, model_name: str):
    """Data Generator.

    Args:
      model_name: model name, either "lm" or "mm"
    """
    # load data from CNS
    print("Loading Ego4D narration data", end="\r")
    self.narration = load_narration_data()
    print("Successfully loaded narration data")

    # initialize gemini model
    self.model_name = model_name
    self.gemini_model = get_gemini_model(model_name)

    # make sure data storage is set up
    assert check_db_connection()


  def get_llm_responses(self, full_prompt: str) -> dict[str, str]:
    """Get LLM response.

    Args:
      full_prompt: full prompt
    Returns:
      response_data (dict): response data in the specified JSON format.
    """

    # get initial response
    response = get_gemini_response(self.gemini_model, full_prompt)

    # try parsing JSON - if it fails, ask LLM to fix it
    try:
      response_data = json.loads(response)
    except Exception as e:
      if isinstance(e, json.decoder.JSONDecodeError):
        # print("JSONDecodeError: trying to fix...", e)
        new_prompt = FIX_JSON_PROMPT.format(full_prompt=full_prompt, response=response, e=e)
        response = get_gemini_response(self.gemini_model, new_prompt)
        response_data = json.loads(response)
      else:
        return f"Exception when fixing JSON: {type(e)} {e}"

    # ask model for another suggestion - if it fails, don't add it
    try:
      prompt2 = ANOTHER_SUGGESTION_PROMPT.format(full_prompt=full_prompt, response=response)
      response2 = get_gemini_response(self.gemini_model, prompt2)
      response_data2 = json.loads(response2)
      response_data['another_suggestion'] = response_data2
    except Exception as e:
      # TODO: log that the second suggestion failed
      # print("EXCEPTION when asking for another suggestion", type(e), e)
      return response_data
    return response_data


  def annotate_video(
    self, video_id: str, max_n_sentences: int = 150, iterator: tqdm = None,
    min_n_sentences: int = 50,
  ) -> list[dict[str, str]]:
    """Annotate a single video from the given video_id, and store the results."""
    # load narration & summary
    x, narr, summ = sample_narration(self.narration, video_id)
    if x is None or x != video_id:
      log_error(video_id, "Failed to load narrations")
      return None

    # split narrations into batches of `max_n_sentences` lines
    list_indexed_narrations = []
    for idx in range(1 + len(narr) // max_n_sentences):
      narr_sub = narr[idx * max_n_sentences:(idx + 1) * max_n_sentences]
      ind_narr = '\n'.join([f"{i:3} {e}" for i, e in enumerate(narr_sub)])
      list_indexed_narrations.append(ind_narr)

    data = []
    # iterate over narration batches
    for batch_idx, indexed_narrations in enumerate(list_indexed_narrations):
      # set postfix on the tqdm iterator
      if iterator is not None:
        postfix_str = f"video {video_id.split('-')[0]}"
        postfix_str += f", batch {batch_idx+1}/{len(list_indexed_narrations)}"
        iterator.set_postfix_str(postfix_str)

      # make sure there are enough sentences in this batch
      if len(indexed_narrations.split("\n")) < min_n_sentences:
        if batch_idx == 0:
          log_error(video_id, f"Less than {min_n_sentences} sentences in this batch, skipping.")
        break

      # setup prompts
      user_prompt = f"\n{indexed_narrations}\n\nResponse:\n"
      full_prompt = f"\n{system_prompt}\n\n{user_prompt}"

      # generate LLM response
      try:
        d = self.get_llm_responses(full_prompt)
      except Exception as e:
        log_error(video_id, f"Exception raised when generating LLM response: {type(e)} {e}")
      else:
        if isinstance(d, dict):
          # parsed successfully -> store data
          d['batch_idx'] = batch_idx
          d['model_name'] = self.model_name
          d['batch_size'] = max_n_sentences
          d['time_added'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
          if "another_suggestion" in d:
            d['another_suggestion']['batch_idx'] = batch_idx
            d['another_suggestion']['model_name'] = self.model_name
            d['another_suggestion']['batch_size'] = max_n_sentences
            d['another_suggestion']['time_added'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

          d['video'] = {
              'video_id': video_id,
              'user_prompt': user_prompt,
              'system_prompt': system_prompt,
              'narration_pass': 'narration_pass_1',
              'summary': '\n'.join(summ),
              'max_lines': max_n_sentences,
          }
          data.append(d)
        elif isinstance(d, str):
          log_error(video_id, d)
        else:
          print("ERROR: unrecognized return type from get_llm_responses")

      # write this batch's output to the google sheet
      if write_data(data):
        data = []

    # make sure all data from this video is written to the google sheet
    if len(data) > 0:
      if not write_data(data):
        print("failed to write datato sheet")
        return data

    return None

## Annotate the data

In [None]:
datagen = DataGenerator(model_name="mm")

Loading Ego4D narration dataSuccessfully loaded narration data


In [None]:
existing_video_ids = get_existing_video_ids()
keys = sorted(list(set(datagen.narration.keys()).difference(existing_video_ids)))[::-1]
iterator = tqdm(enumerate(keys), total=len(keys), leave=True, ncols=750, desc="Ego4D videos")
for idx, video_id in iterator:
  data = datagen.annotate_video(video_id, iterator=iterator, max_n_sentences=200, min_n_sentences=50)
  if data is not None:
    # failed to save data -> print and continue
    print(json.dumps(data, indent=2), end="\n" + "-"*20 + "\n")

Ego4D videos:   0%|                                                                                           …