## Goal Prediction
### 1. Make vectorstore

In [1]:
import openai
import langchain
import logging
import json
import os
import pandas as pd
import pickle
from IPython.display import Image, display

# configure paths
data_path = '../data/ego4d_annotation/'
GOALSTEP_ANNOTATION_PATH = data_path + 'goalstep/'
SPATIAL_ANNOTATION_PATH = data_path + 'spatial/'
GOALSTEP_VECSTORE_PATH = GOALSTEP_ANNOTATION_PATH + 'goalstep_docarray_faiss'
SPATIAL_VECSTORE_PATH = SPATIAL_ANNOTATION_PATH + 'spatial_docarray_faiss'


# Embedding Database
import database
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append(os.path.abspath('/usr/local/lib/python3.10/dist-packages'))

from langchain.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch # do not use this!
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import docarray


# extract videos list

goalstep_videos_list = database.merge_json_video_list(GOALSTEP_ANNOTATION_PATH)
spatial_videos_list = database.merge_json_video_list(SPATIAL_ANNOTATION_PATH)
print(f"goalstep vids: {len(goalstep_videos_list)} and spatial vids: {len(spatial_videos_list)}")

# make document list(langchain.schema) from the video list
# TODO: when making document list, make sure each segment's parents are well listed!
goalstep_document_list = database.make_goalstep_document_list(goalstep_videos_list)
spatial_document = database.make_spatial_document_list(spatial_videos_list)
print(f"goalstep_document_list: {len(goalstep_document_list)}")
print(f"spatial_document_list: {len(spatial_document)}")

goalstep vids: 717 and spatial vids: 50
goalstep_document_list: 39979
spatial_document_list: 588


### 1-1. LOAD vectorstore and start instantly

In [2]:
# Embedding Database
import database
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append(os.path.abspath('/usr/local/lib/python3.10/dist-packages'))

from langchain.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch # do not use this!
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

embeddings = OpenAIEmbeddings()

# make vectorstores for each dataset (takes most time-3min)
if not os.path.exists(GOALSTEP_VECSTORE_PATH + '/index.faiss'):
    print(f"MAKE FAISS GOALSTEP {GOALSTEP_VECSTORE_PATH}")
    goalstep_vector_store =  FAISS.from_documents(goalstep_document_list, embeddings)
    goalstep_vector_store.save_local(GOALSTEP_VECSTORE_PATH)
else:
    print(f"LOAD FAISS GOALSTEP {GOALSTEP_VECSTORE_PATH}")

if not os.path.exists(SPATIAL_VECSTORE_PATH + '/index.faiss'):
    print(f"MAKE FAISS SPATIAL {SPATIAL_VECSTORE_PATH}")
    spatial_vector_store = FAISS.from_documents(spatial_document, embeddings)
    spatial_vector_store.save_local(SPATIAL_VECSTORE_PATH)
else:
    print(f"LOAD FAISS SPATIAL: {SPATIAL_VECSTORE_PATH}")


# load docarray and get docarrayinmemory instance for vectorstore
goalstep_vector_store = FAISS.load_local(GOALSTEP_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)
spatial_vector_store = FAISS.load_local(SPATIAL_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)

# for doc in goalstep_vector_store.docstore._dict.values():
#     print(f"{doc.page_content}")

LOAD FAISS GOALSTEP ../data/ego4d_annotation/goalstep/goalstep_docarray_faiss
LOAD FAISS SPATIAL: ../data/ego4d_annotation/spatial/spatial_docarray_faiss


### 2. Make Input from Source (wip)
- extract input sequence and spatial context from test video

In [3]:
print(os.getcwd())
import sys
import input_source

input_video_idx = int(input("what is the video idx for input?: "))
# goalstep_videos_list = database.merge_json_video_list(GOALSTEP_ANNOTATION_PATH)
print(len(goalstep_videos_list))

goalstep_video = goalstep_videos_list[input_video_idx]
spatial_video = spatial_videos_list[input_video_idx]

input_goalstep_segments = input_source.extract_lower_goalstep_segments(goalstep_video)
input_spatial_context = input_source.extract_spatial_context(spatial_video)

/root/project/script_predict_goal
717


### 3. Query from Database (WIP)

In [19]:
import query

# Retriever & databse query
goalstep_retriever = goalstep_vector_store.as_retriever()
spatial_retriever = spatial_vector_store.as_retriever()
input_query = query.return_source_input_query(input_goalstep_segments, input_spatial_context) 
#print(input_query)


# retrieve from database
retrieved_goalstep = goalstep_retriever.get_relevant_documents(input_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(input_query)
print(retrieved_goalstep)
print(retrieved_spatial)

# Retrieve documents' parent documents with metadatasearch
# TODO: replace metafilter with real values extracted from input values
goalstep_metafilter = {"level": 1, "parent_id": 10}
spatial_metafilter = {"level": 1, "parent_id": 10}

# Search with filter inside vector_Store
retrieved_goalstep_parent_documents = []
try:
    goalstep_search_results = goalstep_vector_store.similarity_search_with_score(
        query="",
        filter=goalstep_metafilter
    )

    if not goalstep_search_results:
        print("No match: goalstep parent")
        goalstep_search_results = []  # Ensure results is a null (empty) array
    else:
        for result, score in goalstep_search_results:
            retrieved_goalstep_parent_documents = result
except Exception as e:
    print(f"Error : goaistep parent search: {e}")
    goalstep_search_results = []  # Ensure results is a null (empty) array

# TODO: spatial context only has initial spatial layout. Maybe we should not search this extensively.
retrieved_spatial_parent_documents = []
try:
    spatial_search_results = spatial_vector_store.similarity_search_with_score(
        query="",
        filter=spatial_metafilter
    )

    if not spatial_search_results:
        print("No match: spatial parent")
        spatial_search_results = []  # Ensure results is a null (empty) array    
    else:
        for result, score in spatial_search_results:
            retrieved_spatial_parent_documents = result        
except Exception as e:
    print(f"Error: spatial parent search: {e}")
    spatial_search_results = []  # Ensure results is a null (empty) array





[Document(metadata={'type': 'level3', 'video_uid': '28bc1ee7-b0c1-4f30-934a-0ab665779d90', 'parent_level1_start_time': 20.84818, 'start_time': 20.86482, 'end_time': 32.02167, 'step_category': 'General cooking activity: Organize and arrange cooking tools or utensils', 'step_description': 'organize the kitchen utensils '}, page_content='Level 3 Segment 1 for Level 2 Segment 2 in Video 28bc1ee7-b0c1-4f30-934a-0ab665779d90\nStep: organize the kitchen utensils '), Document(metadata={'type': 'level3', 'video_uid': '36248b1e-9ba7-4789-bbf5-b9e33e3ab408', 'parent_level1_start_time': 180.63289, 'start_time': 271.77552, 'end_time': 276.31074, 'step_category': 'Add ingredients to the recipe: Add onion to recipe', 'step_description': 'gather onions on the chopping board '}, page_content='Level 3 Segment 4 for Level 2 Segment 4 in Video 36248b1e-9ba7-4789-bbf5-b9e33e3ab408\nStep: gather onions on the chopping board '), Document(metadata={'type': 'level2', 'video_uid': '3c0dffd0-e38e-4643-bc48-d5139

### 4. Prompt > LLM (WIP)

In [44]:
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

from langchain.chains import LLMChain
import logging

# Set logging to ERROR level to suppress INFO or DEBUG messages
logging.basicConfig(level=logging.ERROR)

# llm and parser
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")
parser = StrOutputParser()
# parser = StructuredOutputParser.from_response_schemas([
#     ResponseSchema(name="answer", description="The answer to the question")
# ])

# make context # input_goalstep_segments, # input_spatial_context, # retrieved_goalstep, # retrieved_spatial, retrieved_goalstep_parent_documents, retrieved_spatial_parent_documents
context = promptSource.make_context(input_goalstep_segments, input_spatial_context, retrieved_goalstep, retrieved_spatial)
# print(f"role: {promptSource.role}")
# print(f"question: {promptSource.question}")
# print(f"printing actions {input_goalstep_segments}")
# print(f"This is the context: {context}")

# Make prompt with template
prompt = PromptTemplate.from_template(
        """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say [don't know].

    #Role:
    {role}
    #Question: 
    {question} 
    #Action_Sequence: 
    {action_sequence} 
    #Spatial_Layout
    {spatial_layout}
    #Similar_Space_Example
    {relevant_space}
    """
)

# Define chain, make input, invoke chain
chain = prompt | model | parser
inputs = {
    "role": [{'role':'system', 'content':'you are a helpful assisant that predicts the goal of the user inside a scene. You are given the actions of the user and the initial spatial layout of the scene.'}],
    "question": [{'role':'user', 'content':'A person has performed the given actions in the form of a sequence of actions. What is the goal of the current user? Answer in one verb and a noun pair. surround the verb and a noun pair with ""'}],
    "action_sequence": input_goalstep_segments,
    "spatial_layout": input_spatial_context,
    "relevant_actions": retrieved_goalstep,
    "relevant_space": retrieved_spatial
}
response = chain.invoke(inputs)
print(response)

The goal of the current user is "making stew or soup."


### 5. Process Response and Compute Distance ()

In [None]:
sys.path.append(os.path.abspath('/root/project')) # add root path to sys.path for external package
from util import metrics
# postprocess answers to get goals



# postprocess answers to get lv1 steps?
# can import other scripts in other folders fine!
metrics.printtest()


In [None]:
# compute metrics



### 6. Configure Prompt2 for Target Space

In [None]:
import input_source
import input_target
import prompt_target

# target space input
source_goalstep_segments = input_goalstep_segments
source_spatial_context = input_spatial_context

target_video = []
target_spatial_context = input_source.extract_spatial_context(target_video)

# make target query & retrieve
input_query = query.return_target_input_query(source_goalstep_segments, source_spatial_context, target_spatial_context)
retrieved_goalstep = goalstep_retriever.get_relevant_documents(input_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(input_query)

# Retrieve documents' parent documents for goalstep annotation


# Concat the retrieved results


# define prompt
prompt2 = ChatPromptTemplate.from_template(promptSource.template_source)
prompt2.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser = StrOutputParser()
chain2 = prompt2 | model | parser 

# Get Respone
response2 = chain2.invoke()

### 7. Configure Combined chain for simple one-go-prediction

In [None]:

# # chain can incorpoate other chains
# chain_action = (
#     {"source_spatial_context": itemgetter(source_spatial_context), "goalstep": chain1, "target_spatial_context": itemgetter(target_spatial_context)} | prompt_action | model | parser
# )
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate


# define llm
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")

# define prompt
prompt = ChatPromptTemplate.from_template(promptSource.template_source)
prompt.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser = StrOutputParser()
chain1 = prompt | model | parser 

# Get Respone
response = chain1.invoke()