## Goal Prediction
### 1. Make vectorstore

In [44]:
import openai
import langchain
import logging
import json
import os
import pandas as pd
import pickle
from IPython.display import Image, display

# configure paths
data_path = '../data/ego4d_annotation/'
GOALSTEP_ANNOTATION_PATH = data_path + 'goalstep/'
SPATIAL_ANNOTATION_PATH = data_path + 'spatial/'
GOALSTEP_VECSTORE_PATH = GOALSTEP_ANNOTATION_PATH + 'goalstep_docarray_faiss'
SPATIAL_VECSTORE_PATH = SPATIAL_ANNOTATION_PATH + 'spatial_docarray_faiss'


# Embedding Database
import database
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append(os.path.abspath('/usr/local/lib/python3.10/dist-packages'))

from langchain.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch # do not use this!
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import docarray


# extract videos list
goalstep_videos_list = database.merge_json_video_list(GOALSTEP_ANNOTATION_PATH)
spatial_videos_list = database.merge_json_video_list(SPATIAL_ANNOTATION_PATH)
print(f"goalstep vids: {len(goalstep_videos_list)} and spatial vids: {len(spatial_videos_list)}")

# make document list(langchain.schema) from the video list
# TODO: when making document list, make sure each segment's parents are well listed!
goalstep_document_list = database.make_goalstep_document_list(goalstep_videos_list)
spatial_document = database.make_spatial_document_list(spatial_videos_list)
print(f"goalstep_document_list: {len(goalstep_document_list)}")
print(f"spatial_document_list: {len(spatial_document)}")

goalstep vids: 717 and spatial vids: 36
goalstep_document_list: 39979
spatial_document_list: 400


### 1-1. LOAD vectorstore and start instantly

In [45]:
# Embedding Database
import database
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append(os.path.abspath('/usr/local/lib/python3.10/dist-packages'))

from langchain.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch # do not use this!
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# load docarray and get docarrayinmemory instance for vectorstore
embeddings = OpenAIEmbeddings()
goalstep_vector_store = FAISS.load_local(GOALSTEP_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)
spatial_vector_store = FAISS.load_local(SPATIAL_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)

# for doc in goalstep_vector_store.docstore._dict.values():
#     print(f"{doc.page_content}")

### 2. Make Input from Source (wip)
- extract input sequence and spatial context from test video

In [46]:
print(os.getcwd())
import sys
import input_source

input_video_idx = int(input("what is the video idx for input?: "))
# goalstep_videos_list = database.merge_json_video_list(GOALSTEP_ANNOTATION_PATH)
print(len(goalstep_videos_list))

goalstep_video = goalstep_videos_list[input_video_idx]
spatial_video = spatial_videos_list[input_video_idx]

input_goalstep_segments = input_source.extract_lower_goalstep_segments(goalstep_video)
input_spatial_context = input_source.extract_spatial_context(spatial_video)

/root/project/script_predict_goal
717


### 3. Query from Database (WIP)

In [47]:
import query

# Retriever & databse query
goalstep_retriever = goalstep_vector_store.as_retriever()
spatial_retriever = spatial_vector_store.as_retriever()
input_query = query.return_source_input_query(input_goalstep_segments, input_spatial_context) 
#print(input_query)


# retrieve from database
retrieved_goalstep = goalstep_retriever.get_relevant_documents(input_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(input_query)
print(retrieved_goalstep)
print(retrieved_spatial)

# Retrieve documents' parent documents with metadatasearch
# TODO: replace metafilter with real values extracted from input values
goalstep_metafilter = {"level": 1, "parent_id": 10}
spatial_metafilter = {"level": 1, "parent_id": 10}

# Search with filter inside vector_Store
retrieved_goalstep_parent_documents = []
try:
    goalstep_search_results = goalstep_vector_store.similarity_search_with_score(
        query="",
        filter=goalstep_metafilter
    )

    if not goalstep_search_results:
        print("No match: goalstep parent")
        goalstep_search_results = []  # Ensure results is a null (empty) array
    else:
        for result, score in goalstep_search_results:
            retrieved_goalstep_parent_documents = result
except Exception as e:
    print(f"Error : goaistep parent search: {e}")
    goalstep_search_results = []  # Ensure results is a null (empty) array

# TODO: spatial context only has initial spatial layout. Maybe we should not search this extensively.
retrieved_spatial_parent_documents = []
try:
    spatial_search_results = spatial_vector_store.similarity_search_with_score(
        query="",
        filter=spatial_metafilter
    )

    if not spatial_search_results:
        print("No match: spatial parent")
        spatial_search_results = []  # Ensure results is a null (empty) array    
    else:
        for result, score in spatial_search_results:
            retrieved_spatial_parent_documents = result        
except Exception as e:
    print(f"Error: spatial parent search: {e}")
    spatial_search_results = []  # Ensure results is a null (empty) array





[Document(metadata={'type': 'level2', 'video_uid': '45dc74e1-c8dd-443a-a7a6-ca4215144e97', 'start_time': 436.82178, 'end_time': 486.56667, 'step_category': 'Make baked goods: Deep fry donuts and pastries', 'step_description': 'Fry some jelabi pastries'}, page_content='Level 2 Segment 5 for Video 45dc74e1-c8dd-443a-a7a6-ca4215144e97\nStep: Fry some jelabi pastries'), Document(metadata={'type': 'level2', 'video_uid': '45dc74e1-c8dd-443a-a7a6-ca4215144e97', 'start_time': 0, 'end_time': 96.82035, 'step_category': 'Make baked goods: Deep fry donuts and pastries', 'step_description': 'Fry some jelabi pastries'}, page_content='Level 2 Segment 1 for Video 45dc74e1-c8dd-443a-a7a6-ca4215144e97\nStep: Fry some jelabi pastries'), Document(metadata={'type': 'level2', 'video_uid': '45dc74e1-c8dd-443a-a7a6-ca4215144e97', 'start_time': 407.20567, 'end_time': 436.68723, 'step_category': 'Make baked goods: Deep fry donuts and pastries', 'step_description': 'Fry some jelabi pastries'}, page_content='Leve

### 4. Prompt > LLM (WIP)

In [48]:
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

from langchain.chains import LLMChain

# ready the documents for making the Prompt
# input_goalstep_segments
# input_spatial_context
# retrieved_goalstep
# retrieved_spatial
# retrieved_goalstep_parent_documents
# retrieved_spatial_parent_documents

# define llm
parser = StrOutputParser()
# parser = StructuredOutputParser.from_response_schemas([
    # ResponseSchema(name="answer", description="The answer to the question")
# ])

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")

# make context
context = promptSource.make_context(input_goalstep_segments, input_spatial_context, retrieved_goalstep, retrieved_spatial)

# print(f"role: {promptSource.role}")
# print(f"question: {promptSource.question}")
# print(f"printing actions {input_goalstep_segments}")
print(context)



template_source = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

{role}
{question}
"""
prompt = PromptTemplate.from_template(
        """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know.

    #Role:
    {role}
    #Question: 
    {question} 
    #Action_Sequence: 
    {action_sequence} 
    #Spatial_Layout
    {spatial_layout}

    #Similar_Space_Example
    {relevant_space}

    """
)

# #Similar_Actions_From_other_People
# {relevant_actions}
# Define chain
chain = prompt | model | parser



# input_goalstep_segments
# input_spatial_context
# retrieved_goalstep
# retrieved_spatial

context = input_goalstep_segments

inputs = {
    "role": [{'role':'system', 'content':'you are a helpful assisant that predicts the goal of the user inside a scene. You are given the actions of the user and the initial spatial layout of the scene.'}],
    "question": [{'role':'user', 'content':'A person has performed the given actions in the form of a sequence of actions.What is the goal of the current user? Provide only one answer in a verb and a noun pair. The noun, if possible should be as specific as possible'}],
    "action_sequence": input_goalstep_segments,
    "spatial_layout": input_spatial_context,
    "relevant_actions": retrieved_goalstep,
    "relevant_space": retrieved_spatial
}

response = chain.invoke(inputs)
print(response)


User performs an action sequence as follows {input_action_sequence}. The scene the user is in had the initial spatial layout as follows {'room1': [{'entity': {'type': 'avatar', 'name': 'player', 'status': 'stand'}, 'relation': 'has', 'target': {'type': 'item', 'id': 1, 'name': 'mustard spinach', 'status': 'unwashed'}}, {'entity': {'entity': {'type': 'item', 'id': 2, 'name': 'bowl', 'status': 'contain water'}, 'relation': 'has', 'target': {'type': 'item', 'id': 3, 'name': 'sieve', 'status': 'in water'}}, 'relation': 'has', 'target': {'type': 'item', 'id': 1, 'name': 'mustard spinach', 'status': 'unwashed'}}, {'entity': {'type': 'item', 'id': 4, 'name': 'pot', 'status': 'heating'}, 'relation': 'has', 'target': {'type': 'item', 'id': 5, 'name': 'water', 'status': 'boiling'}}]}. In database, other people performed similar actions as follows [Document(metadata={'type': 'level2', 'video_uid': '45dc74e1-c8dd-443a-a7a6-ca4215144e97', 'start_time': 436.82178, 'end_time': 486.56667, 'step_catego

### 5. Process Response and Compute Distance ()

In [None]:
sys.path.append(os.path.abspath('/root/project')) # add root path to sys.path for external package
from util import metrics
# postprocess answers to get goals



# postprocess answers to get lv1 steps?
# can import other scripts in other folders fine!
metrics.printtest()


In [None]:
# compute metrics



### 6. Configure Prompt2 for Target Space

In [None]:
import input_source
import input_target
import prompt_target

# target space input
source_goalstep_segments = input_goalstep_segments
source_spatial_context = input_spatial_context

target_video = []
target_spatial_context = input_source.extract_spatial_context(target_video)

# make target query & retrieve
input_query = query.return_target_input_query(source_goalstep_segments, source_spatial_context, target_spatial_context)
retrieved_goalstep = goalstep_retriever.get_relevant_documents(input_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(input_query)

# Retrieve documents' parent documents for goalstep annotation


# Concat the retrieved results


# define prompt
prompt2 = ChatPromptTemplate.from_template(promptSource.template_source)
prompt2.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser = StrOutputParser()
chain2 = prompt2 | model | parser 

# Get Respone
response2 = chain2.invoke()

### 7. Configure Combined chain for simple one-go-prediction

In [None]:

# # chain can incorpoate other chains
# chain_action = (
#     {"source_spatial_context": itemgetter(source_spatial_context), "goalstep": chain1, "target_spatial_context": itemgetter(target_spatial_context)} | prompt_action | model | parser
# )
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate


# define llm
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")

# define prompt
prompt = ChatPromptTemplate.from_template(promptSource.template_source)
prompt.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser = StrOutputParser()
chain1 = prompt | model | parser 

# Get Respone
response = chain1.invoke()