## Goal Prediction
### 1. Make vectorstore
- make document list
- make vectorstore or load vectorstore

In [1]:
import openai
import langchain
import logging
import json
import os
import pandas as pd
import pickle
from IPython.display import Image, display

import database
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append(os.path.abspath('/usr/local/lib/python3.10/dist-packages'))
from langchain.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch # do not use this!
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import docarray

# configure paths
data_path = '../data/ego4d_annotation/'
GOALSTEP_ANNOTATION_PATH = data_path + 'goalstep/'
SPATIAL_ANNOTATION_PATH = data_path + 'spatial/'
GOALSTEP_VECSTORE_PATH = GOALSTEP_ANNOTATION_PATH + 'goalstep_docarray_faiss'
SPATIAL_VECSTORE_PATH = SPATIAL_ANNOTATION_PATH + 'spatial_docarray_faiss'

# Test scenarios to exclude
test_uid = [
    "dcd09fa4-afe2-4a0d-9703-83af2867ebd3", #make potato soap
    "46e07357-6946-4ff0-ba36-ae11840bdc39", #make tortila soap
    "026dac2d-2ab3-4f9c-9e1d-6198db4fb080", #prepare steak
    "2f46d1e6-2a85-4d46-b955-10c2eded661c", #make steak
    "14bcb17c-f70a-41d5-b10d-294388084dfc", #prepare garlic(peeling done)
    "487d752c-6e22-43e3-9c08-627bc2a6c6d4", #peel garlic
    "543e4c99-5d9f-407d-be75-c397d633fe56", #make sandwich
    "24ba7993-7fc8-4447-afd5-7ff6d548b11a", #prepare sandwich bread
    "e09a667f-04bc-49b5-8246-daf248a29174", #prepare coffee
    "b17ff269-ec2d-4ad8-88aa-b00b75921427", #prepare coffee and bread
    "58b2a4a4-b721-4753-bfc3-478cdb5bd1a8" #prepare tea and pie
]

# extract videos list
goalstep_videos_list = database.merge_json_video_list(GOALSTEP_ANNOTATION_PATH)
spatial_videos_list = database.merge_json_video_list(SPATIAL_ANNOTATION_PATH)
print(f"goalstep vids: {len(goalstep_videos_list)} and spatial vids: {len(spatial_videos_list)}")

# make document list
goalstep_document_list = database.make_goalstep_document_list(goalstep_videos_list)
spatial_document = database.make_spatial_document_list(spatial_videos_list)
print(f"goalstep_document_list: {len(goalstep_document_list)}")
print(f"spatial_document_list: {len(spatial_document)}")

# MAKE EMBEDDING
embeddings = OpenAIEmbeddings()

# MAKE FAISS VECSTORE
if not os.path.exists(GOALSTEP_VECSTORE_PATH + '/index.faiss'):
    print(f"MAKE FAISS GOALSTEP {GOALSTEP_VECSTORE_PATH}")
    goalstep_vector_store =  FAISS.from_documents(goalstep_document_list, embeddings)
    goalstep_vector_store.save_local(GOALSTEP_VECSTORE_PATH)
else:
    print(f"LOAD FAISS GOALSTEP {GOALSTEP_VECSTORE_PATH}")

if not os.path.exists(SPATIAL_VECSTORE_PATH + '/index.faiss'):
    print(f"MAKE FAISS SPATIAL {SPATIAL_VECSTORE_PATH}")
    spatial_vector_store = FAISS.from_documents(spatial_document, embeddings)
    spatial_vector_store.save_local(SPATIAL_VECSTORE_PATH)
else:
    print(f"LOAD FAISS SPATIAL: {SPATIAL_VECSTORE_PATH}")


# LOAD FAISS VECSTORE
goalstep_vector_store = FAISS.load_local(GOALSTEP_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)
spatial_vector_store = FAISS.load_local(SPATIAL_VECSTORE_PATH, embeddings, allow_dangerous_deserialization=True)

# for doc in goalstep_vector_store.docstore._dict.values():
#     print(f"{doc.page_content}")

goalstep vids: 717 and spatial vids: 50
goalstep_document_list: 39979
spatial_document_list: 588
LOAD FAISS GOALSTEP ../data/ego4d_annotation/goalstep/goalstep_docarray_faiss
LOAD FAISS SPATIAL: ../data/ego4d_annotation/spatial/spatial_docarray_faiss


### 2. Make Input from Source (wip)
- extract input sequence and spatial context from test video

In [3]:
import input_source

# DEF INPUT VIDEO INDEX
input_video_idx = int(input("what is the video idx for input?: "))
print(len(goalstep_videos_list))

# SELECT INPUT VIDEO
goalstep_video = goalstep_videos_list[input_video_idx]
spatial_video = spatial_videos_list[input_video_idx]
print(goalstep_video["video_uid"])

# EXTRACT (LV3 action sequence) & (Spatial context)
input_goalstep_segments = input_source.extract_lower_goalstep_segments(goalstep_video)
input_spatial_context = input_source.extract_spatial_context(spatial_video)

717
grp-88bae242-7f3a-45d4-b129-5d69b1a1e15a


### 3. Query from Database (WIP)

In [4]:
import query

# DEF (Retriever) & (Database Query)
goalstep_retriever = goalstep_vector_store.as_retriever()
spatial_retriever = spatial_vector_store.as_retriever()
database_query = query.return_source_database_query(input_goalstep_segments, input_spatial_context) 

# GET search result
retrieved_goalstep = goalstep_retriever.get_relevant_documents(database_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(database_query)
print(f"{retrieved_goalstep} \n {retrieved_spatial}")



## TODO: FIX parent document search
# DEF metafilter for parent document retrieval
goalstep_metafilter = {"level": 1, "parent_id": 10}
spatial_metafilter = {"level": 1, "parent_id": 10}

# GET parent documents
retrieved_goalstep_parent_documents = []
try:
    goalstep_search_results = goalstep_vector_store.similarity_search_with_score(
        query="",
        filter=goalstep_metafilter
    )

    if not goalstep_search_results:
        print("No match: goalstep parent")
        goalstep_search_results = []  # Ensure results is a null (empty) array
    else:
        for result, score in goalstep_search_results:
            retrieved_goalstep_parent_documents = result
except Exception as e:
    print(f"Error : goaistep parent search: {e}")
    goalstep_search_results = []  # Ensure results is a null (empty) array

# TODO: spatial context only has initial spatial layout. Maybe we should not search this extensively.
retrieved_spatial_parent_documents = []
try:
    spatial_search_results = spatial_vector_store.similarity_search_with_score(
        query="",
        filter=spatial_metafilter
    )

    if not spatial_search_results:
        print("No match: spatial parent")
        spatial_search_results = []  # Ensure results is a null (empty) array    
    else:
        for result, score in spatial_search_results:
            retrieved_spatial_parent_documents = result        
except Exception as e:
    print(f"Error: spatial parent search: {e}")
    spatial_search_results = []  # Ensure results is a null (empty) array


  retrieved_goalstep = goalstep_retriever.get_relevant_documents(database_query)


[Document(metadata={'type': 'level2', 'video_uid': '5cdf77b8-7bf8-421b-99b6-19fa6429aeb4', 'start_time': 151.9339, 'end_time': 177.60814, 'step_category': 'General cooking activity: Organize and arrange cooking tools or utensils', 'step_description': 'organize kitchen items'}, page_content='Level 2 Segment 1 for Video 5cdf77b8-7bf8-421b-99b6-19fa6429aeb4\nStep: organize kitchen items'), Document(metadata={'type': 'level2', 'video_uid': '3c0dffd0-e38e-4643-bc48-d513943dc20b', 'start_time': 338.33889, 'end_time': 402.0589, 'step_category': 'General cooking activity: Organize and arrange cooking tools or utensils', 'step_description': 'Organise kitchen utensils'}, page_content='Level 2 Segment 10 for Video 3c0dffd0-e38e-4643-bc48-d513943dc20b\nStep: Organise kitchen utensils'), Document(metadata={'type': 'level3', 'video_uid': '28bc1ee7-b0c1-4f30-934a-0ab665779d90', 'parent_level1_start_time': 20.84818, 'start_time': 20.86482, 'end_time': 32.02167, 'step_category': 'General cooking activi

### 4. Prompt > LLM (WIP)

In [None]:
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

from langchain.chains import LLMChain
import logging

# SET logging to ERROR level to suppress INFO or DEBUG messages
logging.basicConfig(level=logging.ERROR)

# DEF (LLM MODEL) & (PARSER)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model1 = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")
parser1 = StrOutputParser()
# parser = StructuredOutputParser.from_response_schemas([
#     ResponseSchema(name="answer", description="The answer to the question")
# ])

# DEF PROMPT, CHAIN
prompt1 = PromptTemplate.from_template(promptSource.template1)
chain1 = prompt1 | model1 | parser1

# DEF INPUT
inputs1 = {
    "role":'you are a helpful assisant that predicts the goal of the user inside a scene. You are given the actions of the user and the initial spatial layout of the scene.',
    "question": 'A person performs a sequence of actions. What is the goal of the current user? Answer in one verb and a noun pair. surround the verb and a noun pair with "". Make the noun as specific as possible',
    "action_sequence": input_goalstep_segments,
    "spatial_layout": input_spatial_context,
    "relevant_actions": retrieved_goalstep,
    "relevant_space": retrieved_spatial
}

# INVOKE chain
response1 = chain1.invoke(inputs1)
print(response1)

"Serve dish"


### 5. Process Predicted Goals and Compute Distance ()

In [None]:
sys.path.append(os.path.abspath('/root/project')) # add root path to sys.path for external package
from util import metrics
metrics.printtest()

# TODO: Extract Goal=Activity of the test video

# Distance Metric is used using Bert Score


### 6. Activity Transfer Module

In [None]:
print("transfer activity")

# input (source-spatial context)
# input (target-spatial context)

# compare distance between the two spaces and determine whether spatial-transfer is necessary?

# query from database
# make prompt and chain and invoke

# output (same, or transferred) activity





### 7. Action Sequence Prediction

In [None]:
print("predict action sequence")
# input (target-spatial context)
# input (target-sequence)

# query from database
# make prompt and chain and invoke

# output (action sequence)

### 8. Evalutate Action Sequence Results

In [None]:
print("evaluate Action sequence")




### 9. ETC - DELETE WHEN NOT NEEDED

In [None]:
import input_source
import input_target
import prompt_target

# target space input
source_goalstep_segments = input_goalstep_segments
source_spatial_context = input_spatial_context

target_video = []
target_spatial_context = input_source.extract_spatial_context(target_video)

# make target query & retrieve
input_query = query.return_target_input_query(source_goalstep_segments, source_spatial_context, target_spatial_context)
retrieved_goalstep = goalstep_retriever.get_relevant_documents(input_query)
retrieved_spatial = spatial_retriever.get_relevant_documents(input_query)

# Retrieve documents' parent documents for goalstep annotation


# Concat the retrieved results


# define prompt
prompt2 = ChatPromptTemplate.from_template(promptSource.template_source)
prompt2.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser1 = StrOutputParser()
chain2 = prompt2 | model1 | parser1 

# Get Respone
response2 = chain2.invoke()





# # chain can incorpoate other chains
# chain_action = (
#     {"source_spatial_context": itemgetter(source_spatial_context), "goalstep": chain1, "target_spatial_context": itemgetter(target_spatial_context)} | prompt_action | model | parser
# )
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI

import prompt_source as promptSource
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate


# define llm
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model1 = ChatOpenAI(openai_api_key=openai.api_key, model="gpt-3.5-turbo")

# define prompt
prompt1 = ChatPromptTemplate.from_template(promptSource.template_source)
prompt1.format(context=promptSource.context, question=promptSource.question, rules = promptSource.rules)

# Define chain
parser1 = StrOutputParser()
chain1 = prompt1 | model1 | parser1 

# Get Respone
response1 = chain1.invoke()