# 0-dataset-module1-vectorstoreretriever

- parent 관련된 모듈
    1. https://asidefine.tistory.com/298
    2. https://wikidocs.net/234164
- multi-vector retriever
    1. https://wikidocs.net/234281

### 1. datapath & read

json을 일단 읽어준다.

In [7]:
import os
from dotenv import load_dotenv
import openai
import langchain
import json
import logging #logging for queries
# from langchain_community.document_loaders import TextLoader
# from langchain.document_loaders import JSONLoader

# load api keys
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

PATH_TEST = "../egogoal/v2/annotations/goalstep_test_unannotated.json"
PATH_TRAIN = "../egogoal/v2/annotations/goalstep_train.json"
PATH_VAL = "../egogoal/v2/annotations/goalstep_val.json"
PATH_TRAINVAL = "../egogoal/v2/annotations/goalstep_trainval.json"

data_test = json.load(open(PATH_TEST))
data_train = json.load(open(PATH_TRAIN))
data_val = json.load(open(PATH_VAL))

# train_video_uids = set(x.get("video_id", x.get("video_uid", None)) for x in data_test["annotations"])
# val_video_uids = set(x.get("video_id", x.get("video_uid", None)) for x in data_train["annotations"])
# test_video_uids = set(x.get("video_id", x.get("video_uid", None)) for x in data_val["annotations"])


### 3 1. Manual Method

어떠한 chunk에 대한 유사도가 감지되었다면, 그 chunk를 가지고 있는 원본 document를 모두 참고하는 방법이다.
- 각 비디오는 separate document로 documents에 들어가야 한다.
- 각 비디오가 많이 길수가 있으니까 역시 chunk를 만들어서 줘야 한다.
- 다만, 맨 처음의 spatial context는 무조건 줘야 한다.

### Plan
- `Parse Input JSON`: 

Extract the "segments_sequence" and other required data structures from the JSON.
Create Vector Representations: Convert the segments' context and change information into vector representations (e.g., using embeddings from a pre-trained model like OpenAI's or Hugging Face's).

- `Index Segments into a Vectorstore:`

Each level 3 segment will be a document with its context and change attributes.
Store metadata for parent relationships (e.g., level 2 and level 1 parent information).

- `Query the Vectorstore:`

Match the input sequence to the most similar level 3 segment using embeddings.
Retrieve metadata to locate parent level 2 and level 1 segments.

- `Return Results:`

Return the level 3 segment, its parent level 2 segment, and the level 1 parent.


In [8]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import json

def extract_segments(data):
    segments = []

    def recurse_segments(segment, parent_id=None, level=1):
        segment_id = segment.get("number")
        text = json.dumps(segment.get("context"))
        metadata = {
            "level": level,
            "segment_id": segment_id,
            "parent_id": parent_id,
            "video_uid": segment.get("video_uid"),
        }
        segments.append({"text": text, "metadata": metadata})
        
        # Process child segments recursively
        for child_segment in segment.get("segments", []):
            recurse_segments(child_segment, parent_id=segment_id, level=level+1)

    for video in data["videos"]:
        recurse_segments(video, parent_id=None, level=1)
    
    return segments

In [None]:
# Load your JSON data
with open("database.json") as f:
    data = json.load(f)
segments = extract_segments(data)

# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Create FAISS index
texts = [segment["text"] for segment in segments]
metadata = [segment["metadata"] for segment in segments]
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

In [None]:
# query and traverse
def query_hierarchy(query_sequence, vectorstore):
    # Generate query embedding (1 result is picked)
    query_text = json.dumps(query_sequence)
    result = vectorstore.similarity_search(query_text, k=1)
    
    if not result:
        return None
    
    level3_match = result[0]
    level3_metadata = level3_match.metadata
    
    # Retrieve parent level 2
    level2_match = next(
        (seg for seg in segments if seg["metadata"]["segment_id"] == level3_metadata["parent_id"]),
        None
    )
    
    # Retrieve parent level 1
    level1_match = next(
        (seg for seg in segments if seg["metadata"]["segment_id"] == level2_match["metadata"]["parent_id"]),
        None
    ) if level2_match else None
    
    return {
        "level3": level3_match,
        "level2": level2_match,
        "level1": level1_match,
    }

query_sequence = [...]  # Input your level 3 segment sequence
result_hierarchy = query_hierarchy(query_sequence, vectorstore)

# Display results
print("Level 3 Match:", result_hierarchy["level3"])
print("Level 2 Parent:", result_hierarchy["level2"])
print("Level 1 Parent:", result_hierarchy["level1"])

### 3 2. make hierarchical 

어떠한 chunk에 대한 유사도가 감지되었다면, 그 chunk를 가지고 있는 원본 document를 모두 참고하는 방법이다.
- 각 비디오는 separate document로 documents에 들어가야 한다.
- 각 비디오가 많이 길수가 있으니까 역시 chunk를 만들어서 줘야 한다.
- 다만, 맨 처음의 spatial context는 무조건 줘야 한다.

In [None]:
# Loading and Splitting
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever


# split dataset into lists
documents = []
embeddings = OpenAIEmbeddings()

# text splitter
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
vectorstore = Chroma(
    collection_name="full_docs", embedding = embeddings
)

# storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

### 3. Retrieve from documents list

In [None]:
QUERY = ""

# add documents to retriever and see how many chunk the full document has
retriever.add_documents(documents)
len(list(store.yield_keys()))

# check if search is possible for smaller chunks
sub_docs = vectorstore.similarity_search(QUERY)
print(sub_docs[0].page_content)
len(sub_docs[0].page_content)

# two ways of retrieving documents
retrieved_docs = retriever.invoke(QUERY) 
retrieved_docs = retriever.get_relevant_documents(QUERY)

# print the searched document
print(retrieved_docs[0].page_content)
len(retrieved_docs[0].page_content)

### 4. Store VectorStore