In [1]:
from dotenv import load_dotenv
load_dotenv()
import os


os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY')

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

documents = []

for i in range(1, 4):
  loader = CSVLoader(
    encoding="utf8",
    file_path=f"data/john_wick_{i}.csv",
    metadata_columns=["Review_Date", "Review_Title", "Review_Url", "Author", "Rating"]
  )

  movie_docs = loader.load()
  for doc in movie_docs:

    # We add metadate about the number of the movi
    doc.metadata["Movie_Title"] = f"John Wick {i}"

  documents.extend(movie_docs)

Este tipo de retriaver se usa cuando el input que proporciona el usuario está compuesto por un número reducido de palabras o la precisión de la consulta debe de ser impecable. Veamos un ejemplo para entenderlo mejor:

Imagina que hemos creado un RAG para para reconocer posibles enfermedades sabiendo alguno de sus sintomas. En el caso que tengamos un Naive RAG, posiblemente recolectemos una serie de posibles enfermedades que solo coincidan en uno o dos sintimas, dejando a nuestra herramienta un poco en mal lugar. 

Este es un caso ideal para usar Parent Doc Retriever. Y es que tipo de tecnica consiste en trocear grandes chunks (parent chunk) en aun trozos más pequeños (child chunk). Al tener unos chunks con reducido tamaño, hace que la información que contienen esté más concentrada y por lo tanto, su valor informativo no se diluya entre parrafos de texto.

Pero se sabe que el contexto es importante para una buena respuesta en lenguaje natural. Por lo tanto, en vez de devolver los child chunk, se obtiene los parent chunks a los que pertenece los child chink que más similitud tienen con la consulta del usuario.

In [11]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

parent_docs = documents

# Embedding Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


# Splitters
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
# We don't need a parent splitter because the data cames from CSV file, and each row is a parent doc.


# Stores
store = InMemoryStore()
vectorstore = Chroma(embedding_function=embeddings, collection_name="fullDoc", persist_directory="./JohnWick_db_parentsRD")



parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    # parent_splitter =parent_splitter
)

parent_document_retriever.add_documents(parent_docs, ids=None)

In [12]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore.get()['ids'])}")

Number of parent chunks  is: 75
Number of child chunks is: 3701


In [10]:
parent_document_retriever.vectorstore.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

## We are going to do a Naive RAG.

## Remember:

- R -> Retrieval
- A -> Augmented
- G -> Generation

# Retrieval

In [5]:
# We have already created the retriever object
parent_document_retriever

ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002BD7AC879A0>, docstore=<langchain.storage.in_memory.InMemoryBaseStore object at 0x000002BD7AC86A40>, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x000002BD7AC87610>)

# Augmented

In [6]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

# Generation

In [7]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI()

## Finally, we are going to create a Rag Parent doc Retrieval. For that, we are going to use LCEL (LangChain Expression Language)
If you want to learn more about LCEL, check this good tutorial: https://www.youtube.com/watch?v=O0dUOtOIrfs

In [8]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


parent_retrieval_chain.invoke("Did people generally like John Wick?")

'Yes, people generally like John Wick.'

In [9]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
parent_retrieval_chain.invoke("Did people generally like John Wick?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Did people generally like John Wick?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] [233ms] Exiting Chain run with output:
[0m{
  "question": "Did people generally like John 

'Yes, people generally like John Wick.'