In [2]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import os
import pprint
from dotenv import load_dotenv



load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY')

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectordb = Chroma(persist_directory="./jonhWick_db", embedding_function=embeddings, collection_name="doc_jonhWick")


In [3]:
doc_example = vectordb.get(ids=vectordb.get()['ids'][0])
for key in doc_example:

    print(f" ----  Start {key} --- \n" )
    pprint.pprint(doc_example[key])
    print(f" ----  End {key} --- \n" )

 ----  Start ids --- 

['04862bf7-2f07-4273-bd00-5fefe76256ce']
 ----  End ids --- 

 ----  Start embeddings --- 

None
 ----  End embeddings --- 

 ----  Start metadatas --- 

[{'Author': 'Coventry',
  'Movie_Title': 'John Wick 1',
  'Rating': 5,
  'Review_Date': '5 May 2023',
  'Review_Title': " You don't mess with another person's dog. It's as simple "
                  'as that!\n',
  'Review_Url': '/review/rw9033669/?ref_=tt_urv',
  'row': 2,
  'source': 'data/john_wick_1.csv'}]
 ----  End metadatas --- 

 ----  Start documents --- 

[': 2\n'
 "Review: With the fourth installment scoring immensely at the cinemas as I'm "
 'submitting this review, and after three previous films that are apparently '
 'loved by everyone else in the world, I thought perhaps it would be time for '
 'me check out "John Wick".']
 ----  End documents --- 

 ----  Start uris --- 

None
 ----  End uris --- 

 ----  Start data --- 

None
 ----  End data --- 



### As you can see, we have 9 fields inside to the metadata 

### We are going to create a self Query Retriever. Este tipo de retriever es muy util cuando la fuente de datos de nuestra vectore store tiene metadatos claves para busqueda optima de la respuesta.

Veamos el siguiente ejemplo. Imaginemos que tenemos almacenado en nuestra base de datos vectorial una gran cantidad de experiencias y ofertas de ocio (Ex: clases de surf, tirolina, ruta gastronómica, etc ). La descripción de la experiencia es lo que hemos codificado, usando nuestro modelo de embbedding. Además cada oferta tiene 3 valores claves o metadatos: Fecha, precio y lugar.

Imaginemos que un usuario busca una experiencia de este estilo: Una experiencia en la naturaleza, que sea para toda la familia y segura. Además el precio debe de ser inferior a 50$  y el lugar es California. 

Algo esta claro aquí, "NO QUEREMOS QUE NOS DEVUELVA ACTIVIDAD/EXPERIENCIAS QUE NO CUMPLAN EL PRECIO NI EL LUGAR QUE EL USUARIO PIDE". Por ello no tiene sentido calcular similitudes con chunks/experiencias que no cumplan con el filtro de los metadatos. 

Este caso es ideal para aplicar "self Query Retriever". 

Volviendo a nuestro ejemplo anterior, lo que nos permite este tipo de retriever es realizar un primer filtro a través de los metadas, y después realizar el calculo de similitud entre los chunks que cumplan los requisitos de los metadatos y el input del usuario.




¿La pregunta que nos tenemos que hacer es la siguiente: ¿Como sabe el agente usando en el SQR, cuales son los filtros de la vectore store y que significa cada uno?

Para eso necesitamos darle un contexto a este agente, donde le mostraremos cuales son los metadatos y la descripción de cada uno de ellos. Al proporcionarle este conocimiento, el agente podrá saber cuando y como hacer el filtro al hacer las consultas. Además le tendremos que dar una descripción de la info que va a encontrarse en la vectore store.

In [25]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.chroma import ChromaTranslator



metadata_field_info = [
    AttributeInfo(
        name="Movie_Title",
        description="The title of the movie",
        type="string",
    ),
    AttributeInfo(
        name="Review_Date",
        description="The date of the review",
        type="string",
    ),
    AttributeInfo(
        name="Review_Title",
        description="The title of the review",
        type="string",
    ),
    AttributeInfo(
        name="Review_Url",
        description="The URL of the review",
        type="string",
    ),
    AttributeInfo(
        name="Author",
        description="The author of the review",
        type="string",
    ),
    AttributeInfo(
        name="Rating",
        description="A 1 to 10 rating for the movie",
        type="integer",
    )
]

document_content_desription = "A review of the Jonh Wick movie."


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
chat_model = ChatOpenAI()

self_query_retriever = SelfQueryRetriever.from_llm(
    llm=ChatOpenAI(temperature=0),
    vectorstore =vectordb,
    document_contents = document_content_desription,
    metadata_field_info =metadata_field_info,
    verbose = True,
    structured_query_translator = ChromaTranslator()
)

## We are going to do a Naive RAG.

## Remember:

- R -> Retrieval
- A -> Augmented
- G -> Generation

# Retrieval

In [5]:
# We have already created the retriever object
self_query_retriever

SelfQueryRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001EAC42E1750>, query_constructor=RunnableBinding(bound=FewShotPromptTemplate(input_variables=['query'], examples=[{'i': 1, 'data_source': '```json\n{{\n    "content": "Lyrics of a song",\n    "attributes": {{\n        "artist": {{\n            "type": "string",\n            "description": "Name of the song artist"\n        }},\n        "length": {{\n            "type": "integer",\n            "description": "Length of the song in seconds"\n        }},\n        "genre": {{\n            "type": "string",\n            "description": "The song genre, one of "pop", "rock" or "rap""\n        }}\n    }}\n}}\n```', 'user_query': 'What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre', 'structured_request': '```json\n{{\n    "query": "teenager love",\n    "filter": "and(or(eq(\\"artist\\", \\"Taylor Swift\\"), eq(\\"artist\\", \\"Katy Perry\\"))

# Augmented

In [6]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

# Generation

In [7]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI()

## Finally, we are going to create a Rag Parent doc Retrieval. For that, we are going to use LCEL (LangChain Expression Language)
If you want to learn more about LCEL, check this good tutorial: https://www.youtube.com/watch?v=O0dUOtOIrfs

In [8]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": self_query_retriever })
output_parser = StrOutputParser()


selfq_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


selfq_retrieval_chain.invoke( "Did people generally like John Wick?")

'Yes, people generally liked John Wick.'

In [9]:
selfq_retrieval_chain.invoke("What are the reviews with a score greater than 7?")

'The reviews with a score greater than 7 are:\n1. Review by subxerogravity for the movie "John Wick 2" with a rating of 9.\n2. Review by danielmanson for the movie "John Wick 2" with a rating of 8.\n3. Review by ThomasDrufke for the movie "John Wick 2" with a rating of 8.'

In [17]:
for docs in response:
    print(docs.metadata)


{'Author': 'subxerogravity', 'Movie_Title': 'John Wick 2', 'Rating': 9, 'Review_Date': '10 February 2017', 'Review_Title': " Man! I didn't think Chapter 2 could out do the original, but man! This was Fantastic!\n", 'Review_Url': '/review/rw3637449/?ref_=tt_urv', 'row': 24, 'source': 'data/john_wick_2.csv'}
{'Author': 'danielmanson', 'Movie_Title': 'John Wick 2', 'Rating': 8, 'Review_Date': '28 November 2020', 'Review_Title': " It's just a great action film\n", 'Review_Url': '/review/rw6316364/?ref_=tt_urv', 'row': 1, 'source': 'data/john_wick_2.csv'}
{'Author': 'ThomasDrufke', 'Movie_Title': 'John Wick 2', 'Rating': 8, 'Review_Date': '14 February 2017', 'Review_Title': ' Professional Courtesy\n', 'Review_Url': '/review/rw3640053/?ref_=tt_urv', 'row': 2, 'source': 'data/john_wick_2.csv'}
{'Author': 'Palidan400', 'Movie_Title': 'John Wick 1', 'Rating': 8, 'Review_Date': '25 October 2014', 'Review_Title': " Yeah I'm Thinking He's Back\n", 'Review_Url': '/review/rw3111220/?ref_=tt_urv', 'r

### The rating value are always bigger than 7

### I want to look deeper into what is happening inside the self query retriever

In [26]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
self_query_retriever.invoke("What are the reviews with a score greater than 7 and say bad things about the movie?")

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor] Entering Chain run with input:
[0m{
  "query": "What are the reviews with a score greater than 7 and say bad things about the movie?"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "What are the reviews with a score greater than 7 and say bad things about the movie?"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 4:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON obj

[Document(page_content=": 24\nReview: John Wick: Chapter 3 - Parabellum is quite literally about consequences, dealing with the fallout of John's actions at the end of the previous film and sending him on an even bigger odyssey of violence that continues to explore this world of assassination and deliver beautifully clean action sequences.", metadata={'Author': 'masonsaul', 'Movie_Title': 'John Wick 3', 'Rating': 10, 'Review_Date': '17 May 2019', 'Review_Title': ' Makes John Wick a superb trilogy\n', 'Review_Url': '/review/rw4860603/?ref_=tt_urv', 'row': 24, 'source': 'data/john_wick_3.csv'}),
 Document(page_content=': 17\nReview: There are actually quite a handful reasons why "John Wick" could have become a failure. The two directors have never made a film before and almost exclusively worked in the stunt department so far. The writer is not exactly experienced either. Lead actor Keanu Reeves usually scores more through boyish charm than through realistic portrayal of gritty badass ch

### Finaly, the query is : talk bad about the movie and the filter is "Rating" greater than 7