In [44]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import os
import pprint
from dotenv import load_dotenv



load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY')

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectordb = Chroma(persist_directory="./jonhWick_db", embedding_function=embeddings, collection_name="doc_jonhWick")


In [45]:
doc_example = vectordb.get(ids=vectordb.get()['ids'][0])
for key in doc_example:

    print(f" ----  Start {key} --- \n" )
    pprint.pprint(doc_example[key])
    print(f" ----  End {key} --- \n" )

 ----  Start ids --- 

['04862bf7-2f07-4273-bd00-5fefe76256ce']
 ----  End ids --- 

 ----  Start embeddings --- 

None
 ----  End embeddings --- 

 ----  Start metadatas --- 

[{'Author': 'Coventry',
  'Movie_Title': 'John Wick 1',
  'Rating': 5,
  'Review_Date': '5 May 2023',
  'Review_Title': " You don't mess with another person's dog. It's as simple "
                  'as that!\n',
  'Review_Url': '/review/rw9033669/?ref_=tt_urv',
  'row': 2,
  'source': 'data/john_wick_1.csv'}]
 ----  End metadatas --- 

 ----  Start documents --- 

[': 2\n'
 "Review: With the fourth installment scoring immensely at the cinemas as I'm "
 'submitting this review, and after three previous films that are apparently '
 'loved by everyone else in the world, I thought perhaps it would be time for '
 'me check out "John Wick".']
 ----  End documents --- 

 ----  Start uris --- 

None
 ----  End uris --- 

 ----  Start data --- 

None
 ----  End data --- 



### As you can see, we have 9 fields inside to the metadata 

### We are going to create a self Query Retriever.



In [46]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.chroma import ChromaTranslator



metadata_field_info = [
    AttributeInfo(
        name="Movie_Title",
        description="The title of the movie",
        type="string",
    ),
    AttributeInfo(
        name="Review_Date",
        description="The date of the review",
        type="string",
    ),
    AttributeInfo(
        name="Review_Title",
        description="The title of the review",
        type="string",
    ),
    AttributeInfo(
        name="Review_Url",
        description="The URL of the review",
        type="string",
    ),
    AttributeInfo(
        name="Author",
        description="The author of the review",
        type="string",
    ),
    AttributeInfo(
        name="Rating",
        description="A 1 to 10 rating for the movie",
        type="integer",
    )
]

document_content_desription = "A review of the Jonh Wick movie."


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
chat_model = ChatOpenAI()

self_query_retriever = SelfQueryRetriever.from_llm(
    llm=ChatOpenAI(temperature=0),
    vectorstore =vectordb,
    document_contents = document_content_desription,
    metadata_field_info =metadata_field_info,
    verbose = True,
    # structured_query_translator = ChromaTranslator()
)

## We are going to do a Naive RAG.

## Remember:

- R -> Retrieval
- A -> Augmented
- G -> Generation

# Retrieval

In [47]:
# We have already created the retriever object
self_query_retriever

SelfQueryRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002AFA58DFAC0>, query_constructor=RunnableBinding(bound=FewShotPromptTemplate(input_variables=['query'], examples=[{'i': 1, 'data_source': '```json\n{{\n    "content": "Lyrics of a song",\n    "attributes": {{\n        "artist": {{\n            "type": "string",\n            "description": "Name of the song artist"\n        }},\n        "length": {{\n            "type": "integer",\n            "description": "Length of the song in seconds"\n        }},\n        "genre": {{\n            "type": "string",\n            "description": "The song genre, one of "pop", "rock" or "rap""\n        }}\n    }}\n}}\n```', 'user_query': 'What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre', 'structured_request': '```json\n{{\n    "query": "teenager love",\n    "filter": "and(or(eq(\\"artist\\", \\"Taylor Swift\\"), eq(\\"artist\\", \\"Katy Perry\\"))

# Augmented

In [48]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

# Generation

In [49]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI()

## Finally, we are going to create a Rag Parent doc Retrieval. For that, we are going to use LCEL (LangChain Expression Language)
If you want to learn more about LCEL, check this good tutorial: https://www.youtube.com/watch?v=O0dUOtOIrfs

In [50]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": self_query_retriever })
output_parser = StrOutputParser()


self_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


self_retrieval_chain.invoke( "Did people generally like John Wick?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Did people generally like John Wick?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Did people generally like John Wick?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 4:retriever:Retriever > 5:chain:query_constructor] Entering Chain run with input:
[0m{


'Based on the reviews provided, it seems that people generally liked John Wick.'

In [51]:

setup_and_retrieval = RunnableParallel({"question": itemgetter("question") |  RunnablePassthrough(), "context": itemgetter("question") | self_query_retriever }) | RunnablePassthrough.assign(context=itemgetter("context"))

naive_retrieval_chain = setup_and_retrieval | {"response": rag_prompt | chat_model, "context": itemgetter("context")}

In [52]:
naive_retrieval_chain.invoke({"question" : "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "question": "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnableSequence > 4:chain:RunnableLambda] Entering Chain run with input:
[0m{
  "question": "Make a summary of the reviews that talk about John Wic

{'response': AIMessage(content='Summary:\n- masonsaul gave John Wick 3 a rating of 10 on 17 May 2019, praising it as a superb trilogy.\n- themadmovieman rated the movie 8 on 15 May 2019, describing it as pure, delirious joy and brilliant entertainment.\n- Leofwine_draca awarded the film a 9 on 22 September 2019, highlighting the sheer enjoyment, inventive action sequences, and top-notch choreography.\n- Dannyboi94 also rated the movie 8 on 16 May 2019, calling it the best action movie in years with clear, extraordinary stunts.', response_metadata={'token_usage': {'completion_tokens': 132, 'prompt_tokens': 975, 'total_tokens': 1107}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None}, id='run-fc0bf7ba-1ee4-4382-b5cf-7c28ea7f3b52-0'),
 'context': [Document(page_content=": 24\nReview: John Wick: Chapter 3 - Parabellum is quite literally about consequences, dealing with the fallout of John's actions at the end of the previous fi

In [53]:
# for docs in response:
#     print(docs.metadata)


### The rating value are always greater than 7

### I want to look deeper into what is happening inside the self query retriever

In [54]:
from langchain.globals import set_verbose, set_debug
set_debug(True)
self_query_retriever.invoke("Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7")

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor] Entering Chain run with input:
[0m{
  "query": "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "Make a summary of the reviews that talk about John Wick 3 and have a score higher than 7"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 4:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a 

[Document(page_content=": 24\nReview: John Wick: Chapter 3 - Parabellum is quite literally about consequences, dealing with the fallout of John's actions at the end of the previous film and sending him on an even bigger odyssey of violence that continues to explore this world of assassination and deliver beautifully clean action sequences.", metadata={'Author': 'masonsaul', 'Movie_Title': 'John Wick 3', 'Rating': 10, 'Review_Date': '17 May 2019', 'Review_Title': ' Makes John Wick a superb trilogy\n', 'Review_Url': '/review/rw4860603/?ref_=tt_urv', 'row': 24, 'source': 'data/john_wick_3.csv'}),
 Document(page_content=": 13\nReview: Following on from two deliriously entertaining, visually gorgeous and blissfully simplistic thrillers, John Wick: Chapter 3 - Parabellum keeps up the franchise's unique appeal in stunning fashion. Complete with electrifying action, beautiful cinematography, a pulsating score and a great sense of humour, the film is pretty much as purely joyful as action thril

### Finaly, the query is : talk bad about the movie and the filter is "Rating" greater than 7