This notebook focuses on testing the Self Query Retriever from LangChain. The goal is to be able to filter through the metadata of the vector store before going through the actual chunked documents to enable faster information retrieval. We used this link as the main reference of the notebook.
https://python.langchain.com/docs/modules/data_connection/retrievers/self_query/

In [None]:
import os
import pandas as pd
import json
from langchain.document_loaders import PyPDFLoader  # pip install langchain; pip install pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings  # pip install sentence-transformers
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain import SagemakerEndpoint
from langchain.schema import Document
# pip install chromadb
# pip install lark

In [65]:
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"
 
    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        input_str = json.dumps({"inputs" : [[{"role" : "system",
        "content" : "You are a kind robot."},
        {"role" : "user", "content" : prompt},]],
        "parameters" : {**model_kwargs}})
        return input_str.encode("utf-8")
    
    def transform_output(self, output: bytes) -> str:
        global response_json
        response_json = json.loads(output.read().decode("utf-8"))
        print('print out the OUTPUT1', response_json[0]["generation"]["content"])
        print('print out the OUTPUT2', json.dumps(response_json))
        return response_json[0]["generation"]["content"]
        
        
        # print('print out the OUTPUT2', json.dumps(response_json))
        '''
        json_extract = response_json[0]['generation']['content'].split('}')[0]+'}'
        query_ = json.loads(json_extract)['query']
        filter_ = json.loads(json_extract)['filter']
        query_=query_.replace('"','\\\"')
        filter_=filter_.replace('"','\\\"')
        output_format = " {\n\"query\": \"queryInput\",\n\"filter\": \"filterInput\"\n}"
        output_format = output_format.replace('queryInput',query_).replace('filterInput',filter_)
        print("output_format", output_format)
        print(" {\n\"query\": \"\",\n\"filter\": \"gt(\\\"rating\\\", 8.5)\"\n}")
        '''
        # return " {\n\"query\": \"\",\n\"filter\": \"gt(\\\"rating\\\", 8.5)\"\n}"
        # return output_format

Make a few modifications to the original content handler class. What we discovered is that, when we hardcode the function output to: return " {\n\"query\": \"\",\n\"filter\": \"gt(\\\"rating\\\", 8.5)\"\n}", we will not see an error when calling the self query retriever. It is reasonable to assume that the Invalid JSON error (when calling Langchain's self query retriever) came from the fact that Llama2 produces output that does not fit the requirement of LangChain's self query retriever. 

In [3]:
'''
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        # print(prompt)
        input_str = json.dumps({"inputs" : [[{"role" : "system",
        "content" : "You are a kind robot."},
        {"role" : "user", "content" : prompt},]],
        "parameters" : {**model_kwargs}})
        # print(input_str)
        return input_str.encode("utf-8")
    
    def transform_output(self, output: bytes) -> str:
        # print(output.read().decode("utf-8"))
        response_json = json.loads(output.read().decode("utf-8"))
        print('print out the OUTPUT', response_json[0]["generation"]["content"])
        return response_json[0]["generation"]["content"]
'''

Below is the original content handler class where we invoke our Llama2 endpoint.

In [56]:
endpoint_name = "JumpStartModel-Llama2-70b-Chat-network-isolated"
region = os.environ["AWS_REGION"]

content_handler = ContentHandler()
llm = SagemakerEndpoint(
     endpoint_name = endpoint_name,
     region_name = region, 
     model_kwargs = {"max_new_tokens": 700, "top_p": 0.9, "temperature": 0.6, "return_full_text":True},
     endpoint_kwargs = {"CustomAttributes": 'accept_eula = true'},
     content_handler = content_handler
 )

In [6]:
docs_exp = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"}),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2}),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6})]

In [7]:
embedding_function = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(persist_directory = 'db', documents = docs_exp, embedding = embedding_function)

In [8]:
metadata_field_info = [
    AttributeInfo(name="genre", description="The genre of the movie", type="string or list[string]"),
    AttributeInfo(name="year", description="The year the movie was released", type="integer"),
    AttributeInfo(name="director", description="The name of the movie director", type="string"),
    AttributeInfo(name="rating", description="A 1-10 rating for the movie", type="float")]
document_content_description = "Brief summary of a movie"

In [66]:
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)

In [67]:
retriever.invoke("I want to watch a movie rated higher than 8.5")

print out the OUTPUT1  {
"query": "",
"filter": "and(gt(\"rating\", 8.5)"
}

Please provide the actual data source and user query you would like me to read and I will be happy to assist you in structuring the user's query to match the request schema.
print out the OUTPUT2 [{"generation": {"role": "assistant", "content": " {\n\"query\": \"\",\n\"filter\": \"and(gt(\\\"rating\\\", 8.5)\"\n}\n\nPlease provide the actual data source and user query you would like me to read and I will be happy to assist you in structuring the user's query to match the request schema."}}]


OutputParserException: Parsing text
 {
"query": "",
"filter": "and(gt(\"rating\", 8.5)"
}

Please provide the actual data source and user query you would like me to read and I will be happy to assist you in structuring the user's query to match the request schema.
 raised following error:
Got invalid JSON object. Error: Extra data: line 6 column 1 (char 54)

In [None]:
'''
from langchain.schema import Document
docs_exp = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "science fiction",
            "rating": 9.9,
        },
    ),
]