In [2]:
# LLM loading

from langchain.chat_models import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

In [3]:
# Vectorstore loading

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

vectorstore = Chroma(persist_directory=f"./vectorstore_nltk", embedding_function = embeddings)

In [4]:
# BaseRetriever

from langchain.retrievers import ContextualCompressionRetriever

base_retriever = vectorstore.as_retriever(search_type = 'mmr', search_kwargs = {'k': 2})

In [5]:
# MultiQueryRetriever

from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query = MultiQueryRetriever.from_llm(retriever = base_retriever, llm = llm)

In [6]:
# SelfQueryRetriever

from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="category",
        description="Category of te text content - possible values are NarrativeText and Title",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="Year when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="applicant",
        description="Name of the person that filed the patent",
        type="string",
    ),
    AttributeInfo(
        name="day",
        description="Day when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="month",
        description="Month when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="filename",
        description="Name of the file",
        type="string",
    ),
    AttributeInfo(
        name="id",
        description="Document ID",
        type="string",
    ),
    AttributeInfo(
        name="page_number",
        description="Page number from the original document",
        type="string",
    ),
    AttributeInfo(
        name="register_num",
        description="Patent registration number",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source that published the document.",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Document title",
        type="string",
    ),
    AttributeInfo(
        name="type",
        description="Type of the document - possible value are article, lecture and patent",
        type="string",
    ),
]

document_content_description = "Document content"

self_query_retriever = SelfQueryRetriever.from_llm(llm = llm, vectorstore = vectorstore, document_contents = document_content_description,
                                                   metadata_field_info = metadata_field_info, verbose=True)

In [7]:
# EnsembleRetriever

from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers = [multi_query, self_query_retriever])

In [8]:
# Prompt template

from langchain import PromptTemplate

prompt_template = PromptTemplate.from_template(
    """Let's think step by step. Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to
    make up an answer.

    Context:
    {context}

    Question: {question}
    Helpful Answer:"""
)


In [9]:
# Chain definition

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)
chain = ConversationalRetrievalChain.from_llm(llm = llm, 
                                              retriever = ensemble_retriever, 
                                              chain_type = 'map_reduce',
                                              memory = memory)

In [12]:
# Test

import datetime

file_name = f"responses/responses_{str(datetime.datetime.now()).replace(' ', '_').replace(':', '_')}.txt"
queries = ["What did Nikola Tesla think about Mars?",
           "How many patents did Nikola Tesla file between 1890. and 1895.?",
           "How did Nikola Tesla envision the future of travel?",
           "What is Tesla's description of the human eye?",
           "Where was Nikola Tesla born?"]

with open(file_name, 'w') as file:
    file.write(f"{str(datetime.datetime.now())}\n\n")

for q in queries:
    with open(file_name, 'a') as file:
        file.write(f"- Query: {q}\n")
        file.write(f"   - Split: nltk\n")
        file.write(f"      - Retriever: ensemble\n")
        try:
            file.write(f"         - Response: {chain.run(q)}\n")
        except Exception as e:
            file.write(f"         - ERROR: {e}\n")
        file.write(f"\n")



query='Nikola Tesla Mars' filter=None limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='applicant', value='Nikola Tesla'), Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='year', value=1890), Comparison(comparator=<Comparator.LTE: 'lte'>, attribute='year', value=1895), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='type', value='patent')]) limit=None
query='Nikola Tesla vision future travel' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='applicant', value='Nikola Tesla') limit=None
query='human eye' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='applicant', value='Nikola Tesla') limit=None
query='Nikola Tesla' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='title', value='Birth') limit=None
