In [1]:
# LLM loading

from langchain.chat_models import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

In [11]:
# Vectorstore loading

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

names = ['elements', 'nltk', 'recursive']
all_bases = {}
all_retrievers = {}

for name in names:
    all_bases[name] = Chroma(persist_directory=f"./vectorstore_{name}", embedding_function = embeddings)
    all_retrievers[name] = {}

In [14]:
# BaseRetriever

for name in names:
    all_retrievers[name]['base'] = all_bases[name].as_retriever(search_type = 'mmr')

In [16]:
# MultiQueryRetriever

from langchain.retrievers.multi_query import MultiQueryRetriever

for name in names:
    all_retrievers[name]['multi_query'] = MultiQueryRetriever.from_llm(retriever = all_bases[name].as_retriever(search_type = 'mmr'), 
                                                                       llm = llm)

In [None]:
# ContextualCompressionRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

for name in names:
    compressor = LLMChainExtractor.from_llm(llm)
    all_retrievers[name]['compression_retriever'] = ContextualCompressionRetriever(base_compressor = compressor, 
                                                                                   base_retriever = all_retrievers[name]['base'])

In [17]:
all_retrievers

{'elements': {'base': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f47f3b1fc10>, search_type='mmr', search_kwargs={}),
  'multi_query': MultiQueryRetriever(tags=None, metadata=None, retriever=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f47f3b1fc10>, search_type='mmr', search_kwargs={}), llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['question'], output_parser=None, partial_variables={}, template='You are an AI language model assistant. Your task is \n    to generate 3 different versions of the given user \n    question to retrieve relevant documents from a vector  database. \n    By generating multiple perspectives on the user question, \n    your goal is to help the user overcome some of the limitations

In [None]:
# SelfQueryRetriever

from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="category",
        description="Category of te text content - possible values are NarrativeText and Title",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date when the document was published",
        type="string",
    ),
    AttributeInfo(
        name="filename",
        description="Name of the file",
        type="string",
    ),
    AttributeInfo(
        name="id",
        description="Document ID",
        type="string",
    ),
    AttributeInfo(
        name="page_number",
        description="Page number from the original document",
        type="string",
    ),
    AttributeInfo(
        name="register_num",
        description="Patent registration number",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source that published the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Document title",
        type="string",
    ),
    AttributeInfo(
        name="type",
        description="Type of the document - possible value are article, lecture and patent",
        type="string",
    ),
]

document_content_description = "Document content"

self_query_retriever_elements = SelfQueryRetriever.from_llm(
    llm = llm, 
    vectorstore = vectorstore_elements,
    document_contents = document_content_description,
    metadata_field_info = metadata_field_info, 
    verbose=True
)

self_query_retriever_nltk = SelfQueryRetriever.from_llm(
    llm = llm, 
    vectorstore = vectorstore_nltk,
    document_contents = document_content_description,
    metadata_field_info = metadata_field_info, 
    verbose=True
)

self_query_retriever_recursive = SelfQueryRetriever.from_llm(
    llm = llm, 
    vectorstore = vectorstore_recursive,
    document_contents = document_content_description,
    metadata_field_info = metadata_field_info, 
    verbose=True
)

In [None]:
docs = retriever.get_relevant_documents("What lectures did Nikola Tesla give")



query='Nikola Tesla' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='type', value='lecture') limit=None


In [None]:
[d.metadata['filename'] for d in docs]

['On Light and Other High Frequency Phenomena.docx',
 'Experiments With Alternate Currents of High Potential and High Frequency (lecture).docx',
 'A New System of Alternate Current Motors and Transformers (lecture).docx',
 '8 High Frequency Oscillators for Electro-Therapeutic and Other Purposes (lecture).docx']

In [None]:
# Basic Retriever example

from langchain.chains import RetrievalQA

question = "How many poles shoul my electromotor have, and what should I do if I have the wrong number?"
qa_chain = RetrievalQA.from_chain_type(llm = llm, retriever = retriever)
print(qa_chain({"query": question}))

query='electromotor poles' filter=None limit=None
{'query': 'How many poles shoul my electromotor have, and what should I do if I have the wrong number?', 'result': 'Based on the given context, the number of poles in an electromotor can vary. The motor X has eight poles, the motor Y has six poles, and the motor Z has four poles. The number of poles determines the speed of the motor. If you have the wrong number of poles, you can change the electrical connections to achieve the desired speed. For example, in motor X, you can alternate between two like and two opposite poles to effectively reduce the number of poles by half and double the speed of the motor.'}


In [None]:
# Retriever with memory example

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm = llm, retriever = retriever, memory = memory)
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")

In [18]:
q = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to
make up an answer.

{context} // i.e the pdf text content

Question: {query} // i.e our actualy query, 'Who is the CV about?'
Helpful Answer:"""