# Using a Retriever

This example showcases question answering over an index.

In [1]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings import LlamaCppEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [30]:
# llama_model_path = "../../models/zephyr-7b-beta.Q4_K_M.gguf"
llama_model_path = "../../models/zephyr-7b-beta.Q8_0.gguf"
n_ctx=3096
#Use Llama model for embedding
embeddings = LlamaCppEmbeddings(model_path=llama_model_path, n_ctx=n_ctx) # , n_ctx=2048

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../../models/zephyr-7b-beta.Q8_0.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q8_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 

In [31]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=llama_model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=n_ctx,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../../models/zephyr-7b-beta.Q8_0.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q8_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 

In [4]:
loader = TextLoader("datasets/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)


In [5]:
texts

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source'

In [6]:

# embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)




llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 18179.35 ms /   232 tokens (   78.36 ms per token,    12.76 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 18210.96 ms

llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  9395.42 ms /   232 tokens (   40.50 ms per token,    24.69 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  9425.73 ms

llama_print_timings:        load time =  9188.67 ms
llama_print_timings:   

In [7]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [8]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)


llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   527.11 ms /    13 tokens (   40.55 ms per token,    24.66 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   528.95 ms


 The president nominated Ketanji Brown Jackson to serve on the United States Supreme Court and praised her qualifications as a top legal mind with a background in private practice, public defense, and consensus building, as well as support from both law enforcement organizations and former judges appointed by Democrats and Republicans.


llama_print_timings:        load time =  2514.62 ms
llama_print_timings:      sample time =    80.11 ms /    60 runs   (    1.34 ms per token,   749.01 tokens per second)
llama_print_timings: prompt eval time =  4080.24 ms /   825 tokens (    4.95 ms per token,   202.19 tokens per second)
llama_print_timings:        eval time =  3103.61 ms /    59 runs   (   52.60 ms per token,    19.01 tokens per second)
llama_print_timings:       total time =  7451.88 ms


' The president nominated Ketanji Brown Jackson to serve on the United States Supreme Court and praised her qualifications as a top legal mind with a background in private practice, public defense, and consensus building, as well as support from both law enforcement organizations and former judges appointed by Democrats and Republicans.'

# Chain Type
You can easily specify different chain types to load and use in the RetrievalQA chain. For a more detailed walkthrough of these types, please see this notebook https://python.langchain.com/docs/modules/chains/additional/question_answering.

There are two ways to load different chain types. First, you can specify the chain type argument in the `from_chain_type` method. This allows you to pass in the name of the chain type you want to use. For example, in the below we change the chain type to `map_reduce`.

In [13]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="map_reduce", retriever=docsearch.as_retriever())

In [14]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)


llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1564.43 ms /    13 tokens (  120.34 ms per token,     8.31 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1567.46 ms


 The president said nothing about Ketanji Brown Jackson in this portion of the document.


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    21.53 ms /    19 runs   (    1.13 ms per token,   882.33 tokens per second)
llama_print_timings: prompt eval time =  1221.45 ms /   171 tokens (    7.14 ms per token,   140.00 tokens per second)
llama_print_timings:        eval time =   891.01 ms /    18 runs   (   49.50 ms per token,    20.20 tokens per second)
llama_print_timings:       total time =  2188.72 ms
Llama.generate: prefix-match hit


 President Biden recently nominated Judge Ketanji Brown Jackson to fill a vacancy on the United States Supreme Court. In remarks at the White House Rose Garden, Biden praised Jackson's experience and intellect, adding that "she will be an outstanding Justice of the Supreme Court."


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    74.70 ms /    56 runs   (    1.33 ms per token,   749.64 tokens per second)
llama_print_timings: prompt eval time =   940.36 ms /   215 tokens (    4.37 ms per token,   228.64 tokens per second)
llama_print_timings:        eval time =  2737.58 ms /    55 runs   (   49.77 ms per token,    20.09 tokens per second)
llama_print_timings:       total time =  3922.74 ms
Llama.generate: prefix-match hit


 "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."
Answer: This text describes Ketanji Brown Jackson's background and qualifications for serving as a judge. It highlights her experience as both a private litigator and a federal public defender, as well as her family's connection to education and law enforcement. The text also notes that she has received support from various organizations and former judges across the political spectrum since being nominated. However, it does not explicitly state what the president said about Jackson.


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =   234.73 ms /   162 runs   (    1.45 ms per token,   690.16 tokens per second)
llama_print_timings: prompt eval time =  1057.72 ms /   230 tokens (    4.60 ms per token,   217.45 tokens per second)
llama_print_timings:        eval time =  8017.20 ms /   161 runs   (   49.80 ms per token,    20.08 tokens per second)
llama_print_timings:       total time =  9844.73 ms
Llama.generate: prefix-match hit


 The confirmation of Ketanji Brown Jackson to the Supreme Court was not specifically mentioned in this portion of the document.


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    34.61 ms /    25 runs   (    1.38 ms per token,   722.36 tokens per second)
llama_print_timings: prompt eval time =  1199.98 ms /   267 tokens (    4.49 ms per token,   222.50 tokens per second)
llama_print_timings:        eval time =  1187.68 ms /    24 runs   (   49.49 ms per token,    20.21 tokens per second)
llama_print_timings:       total time =  2497.96 ms
Llama.generate: prefix-match hit


 The president did not mention the confirmation of Ketanji Brown Jackson in this portion of the document.


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    28.60 ms /    22 runs   (    1.30 ms per token,   769.18 tokens per second)


' The president did not mention the confirmation of Ketanji Brown Jackson in this portion of the document.'

llama_print_timings: prompt eval time =  9742.19 ms /  1790 tokens (    5.44 ms per token,   183.74 tokens per second)
llama_print_timings:        eval time =  1232.04 ms /    21 runs   (   58.67 ms per token,    17.04 tokens per second)
llama_print_timings:       total time = 11079.12 ms


# Custom Prompts

You can pass in custom prompts to do question answering. These prompts are the same prompts as you can pass into the base question answering chain https://python.langchain.com/docs/modules/chains/additional/question_answering

In [18]:
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer in Chinese:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [19]:
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs)

In [20]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)


llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1871.98 ms /    13 tokens (  144.00 ms per token,     6.94 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1875.99 ms
Llama.generate: prefix-match hit


 总统在谈论了哪一位法官？
(President spoke highly of Judge Ketanji Brown Jackson and nominated her for a position on the United States Supreme Court.)


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    53.48 ms /    43 runs   (    1.24 ms per token,   804.11 tokens per second)
llama_print_timings: prompt eval time =   470.35 ms /     2 tokens (  235.18 ms per token,     4.25 tokens per second)
llama_print_timings:        eval time =  2210.14 ms /    42 runs   (   52.62 ms per token,    19.00 tokens per second)
llama_print_timings:       total time =  2850.54 ms


' 总统在谈论了哪一位法官？\n(President spoke highly of Judge Ketanji Brown Jackson and nominated her for a position on the United States Supreme Court.)'

# Vectorstore Retriever Options

You can adjust how documents are retrieved from your vectorstore depending on the specific task.

There are two main ways to retrieve documents relevant to a query- Similarity Search and Max Marginal Relevance Search (MMR Search). Similarity Search is the default, but you can use MMR by adding the `search_type` parameter:

```python
docsearch.as_retriever(search_type="mmr")
```

You can also modify the search by passing specific search arguments through the retriever to the search function, using the search_kwargs keyword argument.

* `k` defines how many documents are returned; defaults to 4.
* `score_threshold` allows you to set a minimum relevance for documents returned by the retriever, if you are using the "similarity_score_threshold" search type.
* `fetch_k` determines the amount of documents to pass to the MMR algorithm; defaults to 20.
* `lambda_mult` controls the diversity of results returned by the MMR algorithm, with 1 being minimum diversity and 0 being maximum. Defaults to 0.5.
* `filter` allows you to define a filter on what documents should be retrieved, based on the documents' metadata. This has no effect if the Vectorstore doesn't store any metadata.
Some examples for how these parameters can be used:

In [21]:
# Retrieve more documents with higher diversity- useful if your dataset has many similar documents
docsearch.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25})

# Fetch more documents for the MMR algorithm to consider, but only return the top 5
docsearch.as_retriever(search_type="mmr", search_kwargs={'k': 5, 'fetch_k': 50})

# Only retrieve documents that have a relevance score above a certain threshold
docsearch.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8})

# Only get the single most similar document from the dataset
docsearch.as_retriever(search_kwargs={'k': 1})

# Use a filter to only retrieve documents from a specific paper
docsearch.as_retriever(search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}})

VectorStoreRetriever(tags=['Chroma', 'LlamaCppEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x11764c150>, search_kwargs={'filter': {'paper_title': 'GPT-4 Technical Report'}})

# Return Source Documents

Additionally, we can return the source documents used to answer the question by specifying an optional parameter when constructing the chain.

In [22]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_type="mmr", search_kwargs={'fetch_k': 30}), return_source_documents=True)

In [23]:
query = "What did the president say about Ketanji Brown Jackson"
result = qa({"query": query})


llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1811.20 ms /    13 tokens (  139.32 ms per token,     7.18 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1814.76 ms
Llama.generate: prefix-match hit


 The president nominated Judge Ketanji Brown Jackson for a position on the Supreme Court, calling her "one of our nation's top legal minds" who will continue Justice Stephen Breyer's legacy of excellence.


llama_print_timings:        load time =  1221.49 ms
llama_print_timings:      sample time =    68.44 ms /    47 runs   (    1.46 ms per token,   686.72 tokens per second)
llama_print_timings: prompt eval time =  4965.19 ms /   902 tokens (    5.50 ms per token,   181.66 tokens per second)
llama_print_timings:        eval time =  2478.09 ms /    46 runs   (   53.87 ms per token,    18.56 tokens per second)
llama_print_timings:       total time =  7674.72 ms


In [24]:
result["result"]

' The president nominated Judge Ketanji Brown Jackson for a position on the Supreme Court, calling her "one of our nation\'s top legal minds" who will continue Justice Stephen Breyer\'s legacy of excellence.'

In [25]:
result["source_documents"]

[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../datasets/state_of_the_union.txt'}),
 Document(page_content='Last month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama a

Alternatively, if our document have a "source" metadata key, we can use the `RetrievalQAWithSourcesChain` to cite our sources:

In [27]:
# docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(texts))])

TypeError: langchain.vectorstores.chroma.Chroma.from_texts() got multiple values for keyword argument 'metadatas'

In [33]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [34]:
chain({"question": "What did the president say about Justice Breyer"}, return_only_outputs=True)


llama_print_timings:        load time =  9188.67 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   664.35 ms /    11 tokens (   60.40 ms per token,    16.56 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   666.85 ms


 The president announced the formation of a task force to go after the crimes of Russian oligarchs and seize their ill-gotten gains, including their yachts, luxury apartments, and private jets.
SOURCES: ../../datasets/


llama_print_timings:        load time =  2452.31 ms
llama_print_timings:      sample time =    92.38 ms /    55 runs   (    1.68 ms per token,   595.39 tokens per second)
llama_print_timings: prompt eval time = 12490.83 ms /  2493 tokens (    5.01 ms per token,   199.59 tokens per second)
llama_print_timings:        eval time =  3415.14 ms /    54 runs   (   63.24 ms per token,    15.81 tokens per second)
llama_print_timings:       total time = 16224.80 ms


{'answer': ' The president announced the formation of a task force to go after the crimes of Russian oligarchs and seize their ill-gotten gains, including their yachts, luxury apartments, and private jets.\n',
 'sources': '../../datasets/'}