Retriving answer based on relevant vectors

In [1]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS



In [2]:
os.environ['OPENAI_API_KEY'] = 'Your OPEN AI API Key'

In [3]:
llm = OpenAI(temperature=0.9, max_tokens=500) #Initialising LLM

  warn_deprecated(


<h3>Loading Data</h3>

In [23]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.theguardian.com/us-news/2024/jan/10/nikki-haley-ron-desantis-iowa-republican-debate-candidates",
    "https://www.theguardian.com/us-news/2024/jan/05/nikki-haley-trump-cnn-town-hall-iowa"
])
data = loaders.load() 
len(data)

2

<h3>Creating Chunks</h3>

In [24]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [25]:
docs[1]

Document(page_content='The Florida governor slamming Haley for running “to do her donors’ bidding” and the former UN ambassador calling DeSantis a habitual liar. The tone early in the Iowa debate matched prior GOP debates, which were frequently hostile, with candidates hurling personal attacks at one another.\n\nTrump has repeatedly declined to debate his party’s opponents, and skipped this debate as well, instead participating in a town hall hosted by Fox News, also in Iowa.\n\nUS won’t survive four more years of Trump ‘chaos’, Nikki Haley saysRead more\n\nUnlike the prior debates, this one was not coordinated by the Republican National Committee (RNC), which decided in December to stop hosting GOP debates for the rest of the primary season.\n\nThe RNC debates narrowed the field of Republican contenders to five, and CNN’s debate requirement that candidates poll at 10% in at least three national or Iowa-based surveys has left only Haley, DeSantis and Trump qualifying.', metadata={'sour

<h3>Creating Embeddings and Storing it in FAISS index</h3>

In [26]:
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

  warn_deprecated(


Storing Vector index locally

In [33]:
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f) #This pickle file is a Vector Database.

In [30]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f) #Now the vecotr index is loaded into memory.
#Our vector index will have the knowledge from the URL of articles provided.

<h3>Information Retrieval part</h3>

In [35]:
#retriever means how we want retrieve our vector database.
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [36]:
query = "What does Nikki Haley say about Donald Trump"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What does Nikki Haley say about Donald Trump"
}


  warn_deprecated(
  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Republican presidential candidates Nikki Haley and Donald Trump. Composite: Getty Images\n\nRepublican presidential candidates Nikki Haley and Donald Trump. Composite: Getty Images\n\nNikki Haley\n\nUS won’t survive four more years of Trump ‘chaos’, Nikki Haley says\n\n‘We have a country to save – and that means no more drama,’ top Republican rival for 2024 nomination tells Iowa audience\n\nMartin Pengelly\n\nin Washington\n\n@MartinPengelly\n\nFri 5 Jan 2024 16.35 GMT\n\n\n\n\n\n\n\nThe re-election of Donald Trump would bring “four more years of chaos” the US “won’t survive”, the former president’s closest challenger for the Republican

  warn_deprecated(


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 9:chain:LLMChain > 10:llm:OpenAI] [1.33s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " Nikki Haley says that the US \"won't survive\" another 4 years of Donald Trump and also suggests that he is his own worst enemy and the country needs to be saved from \"drama\" and taking things personally. She is also described as a potential vice-president and an \"imitation\" of Trump who may stray but will ultimately be loyal.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "total_tokens": 1759,
      "completion_tokens": 71,
      "prompt_tokens": 1688
    },
    "model_name": "gpt-3.5-turbo-instruct"
  },
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapRed

{'answer': ' Nikki Haley says that the US "won\'t survive" another 4 years of Donald Trump and also suggests that he is his own worst enemy and the country needs to be saved from "drama" and taking things personally. She is also described as a potential vice-president and an "imitation" of Trump who may stray but will ultimately be loyal.',
 'sources': ''}

What happens when we try to run the above code?<br>
There is some internal debugging.<br>
At first, 4 chunks relevant to our answer are retreived.<br>
So same questions is asked to all the 4 chunks.<br>
So there are 4 LLM calls.<br>
Hence, we get 4 answers.<br>
Then all the 4 chunks are combined, and again the question is asked to the combined summary.<br>
Thus, at the end we get our final answer.
