In [16]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from langchain_core.runnables import RunnablePassthrough

In [17]:
urls = ['https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html',
        'https://cleartax.in/s/budget-2024-highlights',
        'https://www.hindustantimes.com/budget',
        'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr']


In [3]:
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()  

In [4]:
len(data)

4

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)
print("Total number of documents: ",len(docs))

Total number of documents:  175


In [6]:
docs[5]
#to see content of any document

Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}, page_content='“With our markets now past key domestic events, it is crucial to shift our focus back to global events and monitor counters with upcoming earnings declarations. Additionally, we should watch for those likely to benefit from the budget.”\n\n24 Jul 2024, 10:25 PM IST\n\nBudget 2024 Key Highlights Live Updates: Navneet Nagpal, Princpal Consultant and Director, Spectra Hospitality Services, said\n\nBudget 2024 Key Highlights Live Updates: "The Budget 2024\'s focus on developing new religious circuits is a significant step forward for the hospitality sector. The Finance Minister\'s plan to replicate the Kashi Vishwanath corridor model in Gaya’s Vishnupad Temple and Mahabodhi Temple in Bodhgaya is expected to transform these spiritually important sites into world-class tourist desti

Text to Vector Embeddings

In [7]:
embeddings = HuggingFaceEmbeddings()

  embeddings = HuggingFaceEmbeddings()
  embeddings = HuggingFaceEmbeddings()


In [8]:
vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

[0.034922655671834946,
 0.018830018118023872,
 -0.017854738980531693,
 0.0001388440141454339,
 0.0740736871957779]

In [9]:
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

  vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())


In [10]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Budget highlights")
len(retrieved_docs)

3

In [11]:
print(retrieved_docs[0].page_content)

24 Jul 2024, 02:01 PM IST

Budget 2024 Key Highlights Live Updates: Manish Shah praises balanced budget supporting MSMEs and urban housing

Manish Shah, MD & CEO at Godrej Capital, commended the 2024 budget for striking a perfect balance between short-term needs and long-term goals, laying the foundation for a resilient and thriving economy. He praised the government's strong commitment to bolstering the MSME sector, recognizing its pivotal role in the country’s economic advancement.


In [12]:
model_id = "Qwen/Qwen3-0.6B"

text_generation_pipeline = pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, max_new_tokens=400, device=0)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [14]:
prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """


In [15]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [19]:
question = "Nirmala Sitaraman?"

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

rag_chain.invoke(question)

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(id=\'87f060bd-3ab2-4395-94b2-04705173c424\', metadata={\'source\': \'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html\'}, page_content="23 Jul 2024, 07:22 PM IST\\n\\nBudget 2024 Key Highlights Live: Capex target to tax hikes, a glance at Nirmala Sitharaman\'s budget announcement\\n\\nBudget 2024 Key Highlights Live: Key Highlights from Modi 3.0\'s first Union Budget Finance Minister Nirmala Sitharaman focused on the following in Budget 2024 -\\n\\n1. Productivity and resilience in agriculture\\n\\n2. Employment and skilling\\n\\n3. Inclusive human resource development\\n\\n4. Social justice\\n\\n5. Urban development\\n\\n6. Energy security\\n\\n7. Infrastructure\\n\\n8. Innovation and Research Development\\n\\n9. Next generation reforms\\n\\nMint reported Sitharam