In [36]:
# Importing necessary libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate

# Load the PDF file
pdf_path = "/content/For RAG Techzone Data.pdf"  # Update the path accordingly
data = PyPDFLoader(pdf_path).load()

# Set the directory for storing the vector database
persist_directory = "/content/vector_db"  # Update the path for Colab

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=150)
splits = text_splitter.split_documents(data)

# Initialize embeddings using Hugging Face model (use sentence-transformers for better embeddings)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_db = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

# Initialize RoBERTa-based question-answering pipeline from Hugging Face
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Initialize the retriever with multi-query ability
retriever = vector_db.as_retriever()

# Function to execute the query with context retrieved

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [37]:
def execute_query(question):
    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(question)

    # Check if any documents are retrieved
    if not retrieved_docs:
        print("No relevant documents found.")
        return "No relevant documents were retrieved."

    # Prepare context from the retrieved documents
    context = " ".join([doc.page_content.strip() for doc in retrieved_docs])  # Remove extra spaces and newlines
    context = " ".join(context.split())  # Ensure it's properly formatted as a paragraph
    print(f"Retrieved context:\n{context}\n")

    # Use the QA pipeline to answer the question based on context
    response = qa_pipeline(question=question, context=context)

    # Check if the model returned an answer
    if not response or 'answer' not in response:
        print("No valid answer generated by the model.")
        return "No valid answer was generated."

    return response['answer']

# Test the function with a simple question
response = execute_query("What is Techzone Academy?")
print(response)


Retrieved context:
Techzone Is An Academy Which Offers Best Courses Which Is Taught By Industry Experienced Faculty .Where You Are Prepared To Solve Real-T ime Business Challenges And Solving Industry Problems And Even Provide You Job Assistance. TechZone Software Academy Empowering Your Future in Emerging Technologies TechZone Software Academy is a premier institution dedicated to providing top-notch training and education in cutting-edge technologies. With a mission to empower individuals and businesses, we offer a wide range of courses designed to prepare you for the challenges and opportunities in today's rapidly evolving tech landscape. Our commitment to excellence, experienced faculty , and industry-relevant curriculum set us apart as a leader in Techzone Is An Academy Which Offers Best Courses Which Is Taught By Industry Experienced Faculty .Where You Are Prepared To Solve Real-T ime Business Challenges And Solving Industry Problems And Even Provide You Job Assistance. TechZone 

In [38]:
response = execute_query("What are the courses provided by Techzone Academy?")
print(response)

Retrieved context:
Techzone Is An Academy Which Offers Best Courses Which Is Taught By Industry Experienced Faculty .Where You Are Prepared To Solve Real-T ime Business Challenges And Solving Industry Problems And Even Provide You Job Assistance. TechZone Software Academy Empowering Your Future in Emerging Technologies TechZone Software Academy is a premier institution dedicated to providing top-notch training and education in cutting-edge technologies. With a mission to empower individuals and businesses, we offer a wide range of courses designed to prepare you for the challenges and opportunities in today's rapidly evolving tech landscape. Our commitment to excellence, experienced faculty , and industry-relevant curriculum set us apart as a leader in Techzone Is An Academy Which Offers Best Courses Which Is Taught By Industry Experienced Faculty .Where You Are Prepared To Solve Real-T ime Business Challenges And Solving Industry Problems And Even Provide You Job Assistance. TechZone 