In [1]:
import os
import PyPDF2
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq




In [2]:
# Set up Groq API key
os.environ["GROQ_API_KEY"] = "Replace with your actual Groq API key"  # Replace with your actual Groq API key

In [3]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [4]:
def create_qa_system(pdf_paths):
    all_texts = ""
    for pdf_path in pdf_paths:
        # Extract text from each PDF
        pdf_text = extract_text_from_pdf(pdf_path)
        all_texts += pdf_text + "\n"  # Combine text from all PDFs

    # Split combined text into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_text(all_texts)

    # Create embeddings
    embeddings = HuggingFaceEmbeddings()

    # Create vector store
    db = Chroma.from_texts(texts, embeddings)

    # Initialize ChatGroq
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0,
    )

    # Create a retrieval chain
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 1}))

    return qa 

In [None]:
def main():
    pdf_paths = ["Pakistan.pdf"]  
    qa_system = create_qa_system(pdf_paths)
    
    responses = []

    while True:
        query = input("Enter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        answer = qa_system.run(query)
        print("Answer:", answer)
        responses.append({'query': query, 'answer': answer})


if __name__ == "__main__":
    main()