In [12]:
import pdfplumber
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [13]:
PROMPT_TEMPLATE = """
You are an expert research assistant. Use the provided context to answer the query. 
If unsure, state that you don't know. Be concise and factual (max 3 sentences).

Query: {user_query} 
Context: {document_context} 
Answer:
"""

# Configuration
PDF_STORAGE_PATH = 'E:/projects/python/genai/RAG/deepseek/document_store/'  # Ensure trailing slash
EMBEDDING_MODEL = OllamaEmbeddings(model="deepseek-r1:1.5b")
DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL)
LANGUAGE_MODEL = OllamaLLM(model="deepseek-r1:1.5b")

In [14]:
# Save the uploaded file to the specified path
def save_uploaded_file(file_path):
    import os
    os.makedirs(PDF_STORAGE_PATH, exist_ok=True)
    return file_path

# Load PDF documents
def load_pdf_documents(file_path):
    document_loader = PDFPlumberLoader(file_path)
    return document_loader.load()

# Split documents into chunks
def chunk_documents(raw_documents):
    text_processor = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )
    return text_processor.split_documents(raw_documents)

In [15]:
# Index document chunks into the vector store
def index_documents(document_chunks):
    DOCUMENT_VECTOR_DB.add_documents(document_chunks)

# Find related documents based on a query
def find_related_documents(query):
    return DOCUMENT_VECTOR_DB.similarity_search(query)

# Generate an answer using the language model
def generate_answer(user_query, context_documents):
    context_text = "\n\n".join([doc.page_content for doc in context_documents])
    conversation_prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    response_chain = conversation_prompt | LANGUAGE_MODEL
    return response_chain.invoke({"user_query": user_query, "document_context": context_text})

In [None]:
# Main program
if __name__ == "__main__":
    # Prompt user to upload a PDF file
    pdf_path = input("Enter the path to your PDF file: ")
    saved_path = save_uploaded_file(pdf_path)

    # Load, process, and index the PDF
    raw_docs = load_pdf_documents(saved_path)
    processed_chunks = chunk_documents(raw_docs)
    index_documents(processed_chunks)

    print("✅ Document processed successfully! Ask your questions below.")

    # Interactive loop for user queries
    while True:
        user_input = input("Enter your question about the document: ")
        print("Responding to", user_input)
        if not user_input.strip():  # Exit if the input is empty
            break

        # Find related documents and generate an answer
        related_docs = find_related_documents(user_input)
        answer = generate_answer(user_input, related_docs)
        print("Answer:", answer)

✅ Document processed successfully! Ask your questions below.
Answer: <think>
Okay, so I'm trying to figure out the role of the encoder in the Transformer model as explained in this context. From what I remember, Transformers are a type of neural network architecture developed by Google. They're known for their efficiency and effectiveness in various tasks like machine translation.

In the provided context, it seems that the Transformer has an encoder-decoder structure throughout the sections. The encoder is described as mapping an input sequence of symbol representations to a sequence of continuous representations z. This sounds familiar. I think the encoder processes the input data and transforms it into a form that can be used later by the decoder.

The decoder then generates an output sequence based on this transformed data. Each step in the model is auto-regressive, meaning each generated element uses the previously generated ones as additional inputs. That makes sense because in m