In [18]:
import os
import glob
import json
import requests
from typing import List

from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
PDF_GLOB = "data/pdfs/*.pdf"
PERSIST_DIR="temp"

In [6]:
def get_embedding_function():
    """
    Returns an OpenAIEmbeddings instance configured for a local server.
    """
    return OpenAIEmbeddings(
        # 1. Point to your local server (exclude '/embeddings' from the URL here)
        base_url="http://localhost:36912/v1", 
        
        # 2. Local servers usually ignore the key, but it cannot be empty
        api_key="sk-no-key-required", 
        
        # 3. Some local servers require a model name (can often be anything)
        model="Qwen3-Embedding-8B",
        
        # 4. Disable validtion checks that might fail on local servers
        check_embedding_ctx_length=False 
    )

In [None]:
def load_pdfs(pdf_paths: List[str]):
    """Load PDFs into LangChain Documents (1 doc per page by default)."""
    docs = []
    for path in pdf_paths:
        loader = PyPDFLoader(path)  # uses pypdf under the hood
        docs.extend(loader.load())
    return docs


def build_vectorstore_from_pdfs(
    pdf_glob: str,
    persist_dir: str,
    *,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
):
    pdf_paths = sorted(glob.glob(pdf_glob))
    if not pdf_paths:
        raise FileNotFoundError(f"No PDFs matched: {pdf_glob}")

    raw_docs = load_pdfs(pdf_paths)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )
    chunks = splitter.split_documents(raw_docs)

    embeddings = get_embedding_function()
    vs = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_dir,
        collection_name="pdf_rag",
    )
    return vs

In [11]:
vs = build_vectorstore_from_pdfs(PDF_GLOB, PERSIST_DIR)

Ignoring wrong pointing object 8 0 (offset 0)


In [15]:
vs._collection.count()

52

In [30]:
QUESTION = "Ut  consectetur  dui  ipsum,  a  rhoncus  lacus  gravida vitae. In mollis tortor in libero lobortis molestie?"

def format_docs(docs):
    """Helper to join document content into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)


def load_vectorstore(persist_dir: str):
    embeddings = get_embedding_function()
    return Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings,
        collection_name="pdf_rag",
    )

def chat(prompt_value):
    """
    This function acts as the LLM.
    It receives a LangChain 'PromptValue' and must return a string.
    """    
    # Convert the prompt object (containing system/human messages) to a single string
    prompt_text = prompt_value.to_string()

    print(prompt_text)

    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": prompt_text,
            },
        ]

        data = {"messages": messages, "temperature": 0.7}
        response = requests.post(
            url="http://localhost:32001/v1/chat/completions",
            headers={
                "Content-Type": "application/json",
            },
            data=json.dumps(data),
        )
        response.raise_for_status()

        return response.json()['choices'][0]['message']['content']

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")


def make_rag_chain(vectorstore: Chroma):
    # Create a prompt template using langchain_core.prompts
    template = """Answer the question based only on the provided context.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Always return a 'SOURCES' section in your answer.
    The 'SOURCES' section should contain the references from the source documents that you used to answer the question.

    Context:
    {context}

    Question:
    {input}
    """
    prompt = ChatPromptTemplate.from_template(template)

    llm = RunnableLambda(chat)

    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

    # Build the RAG chain using RunnablePassthrough and StrOutputParser
    rag_chain = (
        {
            "context": retriever | format_docs,  # Ensure context is string for the prompt
            "input": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
    )
    return rag_chain


vs = load_vectorstore(PERSIST_DIR)


chain = make_rag_chain(vs)
result = chain.invoke(QUESTION)  # Pass the question directly

Human: Answer the question based only on the provided context.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Always return a 'SOURCES' section in your answer.
    The 'SOURCES' section should contain the references from the source documents that you used to answer the question.

    Context:
    Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc a feugiat urna. Integer sit 
amet tincidunt mauris. Donec id urna nec turpis laoreet aliquam ac id ante. Curabitur faucibus 
consectetur velit, vitae efficitur odio facilisis sed. 
 
Aenean pulvinar euismod ligula at lacinia. Ut consectetur dui ipsum, a rhoncus lacus gravida 
vitae. In mollis tortor in libero lobortis molestie. Integer tempor justo ligula, eu euismod lectus 
fringilla eu. Proin vestibulum sodales tr istique. Pellentesque pretium, nibh et aliquet 
scelerisque, felis nulla lobortis tellus, at tristique libero ipsum a leo. Nulla mauris turpis, feugiat 
eu lacus 

In [31]:
result

'The text provided contains the sentence: "Ut consectetur dui ipsum, a rhoncus lacus gravida vitae. In mollis tortor in libero lobortis molestie." This appears to be a fragment from a larger document, possibly a style guide or a sample of placeholder text (like Lorem Ipsum), but it does not provide enough context to answer any specific question about its meaning or purpose.\n\nTherefore, based on the information given, I cannot provide a meaningful answer to the question.\n\nSOURCES\n- The provided context text, which includes the sentence in question.'