In [5]:
!pip install requests
!pip install beautifulsoup4
!pip install langchain-community
!pip install langchain-text-splitters
!pip install langchain-chroma
!pip install -U langchain-ollama
!pip install -U pypdf

Collecting langchain-ollama
  Downloading langchain_ollama-0.2.3-py3-none-any.whl.metadata (1.9 kB)
Collecting ollama<1,>=0.4.4 (from langchain-ollama)
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Downloading langchain_ollama-0.2.3-py3-none-any.whl (19 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Installing collected packages: ollama, langchain-ollama
Successfully installed langchain-ollama-0.2.3 ollama-0.4.7


In [6]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
import sys

In [7]:
import os

# ตั้งค่าให้ Ollama ใช้เซิร์ฟเวอร์ที่รันอยู่บนเครื่องอื่น
os.environ["OLLAMA_BASE_URL"] = "http://ollama:11434"

# ตรวจสอบว่าตัวแปรถูกตั้งค่าแล้วหรือไม่
print(os.getenv("OLLAMA_BASE_URL"))  # ควรแสดง http://server-ip:11434

http://ollama:11434


In [8]:
OLLAMA_BASE_URL = "http://ollama:11434"
OLLAMA_EMBEDDING_MODEL = "bge-m3"
OLLAMA_LLM_MODEL = "deepseek-r1:1.5b"

In [9]:
import chromadb
chromadb.api.client.SharedSystemClient.clear_system_cache()

In [10]:
# import os
import shutil
import time

def ingest(file_path: str):
    # Get the doc
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    
    # Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=51,
        length_function=len,
        strip_whitespace=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")

    
    
    embedding = OllamaEmbeddings(
        model=OLLAMA_EMBEDDING_MODEL,
        base_url=OLLAMA_BASE_URL
    )

    # สร้างไดเรกทอรีสำหรับบันทึก embeddings
    embeddings_dir = "./embeddings"
    os.makedirs(embeddings_dir, exist_ok=True)
    
    # สร้าง vectorstore และบันทึก embeddings ลงในไฟล์
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory=embeddings_dir)

    print("--- Read File Success ---")


In [64]:
ingest("doc1.pdf")

Split 66 documents into 268 chunks.
--- Read File Success ---


In [56]:
def rag_chain():
       
    #Load vector store
    embeddings_dir = "./embeddings"
    embedding = OllamaEmbeddings(
        model=OLLAMA_EMBEDDING_MODEL,
        base_url=OLLAMA_BASE_URL
    )
    vector_store = Chroma(persist_directory=embeddings_dir, embedding_function=embedding)
    print("---- Vector Store Loaded ----")

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 3,
        },
    )
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question. Your response must be in English only.[/Instructions] </s> 
        [Instructions] Question: {input}
        Context: {context} 
        Answer: Your response must be in English only[/Instructions]
        """
    )

    
    print("---- Gen AI Answer... ----")
    model = OllamaLLM(
        model=OLLAMA_LLM_MODEL,
        base_url=OLLAMA_BASE_URL
    )
    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [57]:
def ask(question: str):
    # Initialize chain and prompt
    chain = rag_chain()
    
    # Invoke chain with the query
    result = chain.invoke({"input": question})

    input = result["input"]
    context = " ".join([doc["page_content"] for doc in result["context"]])
    prompt = f"""
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question. Your response must be in English only.[/Instructions] </s> 
        [Instructions] Question: {input}
        Context: {context} 
        Answer: Your response must be in English only[/Instructions]
        """
    print("---- PROMPT ----")
    print(prompt)
    print("---- ------ ----")
    print("---- ------ ----")
    print("---- ------ ----")

    # Print the result
    print("---- ANSWER ----")
    print(result["answer"])
    print("---- ------ ----")
    print("---- ------ ----")
    print("---- ------ ----")
    
    # # Show the source documents
    # for doc in result["context"]:
    #     print("Source: ", doc.metadata["source"])
    #     print("PageContent: ", doc.page_content)

In [58]:
ask("งานวิจัยนี้ต้องการพัฒนาอะไรเพื่อช่วยย่อยฟางข้าว?")

---- Vector Store Loaded ----
---- ANSWER ----
<think>
Alright, I need to figure out what this research project is about. The user mentioned a project titled "การพัฒนาหัวเชื้อย่อยส_COLUMNสลายฟางข้าวจากแบคทีเรียที่ผลิตเซลลูเลส". Let me break it down.

First, the title translates to "Development of Heads and Legs of Water Columns in Failing Fields from Groundwater Sources". So, the project is about modifying or developing structures called water columns in areas where water can't make it to the surface. These are typically deep underground or under water, hence the term 'falying fields'.

The user provided some context: a project led by Sursuthai with specific names like Rakunrasri and others. The goal seems to be creating water columns that can serve as substitutes for traditional water sources. This would make use of groundwater, which is often difficult to access or extract in hilly or mountainous regions.

I should consider the potential applications. These water columns could be use