In [88]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install langchain-community
# !pip install langchain-text-splitters
# !pip install langchain-chroma
# !pip install -U langchain-ollama
# !pip install -U pypdf

In [56]:
# !pip install pythainlp


Collecting pythainlp
  Downloading pythainlp-5.0.5-py3-none-any.whl.metadata (7.5 kB)
Downloading pythainlp-5.0.5-py3-none-any.whl (17.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.9/17.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pythainlp
Successfully installed pythainlp-5.0.5


In [42]:
import os

# ตั้งค่าให้ Ollama ใช้เซิร์ฟเวอร์ที่รันอยู่บนเครื่องอื่น
os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"

# ตรวจสอบว่าตัวแปรถูกตั้งค่าแล้วหรือไม่
print(os.getenv("USER_AGENT"))

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36


In [58]:
import bs4
import requests
from langchain_community.document_loaders import WebBaseLoader

# Monkey Patch requests.get เพื่อใส่ User-Agent
original_get = requests.get

def patched_get(url, *args, **kwargs):
    headers = kwargs.pop("headers", {})
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    return original_get(url, headers=headers, *args, **kwargs)

requests.get = patched_get  # ใช้ patched version ของ requests.get

# ใช้ WebBaseLoader ตามปกติ
loader = WebBaseLoader(
    web_paths=("http://ollama:11434",),
)
docs = loader.load()

# คืนค่า requests.get กลับเป็นปกติ (ถ้าต้องการ)
requests.get = original_get

print(docs)

[Document(metadata={'source': 'http://ollama:11434'}, page_content='Ollama is running')]


In [59]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
import sys

In [60]:
import os

# ตั้งค่าให้ Ollama ใช้เซิร์ฟเวอร์ที่รันอยู่บนเครื่องอื่น
os.environ["OLLAMA_BASE_URL"] = "http://ollama:11434"

# ตรวจสอบว่าตัวแปรถูกตั้งค่าแล้วหรือไม่
print(os.getenv("OLLAMA_BASE_URL"))  # ควรแสดง http://server-ip:11434

http://ollama:11434


In [61]:
OLLAMA_BASE_URL = "http://ollama:11434"
OLLAMA_LLM_MODEL = "deepseek-r1:8b"
OLLAMA_EMBED_MODEL = "nomic-embed-text"

In [62]:
import chromadb
chromadb.api.client.SharedSystemClient.clear_system_cache()

In [72]:
import os
import shutil
import time

def ingest(file_path: str):
    chunk_size = 500
    db_path = "./sql_chroma_db"
    # Delete the folder if it exists
    if os.path.exists(db_path):
        shutil.rmtree(db_path)
        print(f"Deleted existing folder: {db_path}")

    # Add a 1-second delay before loading the PDF
    time.sleep(3)
        
    # Get the doc
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    
    # Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
        add_start_index=True,
    )
    
    chunks = text_splitter.split_documents(pages)
    
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    
    
    local_embeddings = OllamaEmbeddings(
        model=OLLAMA_EMBED_MODEL,
        base_url=OLLAMA_BASE_URL,
    )
    
    # Create vector store
    Chroma.from_documents(
        documents=chunks,  
        embedding=local_embeddings, 
        collection_name="thai_docs",
        persist_directory=db_path,
        collection_metadata={
            "language": "thai",
            "chunk_size": chunk_size
        }
    )

In [73]:
# only run this once to generate vector store
# ingest("sql_tutorial.pdf")

ingest("law.pdf")

Deleted existing folder: ./sql_chroma_db
Split 9 documents into 44 chunks.


OperationalError: attempt to write a readonly database

In [57]:
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import pythainlp

def create_thai_embeddings(documents, chunk_size=500):
    """
    Create embeddings for Thai text documents and store in Chroma
    
    Parameters:
    documents (list): List of Thai text documents
    chunk_size (int): Size of text chunks for splitting
    """
    
    # 1. Configure text splitter for Thai
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    # 2. Split documents
    texts = text_splitter.create_documents(documents)
    
    # 3. Configure Ollama embeddings
    embeddings = OllamaEmbeddings(
        model="nomic-embed-text",  # หรือใช้ mistral ก็ได้
        base_url=OLLAMA_BASE_URL,
        model_kwargs={
            "temperature": 0.0,  # ให้ผลลัพธ์คงที่
            "num_ctx": 2048,     # ความยาวบริบทสูงสุด
        }
    )
    
    # 4. Create and configure Chroma
    vectorstore = Chroma.from_documents(
        documents=texts,
        embeddings=embeddings,
        collection_name="thai_docs",
        persist_directory="./chroma_db",
        collection_metadata={
            "language": "thai",
            "chunk_size": chunk_size
        }
    )
    
    return vectorstore

# ตัวอย่างการใช้งาน
# documents = [
#     "เอกสารภาษาไทยฉบับที่ 1",
#     "เอกสารภาษาไทยฉบับที่ 2"
# ]

# vectorstore = create_thai_embeddings(documents)

In [32]:
# Create a RAG chain that retreives relevent chunks and prepares a response

In [50]:
def rag_chain():
    model = OllamaLLM(
        model=OLLAMA_LLM_MODEL,
        base_url=OLLAMA_BASE_URL
    )
    
    #Load vector store
    embedding = OllamaEmbeddings(
        model=OLLAMA_EMBED_MODEL,
        base_url=OLLAMA_BASE_URL
    )
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity",
        # search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            # "score_threshold": 0.7, # คืนค่าเอกสารที่มีคะแนนความคล้ายคลึง ≥ 0.7
            # "fetch_k": 10,  # ดึงผลลัพธ์มา 10 รายการก่อนเลือก 3 รายการที่หลากหลายที่สุด
            # "lambda_mult": 0.5,  # ปรับสมดุลระหว่างความคล้ายคลึงและความหลากหลาย
        },
    )
    
    # # Get documents from the vector store
    # results = retriever.invoke("ถ้าต้องถูกทำงานล่วงเวลาในวันหยุด เราจะได้ค่าจ้างเท่าไหร?")
    
    # # Print the results to see what documents are returned
    # print("Vector Store Results:")
    # for idx, result in enumerate(results):
    #     print(f"Result {idx+1}: {result}")

    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input}
        Context: {context} 
        Answer: [/Instructions]
        """
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain, prompt


In [51]:
def ask(query: str):
    # Initialize chain and prompt
    chain, prompt = rag_chain()

    # Display the prompt before invoking
    print("Prompt used in the chain:")
    print(prompt.format(input=query, context=""))

    # Invoke chain with the query
    result = chain.invoke({"input": query})

    # Print the result
    print("Answer: ", result["answer"])
    
    # Show the source documents
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])


In [70]:
  
#Load vector store
embedding = OllamaEmbeddings(
      model=OLLAMA_EMBED_MODEL,
    base_url=OLLAMA_BASE_URL
)
vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

retriever = vector_store.as_retriever(
    search_type="mmr",
     search_kwargs={
        "k": 5,
        "lambda_mult": 0.8
         
     },
)

question = "ไม่สามารถรับเด็กอายุต่ำกว่ากี่ปีให้มาทำงานได้?"

retrieved_docs = retriever.invoke(question)

context = ' '.join([doc.page_content for doc in retrieved_docs])


prompt = f"""Answer the question according to the context given very briefly:
           Question: {question}.
           Context: {context}
"""


page_contents = []
for doc in retrieved_docs:
    try:
         print(doc.page_content)
         print("------------------")
    except AttributeError:
        print(f"Document missing 'page_content': {doc}") # หรือจะข้ามไปเลยก็ได้
print(page_contents)

print(prompt)

[]
Answer the question according to the context given very briefly:
           Question: ไม่สามารถรับเด็กอายุต่ำกว่ากี่ปีให้มาทำงานได้?.
           Context: 



In [53]:
model = OllamaLLM(
    model=OLLAMA_LLM_MODEL,
    base_url=OLLAMA_BASE_URL
)

response = model.invoke(prompt)
print(response)

<think>
Alright, let me try to figure this out. The question is asking what the minimum age is for a child to be able to work according to the given context.

First, I'll read through the context provided. It talks about overtime, notice periods, labor contracts without specific durations, and termination procedures. Then it moves on to cases where an employer and employee can terminate the contract mutually.

Next, there's a part discussing protective measures for child workers. It mentions that young workers have rights to leave for training organized by the Department of Labor, and employers must allow this leave with overtime pay equivalent to regular days. This seems relevant because it directly refers to child workers' rights regarding work hours and leave.

Then, it lists situations where a worker loses their ability to work due to accidents or diseases. Points (j), (k), and (l) detail conditions under which an employee can't work anymore. These include intoxication, simple labo