In [1]:
# !pip install transformers torch
# !pip install 'accelerate>=0.26.0'

# นำเข้าโมดูลที่ต้องการ
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def askTyphoon2(prompt: str):
    print("--Chat with Typhoon2--");
    
    # โหลดโมเดลและ tokenizer
    model_id = "scb10x/llama3.2-typhoon2-1b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
    # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

    # เตรียมข้อความนำเข้า
    messages = [
        {"role": "system", "content": "You are a friendly assistant. Answer the question based only on the following context. If you don't know the answer, then reply, No Context availabel for this question."},
        {"role": "user", "content": prompt}
    ]
    # แปลงข้อความเป็น token (คืนค่าเป็น tensor โดยตรง)
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    
    # สร้าง attention_mask จาก input_ids (1 สำหรับ token จริง, 0 สำหรับ padding)
    attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)
    
    # กำหนดพารามิเตอร์สำหรับการสร้างข้อความ
    terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    
    # สร้างข้อความตอบกลับ
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,  # ส่ง attention_mask เข้าไป
        max_new_tokens=512,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id  # กำหนด pad_token_id อย่างชัดเจน
    )
    
    # แปลงผลลัพธ์กลับเป็นข้อความ
    response = outputs[0][input_ids.shape[-1]:]
    answer = tokenizer.decode(response, skip_special_tokens=True)
    print(answer)

In [2]:
# !pip install -U transformers langchain langchain_huggingface langchain_community sentence-transformers langchain_huggingface langchain_core chromadb ipywidgets

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document  # นำเข้า Document

def textSplitter(text: str):
    # สร้าง Document
    documents = [Document(page_content=text, metadata={
        "source":"website.txt"
    })]
    
    # Split the text by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=51,
        length_function=len,
        strip_whitespace=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split text into {len(chunks)} chunks.")
    return chunks

def documentsSplitter(documents: Document):
    # Split the text by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=51,
        length_function=len,
        strip_whitespace=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split text into {len(chunks)} chunks.")
    return chunks

In [None]:
# !pip install -U langchain-chroma

import os
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def ingest(chunks: List[Document], embeddings_dir:str):
    """
    ฟังก์ชันที่รับรายการของ Document (chunks) และนำไปสร้าง Vector database
    """

    model_name = "BAAI/bge-m3"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    
    embedding = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

    
    
   # ตรวจสอบว่ามี vector store อยู่แล้วหรือไม่
    if os.path.exists(os.path.join(embeddings_dir, "chroma.sqlite3")):
        print("--append data to vector store--")
        # โหลด vector store ที่มีอยู่
        vector_store = Chroma(persist_directory=embeddings_dir, embedding_function=embedding)
        # เพิ่มเอกสารใหม่
        vector_store.add_documents(documents=chunks)
    else:
        print("--new vector store--")
        # สร้างไดเรกทอรีสำหรับบันทึก embeddings
        os.makedirs(embeddings_dir, exist_ok=True)
        # สร้าง vector store ใหม่และบันทึก embeddings ลงในไฟล์
        Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory=embeddings_dir)

    print("--- Ingest to Vector Database Success ---")

In [4]:
# !pip install requests
# !pip install beautifulsoup4

In [None]:
import requests

original_get = requests.get

def patched_get(url, *args, **kwargs):
    headers = kwargs.pop("headers", {})
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    return original_get(url, headers=headers, *args, **kwargs)

In [None]:
import bs4
import requests
from langchain_community.document_loaders import WebBaseLoader

def loadContentFromWebsite(url: str, targetClassName: str):
    requests.get = patched_get  # ใช้ patched version ของ requests.get
    bs4_strainer = bs4.SoupStrainer(class_=targetClassName)
    loader = WebBaseLoader(
        web_paths=(url,), # , ที่อยู่ในวงเล็บสำคัญนะห้ามลบ
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    # คืนค่า requests.get กลับเป็นปกติ (ถ้าต้องการ)
    requests.get = original_get
    
    return docs

In [None]:
# !pip install -U langchain-chroma
from langchain_chroma import Chroma  # แก้ไขการนำเข้า

def loadVectorStore(embeddings_dir: str):
    model_name = "BAAI/bge-m3"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    
    embedding = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    vector_store = Chroma(persist_directory=embeddings_dir, embedding_function=embedding)
    print("--Vector Store Loaded--")
    return vector_store

In [20]:
def getRetriever(vector_store: Chroma):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever

In [21]:
from langchain_core.retrievers import BaseRetriever

def genPrompt(question : str, retriever: BaseRetriever):
    retrieved_docs = retriever.invoke(question )
    context = ' '.join([doc.page_content for doc in retrieved_docs])
    prompt = f"""
        <s> [Instructions] 
                You are a friendly assistant. Answer the question based only on the following context. 
                If you don't know the answer, then reply, No Context availabel for this question.
            [/Instructions] 
        </s> 
        [Instructions] 
            Question: {question}
            Context: {context} 
            Answer:
        [/Instructions]
        """
    return prompt

In [55]:
documents = loadContentFromWebsite(
    url="https://www.sanook.com/news/9735274/",
    targetClassName= "EntryReaderInner"
)
print(documents)

[Document(metadata={'source': 'https://www.sanook.com/news/9735274/'}, page_content='เด็กสาววัย 16 ไปโรงพยาบาลเพราะท้องอืด ช็อก หมอบอกมีชีวิตอยู่ได้อีกแค่ 1 สัปดาห์ ต้องผ่าตัดด่วน\nอเล็กซ์ อาร์เคล อายุ 16 ปี จากสติริง สกอตแลนด์ ไปพบแพทย์เพื่อตรวจสอบอาการท้องอืดและปวดท้องที่เธอประสบอยู่ในช่วงหลัง ซึ่งไม่เคยคาดคิดว่าเธอจะเหลือเวลาอีกแค่ 1 สัปดาห์ในการมีชีวิตอยู่จากการมีเนื้องอกที่หายากมากในโลก\nอเล็กซ์สังเกตเห็นอาการไม่สบายตัวครั้งแรกในระหว่างทริปท่องเที่ยวกับครอบครัว และเมื่อกลับบ้าน เธอก็รีบไปนัดหมายตรวจสุขภาพกับแพทย์ เธอได้รับการตรวจเลือดและเพียงแค่สัปดาห์เดียวหลังจากนั้นก็ถูกนำตัวไปโรงพยาบาลเพื่อทำการอัลตราซาวด์ ขณะนั้นเธอสูญเสียการมองเห็นในสายตาซ้ายและได้รับการแจ้งว่าเนื้องอกของเธอเติบโตขึ้นอีก 13 เซนติเมตร\nอเล็กซ์ได้รับการแจ้งว่าเธอเป็นโรคเนื้องอกต่อมใต้สมองที่หายาก ซึ่งในโลกนี้มีการบันทึกกรณีเช่นนี้เพียง 20 ราย และเธอเหลือเวลาแค่ 1 สัปดาห์เท่านั้น "เมื่ออายุ 16 ปี ชีวิตของฉันเหมือนจะผ่านไปต่อหน้าต่อตา สิ่งที่ฉันเคยห่วงใยมันไม่สำคัญสำหรับฉันอีกต่อจากวันนั้น" นักเรียนหญิงเผยว่าเธอไ

In [68]:
chunks = documentsSplitter(documents)
print(chunks)

Split text into 6 chunks.
[Document(metadata={'source': 'https://www.sanook.com/news/9735274/'}, page_content='เด็กสาววัย 16 ไปโรงพยาบาลเพราะท้องอืด ช็อก หมอบอกมีชีวิตอยู่ได้อีกแค่ 1 สัปดาห์ ต้องผ่าตัดด่วน\nอเล็กซ์ อาร์เคล อายุ 16 ปี จากสติริง สกอตแลนด์ ไปพบแพทย์เพื่อตรวจสอบอาการท้องอืดและปวดท้องที่เธอประสบอยู่ในช่วงหลัง ซึ่งไม่เคยคาดคิดว่าเธอจะเหลือเวลาอีกแค่ 1 สัปดาห์ในการมีชีวิตอยู่จากการมีเนื้องอกที่หายากมากในโลก'), Document(metadata={'source': 'https://www.sanook.com/news/9735274/'}, page_content='อเล็กซ์สังเกตเห็นอาการไม่สบายตัวครั้งแรกในระหว่างทริปท่องเที่ยวกับครอบครัว และเมื่อกลับบ้าน เธอก็รีบไปนัดหมายตรวจสุขภาพกับแพทย์ เธอได้รับการตรวจเลือดและเพียงแค่สัปดาห์เดียวหลังจากนั้นก็ถูกนำตัวไปโรงพยาบาลเพื่อทำการอัลตราซาวด์ ขณะนั้นเธอสูญเสียการมองเห็นในสายตาซ้ายและได้รับการแจ้งว่าเนื้องอกของเธอเติบโตขึ้นอีก 13 เซนติเมตร'), Document(metadata={'source': 'https://www.sanook.com/news/9735274/'}, page_content='อเล็กซ์ได้รับการแจ้งว่าเธอเป็นโรคเนื้องอกต่อมใต้สมองที่หายาก ซึ่งในโลกนี้มีการบัน

In [69]:
embeddings_dir="./embeddings-reg-web-typhoon2"

ingest(
    chunks=chunks,
    embeddings_dir=embeddings_dir
)

--- Ingest Text Successc ---


In [70]:
vectorstore = loadVectorStore(embeddings_dir)

--Vector Store Loaded--


In [71]:
retriever = getRetriever(vectorstore)

In [73]:
prompt = genPrompt("อเล็กมีอายุกี่ปี",retriever)

print(prompt)


        <s> [Instructions] 
                You are a friendly assistant. Answer the question based only on the following context. 
                If you don't know the answer, then reply, No Context availabel for this question.
            [/Instructions] 
        </s> 
        [Instructions] 
            Question: อเล็กมีอายุกี่ปี
            Context: เด็กสาววัย 16 ไปโรงพยาบาลเพราะท้องอืด ช็อก หมอบอกมีชีวิตอยู่ได้อีกแค่ 1 สัปดาห์ ต้องผ่าตัดด่วน
อเล็กซ์ อาร์เคล อายุ 16 ปี จากสติริง สกอตแลนด์ ไปพบแพทย์เพื่อตรวจสอบอาการท้องอืดและปวดท้องที่เธอประสบอยู่ในช่วงหลัง ซึ่งไม่เคยคาดคิดว่าเธอจะเหลือเวลาอีกแค่ 1 สัปดาห์ในการมีชีวิตอยู่จากการมีเนื้องอกที่หายากมากในโลก เด็กสาววัย 16 ไปโรงพยาบาลเพราะท้องอืด ช็อก หมอบอกมีชีวิตอยู่ได้อีกแค่ 1 สัปดาห์ ต้องผ่าตัดด่วน
อเล็กซ์ อาร์เคล อายุ 16 ปี จากสติริง สกอตแลนด์ ไปพบแพทย์เพื่อตรวจสอบอาการท้องอืดและปวดท้องที่เธอประสบอยู่ในช่วงหลัง ซึ่งไม่เคยคาดคิดว่าเธอจะเหลือเวลาอีกแค่ 1 สัปดาห์ในการมีชีวิตอยู่จากการมีเนื้องอกที่หายากมากในโลก เด็กสาววัย 16 ไปโรงพยาบา

In [74]:
askTyphoon2(prompt)

16


In [79]:
askTyphoon2(genPrompt("อเล็กมีลูกกี่คน",retriever))


16


In [81]:
askTyphoon2(genPrompt("อเล็กกินกับใคร",retriever))


เธอได้รับอาหารในโรงพยาบาลและได้รับการผ่าตัดที่ดี


In [15]:
def ingestWebsite(url: str,targetClassName: str):
    documents = loadContentFromWebsite(
        url=url,
        targetClassName= targetClassName
    )
    chunks = documentsSplitter(documents)
    embeddings_dir="./embeddings-website"
    ingest(
        chunks=chunks,
        embeddings_dir=embeddings_dir
    )
    
def askWebsite(question: str):
    embeddings_dir="./embeddings-website"
    vectorstore = loadVectorStore(embeddings_dir)
    retriever = getRetriever(vectorstore)
    prompt = genPrompt(question= question, retriever= retriever)
    askTyphoon2(prompt)

In [24]:
ingestWebsite(
    url="https://www.sanook.com/travel/1451675/",
    targetClassName= "EntryReaderInner"
)

Split text into 8 chunks.
--append data to vector store--
--- Ingest to Vector Database Success ---


In [25]:
askWebsite("ต่อพาสปอร์ต ใช้เอกสารอะไรบ้าง");

--Vector Store Loaded--
--Chat with Typhoon2--
ต่อพาสปอร์ตใช้เอกสารอะไรบ้าง
สำหรับผู้ที่มีอายุ 20 ปีขึ้นไป บัตรประชาชนตัวจริงที่ยังไม่หมดอายุ

สำหรับผู้ที่อายุต่ำกว่า 20 ปี

บัตรประชาชนตัวจริงที่ยังไม่หมดอายุ
สูติบัตร (กรณีอายุต่ำกว่า 15 ปี)
บัตรประชาชนของพ่อและแม่
หนังสือยินยอมจากพ่อแม่ให้เดินทางไปต่างประเทศ นำบัตรประชาชนเสียบเข้าไปในเครื่อง Kiosk เพื่อดึงข้อมูล
ตรวจสอบและกรอกข้อมูลเพิ่มเติมหากจำเป็น
ถ่ายรูปด้วยตนเอง สามารถถ่ายใหม่จนกว่าจะพอใจ
สแกนลายนิ้วมือและม่านตา (ควรงดใส่คอนแทคเลนส์สี)
เลือกอายุการใช้งานของพาสปอร์ต (5 ปี หรือ 10 ปี)
รับสลิปที่มี QR Code
นำสลิปไปชำระค่าธรรมเนียมที่เคาน์เตอร์
เลือกวิธีรับหนังสือเดินทาง (รับด้วยตนเองหรือจัดส่งทางไปรษณีย์)


In [26]:
askWebsite("เครื่องดื่มบูสต์พลังงานมีอะไรบ้าง")

--Vector Store Loaded--
--Chat with Typhoon2--
เครื่องดื่มบูสต์พลังงานจากธรรมชาติมีดังนี้:
1. คอมบูชา
2. น้ำเมล็ดเจีย
3. น้ำผึ้ง
4. โยเกิร์ตหมักด้วยจุลินทรีย์
5. น้ำผลไม้แห้ง
6. น้ำผลไม้ที่มีไฟเบอร์สูง
7. ชาเขียว
8. ชาดำ
9. น้ำมะพร้าว
10. น้ำมะนาว
11. น้ำส้ม
12. น้ำแครนเบอร์รี่
