## Configuration

In [3]:
"""Chunking strategy to divide text file into main topics."""
from KG_builder.extract.extract_triples import extract_triples
from KG_builder.utils.llm_utils import load_model


In [5]:

LLM_MODEL = "gemini-2.0-flash-lite"
EMBEDDING_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
THRESHOLE = 0.95
DATA = "./data"

In [15]:
import os

list_file = os.listdir(DATA)

## Tạo bảng lưu

In [3]:
from KG_builder.models.db_engine import *

create_tables()

Đã tạo bảng thành công!


## Load model

In [6]:

from KG_builder.embedding.load.free import QwenEmbedding

embed_model = QwenEmbedding(model_name=EMBEDDING_MODEL)
llm = load_model(LLM_MODEL)

In [8]:
embed_model.encode_sync(["Hello"]).shape

(1, 896)

## Lấy triples

In [7]:
triples = []

In [17]:

for file_name in list_file:
    path = os.path.join(DATA, file_name)
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    cleaned_text = clean_vn_text(text)

    section_boundaries = [
        ("THÔNG TIN CÁ NHÂN", "7. Quá trình công tác"),
        ("7. Quá trình công tác", "B. TỰ KHAI THEO")
    ]

    section_prompts = [
        (EXTRACT_TRIPLE_PERSONAL_INFO_PROMPT, EXTRACT_TRIPLE_PERSONAL_INFO_USER_PROMPT),
        (EXTRACT_TRIPLE_WORKING_INFO_PROMPT, EXTRACT_TRIPLE_WORKING_INFO_USER_PROMPT)
    ]

    main_subject = None

    for i, ((start_keyword, end_keyword), (system_instruction, context)) in enumerate(zip(section_boundaries, section_prompts)):
        section = extract_specific_sections(
            text=text,
            start_keyword=start_keyword,
            end_keyword=end_keyword
        )
        
        context_kwargs = {"context": section}
        if main_subject:
            context_kwargs["main_subject"] = main_subject

        messages = {
            "system_instruction": system_instruction.format(),
            "context": context.format(**context_kwargs)
        }
        
        response = extract_triples(
            messages=messages,
            llm=llm,
            response_format=response_format
        )
        
        for triple in response["triples"]:
            if not isinstance(triple, dict):
                continue
            add_triple_data(session=session, triple_data=triple)
        
        
        if i == 0:
            main_subject = response.get("triples")[0]["subject"]

{
  "triples": [
    {
      "subject": "TRẦN QUỐC HÒA",
      "predicate": "has_birth_date",
      "object": "24-07-1976",
      "metadata": {
        "source": "2. Ngày tháng năm sinh: 24/07/1976 ; Nam þ ; Nữ ; Quốc tịch: Việt Nam ;"
      }
    },
    {
      "subject": "TRẦN QUỐC HÒA",
      "predicate": "gender",
      "object": "Nam",
      "metadata": {
        "source": "2. Ngày tháng năm sinh: 24/07/1976 ; Nam þ ; Nữ ; Quốc tịch: Việt Nam ;"
      }
    },
    {
      "subject": "TRẦN QUỐC HÒA",
      "predicate": "nationality",
      "object": "Việt Nam",
      "metadata": {
        "source": "2. Ngày tháng năm sinh: 24/07/1976 ; Nam þ ; Nữ ; Quốc tịch: Việt Nam ;"
      }
    },
    {
      "subject": "TRẦN QUỐC HÒA",
      "predicate": "ethnicity",
      "object": "Kinh",
      "metadata": {
        "source": "Dân tộc: Kinh ; Tôn giáo: không"
      }
    },
    {
      "subject": "TRẦN QUỐC HÒA",
      "predicate": "religion",
      "object": "không",
      "metadata": {
  

ERROR:root:Message: Unterminated string starting at: line 16 column 19 (char 347)
Traceback (most recent call last):
  File "/Users/huynhnguyen/WorkDir/bachoc_1/src/KG_builder/extract/extract_triples.py", line 16, in extract_triples
    res = json.loads(response)
  File "/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ~~~~~~~~~~~~~~~~~~~~~~~^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/json/decoder.py", line 345, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
               ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/json/decoder.py", line 361, in raw_decode
    obj, end = self.scan_once(s, idx)
               ~~~~~~~~~~~~~~^^^^^^^^
json.decoder.JSONDecodeError: Unterminated 

{
  "triples": [
    {
      "subject": "ĐỖ VĂN CHIẾN",
      "predicate": "has_birth_date",
      "object": "17-11-1980",
      "metadata": {
        "source": "Ngày tháng năm sinh: 17 - 11 - 1980"
      }
    },
    {
      "subject": "ĐỖ VĂN CHIẾN",
      "predicate": "has_gender",
      "object": "Nam",
      "metadata": {
        "source": "Ngày tháng năm sinh: 17 - 11 - 1980; \n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\

UnboundLocalError: cannot access local variable 'res' where it is not associated with a value

In [9]:
len(triples)

47