In [2]:
#!pip install docling

Collecting docling
  Downloading docling-2.44.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.44.2-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling)
  Downloading docling_parse-4.1.0-cp311-cp311-win_amd64.whl.metadata (9.6 kB)
Collecting docling-ibm-models<4,>=3.9.0 (from docling)
  Downloading docling_ibm_models-3.9.0-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting pydantic-settings<3.0.0,>=2.3.0 (from docling)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting huggingface_hub<1,>=0.23 (from docling)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (

In [None]:
import os, re, json
from typing import List, Dict, Any
from docling.document_converter import DocumentConverter

# -------- regex helpers --------
ROMAN  = re.compile(r"^[IVXLCDM]+$")
DIGIT  = re.compile(r"^\d+$")
LETTER = re.compile(r"^[a-z]$")
BULLET = re.compile(r"^\s*-\s+")

def is_roman(s):  return bool(ROMAN.fullmatch(s or ""))
def is_digit(s):  return bool(DIGIT.fullmatch(s or ""))
def is_letter(s): return bool(LETTER.fullmatch(s or ""))

def clean(x):
    if x is None: return ""
    x = str(x).replace("\r", "").strip()
    x = re.sub(r"[ \t]+\n", "\n", x)
    x = re.sub(r"\s+", " ", x)
    return x.strip()

def split_docs(text: str) -> List[str]:
    if not text: return []
    lines = text.replace("\r", "").split("\n")
    out, cur = [], None
    for ln in lines:
        if BULLET.match(ln):
            if cur: out.append(cur.strip())
            cur = BULLET.sub("", ln).strip()
        else:
            if cur is None:
                cur = ln.strip()
            else:
                cur += " " + ln.strip()
    if cur and cur.strip(): out.append(cur.strip())
    out = [re.sub(r"\s+", " ", d).strip() for d in out if d and d.strip()]
    return out

def split_title(text: str) -> list:
    if not text:
        return []
    text = clean(text)
    parts = re.split(r",|\n", text)
    return [p.strip() for p in parts if p.strip()]

def force_labels(schema: Dict[str, Any]) -> None:
    # Sections (La Mã) giữ label Phamvi; Group & Item đều gán label Thutuc
    for sec in schema.get("sections", []):
        sec["label"] = "Phamvi"
        for grp in sec.get("groups", []):
            grp.setdefault("label", "Thutuc")   # đổi từ Loaithutuc -> Thutuc
            for it in grp.get("items", []):
                it.setdefault("label", "Thutuc")

# -------- core: parse ONE table -> schema --------
def table_to_schema(header: list, rows: list) -> Dict[str, Any]:
    schema: Dict[str, Any] = {"sections": []}
    current_section = None
    current_group = None
    current_item = None

    def new_section(code, title):
        nonlocal current_section, current_group, current_item
        current_section = {"code": code, "title": title, "label": "Phamvi", "groups": []}
        schema["sections"].append(current_section)
        current_group = None
        current_item = None

    def new_group(code, title):
        nonlocal current_group, current_item
        # đổi label group thành "Thutuc"
        current_group = {"code": code, "title": title, "label": "Thutuc", "items": []}
        current_section["groups"].append(current_group)
        current_item = None

    def new_item(code, title, docs, notes):
        nonlocal current_item
        current_item = {
            "code": code,
            "Thanhphandutoan": split_title(title),  # list
            "label": "Thutuc",
            "Hosochungtu": split_docs(docs),        # list
            "Ghichu": clean(notes)
        }
        current_group["items"].append(current_item)

    def append_to_last(content=None, docs=None, notes=None):
        nonlocal current_item, current_group
        if current_item is not None:
            if content:
                more = split_title(content)
                if more:
                    current_item["Thanhphandutoan"].extend(more)
            if docs:
                extra = split_docs(docs)
                if not extra and docs.strip():
                    extra = [clean(docs)]
                current_item["Hosochungtu"].extend(extra)
            if notes:
                cur = current_item.get("Ghichu", "")
                current_item["Ghichu"] = (cur + " " + clean(notes)).strip() if cur else clean(notes)
        elif current_group is not None and content:
            current_group["title"] = (current_group["title"] + " " + content).strip()

    for row in rows:
        cells = [c if isinstance(c, str) else str(c) for c in row]

        if len(cells) < 4:
            cells = cells + [""] * (4 - len(cells))
        elif len(cells) > 4:
            stt = cells[0]
            content = " ".join(cells[1:-2]) if len(cells) > 3 else cells[1]
            docs = cells[-2]
            notes = cells[-1]
            cells = [stt, content, docs, notes]

        stt, content, docs, notes = cells
        stt = (stt or "").strip()
        content = clean(content)
        docs = (docs or "").strip()
        notes = (notes or "").strip()

        if is_roman(stt):
            new_section(stt, content)
        elif is_digit(stt):
            if current_section is None:
                new_section("I", "Chưa rõ")
            new_group(stt, content)
        elif is_letter(stt) or stt == "-":
            if current_section is None:
                new_section("I", "Chưa rõ")
            if current_group is None:
                new_group("1", "Chưa rõ")
            new_item(stt, content, docs, notes)
        elif stt == "":
            append_to_last(content=content, docs=docs, notes=notes)
        else:
            if current_section is None:
                new_section("I", "Chưa rõ")
            if current_group is None:
                new_group("1", "Chưa rõ")
            new_item(stt, content, docs, notes)

    # unique Hosochungtu
    for sec in schema["sections"]:
        for grp in sec["groups"]:
            for it in grp["items"]:
                seen, uniq = set(), []
                for d in it["Hosochungtu"]:
                    if d not in seen:
                        seen.add(d); uniq.append(d)
                it["Hosochungtu"] = uniq

    force_labels(schema)
    return schema

# -------- main: DOCX -> structured JSON --------
def convert_docx_to_structured_json(file_path, out_path):
    converter = DocumentConverter()
    result = converter.convert(file_path)
    doc_dict = result.document.model_dump()

    structured_all = []
    for tb in doc_dict.get("tables", []):
        grid = tb.get("data", {}).get("grid", [])
        if not grid:
            continue
        header = [c.get("text", "").strip() for c in grid[0]]
        rows = [[c.get("text", "").strip() for c in r] for r in grid[1:]]
        schema = table_to_schema(header, rows)
        structured_all.append(schema)

    # Bọc toàn bộ output trong khóa cao nhất "Quytrinh"
    out_inner = {
        "title": os.path.splitext(os.path.basename(file_path))[0],
        "tables_structured": structured_all
    }
    out = {"Quytrinh": out_inner}

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f"{file_path} -> {out_path}")

    return out

# -------- run demo --------
if __name__ == "__main__":
    input_path = r"C:/Users/heheh/Desktop/Convert-docling/doc/Quy trinh Kiem soat chi va Thanh toan cua UET (03.01.2021).docx"
    output_path = r"C:/Users/heheh/Desktop/Convert-docling/Final Json/Quy trình/chapter_3.json"
    convert_docx_to_structured_json(input_path, output_path)


✅ C:/Users/heheh/Desktop/Convert-docling/doc/Quy trinh Kiem soat chi va Thanh toan cua UET (03.01.2021).docx -> C:/Users/heheh/Desktop/Convert-docling/Final Json/Quy trình/chapter_3.json


In [6]:
#!pip install neo4j>=5.21
#!pip install python-dotenv>=1.0


In [22]:
import os, re, shutil
from pathlib import Path
from neo4j import GraphDatabase, basic_auth

# ===================== CẤU HÌNH =====================
NEO4J_URI      = "neo4j://127.0.0.1:7687"
NEO4J_USER     = "neo4j"   
NEO4J_PASSWORD = ""        
NEO4J_DATABASE = "neo4j"

SOURCE_JSON = "C:/Users/heheh/Desktop/Convert-docling/Final Json/Quy trình/chapter_3.json"
IMPORT_DIR = r"C:/Users/heheh/.Neo4jDesktop2/Data/dbmss/dbms-1145d760-e398-4bc3-b64c-ea5ffc49ec22/import"
JSON_URL = "file:///chapter_3.json"

CONSTRAINTS = [
    "CREATE CONSTRAINT IF NOT EXISTS FOR (q:Quytrinh) REQUIRE q.title IS UNIQUE",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (s:Phamvi)   REQUIRE (s.proc, s.tableIdx, s.code) IS NODE KEY",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (t:Thutuc)   REQUIRE (t.proc, t.tableIdx, t.sectionCode, t.code) IS NODE KEY",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (x:Thanhphandutoan) REQUIRE x.name IS UNIQUE",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (x:Hosochungtu)     REQUIRE x.name IS UNIQUE",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (x:Ghichu)          REQUIRE x.key  IS UNIQUE",
]

IMPORT_CYPHER = """
CALL apoc.load.json($jsonUrl) YIELD value
WITH value.Quytrinh AS root
MERGE (q:Quytrinh {title: root.title})

WITH q, coalesce(root.tables_structured, []) AS tbs
UNWIND range(0, size(tbs)-1) AS tblIdx
WITH q, tblIdx, tbs[tblIdx] AS tbl
UNWIND coalesce(tbl.sections, []) AS secMap

WITH q, tblIdx,
     toUpper(coalesce(secMap.code,'')) AS secCode,
     coalesce(secMap.title,'Chưa rõ')  AS secTitle,
     coalesce(secMap.label,'Phamvi')   AS secLabel,
     secMap
WHERE secCode =~ '^[IVXLCDM]+$'
MERGE (s:Phamvi {proc:q.title, tableIdx:tblIdx, code:secCode})
  ON CREATE SET s.title = secTitle, s.label = secLabel
  ON MATCH  SET s.title = secTitle, s.label = secLabel
MERGE (q)-[:HAS_SECTION]->(s)

WITH q, s, tblIdx, secMap
UNWIND coalesce(secMap.groups, []) AS grpMap
WITH q, s, tblIdx,
     coalesce(grpMap.code,'')         AS grpCode,
     coalesce(grpMap.title,'Chưa rõ') AS grpTitle,
     grpMap
WHERE grpCode =~ '^\d+$'
MERGE (t:Thutuc {proc:q.title, tableIdx:tblIdx, sectionCode:s.code, code:grpCode})
  ON CREATE SET t.title = grpTitle, t.label = 'Thutuc', t.level = 'group'
  ON MATCH  SET t.title = grpTitle, t.label = 'Thutuc', t.level = 'group'
MERGE (s)-[:HAS_ITEM]->(t)

WITH q, s, t, tblIdx, grpMap
UNWIND coalesce(grpMap.items, []) AS itMap
WITH q, s, t, tblIdx,
     toLower(trim(coalesce(itMap.code,''))) AS itemCode,
     itMap
WHERE itemCode <> ''

FOREACH (_ IN [1] |
  FOREACH (tpName IN [x IN coalesce(itMap.Thanhphandutoan,[]) WHERE trim(x) <> ''] |
    MERGE (tp:Thanhphandutoan {name: trim(tpName)})
    MERGE (t)-[:REQUIRES {item:itemCode}]->(tp)
  )
)

FOREACH (_ IN [1] |
  FOREACH (hsName IN [x IN coalesce(itMap.Hosochungtu,[]) WHERE trim(x) <> ''] |
    MERGE (hs:Hosochungtu {name: trim(hsName)})
    MERGE (t)-[:REQUIRES {item:itemCode}]->(hs)
  )
)

WITH t, itemCode, itMap,
     trim(replace(replace(coalesce(itMap.Ghichu,''),'\r',' '),'\n',' ')) AS noteText
WHERE noteText <> '' AND NOT noteText IN ['-','—','N/A','n/a','None','null']
MERGE (gh:Ghichu { key: t.proc+'|'+toString(t.tableIdx)+'|'+t.sectionCode+'|'+t.code+'|'+itemCode+'|'+noteText })
  ON CREATE SET gh.text = noteText
MERGE (t)-[:NOTE {item:itemCode}]->(gh)
"""

def main():
    # Copy JSON file to Neo4j import directory
    os.makedirs(IMPORT_DIR, exist_ok=True)
    dest_path = os.path.join(IMPORT_DIR, "chapter_3.json")
    print(f"Copying {SOURCE_JSON} to {dest_path}")
    shutil.copy2(SOURCE_JSON, dest_path)
    print("File copied successfully!")

    auth = None if NEO4J_PASSWORD == "" else basic_auth(NEO4J_USER, NEO4J_PASSWORD)
    driver = GraphDatabase.driver(NEO4J_URI, auth=auth)

    with driver.session(database=NEO4J_DATABASE) as s:
        # Test kết nối
        s.run("RETURN 1").consume()
        print("Connected.")

        # Chạy constraints
        for i, stmt in enumerate(CONSTRAINTS, 1):
            s.run(stmt).consume()
            print(f"  ✓ constraint {i}/{len(CONSTRAINTS)} OK")

        # Chạy import (một statement)
        print("Importing from:", JSON_URL)
        s.run(IMPORT_CYPHER, jsonUrl=JSON_URL).consume()
        print("Import hoàn tất!")

    driver.close()

if __name__ == "__main__":
    main()

Copying C:/Users/heheh/Desktop/Convert-docling/Final Json/Quy trình/chapter_3.json to C:/Users/heheh/.Neo4jDesktop2/Data/dbmss/dbms-1145d760-e398-4bc3-b64c-ea5ffc49ec22/import\chapter_3.json
File copied successfully!
Connected.
  ✓ constraint 1/6 OK
  ✓ constraint 2/6 OK
  ✓ constraint 3/6 OK
  ✓ constraint 4/6 OK
  ✓ constraint 5/6 OK
  ✓ constraint 6/6 OK
Importing from: file:///chapter_3.json
Import hoàn tất!
