In [46]:
import json
import re
import os
import uuid
from typing import Dict, List, Any


In [1]:
class EntityType:
    """Định nghĩa các loại thực thể được hỗ trợ."""
    COURSE = "course"
    SESSION = "session"
    CLO = "clo"
    ASSESSMENT = "assessment"
    MATERIAL = "material"

In [2]:
def handle_entity(entity_type):
    if entity_type == EntityType.COURSE:
        print("Đang xử lý khóa học")
    elif entity_type == EntityType.SESSION:
        print("Đang xử lý buổi học")


In [None]:
def clean_text(text):
    """Làm sạch văn bản cơ bản: loại bỏ khoảng trắng thừa, chuẩn hóa newline."""
    if not isinstance(text, str):
        return str(text)
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [6]:
text = """   Đây là một đoạn văn bản   \n với nhiều khoảng trắng. 
alo
123  
"""
cleaned_text = clean_text(text)
print(f"Văn bản đã được làm sạch: '{cleaned_text}'")

Văn bản đã được làm sạch: 'Đây là một đoạn văn bản với nhiều khoảng trắng. alo 123'


In [57]:
def extract_subject_areas(syllabus_data: Dict[str, Any]) -> List[str]:
    """Trích xuất các lĩnh vực chủ đề của môn học."""
    common = {
        "bartending": ["pha chế","cocktail","bartending","đồ uống"],
        "math": ["math", "mathematics", "toán học", "toán", "đại số", "giải tích", "xác suất", "thống kê"],
        "programming": ["programming", "lập trình", "coding", "development", "phát triển", "software", "phần mềm"],
        "ai": ["ai", "artificial intelligence", "trí tuệ nhân tạo", "machine learning", "học máy", "deep learning", "học sâu"],
        "database": ["database", "cơ sở dữ liệu", "data", "dữ liệu", "sql", "nosql"],
        "networking": ["network", "mạng", "internet", "web", "protocol", "giao thức"],
        "security": ["security", "bảo mật", "cryptography", "mã hóa", "privacy", "riêng tư"],
        "business": ["business", "kinh doanh", "marketing", "management", "quản lý", "finance", "tài chính"],
        "english": ["english", "tiếng anh", "language", "ngôn ngữ"],
        "chinese": ["chinese", "tiếng trung", "language", "ngôn ngữ"],
        "japanese": ["japanese", "tiếng nhật", "language", "ngôn ngữ"],
        "graphic": ["graphic", "đồ họa", "graphics", "animation", "hoạt hình"],
        "design": ["design", "thiết kế", "ui", "ux", "interface", "giao diện"],
        "mobile": ["mobile", "di động", "android", "ios", "app", "ứng dụng"],
        "vovinam": ["vovinam", "võ", "martial arts", "nghệ thuật chiến đấu"],
        "bartending": ["bartending", "pha chế", "cocktail", "đồ uống"],
        "cooking": ["cooking", "nấu ăn", "ẩm thực", "food", "đồ ăn"],
    }
    text = " ".join([
        syllabus_data.get("general_details", {}).get("Syllabus Name","").lower(),
        syllabus_data.get("general_details", {}).get("Syllabus English","").lower(),
        syllabus_data.get("general_details", {}).get("Description","").lower()
    ])
    areas = [area for area, kws in common.items() if any(kw in text for kw in kws)]
    return list(set(areas))

In [58]:
syllabus_data = {
    "general_details": {
        "Syllabus Name": "Lập trình căn bản",
        "Syllabus English": "Introduction to Programming",
        "Description": "Học viên sẽ học cách viết chương trình đơn giản bằng Python."
    }
}

subject_areas = extract_subject_areas(syllabus_data)
print(f"Các lĩnh vực chủ đề của môn học: {subject_areas}")

Các lĩnh vực chủ đề của môn học: ['programming']


In [59]:
def extract_important_metadata(syllabus_data):
    """Trích xuất các metadata quan trọng từ syllabus để sử dụng trong nhiều chunks."""
    metadata = {}
    
    if "general_details" in syllabus_data:
        gd = syllabus_data["general_details"]
        
        metadata["syllabus_name"] = gd.get("Syllabus Name", "N/A")
        metadata["syllabus_english"] = gd.get("Syllabus English", "")
        metadata["subject_code"] = gd.get("Subject Code", "N/A")
        metadata["credits"] = gd.get("NoCredit", "N/A")
        metadata["degree_level"] = gd.get("Degree Level", "N/A")
        metadata["time_allocation"] = gd.get("Time Allocation", "N/A")
        metadata["pre_requisite"] = gd.get("Pre-Requisite", "")
        
        # Break down description into short and full
        description = gd.get("Description", "N/A")
        metadata["description"] = description
        metadata["description_full"] = description
        metadata["description_short"] = description[:150] + "..." if len(description) > 150 else description
        
        metadata["student_tasks"] = gd.get("StudentTasks", "N/A")
        metadata["tools"] = gd.get("Tools", "")
        metadata["scoring_scale"] = gd.get("Scoring Scale", "N/A")
        metadata["min_avg_mark_to_pass"] = gd.get("MinAvgMarkToPass", "N/A")
        metadata["note"] = gd.get("Note", "")
        
    # Extract assessment summary if available
    if "assessments" in syllabus_data and syllabus_data["assessments"]:
        assessment_weights = []
        for assessment in syllabus_data["assessments"]:
            category = assessment.get("Category", "")
            weight = assessment.get("Weight", "")
            if category and weight:
                assessment_weights.append(f"{category}: {weight}")
        
        if assessment_weights:
            metadata["assessment_summary"] = ", ".join(assessment_weights)
    
    # Count total materials
    if "materials_table" in syllabus_data:
        metadata["total_materials"] = len(syllabus_data["materials_table"])
    else:
        metadata["total_materials"] = 0
    
    # Count total CLOs
    if "clos" in syllabus_data:
        metadata["total_clos"] = len(syllabus_data["clos"])
    else:
        metadata["total_clos"] = 0
    
    # Count total sessions
    if "sessions" in syllabus_data:
        metadata["total_sessions"] = len(syllabus_data["sessions"])
    else:
        metadata["total_sessions"] = 0
    
    # Extract subject area keywords
    subject_areas = extract_subject_areas(syllabus_data)
    if subject_areas:
        metadata["subject_areas"] = subject_areas
    else:
        metadata["subject_areas"] = []
    
    return metadata

In [60]:
syllabus_data = {
  "RES222": {
    "page_title": "FPT University Learning Materials",
    "page_url": "https://flm.fpt.edu.vn/gui/role/student/SyllabusDetails?sylID=10372",
    "subject_code": "RES222",
    "syllabus_id": "10372",
    "general_details": {
      "Syllabus ID": "10372",
      "Syllabus Name": "Wines, Beers, Spirits 2_Nghiệp vụ pha chế đồ uống 2",
      "Syllabus English": "Wines, Beers, Spirits 2",
      "Subject Code": "RES222",
      "NoCredit": "3",
      "Degree Level": "Bachelor",
      "Time Allocation": "Study hour (150h) = 45h (60 sessions) contact hours + 1h final exam + 104h self-study",
      "Pre-Requisite": "RES213",
      "Description": "-Objective: To provides the basic knowledge about the berverage industry as well as the knowledge of the world beverage culture in operating the Restaurant - Bar.\n\n-Description: This course is the continued part of RES213. This course focus on the beverage service of the Hospitality Industry including spirits, wines, beers and mixology beverages. Topics include purchasing, resource control, beverage storage and issuing, bar stocks and sales control, bar customer care, safety and security, managing bar wastage, the law and sale of liquor.\n\n-Teaching method: lecture, discussion, demonstration, roleplay, project based learning, group work\n\n- Facility and equipment requirement : Depending on  the training condition of each campus, the course will be implemented in school lab designed up to 4-5* hotel standard, real hotel or other places, which fully met with requirements.",
      "StudentTasks": "- Class attendance is strongly encouraged. Attend at least 80% of class hours in order to be accepted to the final examination \n\n- Actively participate in class activities\n\n- Fulfill tasks given by instructor after class\n\n- Use their own laptop in class only for learning purpose\n\n- Read the textbook in advance\n\n- Access the course website (http://flm.fpt.edu.vn) for up-to-date information and material of the course, for online supports from teachers and other students and for practicing and assessment.",
      "Tools": "Internet access",
      "Scoring Scale": "10",
      "DecisionNo MM/dd/yyyy": "402/QĐ-ĐHFPT dated 04/22/2024",
      "IsApproved": "True",
      "Note": "",
      "MinAvgMarkToPass": "5",
      "IsActive": "True",
      "ApprovedDate": "4/22/2024"
    },
    "materials_table": [
      {
        "MaterialDescription": "The Bar & Beverage Book",
        "Author": "Costas, K., & Chris, T.",
        "Publisher": "John Wiley & Sons Publisher",
        "PublishedDate": "2012",
        "Edition": "5th edition",
        "ISBN": "9780470248454",
        "IsMainMaterial": "",
        "IsHardCopy": "",
        "IsOnline": "",
        "Note": "Wiley - Vital Source"
      }
    ],
    "clos": [
      {
        "CLO Name": "1",
        "CLO Details": "CLO1",
        "LO Details": "Manage and  correctly arrange sanitation standards and routines for the bar setup."
      },
      {
        "CLO Name": "2",
        "CLO Details": "CLO10",
        "LO Details": "Present basic knowledge about tea and coffee beverages"
      },
      {
        "CLO Name": "10",
        "CLO Details": "CLO9",
        "LO Details": "Develop ability to work quietly; with cooperation; patience, carefulness, cleanliness and aesthetic values."
      }
    ],
    "clo_plo_mapping_link": "https://flm.fpt.edu.vn/CLOMapping/View?syllabusID=10372",
    "sessions": [
      {
        "Session": "1",
        "Topic": "Chapter 9: Sanitation and Bar setup\n\n- Sanitation & liquor supplies \n\n- Mixes \n\n- Garnishes and condiments \n\n- Service accessories",
        "Learning-Teaching Type": "Offline",
        "LO": "LO1",
        "ITU": "",
        "Student Materials": "Text book, Slides",
        "S-Download": {
          "text": "RES222",
          "link": "https://flm.fpt.edu.vn/download/6252/S/1_RES222.zip"
        },
        "Student's Tasks": "Read powerpoint slides and textbook before lecture",
        "URLs": ""
      },
      {
        "Session": "2",
        "Topic": "Chapter 9: Sanitation and Bar setup\n\n- Sanitation & liquor supplies \n\n- Mixes \n\n- Garnishes and condiments \n\n- Service accessories",
        "Learning-Teaching Type": "Offline",
        "LO": "LO1",
        "ITU": "",
        "Student Materials": "Text book, Slides",
        "S-Download": {
          "text": "RES222",
          "link": "https://flm.fpt.edu.vn/download/6252/S/2_RES222.zip"
        },
        "Student's Tasks": "Read powerpoint slides and textbook before lecture",
        "URLs": ""
      },
      {
        "Session": "59",
        "Topic": "GROUP PRESENTATION AND REPORT SUBMISSION",
        "Learning-Teaching Type": "Offline",
        "LO": "",
        "ITU": "",
        "Student Materials": "",
        "S-Download": "",
        "Student's Tasks": "",
        "URLs": ""
      },
      {
        "Session": "60",
        "Topic": "GROUP PRESENTATION AND REPORT SUBMISSION",
        "Learning-Teaching Type": "Offline",
        "LO": "",
        "ITU": "",
        "Student Materials": "",
        "S-Download": "",
        "Student's Tasks": "",
        "URLs": ""
      }
    ],
    "assessments": [
      {
        "Category": "Class participation and preparation",
        "Type": "on-going",
        "Part": "1",
        "Weight": "10.0%",
        "Completion Criteria": ">0",
        "Duration": "During all course",
        "CLO": "",
        "Question Type": "Each student shoud make at least 1 individual presentation, case analysis during the course in front of the class. Topics will be varied based on the topics in the textbook.",
        "No Question": "",
        "Knowledge and Skill": "",
        "Grading Guide": "",
        "Note": "Participation grade will be given based on students'  preparation, and quantity as well as quality of participation in classroom; quality of participation is demonstrated through asking relevant questions, making statements that add to and facilitate class discussion, and builds upon others’ comments. Class preparation and pre-reading before lecture will be assessed upon individuals' understanding and ability to answer questions in the class. Individuals will be called up randomly to be assessed."
      },
      {
        "Category": "Group assignment",
        "Type": "on-going",
        "Part": "1",
        "Weight": "25.0%",
        "Completion Criteria": ">0",
        "Duration": "Present on session 29,30",
        "CLO": "All Los",
        "Question Type": "Four or five students per group.\n\nGroups and groups ideas are determined on the third session of the course. Work will be break down into stages and examined during the course. Final work to be submitted prior to the presentation of the topic.",
        "No Question": "Minimum length of the group assignment should be 10 pages, excluding appendices",
        "Knowledge and Skill": "Based on topics of studied units",
        "Grading Guide": "Group presentation in class room",
        "Note": "All assignments should be typed and double-spaced with 13-point font, and must be submitted in soft copy. A deduction of 10% of the assignment mark will be applied for each day of late submission\n\nEvaluation will be based on both presentation and written report (reccommded 30%, 70% accordingly)"
      },
      {
        "Category": "Group practice",
        "Type": "on-going",
        "Part": "1",
        "Weight": "15.0%",
        "Completion Criteria": ">0",
        "Duration": "- Session 27, 28",
        "CLO": "All Los",
        "Question Type": "Four or five students per group. Competency is to be demonstrated by effectively preparing and serving beverage in accordance with the performance criteria and the range listed within the practical learning sessions.",
        "No Question": "",
        "Knowledge and Skill": "Based on the practical learning sessions of the course.",
        "Grading Guide": "Bar study room",
        "Note": "Competency in this unit may be assessed through: Demonstration of skills on mixing different (in quantity) international alcoholic beverages in a prescribed timeframe; utilizing systematic liquor pouring and proper bar measurement; mixing of alcoholic concoction showmanship skill techniques.\n\nThe mark will be given based on each student's performance, not on the whole group's. It means each student will get their own mark."
      },
      {
        "Category": "Individual practice",
        "Type": "on-going",
        "Part": "2",
        "Weight": "20.0%",
        "Completion Criteria": ">0",
        "Duration": "-Session 10,11,12\n\n- Session 21,22,23",
        "CLO": "LO2,3,8,9",
        "Question Type": "Each student must be assessed through practical demonstration in a simulated workplace environment where beverage ingredients and equipment are provided. Competency is to be demonstrated by effectively preparing and serving beverage in accordance with the performance criteria and the range listed within the practical learning sessions.",
        "No Question": "2 practical assessments",
        "Knowledge and Skill": "Based on the practical learning sessions of the course.",
        "Grading Guide": "Bar study room",
        "Note": "Competency in this unit may be assessed through: \n\n- Observation shall be done on the proper classification of alcoholic beverages and glasses; proper mixing procedures on several cocktails and the uses and maintenance of different bar tools and equipment.\n\n- Direct observation of student providing advice to customers or colleagues on cocktail and its proper serving procedures."
      },
      {
        "Category": "Quizzes",
        "Type": "on-going",
        "Part": "2",
        "Weight": "10.0%",
        "Completion Criteria": ">0",
        "Duration": "- Session 10\n\n- Session 21",
        "CLO": "LO1-3",
        "Question Type": "Multiple choice, exercises or essay questions",
        "No Question": "Appx. 15",
        "Knowledge and Skill": "Quizzes will test for knowledge of subjects in the chapter assigned for that day.",
        "Grading Guide": "Class room",
        "Note": "These quizzes will be given at various times during class (beginning, middle or end). Total number of quizzes: 2"
      },
      {
        "Category": "Final exam",
        "Type": "final exam",
        "Part": "1",
        "Weight": "20.0%",
        "Completion Criteria": "4",
        "Duration": "60'",
        "CLO": "All Los",
        "Question Type": "Multiple choice .  Other types of questions may be included.",
        "No Question": "50",
        "Knowledge and Skill": "The questions/ exercises will cover the lectures, class discussions, activities, exercises, and textbook.",
        "Grading Guide": "Exam room",
        "Note": "Not allow to use any document/information in any form in exam room"
      }
    ],
    "extraction_errors": [],
    "extraction_time": "2025-05-15T18:07:06.605Z",
    "materials_info": "1 material(s)"
  }
}

import json
syllabus_data = json.dumps(syllabus_data, ensure_ascii=False, indent=2)

In [61]:
def create_entity_id(entity_type: str, subject_code: str, identifier: str = None) -> str:
    """Tạo ID duy nhất cho thực thể."""
    if identifier:
        return f"{entity_type}_{subject_code}_{identifier}"
    return f"{entity_type}_{subject_code}"

In [62]:
def create_enhanced_chunks_from_syllabus(subject_code: str, sd: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Tạo các chunk từ dữ liệu syllabus với metadata phong phú và liên kết thực thể."""
    chunks: List[Dict[str, Any]] = []
    md = extract_important_metadata(sd)

    # Entity IDs
    course_id = create_entity_id(EntityType.COURSE, subject_code)
    session_ids = {
        s.get("Session"): create_entity_id(EntityType.SESSION, subject_code, s.get("Session"))
        for s in sd.get("sessions", [])
    }
    clo_ids = {
        c.get("CLO Name"): create_entity_id(EntityType.CLO, subject_code, f"CLO{c.get('CLO Name')}")
        for c in sd.get("clos", [])
    }
    assess_ids = {
        a.get("Category"): create_entity_id(EntityType.ASSESSMENT, subject_code, str(i+1))
        for i,a in enumerate(sd.get("assessments", []))
    }
    mat_ids = {
        m.get("MaterialDescription"): create_entity_id(EntityType.MATERIAL, subject_code, str(i+1))
        for i,m in enumerate(sd.get("materials_table", []))
    }

    base_meta = {
        "subject_code": subject_code,
        "syllabus_id": sd.get("syllabus_id",""),
        "entity_id": course_id,
        "syllabus_name": md["syllabus_name"],
        "syllabus_english": md["syllabus_english"],
        "credits": md["credits"],
        "total_clos": md["total_clos"],
        "total_sessions": md["total_sessions"],
        "total_materials": md["total_materials"],
    }
    if md["subject_areas"]:
        base_meta["subject_areas"] = md["subject_areas"]

    # --- Chunk: Overview (rút gọn) ---
    ov = [
        f"Môn học {subject_code} - {md['syllabus_name']}.",
        f"Tên tiếng Anh: {md['syllabus_english']}.",
        f"Số tín chỉ: {md['credits']}.",
    ]
    if md["pre_requisite"]:
        ov.append(f"Tiên quyết: {md['pre_requisite']}.")
    ov.append(f"Mô tả ngắn: {md['description_short']}")
    if md.get("assessment_summary"):
        ov.append(f"Đánh giá: {md['assessment_summary']}.")
    ov.append(f"Thang điểm: {md['scoring_scale']}.")
    ov.append(f"Điểm tối thiểu: {md['min_avg_mark_to_pass']}.")

    chunks.append({
        "type": "overview",
        "content": clean_text(" ".join(ov)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "overview",
            "title": f"Tổng quan - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {
                "sessions": list(session_ids.values()),
                "clos": list(clo_ids.values()),
                "assessments": list(assess_ids.values()),
                "materials": list(mat_ids.values())
            }
        }
    })

    # --- Chunk: General Info (chi tiết) ---
    gd = sd.get("general_details", {})
    gi_texts = [
        f"Tên môn học: {gd.get('Syllabus Name')} ({gd.get('Syllabus English')}).",
        f"Mã môn: {gd.get('Subject Code')}. Số tín chỉ: {gd.get('NoCredit')}.",
        f"Bậc đào tạo: {gd.get('Degree Level')}.",
        f"Phân bổ thời gian: {gd.get('Time Allocation')}.",
        f"Mô tả: {md['description_full']}"
    ]
    if md["pre_requisite"]:
        gi_texts.append(f"Tiên quyết: {md['pre_requisite']}.")

    chunks.append({
        "type": "general_info",
        "content": clean_text(" ".join(gi_texts)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "general_details",
            "title": f"Thông tin chung - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {
                "sessions": list(session_ids.values()),
                "clos": list(clo_ids.values()),
                "assessments": list(assess_ids.values()),
                "materials": list(mat_ids.values())
            }
        }
    })

    # --- Chunk: Student Tasks & Tools (tập trung) ---
    st = []
    if md["student_tasks"]:
        st.append(f"Nhiệm vụ sinh viên: {md['student_tasks']}")
    if md["tools"]:
        st.append(f"Công cụ: {md['tools']}")
    chunks.append({
        "type": "student_tasks",
        "content": clean_text(" ".join(st)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "student_tasks",
            "title": f"Nhiệm vụ & Công cụ - {subject_code}",
            "entity_type": EntityType.COURSE
        }
    })

    # --- Chunk: Structure ---
    st_texts = [
        f"Cấu trúc môn {subject_code} - {md['syllabus_name']}.",
        f"{md['credits']} tín chỉ.",
        f"{md['total_clos']} CLO.",
        f"{md['total_sessions']} buổi học.",
        f"{md['total_materials']} tài liệu."
    ]
    if md.get("assessment_summary"):
        st_texts.append(f"Đánh giá: {md['assessment_summary']}.")
    st_texts.append(f"Điểm tối thiểu: {md['min_avg_mark_to_pass']}.")

    chunks.append({
        "type": "structure",
        "content": clean_text(" ".join(st_texts)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "structure",
            "title": f"Cấu trúc môn học - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {
                "sessions": list(session_ids.values()),
                "clos": list(clo_ids.values()),
                "assessments": list(assess_ids.values()),
                "materials": list(mat_ids.values())
            }
        }
    })

    # --- Chunk: CLO Overview ---
    clo_over = [f"Tổng quan CLO môn {subject_code} ({md['credits']} tín chỉ):"]
    for c in sd.get("clos", []):
        name, det, lo = c.get("CLO Name"), c.get("CLO Details"), c.get("LO Details")
        label = f"CLO{name} ({det})" if det and det != name else f"CLO{name}"
        clo_over.append(f"{label}: {lo}")
    chunks.append({
        "type": "clos_overview",
        "content": clean_text(" ".join(clo_over)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "clos_overview",
            "title": f"Tổng quan CLO - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {"clos": list(clo_ids.values())}
        }
    })

    # --- Chunk: Individual CLOs ---
    for c in sd.get("clos", []):
        name, det, lo = c.get("CLO Name"), c.get("CLO Details"), c.get("LO Details")
        texts = [
            f"Môn {subject_code} ({md['credits']} tín chỉ).",
            f"CLO{name}{f' ({det})' if det and det!=name else ''}: {lo}."
        ]
        # assessments
        related_a = []
        for a in sd.get("assessments", []):
            if name in a.get("CLO","") or det in a.get("CLO",""):
                related_a.append(f"{a['Category']} ({a['Weight']})")
        if related_a:
            texts.append(f"Đánh giá qua: {', '.join(related_a)}.")
        # sessions
        related_s = []
        for s in sd.get("sessions", []):
            if name in s.get("LO","") or det in s.get("LO",""):
                related_s.append(s.get("Session"))
        if related_s:
            texts.append(f"Dạy tại buổi: {', '.join(related_s)}.")
        chunks.append({
            "type": "clo",
            "content": clean_text(" ".join(texts)),
            "metadata": {
                **base_meta,
                "chunk_id": str(uuid.uuid4()),
                "source_section": "clos",
                "title": f"CLO{name} - {subject_code}",
                "entity_type": EntityType.CLO,
                "entity_id": clo_ids.get(name),
                "related_entities": {
                    "course": course_id,
                    "assessments": [assess_ids[a['Category']] for a in sd.get("assessments", []) if name in a.get("CLO","")],
                    "sessions": [session_ids[s['Session']] for s in sd.get("sessions", []) if name in s.get("LO","")]
                }
            }
        })

    # --- Chunk: Sessions Overview ---
    topic_groups: Dict[str, List[str]] = {}
    for s in sd.get("sessions", []):
        topic_groups.setdefault(s.get("Topic",""), []).append(s.get("Session"))
    so = [f"Tổng quan buổi học {subject_code} ({md['credits']} tín chỉ):"]
    for topic, nums in topic_groups.items():
        so.append(f"Buổi {', '.join(nums)}: {topic}")
    chunks.append({
        "type": "sessions_overview",
        "content": clean_text(" ".join(so)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "sessions_overview",
            "title": f"Tổng quan buổi học - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {"sessions": list(session_ids.values())}
        }
    })

    # --- Chunk: Individual Sessions ---
    for s in sd.get("sessions", []):
        texts = [
            f"Môn {subject_code} ({md['credits']} tín chỉ).",
            f"Buổi {s.get('Session')}: {s.get('Topic')}.",
            f"Loại: {s.get('Learning-Teaching Type')}.",
            f"LO: {s.get('LO')}.",
        ]
        if s.get("Student Materials"):
            texts.append(f"Tài liệu: {s.get('Student Materials')}.")
        dl = s.get("S-Download", {})
        if isinstance(dl, dict):
            texts.append(f"Link tải: {dl.get('text')} - {dl.get('link')}.")
        if s.get("Student's Tasks"):
            texts.append(f"Nhiệm vụ: {s.get('Student\'s Tasks')}.")
        if s.get("URLs"):
            texts.append(f"URLs: {s.get('URLs')}.")
        # related CLOs
        related_clo = [clo_ids[name] for name in clo_ids if name in s.get("LO","")]
        chunks.append({
            "type": "session",
            "content": clean_text(" ".join(texts)),
            "metadata": {
                **base_meta,
                "chunk_id": str(uuid.uuid4()),
                "source_section": "sessions",
                "session_number": s.get("Session"),
                "title": f"Buổi {s.get('Session')} - {subject_code}",
                "entity_type": EntityType.SESSION,
                "entity_id": session_ids.get(s.get("Session")),
                "related_entities": {
                    "course": course_id,
                    "clos": related_clo
                },
                **({"download_link": dl.get("link")} if isinstance(dl, dict) else {})
            }
        })

    # --- Chunk: Assessments Overview ---
    ao = [f"Tổng quan đánh giá {subject_code} ({md['credits']} tín chỉ):"]
    for a in sd.get("assessments", []):
        ao.append(f"{a.get('Category')}: {a.get('Weight')}")
    chunks.append({
        "type": "assessments_overview",
        "content": clean_text(" ".join(ao)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "assessments_overview",
            "title": f"Tổng quan đánh giá - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {"assessments": list(assess_ids.values())}
        }
    })

    # --- Chunk: Individual Assessments ---
    for i,a in enumerate(sd.get("assessments", [])):
        texts = [
            f"Môn {subject_code} ({md['credits']} tín chỉ).",
            f"Đánh giá: {a.get('Category')}. Trọng số: {a.get('Weight')}.",
            f"CLO: {a.get('CLO')}.",
        ]
        if a.get("Part"):
            texts.append(f"Phần: {a.get('Part')}.")
        if a.get("Duration"):
            texts.append(f"Thời lượng: {a.get('Duration')}.")
        if a.get("Note"):
            texts.append(f"Ghi chú: {a.get('Note')}.")
        related = [clo_ids[n] for n in clo_ids if n in a.get("CLO","")]
        chunks.append({
            "type": "assessment",
            "content": clean_text(" ".join(texts)),
            "metadata": {
                **base_meta,
                "chunk_id": str(uuid.uuid4()),
                "source_section": "assessments",
                "assessment_category": a.get("Category"),
                "assessment_weight": a.get("Weight"),
                "title": f"Đánh giá {a.get('Category')} - {subject_code}",
                "entity_type": EntityType.ASSESSMENT,
                "entity_id": assess_ids.get(a.get("Category")),
                "related_entities": {
                    "course": course_id,
                    "clos": related
                }
            }
        })

    # --- Chunk: Materials Overview ---
    mo = [f"Tổng quan tài liệu {subject_code} ({md['credits']} tín chỉ):"]
    for m in sd.get("materials_table", []):
        desc, auth, pub = m.get("MaterialDescription"), m.get("Author",""), m.get("Publisher","")
        txt = desc
        if auth: txt += f", tác giả: {auth}"
        if pub:  txt += f", NXB: {pub}"
        mo.append(txt)
    chunks.append({
        "type": "materials_overview",
        "content": clean_text(" ".join(mo)),
        "metadata": {
            **base_meta,
            "chunk_id": str(uuid.uuid4()),
            "source_section": "materials_overview",
            "title": f"Tổng quan tài liệu - {subject_code}",
            "entity_type": EntityType.COURSE,
            "related_entities": {"materials": list(mat_ids.values())}
        }
    })

    # --- Chunk: Individual Materials ---
    for m in sd.get("materials_table", []):
        texts = [
            f"Môn {subject_code} ({md['credits']} tín chỉ).",
            f"Tài liệu: {m.get('MaterialDescription')}.",
        ]
        for field in ["Author","Publisher","PublishedDate","Edition","ISBN","Note"]:
            if m.get(field):
                texts.append(f"{field}: {m.get(field)}.")
        cid = mat_ids.get(m.get("MaterialDescription"))
        chunks.append({
            "type": "material",
            "content": clean_text(" ".join(texts)),
            "metadata": {
                **base_meta,
                "chunk_id": str(uuid.uuid4()),
                "source_section": "materials",
                "title": f"Tài liệu {m.get('MaterialDescription')} - {subject_code}",
                "entity_type": EntityType.MATERIAL,
                "entity_id": cid,
                "related_entities": {"course": course_id},
                **({"is_coursera": True} if "coursera" in m.get("MaterialDescription","").lower() or "coursera" in m.get("Note","").lower() else {})
            }
        })

    return chunks

In [63]:
def process_syllabus_data(syllabus_data_file):
    """Process syllabus data and create enhanced chunks."""
    # Giải mã JSON string thành dict nếu cần
    if isinstance(syllabus_data_file, str):
        syllabus_data = json.loads(syllabus_data_file)
    else:
        syllabus_data = syllabus_data_file

    all_chunks = []

    for subject_code, subject_data in syllabus_data.items():
        print(f"Processing subject: {subject_code}")
        subject_data["syllabus_id"] = subject_data.get("syllabus_id", "")
        chunks = create_enhanced_chunks_from_syllabus(subject_code, subject_data)
        all_chunks.extend(chunks)
        print(f"Created {len(chunks)} chunks for {subject_code}")

    print(all_chunks)


In [64]:
process_syllabus_data(syllabus_data)

Processing subject: RES222
Created 22 chunks for RES222
[{'type': 'overview', 'content': 'Môn học RES222 - Wines, Beers, Spirits 2_Nghiệp vụ pha chế đồ uống 2. Tên tiếng Anh: Wines, Beers, Spirits 2. Số tín chỉ: 3. Tiên quyết: RES213. Mô tả ngắn: -Objective: To provides the basic knowledge about the berverage industry as well as the knowledge of the world beverage culture in operating the Resta... Đánh giá: Class participation and preparation: 10.0%, Group assignment: 25.0%, Group practice: 15.0%, Individual practice: 20.0%, Quizzes: 10.0%, Final exam: 20.0%. Thang điểm: 10. Điểm tối thiểu: 5.', 'metadata': {'subject_code': 'RES222', 'syllabus_id': '10372', 'entity_id': 'course_RES222', 'syllabus_name': 'Wines, Beers, Spirits 2_Nghiệp vụ pha chế đồ uống 2', 'syllabus_english': 'Wines, Beers, Spirits 2', 'credits': '3', 'total_clos': 3, 'total_sessions': 4, 'total_materials': 1, 'subject_areas': ['security', 'ai', 'bartending', 'design'], 'chunk_id': '202f53f4-d624-445e-af17-074ec292ef3