### Load the document

In [130]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/labor_codex.pdf")

# Load a PDF file
docs = loader.load()

### Split to chunks

In [45]:
text = ""

for i in range(len(docs)):
    text += docs[i].page_content

#### Custom Splitter

In [212]:
from langchain.schema import Document
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter
import re
from typing import List

class RegexTextSplitter(TextSplitter):
    def __init__(self, chapter_pattern: str, section_pattern: str, chunk_size: int, chunk_overlap: int):
        """
        :param chapter_pattern: Regex pattern for chapters (e.g., r"(\\d+-bob)")
        :param section_pattern: Regex pattern for articles (modda), (e.g., r"(\\d+-modda)")
        :param chunk_size: Max tokens per chunk
        :param chunk_overlap: Overlap tokens between consecutive chunks
        """
        self.chapter_pattern = chapter_pattern
        self.section_pattern = section_pattern
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Splitting text into chunks using GPT-4 tokenizer (can be replaced with another model)
        self.recursive_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            model_name="gpt-4",
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

    def split_by_chapter(self, text: str) -> List[dict]:
        """
        Splits the entire text by chapters according to chapter_pattern.
        Returns a list of dicts like:
        [
        {"chapter_title": "II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIK", "content": "Text of that chapter"},
        ...
        ]
        """
        chapters = []
        pattern = (
            rf"({self.chapter_pattern})\s*"        # group(1): chapter number/title, e.g., "II BO‘LIM."
            rf"([\s\S]*?)(?=\n[0-9])"             # group(2): chapter name lazily until \n followed by digit
            rf"\n([\s\S]*?)(?={self.chapter_pattern}|$)"  # group(3): chapter content until next chapter or EOF
        )

        chapter_matches = re.findall(pattern, text, flags=re.DOTALL)
        for match in chapter_matches:
            chapter_title = f"{match[0].strip()} {match[1].strip()}"  # Combine number and title
            chapter_content = match[2].strip()  # Extract content
            chapters.append({
                "chapter_title": chapter_title,
                "content": chapter_content
            })
        return chapters



    def split_by_section(self, chapter: dict) -> List[dict]:
        """
        We assume each article (modda) can look like:

            1-modda. Article Title
            The rest of the article content...
        
        but sometimes the "Article Title" might be on the same line or might
        not have a trailing newline.

        This pattern:
        - Group(1): e.g. "1-modda"
        - Group(2): the article title (from after "." up to the newline or end of line)
        - Group(3): (optional) the rest of that article's text (until the next modda, bob, or EOF)
        """

        text = chapter["content"]

      
        pattern = (
    rf"({self.section_pattern})\.\s*"                  # group(1): modda number, e.g. "578-modda"
    rf"([\s\S]*?)(?=\n[A-ZА-Я])"                       # group(2): captures text lazily until we see newline + uppercase
    rf"(.*?)(?=\n{self.section_pattern}|$)"
)


        matches = re.findall(pattern, text, flags=re.DOTALL)

        sections = []
        for match in matches:
            section_title = match[1].strip()     # e.g. "1-modda"
            section_name  = match[2].strip()     # e.g. "Ushbu Kodeks bilan..."
            section_body  = match[3].strip() if match[3] else ""  # rest of the modda text

            sections.append({
                "chapter_title": chapter["chapter_title"],
                "section_title": section_title,
                "section_name":  section_name.replace("\n", ""),
                "content":       section_body
            })
        # print(matches)
        return sections


    def split_with_chunks(self, section: dict) -> List[Document]:
        """
        Breaks the content of a single article (section) into token-based chunks
        using RecursiveCharacterTextSplitter.
        """
        chunks = self.recursive_splitter.split_text(section["content"])
        documents = []
        for chunk in chunks:
            doc = Document(
                page_content=chunk,
                metadata={
                    "chapter":       section["chapter_title"],
                    "section":       section["section_title"],
                    "section_name":  section["section_name"],
                }
            )
            documents.append(doc)
        return documents

    def split_text(self, text: str) -> List[Document]:
        """
        1) Split by chapters -> 2) for each chapter, find all modda -> 3) chunk each modda
        """
        all_documents = []
        chapters = self.split_by_chapter(text)
        for chapter in chapters:
            sections = self.split_by_section(chapter)
            for section in sections:
                chunks = self.split_with_chunks(section)
                all_documents.extend(chunks)
        return all_documents

    def split_documents(self, documents: List[Document]) -> List[Document]:
        """
        Runs the splitting process for each document in the list.
        """
        split_documents = []
        for doc in documents:
            split_docs = self.split_text(doc.page_content)
            split_documents.extend(split_docs)
        return split_documents

In [169]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/labor_codex.pdf")

# Load a PDF file
docs = loader.load()

In [170]:
text = ""

for i in range(len(docs)):
    text += docs[i].page_content

In [213]:
chapter_pattern = r"(?:[IVXLCDM]+ BO‘LIM\.)"   # например "1-bob"
section_pattern = r"(\d+-modda)"  # "1-modda"

splitter = RegexTextSplitter(
    chapter_pattern=chapter_pattern,
    section_pattern=section_pattern,
    chunk_size=800,
    chunk_overlap=400
)

doc = Document(page_content=text)
docs = splitter.split_documents([doc])

In [214]:
docs[2:6]

[Document(metadata={'chapter': 'I BO‘LIM. UMUMIY QOIDALAR', 'section': '3-modda', 'section_name': 'Yakka tartibdagi mehnatga oid munosabatlarni va ular bilan bevosita bog‘liq bo‘lgan ijtimoiy munosabatlarni huquqiy jihatdan tartibga solishning asosiy prinsiplari'}, page_content='Yakka tartibdagi mehnatga oid munosabatlarni va ular bilan bevosita bog‘liq bo‘lgan \nijtimoiy munosabatlarni huquqiy jihatdan tartibga solishning asosiy prinsiplari quyidagilardan iborat: \nmehnat huquqlarining tengligi, mehnat va mashg‘ulotlar sohasida kamsitishni taqiqlash; \nmehnat erkinligi va majburiy mehnatni taqiqlash; \nmehnat sohasidagi ijtimoiy sheriklik; \nmehnat huquqlari ta’minlanishining va mehnat majburiyatlari bajarilishining \nkafolatlanganligi; \nxodimning huquqiy holati yomonlashishiga yo‘l qo‘yilmasligi.'),
 Document(metadata={'chapter': 'I BO‘LIM. UMUMIY QOIDALAR', 'section': '4-modda', 'section_name': 'Mehnat huquqlarining tengligi, mehnat va mashg‘ulotlar sohasida kamsitishni taqiqlash p

In [197]:
docs[-2:]

[Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH 31-bob. Umumiy qoidalar', 'section': '580-modda', 'section_name': 'Jamoaviy mehnat nizosini hal etish ( tartibga solish) jarayonida mehnat nizosining taraflari erishgan kelishuvlar va ularning bajarilishi ustidan nazorat'}, page_content='Nizoni hal etish (tartibga solish) jarayonida jamoaviy mehnat nizosi taraflari erishgan \nkelishuvlar yozma shaklda rasmiylashtiriladi va jam oaviy mehnat nizosi taraflari uchun majburiy \nkuchga ega. Ularning bajarilishi ustidan nazorat jamoaviy mehnat nizosining taraflari, shuningdek \nmehnat nizolarini tartibga solish bo‘yicha davlat organlari tomonidan amalga oshiriladi.'),
 Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH 31-bob. Umumiy qoidalar', 'section': '581-modda', 'section_name': 'Da’vo xususiyatiga ega jamoaviy mehnat nizolarini sudda ko‘rib

In [217]:
# replace 31-bob. Umumiy qoidalar with "" from chapter in metadata
for doc in docs:
    doc.metadata["chapter"] = doc.metadata["chapter"].replace("31-bob. Umumiy qoidalar", "")

In [218]:
docs[-2:]

[Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH ', 'section': '580-modda', 'section_name': 'Jamoaviy mehnat nizosini hal etish ( tartibga solish) jarayonida mehnat nizosining taraflari erishgan kelishuvlar va ularning bajarilishi ustidan nazorat'}, page_content='Nizoni hal etish (tartibga solish) jarayonida jamoaviy mehnat nizosi taraflari erishgan \nkelishuvlar yozma shaklda rasmiylashtiriladi va jam oaviy mehnat nizosi taraflari uchun majburiy \nkuchga ega. Ularning bajarilishi ustidan nazorat jamoaviy mehnat nizosining taraflari, shuningdek \nmehnat nizolarini tartibga solish bo‘yicha davlat organlari tomonidan amalga oshiriladi.'),
 Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH ', 'section': '581-modda', 'section_name': 'Da’vo xususiyatiga ega jamoaviy mehnat nizolarini sudda ko‘rib chiqish'}, page_content='Qonunchilikni, jamoa

### Try to generate

In [189]:
import os
import json
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [190]:
# read .env file
from dotenv import load_dotenv

load_dotenv()

True

In [226]:
label_template = """
You are an AI assistant tasked with generating 10 highly focused question-answer pairs **in Uzbek** based **strictly** on the provided document chunk.

Context:
- Document: "{chunk}"
- Chapter Name: "{chapter}"
- Section Name: "{section}. {section_name}"

Instructions:
1. Carefully analyze the given chunk of the document and extract key facts, topics, or concepts.
2. Generate **10 questions** that are maximally semantically close to the specific content of this chunk.
    - All questions must be **strictly tied** to the content of the chunk, chapter, and section.
    - Reuse or closely paraphrase phrases from the chunk wherever possible to maintain high relevance.
    - Incorporate synonyms or light rephrasings to capture different ways a user might query this content (e.g., "mehnat huquqlari" ↔ "ish huquqlari").
    - Avoid any extrapolation, assumptions, or references to external knowledge.
3. Write the **answers** based only on the given chunk. **Do not invent or assume additional information.**
4. Explicitly reference the **chapter name** and **section name** in both the questions and answers for clarity and traceability.
5. Use natural, conversational Uzbek language when generating questions. Occasionally include mild typos or colloquialisms to mimic real user queries, but ensure they remain understandable.
6. Include a reference link in each answer pointing to the document, chapter, and section for traceability.
7. All questions must revolve around a **common theme** in the chunk (e.g., “mehnat huquqlari,” “ish beruvchilarning huquqlari,” etc.).

**Example of how to use phrases from the chunk (paraphrasing)**:
- If the chunk states: "Mehnat qilish, erkin ish tanlash va ishsizlikdan himoyalanish huquqi davlat kafolatlari bilan belgilanadi."
  - A suitable question might be: "Mehnat qilish va erkin ish tanlash huquqlari haqida {chapter} va {section} nima deyilgan?"
  - The answer must be grounded in the exact text: "Bu huquqlar davlat kafolatlari orqali belgilanadi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. {chapter}. {section}."

**Answer Example**:
If the document says: "Mehnat huquqlari himoyasi mehnat kodeksining asosiy maqsadi hisoblanadi."
- Question: "Mehnat kodeksining asosiy maqsadi nima?"
- Answer: "Mehnat huquqlarining himoyasi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. {{chapter}}, {{section}}. {{section_name}}."

Output Format:
Return a valid JSON object with the following structure:
```json
{{
  "question_1": "Generated question text",
  "answer_1": "Generated answer text",
  "question_2": "Generated question text",
  ...
}}
"""

label_prompt = ChatPromptTemplate.from_template(label_template)
llm = ChatOpenAI(temperature=0.2, model="gpt-4o-mini")

label_chain = label_prompt | llm | JsonOutputParser()

In [220]:
print(docs[1].page_content)

Ushbu Kodeksning asosiy vazifalari quyidagilardan iborat: 
xodimlar mehnat huquqlari va erkinliklarining, shu jumladan mehnat qilishga, erkin ish 
tanlashga, adolatli va xavfsiz mehnat sharoitlariga hamda ishsizlikdan himoyalanishga bo‘lgan 
huquqining davlat kafolatlarini belgilash; 
ish beruvchilarning ka drlarni tanlash, joy -joyiga qo‘yish va samarali mehnat jarayonini 
tashkil etish sohasidagi huquqlari amalga oshirilishini ta’minlash; 
mehnat sohasida ijtimoiy sheriklikni rag‘batlantirish va rivojlantirish; 
xodimlar va ish beruvchilarning huquqlari hamda q onuniy manfaatlari himoya qilinishini 
ta’minlash; 
mehnat bozorining samarali faoliyat ko‘rsatishiga ko‘maklashish.


In [196]:
doc_index = 1

# Generate questions for the second document chunk
output = label_chain.invoke({
    "chunk": docs[doc_index].page_content,
    "chapter": docs[doc_index].metadata["chapter"],
    "section": docs[doc_index].metadata["section"],
    "section_name": docs[doc_index].metadata["section_name"]
})

In [200]:
output

{'question_1': 'Ushbu Kodeksning asosiy vazifalari nimalardan iborat?',
 'answer_1': 'Ushbu Kodeksning asosiy vazifalari xodimlar mehnat huquqlari va erkinliklarini, ish beruvchilarning huquqlarini ta’minlash, mehnat sohasida ijtimoiy sheriklikni rivojlantirish va mehnat bozorining samarali faoliyat ko‘rsatishini ko‘maklashishdan iborat. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. I BO‘LIM. UMUMIY QOIDALAR. 2-modda.',
 'question_2': 'Xodimlar mehnat huquqlari qanday himoyalanadi?',
 'answer_2': 'Xodimlar mehnat huquqlari davlat kafolatlari orqali himoyalanadi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. I BO‘LIM. UMUMIY QOIDALAR. 2-modda.',
 'question_3': 'Ish beruvchilarning huquqlari qanday ta’minlanadi?',
 'answer_3': 'Ish beruvchilarning huquqlari amalga oshirilishini ta’minlash Kodeksning asosiy vazifalaridan biridir. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. I BO‘LIM. UMUMIY QOIDALAR. 2-modda.',
 'question_4': 'Mehnat sharoitlari qanday bo‘lishi kerak

#### Another example

In [221]:
len(docs)

693

In [229]:
doc_index = 42
docs[doc_index]

Document(metadata={'chapter': 'II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIK', 'section': '35-modda', 'section_name': 'Mehnat sohasidagi ijtimoiy sheriklikning darajalari'}, page_content='Mehnat sohasidagi ijtimoiy sheriklik: \nboshlang‘ich darajada (tashkilotda yoki jismoniy shaxs bo‘lgan ish beruvchida); \nhududiy darajada;  \ntarmoq darajasida;  \nrespublika darajasida amalga oshiriladi.')

In [228]:
# Generate questions for the  document chunk
output = label_chain.invoke({
    "chunk": docs[doc_index].page_content,
    "chapter": docs[doc_index].metadata["chapter"],
    "section": docs[doc_index].metadata["section"],
    "section_name": docs[doc_index].metadata["section_name"]
})

In [230]:
output

{'question_1': 'Mehnat sohasidagi ijtimoiy sheriklikning darajalari qanday belgilangan?',
 'answer_1': 'Mehnat sohasidagi ijtimoiy sheriklik boshlang‘ich darajada, hududiy darajada, tarmoq darajasida va respublika darajasida amalga oshiriladi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIK. 35-modda.',
 'question_2': 'II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIKda mehnat sohasidagi ijtimoiy sheriklikning darajalari qanday ko‘rsatilgan?',
 'answer_2': 'Mehnat sohasidagi ijtimoiy sheriklik boshlang‘ich, hududiy, tarmoq va respublika darajasida amalga oshiriladi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIK. 35-modda.',
 'question_3': 'Mehnat sohasidagi ijtimoiy sheriklikning boshlang‘ich darajasi nimani anglatadi?',
 'answer_3': 'Boshlang‘ich darajada mehnat sohasidagi ijtimoiy sheriklik tashkilotda yoki jismoniy shaxs bo‘lgan ish beruvchida amalga oshiriladi. Havola: O‘ZBEKIS