### Load the document

In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/labor_codex.pdf")

# Load a PDF file
docs = loader.load()

### Split to chunks

In [2]:
text = ""

for i in range(len(docs)):
    text += docs[i].page_content

#### Custom Splitter

In [3]:
from langchain.schema import Document
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter
import re
from typing import List

class RegexTextSplitter(TextSplitter):
    def __init__(self, chapter_pattern: str, section_pattern: str, chunk_size: int, chunk_overlap: int):
        """
        :param chapter_pattern: Regex pattern for chapters (e.g., r"(\\d+-bob)")
        :param section_pattern: Regex pattern for articles (modda), (e.g., r"(\\d+-modda)")
        :param chunk_size: Max tokens per chunk
        :param chunk_overlap: Overlap tokens between consecutive chunks
        """
        self.chapter_pattern = chapter_pattern
        self.section_pattern = section_pattern
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Splitting text into chunks using GPT-4 tokenizer (can be replaced with another model)
        self.recursive_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            model_name="gpt-4",
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

    def split_by_chapter(self, text: str) -> List[dict]:
        """
        Splits the entire text by chapters according to chapter_pattern.
        Returns a list of dicts like:
        [
        {"chapter_title": "II BO‘LIM. MEHNAT SOHASIDAGI IJTIMOIY SHERIKLIK", "content": "Text of that chapter"},
        ...
        ]
        """
        chapters = []
        pattern = (
            rf"({self.chapter_pattern})\s*"        # group(1): chapter number/title, e.g., "II BO‘LIM."
            rf"([\s\S]*?)(?=\n[0-9])"             # group(2): chapter name lazily until \n followed by digit
            rf"\n([\s\S]*?)(?={self.chapter_pattern}|$)"  # group(3): chapter content until next chapter or EOF
        )

        chapter_matches = re.findall(pattern, text, flags=re.DOTALL)
        for match in chapter_matches:
            chapter_title = f"{match[0].strip()} {match[1].strip()}"  # Combine number and title
            chapter_content = match[2].strip()  # Extract content
            chapters.append({
                "chapter_title": chapter_title,
                "content": chapter_content
            })
        return chapters



    def split_by_section(self, chapter: dict) -> List[dict]:
        """
        We assume each article (modda) can look like:

            1-modda. Article Title
            The rest of the article content...
        
        but sometimes the "Article Title" might be on the same line or might
        not have a trailing newline.

        This pattern:
        - Group(1): e.g. "1-modda"
        - Group(2): the article title (from after "." up to the newline or end of line)
        - Group(3): (optional) the rest of that article's text (until the next modda, bob, or EOF)
        """

        text = chapter["content"]

      
        pattern = (
    rf"({self.section_pattern})\.\s*"                  # group(1): modda number, e.g. "578-modda"
    rf"([\s\S]*?)(?=\n[A-ZА-Я])"                       # group(2): captures text lazily until we see newline + uppercase
    rf"(.*?)(?=\n{self.section_pattern}|$)"
)


        matches = re.findall(pattern, text, flags=re.DOTALL)

        sections = []
        for match in matches:
            section_title = match[1].strip()     # e.g. "1-modda"
            section_name  = match[2].strip()     # e.g. "Ushbu Kodeks bilan..."
            section_body  = match[3].strip() if match[3] else ""  # rest of the modda text

            sections.append({
                "chapter_title": chapter["chapter_title"],
                "section_title": section_title,
                "section_name":  section_name.replace("\n", ""),
                "content":       section_body
            })
        # print(matches)
        return sections


    def split_with_chunks(self, section: dict) -> List[Document]:
        """
        Breaks the content of a single article (section) into token-based chunks
        using RecursiveCharacterTextSplitter.
        """
        chunks = self.recursive_splitter.split_text(section["content"])
        documents = []
        for chunk in chunks:
            doc = Document(
                page_content=chunk,
                metadata={
                    "chapter":       section["chapter_title"],
                    "section":       section["section_title"],
                    "section_name":  section["section_name"],
                }
            )
            documents.append(doc)
        return documents

    def split_text(self, text: str) -> List[Document]:
        """
        1) Split by chapters -> 2) for each chapter, find all modda -> 3) chunk each modda
        """
        all_documents = []
        chapters = self.split_by_chapter(text)
        for chapter in chapters:
            sections = self.split_by_section(chapter)
            for section in sections:
                chunks = self.split_with_chunks(section)
                all_documents.extend(chunks)
        return all_documents

    def split_documents(self, documents: List[Document]) -> List[Document]:
        """
        Runs the splitting process for each document in the list.
        """
        split_documents = []
        for doc in documents:
            split_docs = self.split_text(doc.page_content)
            split_documents.extend(split_docs)
        return split_documents

In [6]:
text = ""

for i in range(len(docs)):
    text += docs[i].page_content

In [7]:
chapter_pattern = r"(?:[IVXLCDM]+ BO‘LIM\.)"   # например "1-bob"
section_pattern = r"(\d+-modda)"  # "1-modda"

splitter = RegexTextSplitter(
    chapter_pattern=chapter_pattern,
    section_pattern=section_pattern,
    chunk_size=800,
    chunk_overlap=400
)

doc = Document(page_content=text)
docs = splitter.split_documents([doc])

In [8]:
docs[2:6]

[Document(metadata={'chapter': 'I BO‘LIM. UMUMIY QOIDALAR', 'section': '3-modda', 'section_name': 'Yakka tartibdagi mehnatga oid munosabatlarni va ular bilan bevosita bog‘liq bo‘lgan ijtimoiy munosabatlarni huquqiy jihatdan tartibga solishning asosiy prinsiplari'}, page_content='Yakka tartibdagi mehnatga oid munosabatlarni va ular bilan bevosita bog‘liq bo‘lgan \nijtimoiy munosabatlarni huquqiy jihatdan tartibga solishning asosiy prinsiplari quyidagilardan iborat: \nmehnat huquqlarining tengligi, mehnat va mashg‘ulotlar sohasida kamsitishni taqiqlash; \nmehnat erkinligi va majburiy mehnatni taqiqlash; \nmehnat sohasidagi ijtimoiy sheriklik; \nmehnat huquqlari ta’minlanishining va mehnat majburiyatlari bajarilishining \nkafolatlanganligi; \nxodimning huquqiy holati yomonlashishiga yo‘l qo‘yilmasligi.'),
 Document(metadata={'chapter': 'I BO‘LIM. UMUMIY QOIDALAR', 'section': '4-modda', 'section_name': 'Mehnat huquqlarining tengligi, mehnat va mashg‘ulotlar sohasida kamsitishni taqiqlash p

In [9]:
docs[-2:]

[Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH 31-bob. Umumiy qoidalar', 'section': '580-modda', 'section_name': 'Jamoaviy mehnat nizosini hal etish ( tartibga solish) jarayonida mehnat nizosining taraflari erishgan kelishuvlar va ularning bajarilishi ustidan nazorat'}, page_content='Nizoni hal etish (tartibga solish) jarayonida jamoaviy mehnat nizosi taraflari erishgan \nkelishuvlar yozma shaklda rasmiylashtiriladi va jam oaviy mehnat nizosi taraflari uchun majburiy \nkuchga ega. Ularning bajarilishi ustidan nazorat jamoaviy mehnat nizosining taraflari, shuningdek \nmehnat nizolarini tartibga solish bo‘yicha davlat organlari tomonidan amalga oshiriladi.'),
 Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH 31-bob. Umumiy qoidalar', 'section': '581-modda', 'section_name': 'Da’vo xususiyatiga ega jamoaviy mehnat nizolarini sudda ko‘rib

In [10]:
# replace 31-bob. Umumiy qoidalar with "" from chapter in metadata
for doc in docs:
    doc.metadata["chapter"] = doc.metadata["chapter"].replace("31-bob. Umumiy qoidalar", "")

In [11]:
docs[-2:]

[Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH ', 'section': '580-modda', 'section_name': 'Jamoaviy mehnat nizosini hal etish ( tartibga solish) jarayonida mehnat nizosining taraflari erishgan kelishuvlar va ularning bajarilishi ustidan nazorat'}, page_content='Nizoni hal etish (tartibga solish) jarayonida jamoaviy mehnat nizosi taraflari erishgan \nkelishuvlar yozma shaklda rasmiylashtiriladi va jam oaviy mehnat nizosi taraflari uchun majburiy \nkuchga ega. Ularning bajarilishi ustidan nazorat jamoaviy mehnat nizosining taraflari, shuningdek \nmehnat nizolarini tartibga solish bo‘yicha davlat organlari tomonidan amalga oshiriladi.'),
 Document(metadata={'chapter': 'VII BO‘LIM. XODIMLARNING MEHNAT HUQUQLARINI HIMOYA QILISH. MEHNAT \nNIZOLARINI KO‘RIB CHIQISH ', 'section': '581-modda', 'section_name': 'Da’vo xususiyatiga ega jamoaviy mehnat nizolarini sudda ko‘rib chiqish'}, page_content='Qonunchilikni, jamoa

### Try to generate

In [12]:
import os
import json
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [72]:
# read .env file
from dotenv import load_dotenv

load_dotenv()

True

In [14]:
label_template = """
You are an AI assistant tasked with generating 10 highly focused question-answer pairs **in Uzbek** based **strictly** on the provided document chunk.

Context:
- Document: "{chunk}"
- Chapter Name: "{chapter}"
- Section: "{section}"
- Section Name: {section_name}"

Instructions:
1. Carefully analyze the given chunk of the document and extract key facts, topics, or concepts.
2. Generate **10 questions** that are maximally semantically close to the specific content of this chunk.
    - All questions must be **strictly tied** to the content of the chunk, chapter, and section.
    - Reuse or closely paraphrase phrases from the chunk wherever possible to maintain high relevance.
    - Incorporate synonyms or light rephrasings to capture different ways a user might query this content (e.g., "mehnat huquqlari" ↔ "ish huquqlari").
    - Avoid any extrapolation, assumptions, or references to external knowledge.
3. Write the **answers** based only on the given chunk. **Do not invent or assume additional information.**
4. Explicitly reference the **chapter name** and **section name** in both the questions and answers for clarity and traceability.
5. Use natural, conversational Uzbek language when generating questions. Occasionally include mild typos or colloquialisms to mimic real user queries, but ensure they remain understandable.
6. Include a reference link in each answer pointing to the document, chapter, and section for traceability.
7. All questions must revolve around a **common theme** in the chunk (e.g., “mehnat huquqlari,” “ish beruvchilarning huquqlari,” etc.).

**Example of how to use phrases from the chunk (paraphrasing)**:
- If the chunk states: "Mehnat qilish, erkin ish tanlash va ishsizlikdan himoyalanish huquqi davlat kafolatlari bilan belgilanadi."
  - A suitable question might be: "Mehnat qilish va erkin ish tanlash huquqlari haqida {{chapter}} va {{section}} nima deyilgan?"
  - The answer must be grounded in the exact text: "Bu huquqlar davlat kafolatlari orqali belgilanadi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. {{chapter}}. {{section}}. {{section_name}}."

**Answer Example**:
If the document says: "Mehnat huquqlari himoyasi mehnat kodeksining asosiy maqsadi hisoblanadi."
- Question: "Mehnat kodeksining asosiy maqsadi nima?"
- Answer: "Mehnat huquqlarining himoyasi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. {{chapter}}, {{section}}. {{section_name}}."

Output Format:
Return a valid JSON object with the following structure:
```json
{{
  "question_1": "Generated question text",
  "answer_1": "Generated answer text",
  "question_2": "Generated question text",
  ...
}}
"""

label_prompt = ChatPromptTemplate.from_template(label_template)
llm = ChatOpenAI(temperature=0.2, model="gpt-4o-mini")

label_chain = label_prompt | llm | JsonOutputParser()

In [15]:
print(docs[1].page_content)

Ushbu Kodeksning asosiy vazifalari quyidagilardan iborat: 
xodimlar mehnat huquqlari va erkinliklarining, shu jumladan mehnat qilishga, erkin ish 
tanlashga, adolatli va xavfsiz mehnat sharoitlariga hamda ishsizlikdan himoyalanishga bo‘lgan 
huquqining davlat kafolatlarini belgilash; 
ish beruvchilarning ka drlarni tanlash, joy -joyiga qo‘yish va samarali mehnat jarayonini 
tashkil etish sohasidagi huquqlari amalga oshirilishini ta’minlash; 
mehnat sohasida ijtimoiy sheriklikni rag‘batlantirish va rivojlantirish; 
xodimlar va ish beruvchilarning huquqlari hamda q onuniy manfaatlari himoya qilinishini 
ta’minlash; 
mehnat bozorining samarali faoliyat ko‘rsatishiga ko‘maklashish.


In [16]:
doc_index = 1

# Generate questions for the second document chunk
output = label_chain.invoke({
    "chunk": docs[doc_index].page_content,
    "chapter": docs[doc_index].metadata["chapter"],
    "section": docs[doc_index].metadata["section"],
    "section_name": docs[doc_index].metadata["section_name"]
})

In [17]:
output

{'question_1': 'Ushbu Kodeksning asosiy vazifalari nimalardan iborat?',
 'answer_1': 'Ushbu Kodeksning asosiy vazifalari xodimlar mehnat huquqlari va erkinliklarini, ish beruvchilarning huquqlarini, mehnat sohasida ijtimoiy sheriklikni rag‘batlantirish va mehnat bozorining samarali faoliyat ko‘rsatishini ta’minlashdan iborat. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. I BO‘LIM. 2-modda. Ushbu Kodeksning asosiy vazifalari.',
 'question_2': 'Mehnat qilish va erkin ish tanlash huquqlari qanday kafolatlanadi?',
 'answer_2': 'Mehnat qilish va erkin ish tanlash huquqlari davlat kafolatlari orqali belgilanadi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. I BO‘LIM. 2-modda. Ushbu Kodeksning asosiy vazifalari.',
 'question_3': 'Ish beruvchilarning huquqlari qanday amalga oshiriladi?',
 'answer_3': 'Ish beruvchilarning huquqlari amalga oshirilishini ta’minlash uchun ularning kadrlarni tanlash, joy-joyiga qo‘yish va samarali mehnat jarayonini tashkil etish sohasidagi huquqlari h

#### Another example

In [18]:
len(docs)

693

In [19]:
doc_index = 421
docs[doc_index]

Document(metadata={'chapter': 'IV BO‘LIM. YAKKA TARTIBDAGI MEHNATGA OID MUNOSABATLAR', 'section': '345-modda', 'section_name': 'Yetkazilgan zararning miqdorini aniqlash'}, page_content='Ish beruvchiga yetkazilgan zarar miqdori buxgalteriya hisobi ma’lumotlari asosidagi haqiqiy \nyo‘qotishlar bo‘yicha belgilanadi. \nIsh beruvchining asosiy fondlarga (vositalarga) taalluqli bo‘lgan mol -mulkiga yetkazilgan \nzarar miqdori belgilangan normalarga ko‘ra eskirish chegirib tashlangan, m oddiy qimmatliklarning \nbalans qiymatidan (tannarxidan) kelib chiqqan holda hisoblab chiqariladi. \nIsh beruvchining asosiy fondlarga (vositalarga) taalluqli mol-mulki o‘g‘irlangan, kamomad \nbo‘lgan, qasddan yo‘q qilingan yoki qasddan buzilgan taqdirda zararni ng miqdori zarar aniqlangan \nkunda ushbu hududda amalda bo‘lgan bozor narxlari bo‘yicha hisoblab chiqariladi. Boshqa hollarda \nzararning miqdori u yetkazilgan kuni mazkur hududda amalda bo‘lgan bozor narxlari bo‘yicha \nhisoblab chiqariladi. \nQonunc

In [20]:
# Generate questions for the  document chunk
output = label_chain.invoke({
    "chunk": docs[doc_index].page_content,
    "chapter": docs[doc_index].metadata["chapter"],
    "section": docs[doc_index].metadata["section"],
    "section_name": docs[doc_index].metadata["section_name"]
})

In [21]:
output

{'question_1': 'Ish beruvchiga yetkazilgan zarar miqdori qanday belgilanadi?',
 'answer_1': 'Ish beruvchiga yetkazilgan zarar miqdori buxgalteriya hisobi ma’lumotlari asosidagi haqiqiy yo‘qotishlar bo‘yicha belgilanadi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. IV BO‘LIM. 345-modda. Yetkazilgan zararning miqdorini aniqlash.',
 'question_2': 'Asosiy fondlarga taalluqli mol-mulkga yetkazilgan zarar qanday hisoblanadi?',
 'answer_2': 'Ish beruvchining asosiy fondlarga taalluqli mol-mulkiga yetkazilgan zarar miqdori belgilangan normalarga ko‘ra eskirish chegirib tashlangan, oddiy qimmatliklarning balans qiymatidan kelib chiqqan holda hisoblab chiqariladi. Havola: O‘ZBEKISTON RESPUBLIKASINING MEHNAT KODEKSI. IV BO‘LIM. 345-modda. Yetkazilgan zararning miqdorini aniqlash.',
 'question_3': 'O‘g‘irlangan yoki qasddan yo‘q qilingan mol-mulk uchun zarar miqdori qanday aniqlanadi?',
 'answer_3': 'Ish beruvchining asosiy fondlarga taalluqli mol-mulki o‘g‘irlangan, kamomad bo‘lgan, qasdd

### Generate for all

In [22]:
# in cycle generate and save to json
dataset = {}

for i, doc in enumerate(docs):
    output = label_chain.invoke({
        "chunk": doc.page_content,
        "chapter": doc.metadata["chapter"],
        "section": doc.metadata["section"],
        "section_name": doc.metadata["section_name"]
    })
    row = {"chunk": doc.page_content, "chapter": doc.metadata["chapter"], "section": doc.metadata["section"], "section_name": doc.metadata["section_name"], "questions": output}
    dataset[f"doc_{i}"] = row

### Save to dataframe

In [26]:
import json
import pandas as pd


# Create a list to hold rows of the dataset
dataset_rows = []

# Iterate over each document and its data
for doc_id, doc_data in dataset.items():
    chunk = doc_data["chunk"]
    chapter = doc_data["chapter"]
    section = doc_data["section"]
    section_name = doc_data["section_name"]
    questions = doc_data["questions"]
    
    # Iterate over question-answer pairs
    for i in range(1, len(questions) // 2 + 1):
        question = questions[f"question_{i}"]
        answer = questions[f"answer_{i}"]
        dataset_rows.append({
            "chunk": chunk,
            "chapter": chapter,
            "section": section,
            "section_name": section_name,
            "question": question,
            "answer": answer
        })

# Convert the dataset into a DataFrame
df = pd.DataFrame(dataset_rows)

In [27]:
df.head()

Unnamed: 0,chunk,chapter,section,section_name,question,answer
0,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeks qaysi munosabatlarni tartibga sol...,Ushbu Kodeks yakka tartibdagi mehnatga oid mun...
1,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeks kimlar o'rtasidagi munosabatlarni...,"Ushbu Kodeks xodimlar, ish beruvchilar va davl..."
2,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ish beruvchilar va xodimlar o'rtasidagi munosa...,Ushbu Kodeks ish beruvchilar va xodimlar o'rta...
3,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda ijtimoiy munosabatlar qanday ko...,Ushbu Kodeks ijtimoiy munosabatlarni yakka tar...
4,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda xodimlar va ish beruvchilar o'r...,"Ushbu Kodeks xodimlar, ish beruvchilar va davl..."


### Import data from script

In [60]:
df = pd.read_pickle("data/labor_codex_rag_dataset.pkl")

In [61]:
df.head()

Unnamed: 0,chunk,chapter,section,section_name,question,answer
0,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda xodimlar va ish beruvchilar o'r...,"Ushbu Kodeks xodimlar, ish beruvchilar va davl..."
1,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda qanday ijtimoiy munosabatlar ta...,Ushbu Kodeks yakka tartibdagi mehnatga oid mun...
2,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda davlat manfaatlari qanday muvoz...,"Ushbu Kodeks xodimlar, ish beruvchilar va davl..."
3,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda mehnat munosabatlari qanday muv...,"Ushbu Kodeks xodimlar, ish beruvchilar va davl..."
4,"Ushbu Kodeks xodimlar, ish beruvchilar va davl...",I BO‘LIM. UMUMIY QOIDALAR,1-modda,Ushbu Kodeks bilan tartibga solinadigan munosa...,Ushbu Kodeksda xodimlar va ish beruvchilar o'r...,Ushbu Kodeks xodimlar va ish beruvchilar o'rta...


In [62]:
df.shape

(6890, 6)

In [63]:
# find rows with len of section_name > 300
df["section_name_len"] = df["section_name"].apply(len)

df[df["section_name_len"] > 300].shape

(310, 7)

In [64]:
# drop rows with len of section_name > 300
df = df[df["section_name_len"] <= 300]

In [65]:
df['section_name'].nunique()

528

## Upload to huggingface

In [75]:
df.drop(columns=["section_name_len"], inplace=True)

In [76]:
# divide to train and test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [77]:
train_df.head()

Unnamed: 0,chunk,chapter,section,section_name,question,answer
5257,Kasanachi tomonidan ishlarni bajarish ish beru...,VI BO‘LIM. AYRIM TOIFADAGI XODIMLAR MEHNATINI ...,447-modda,Mehnat shartnomasida shart qilib ko‘rsatilgan ...,Kasanachi oila a’zolari tomonidan bajariladiga...,Kasanachi oila a’zolari tomonidan bajariladiga...
3350,"Mehnat normalarining joriy etilishi, almashtir...",IV BO‘LIM. YAKKA TARTIBDAGI MEHNATGA OID MUNOS...,274-modda,"Mehnat normalarini ishlab chiqish, joriy etish...",Mehnat normalarining amal qilish muddati tugag...,Mazkur muddat tugaganidan keyin vaqtinchalik n...
1939,qonunda nazarda tutilgan hollarda boshqa xodim...,IV BO‘LIM. YAKKA TARTIBDAGI MEHNATGA OID MUNOS...,160-modda,Mehnat shartnomasini xodimning tashabbusiga ko...,Ish beruvchi ogohlantirish muddati tugagandan ...,Ish beruvchi xodimga mehnat shartnomasini beko...
4456,Mehnat sharoiti noqulay ishlarda band bo‘lgan ...,IV BO‘LIM. YAKKA TARTIBDAGI MEHNATGA OID MUNOS...,363-modda,"Xodimlarni sut, davolash -profilaktika oziq-ov...",Ish beruvchi tomonidan taqdim etiladigan oziq-...,"Oziq-ovqat mahsulotlari jamoa kelishuvlarida, ..."
5697,Noqulay tabiiy -iqlim sharoitlaridagi ish uchu...,VI BO‘LIM. AYRIM TOIFADAGI XODIMLAR MEHNATINI ...,483-modda,Noqulay tabiiy-iqlim sharoitlaridagi ish uchun...,Noqulay tabiiy-iqlim sharoitlarida ishlovchila...,"Ushbu ta’til jamoa kelishuvlarida, jamoa shart..."


In [78]:
from datasets import Dataset
rag_ds = Dataset.from_pandas(df)

In [82]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# Combine into a DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Save locally
dataset_dict.save_to_disk("data/rag_dataset_hf")

Saving the dataset (1/1 shards): 100%|██████████| 5264/5264 [00:00<00:00, 835843.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1316/1316 [00:00<00:00, 112663.12 examples/s]


In [84]:
# Push to Hugging Face Hub
from huggingface_hub import login


# Log in to Hugging Face account
login(token=os.getenv('HF_KEY'))  # Replace with your HF token or use an environment variable

# Specify repository name
repo_name = "fitlemon/rag-labor-codex-dataset"  

# Push to the Hugging Face Hub
dataset_dict.push_to_hub(repo_name)

print(f"Dataset pushed to the Hugging Face Hub under {repo_name}")

Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 161.98ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 398.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]


Dataset pushed to the Hugging Face Hub under fitlemon/rag-labor-codex-dataset
