# Processing dataset -- Chunking

In [24]:
import lzma
import json
import pandas
from chunkipy import TextChunker, TokenEstimator
import os

In [26]:
file_path = "data/train.doj_guidance.jsonl.xz"

docs = []
with lzma.open(file_path, "rt", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 100:
            break
        docs.append(json.loads(line))

print(f"Loaded {len(docs)} documents.")
print("First document preview:", docs[0])

Loaded 100 documents.
First document preview: {'url': 'https://www.justice.gov/d9/2022-11/interagency_veterans_resource_0.pdf', 'text': 'Protections Against Employment Discrimination for Service Members\nand Veterans\nJointly Authored By:\nU.S. Department of Labor Office of Federal Contract Compliance Programs (OFCCP)\nU.S. Department of Labor Veterans’ Employment and Training Service (VETS)\nU.S. Department of Justice Civil Rights Division (CRT)\nU.S. Equal Employment Opportunity Commission (EEOC)\nService members and veterans have made great sacrifices to guarantee the freedoms and\nliberties that all Americans enjoy. And service members and veterans themselves are also\nguaranteed rights, such as the right to be free from discrimination in civilian employment. This\ndocument helps explain where to get help if you are a service member or veteran and you feel\nthat you have been discriminated against in employment because of your military status,\nveteran status, or another protected 

In [27]:
# Print the keys (fields) of the first document
print(docs[0].keys())

dict_keys(['url', 'text', 'downloaded_timestamp', 'created_timestamp'])


In [33]:
def chunk_and_save(text, chunk_size, tokens, overlap_percent, docId, output_dir="resources/data/chunks/guidance"):
    """
    Splits the input text into overlapping chunks and saves each chunk as a JSON file.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The size of each chunk.
        tokens (bool): Whether to chunk by tokens.
        overlap_percent (float): Percentage of overlap between chunks.
        document_name (str): Name of the original document.
        output_dir (str): Directory to save the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_chunker = TextChunker(chunk_size, tokens=tokens, overlap_percent=overlap_percent)
    chunks = text_chunker.chunk(text)
    for i, chunk in enumerate(chunks):
        chunk_data = {
            "document_category": "guidance", #change here
            "docId": docId,
            "chunk_index": i + 1,
            "chunk_text": chunk
        }
        chunk_filename = f"{docId}_chunk_{i + 1}.json"
        chunk_path = os.path.join(output_dir, chunk_filename)
        with open(chunk_path, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {chunk_filename} ({len(chunk)} chunk characters)")

In [None]:
#loop over docs
for idx, doc in enumerate(docs):
    url = doc['url']
    docId = "guidance_"+str(idx)
    chunk_and_save(doc['text'], chunk_size=500, tokens=True, overlap_percent=0.1, docId=docId, output_dir= "resources/data/chunks/guidance")
    print("--------------")

Saved guidance_0_chunk_1.json (3747 chunk characters)
Saved guidance_0_chunk_2.json (3370 chunk characters)
Saved guidance_0_chunk_3.json (3343 chunk characters)
Saved guidance_0_chunk_4.json (2022 chunk characters)
--------------
