# Processing dataset -- Chunking

In [8]:
import lzma
import json
import pandas
from chunkipy import TextChunker, TokenEstimator
import os

In [17]:
file_path = "data/train.doj_guidance.jsonl.xz"

docs = []
with lzma.open(file_path, "rt", encoding="utf-8") as f:
    for line in f:
        docs.append(json.loads(line))

print(f"Loaded {len(docs)} documents.")
print("First document preview:", docs[0])

Loaded 335 documents.
First document preview: {'url': 'https://www.justice.gov/d9/2022-11/interagency_veterans_resource_0.pdf', 'text': 'Protections Against Employment Discrimination for Service Members\nand Veterans\nJointly Authored By:\nU.S. Department of Labor Office of Federal Contract Compliance Programs (OFCCP)\nU.S. Department of Labor Veterans’ Employment and Training Service (VETS)\nU.S. Department of Justice Civil Rights Division (CRT)\nU.S. Equal Employment Opportunity Commission (EEOC)\nService members and veterans have made great sacrifices to guarantee the freedoms and\nliberties that all Americans enjoy. And service members and veterans themselves are also\nguaranteed rights, such as the right to be free from discrimination in civilian employment. This\ndocument helps explain where to get help if you are a service member or veteran and you feel\nthat you have been discriminated against in employment because of your military status,\nveteran status, or another protected 

In [18]:
# Print the keys (fields) of the first document
print(docs[0].keys())

dict_keys(['url', 'text', 'downloaded_timestamp', 'created_timestamp'])


In [None]:
def chunk_and_save(text, chunk_size, tokens, overlap_percent, document_name, output_dir="chunks/guidance"):
    """
    Splits the input text into overlapping chunks and saves each chunk as a JSON file.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The size of each chunk.
        tokens (bool): Whether to chunk by tokens.
        overlap_percent (float): Percentage of overlap between chunks.
        document_name (str): Name of the original document.
        output_dir (str): Directory to save the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_chunker = TextChunker(chunk_size, tokens=tokens, overlap_percent=overlap_percent)
    chunks = text_chunker.chunk(text)
    for i, chunk in enumerate(chunks):
        chunk_data = {
            "document_category": "guidance", #change here
            "document_name": document_name,
            "chunk_index": i + 1,
            "chunk_text": chunk
        }
        chunk_filename = f"chunk_{i + 1}_{document_name}.json"
        chunk_path = os.path.join(output_dir, chunk_filename)
        with open(chunk_path, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {chunk_filename} ({len(chunk)} chunk characters)")

In [23]:
#loop over docs
for idx, doc in enumerate(docs):
    url = doc['url']
    document_name = os.path.basename(url)
    chunk_and_save(doc['text'], chunk_size=500, tokens=True, overlap_percent=0.1, document_name=document_name, output_dir= "chunks/guidance")
    print("--------------")

Saved chunk_1_interagency_veterans_resource_0.pdf.json (3747 chunk characters)
Saved chunk_2_interagency_veterans_resource_0.pdf.json (3370 chunk characters)
Saved chunk_3_interagency_veterans_resource_0.pdf.json (3343 chunk characters)
Saved chunk_4_interagency_veterans_resource_0.pdf.json (2022 chunk characters)
--------------
Saved chunk_1_know_your_voting_rights_spanish.pdf.json (3450 chunk characters)
Saved chunk_2_know_your_voting_rights_spanish.pdf.json (3318 chunk characters)
Saved chunk_3_know_your_voting_rights_spanish.pdf.json (3153 chunk characters)
Saved chunk_4_know_your_voting_rights_spanish.pdf.json (3329 chunk characters)
Saved chunk_5_know_your_voting_rights_spanish.pdf.json (2385 chunk characters)
--------------
Saved chunk_1_know_your_voting_rights.pdf.json (3365 chunk characters)
Saved chunk_2_know_your_voting_rights.pdf.json (3180 chunk characters)
Saved chunk_3_know_your_voting_rights.pdf.json (3215 chunk characters)
Saved chunk_4_know_your_voting_rights.pdf.json

KeyboardInterrupt: 