# Processing dataset -- Chunking

In [27]:
import lzma
import json
import pandas
from chunkipy import TextChunker, TokenEstimator
import os

ModuleNotFoundError: No module named 'chunkipy'

In [26]:
file_path = "data/train.doj_guidance.jsonl.xz"

docs = []
with lzma.open(file_path, "rt", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 100:
            break
        docs.append(json.loads(line))

print(f"Loaded {len(docs)} documents.")
print("First document preview:", docs[0])

Loaded 100 documents.
First document preview: {'url': 'https://www.justice.gov/d9/2022-11/interagency_veterans_resource_0.pdf', 'text': 'Protections Against Employment Discrimination for Service Members\nand Veterans\nJointly Authored By:\nU.S. Department of Labor Office of Federal Contract Compliance Programs (OFCCP)\nU.S. Department of Labor Veterans’ Employment and Training Service (VETS)\nU.S. Department of Justice Civil Rights Division (CRT)\nU.S. Equal Employment Opportunity Commission (EEOC)\nService members and veterans have made great sacrifices to guarantee the freedoms and\nliberties that all Americans enjoy. And service members and veterans themselves are also\nguaranteed rights, such as the right to be free from discrimination in civilian employment. This\ndocument helps explain where to get help if you are a service member or veteran and you feel\nthat you have been discriminated against in employment because of your military status,\nveteran status, or another protected 

In [27]:
# Print the keys (fields) of the first document
print(docs[0].keys())

dict_keys(['url', 'text', 'downloaded_timestamp', 'created_timestamp'])


In [33]:
def chunk_and_save(text, chunk_size, tokens, overlap_percent, docId, output_dir="resources/data/chunks/guidance"):
    """
    Splits the input text into overlapping chunks and saves each chunk as a JSON file.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The size of each chunk.
        tokens (bool): Whether to chunk by tokens.
        overlap_percent (float): Percentage of overlap between chunks.
        document_name (str): Name of the original document.
        output_dir (str): Directory to save the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_chunker = TextChunker(chunk_size, tokens=tokens, overlap_percent=overlap_percent)
    chunks = text_chunker.chunk(text)
    for i, chunk in enumerate(chunks):
        chunk_data = {
            "document_category": "guidance", #change here
            "docId": docId,
            "chunk_index": i + 1,
            "chunk_text": chunk
        }
        chunk_filename = f"{docId}_chunk_{i + 1}.json"
        chunk_path = os.path.join(output_dir, chunk_filename)
        with open(chunk_path, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {chunk_filename} ({len(chunk)} chunk characters)")

In [None]:
#loop over docs
for idx, doc in enumerate(docs):
    url = doc['url']
    docId = "guidance_"+str(idx)
    chunk_and_save(doc['text'], chunk_size=500, tokens=True, overlap_percent=0.1, docId=docId, output_dir= "resources/data/chunks/guidance")
    print("--------------")

Saved guidance_0_chunk_1.json (3747 chunk characters)
Saved guidance_0_chunk_2.json (3370 chunk characters)
Saved guidance_0_chunk_3.json (3343 chunk characters)
Saved guidance_0_chunk_4.json (2022 chunk characters)
--------------


## Query generation

In [None]:
# TODO

## Dataset creation

In [11]:

import os
import json
import pandas as pd

base_Folder = "resources/data/querygen"
filenames = next(os.walk(base_Folder), (None, None, []))[2]  # [] if no file


chunks = []
queries = []
labels = []
query_count = 0
for filename in filenames:
    with open(os.path.join(base_Folder, filename), "r") as file:
        chunks_batch = json.load(file)
        for chunk in chunks_batch:
            labels_chunk = []
            chunk_queries = [q for q, s in zip(chunk["querygen"], chunk["querygen_score"]) if float(s) >= 1.5]
            for chunk_query in chunk_queries:
                queries.append({"query_id": query_count, "query": chunk_query})
                labels_chunk.append({"query_id": query_count, 
                           "doc_id": chunk["doc_id"],  
                           "chunk_id": chunk["chunk_id"],
                           "label": "Relevant"})
                query_count += 1
            del chunk["querygen"]
            del chunk["querygen_score"]
            chunks.append(chunk)
            labels.extend(labels_chunk)
            
            
documents_df = pd.DataFrame(chunks)
documents_df.rename(columns={"document_category": "category", "text": "content"}, inplace=True)
documents_df.to_csv("dataset/legal-docs/document.csv", index=False, encoding="utf-8")


queries_df = pd.DataFrame(queries)
queries_df.set_index('query_id', inplace=True)
queries_df.to_csv("dataset/legal-docs/query.csv", index=True, encoding="utf-8")


labels_df = pd.DataFrame(labels)
labels_df.to_csv("dataset/legal-docs/label.csv", index=False, encoding="utf-8")        

In [3]:
documents_df

Unnamed: 0,id,doc_id,chunk_id,category,content
0,legal-advice_98__48,legal-advice_98,48,legal-advice,(f) Effective/applicability date. This section...
1,legal-advice_31__26,legal-advice_31,26,legal-advice,The\ncircumstances described in this paragraph...
2,legal-advice_25__137,legal-advice_25,137,legal-advice,2. 3. Add lines 1 and 2 . . . . . . . . . . . ...
3,legal-advice_64__147,legal-advice_64,147,legal-advice,If the premium tax credit you can\nclaim excee...
4,legal-advice_37__14,legal-advice_37,14,legal-advice,"However, if you were a nonresident alien or a ..."
...,...,...,...,...,...
3195,legal-advice_68__126,legal-advice_68,126,legal-advice,You can either pay the\npremiums yourself or t...
3196,legal-advice_40__60,legal-advice_40,60,legal-advice,Line 25c—Other Forms\nInclude on line 25c any ...
3197,legal-advice_61__26,legal-advice_61,26,legal-advice,The\ncircumstances described in this paragraph...
3198,legal-advice_93__18,legal-advice_93,18,legal-advice,"Upon\nthe applicant’s written request, the Int..."


In [12]:
labels_df

Unnamed: 0,query_id,doc_id,chunk_id,label
0,0,legal-advice_74,1,Relevant
1,1,legal-advice_67,28,Relevant
2,2,legal-advice_67,28,Relevant
3,3,legal-advice_36,35,Relevant
4,4,legal-advice_78,8,Relevant
...,...,...,...,...
862,862,legal-advice_61,20,Relevant
863,863,legal-advice_45,8,Relevant
864,864,legal-advice_0,27,Relevant
865,865,legal-advice_61,26,Relevant


In [9]:
labels_df[["doc_id","chunk_id"]].value_counts()

doc_id           chunk_id
legal-advice_65  27          3
legal-advice_92  18          3
legal-advice_59  1           3
legal-advice_26  55          3
legal-advice_77  1           3
                            ..
legal-advice_45  14          1
                 10          1
                 8           1
legal-advice_27  22          1
legal-advice_99  156         1
Name: count, Length: 645, dtype: int64

In [None]:
# create negative labels

import random
from pyterrier_doc2query import QueryScorer
from pyterrier_dr import ElectraScorer

def create_negative_labels(labels_df, quires_df, query_scorer):
    labels_df_copy = labels_df.copy()

    query_id = labels_df_copy["query_id"].tolist()
    random.shuffle(query_id)

    labels_df_copy["query_id"] = query_id

    # build df for Query Scorer
    query_df = pd.DataFrame([

    ])

    


    return labels_df_copy


In [33]:
quires_df = pd.read_csv("dataset/legal-docs/query.csv")

In [34]:
quires_df

Unnamed: 0,query_id,query
0,0,internal revenue service memo release number
1,1,who must qualify for the required continuing e...
2,2,requirements for a continuing education program
3,3,statement of fee information concerning matter...
4,4,what is the purpose of bea payments
...,...,...
862,862,when do you renew your social security numbers
863,863,who has the right to redeem a gift card at any...
864,864,who can prepare tax returns
865,865,who may represent employer


In [31]:
create_negative_labels(labels_df)

Unnamed: 0,query_id,doc_id,chunk_id,label
0,177,legal-advice_74,1,Relevant
1,257,legal-advice_67,28,Relevant
2,357,legal-advice_67,28,Relevant
3,498,legal-advice_36,35,Relevant
4,79,legal-advice_78,8,Relevant
...,...,...,...,...
862,58,legal-advice_61,20,Relevant
863,266,legal-advice_45,8,Relevant
864,264,legal-advice_0,27,Relevant
865,120,legal-advice_61,26,Relevant
