# Processing dataset -- Chunking

In [None]:
import lzma
import json
import pandas
from chunkipy import TextChunker, TokenEstimator
import os

In [None]:
file_path = "data/train.doj_guidance.jsonl.xz"

docs = []
with lzma.open(file_path, "rt", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 100:
            break
        docs.append(json.loads(line))

print(f"Loaded {len(docs)} documents.")
print("First document preview:", docs[0])

In [None]:
# Print the keys (fields) of the first document
print(docs[0].keys())

In [None]:
def chunk_and_save(text, chunk_size, tokens, overlap_percent, docId, output_dir="resources/data/chunks/guidance"):
    """
    Splits the input text into overlapping chunks and saves each chunk as a JSON file.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The size of each chunk.
        tokens (bool): Whether to chunk by tokens.
        overlap_percent (float): Percentage of overlap between chunks.
        document_name (str): Name of the original document.
        output_dir (str): Directory to save the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_chunker = TextChunker(chunk_size, tokens=tokens, overlap_percent=overlap_percent)
    chunks = text_chunker.chunk(text)
    for i, chunk in enumerate(chunks):
        chunk_data = {
            "document_category": "guidance", #change here
            "docId": docId,
            "chunk_index": i + 1,
            "chunk_text": chunk
        }
        chunk_filename = f"{docId}_chunk_{i + 1}.json"
        chunk_path = os.path.join(output_dir, chunk_filename)
        with open(chunk_path, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {chunk_filename} ({len(chunk)} chunk characters)")

In [None]:
#loop over docs
for idx, doc in enumerate(docs):
    url = doc['url']
    docId = "guidance_"+str(idx)
    chunk_and_save(doc['text'], chunk_size=500, tokens=True, overlap_percent=0.1, docId=docId, output_dir= "resources/data/chunks/guidance")
    print("--------------")

## Query generation

In [None]:
%pip install chunkipy 
%pip install python-terrier
%pip install pyterrier_pisa
%pip install git+https://github.com/terrierteam/pyterrier_dr.git
%pip install git+https://github.com/terrierteam/pyterrier_t5.git

In [None]:
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
import json
import os

def append_queries(input_data):
    for row in input_data:
        row["querygen"] = row["querygen"].split("\n")



def list_all_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def batch_processing(batch, pipeline):

    # generate queries
    queries = pipeline(batch)

    # modify data structure
    for query in queries:
        if "querygen_score" in query:
            query["querygen_score"] = [str(num) for num in query["querygen_score"]]
    append_queries(queries)       

    return queries

def process(input_data_path, output_data_path, pipeline, batch_size):

    # list all docs
    docs = list_all_files(input_data_path)
    
    n_batches = len(docs) // batch_size + 1 * ( len(docs) % batch_size > 0 )

    print(f"Total number of docs: {len(docs)}")

    destination_files = list_all_files(output_data_path)

    batch_num = 1
    for i in range(0, len(docs), batch_size):

        # check if batch already processed
        if f"{output_data_path}/batch_{batch_num}.json" in destination_files:
            continue


        files_batch = []

        for file in docs[i:i+batch_size]:
            # Open and read the JSON file
            with open(file, 'r') as file_n:
                files_batch.append(json.load(file_n))

        docs_batch = []

        for file in files_batch:
            doc = {
                "id":f"{file['docId']}__{file['chunk_index']}",
                "doc_id": file["docId"],
                "chunk_id": file["chunk_index"],
                "document_category": file["document_category"],
                "text": file["chunk_text"]
            }
            docs_batch.append(doc)
        
        # process batch
        batch_queries = batch_processing(docs_batch, pipeline)

        # save batch
        with open(f"{output_data_path}/batch_{batch_num}.json", "w") as outfile:
            json.dump(batch_queries, outfile, indent=2)
        
        print(f"Batch processed : {batch_num} / {n_batches}")
        batch_num += 1

In [None]:
INPUT_DATA_PATH = "./chunks"
OUTPUT_DATA_PATH = "./queries" # TO BE CREATED IN ADVANCE (or programmaticaly)
BATCH_SIZE = 32 # DO NOT CHANGE AFTER FIRST RUN
N_QUERIES = 3

# pipeline
doc2query = Doc2Query(num_samples=N_QUERIES)
scorer = ElectraScorer()

# inspection
pipeline = doc2query >> QueryScorer(scorer) # >> QueryFilter(append=False, t=3.21484375) # 30% electra filter

process(INPUT_DATA_PATH, OUTPUT_DATA_PATH, pipeline, BATCH_SIZE)

## Dataset creation

In [None]:

import os
import json
import pandas as pd

base_Folder = "resources/data/querygen"
filenames = next(os.walk(base_Folder), (None, None, []))[2]  # [] if no file


chunks = []
queries = []
labels = []
query_count = 0
for filename in filenames:
    with open(os.path.join(base_Folder, filename), "r") as file:
        chunks_batch = json.load(file)
        for chunk in chunks_batch:
            labels_chunk = []
            chunk_queries = [q for q, s in zip(chunk["querygen"], chunk["querygen_score"]) if float(s) >= 1.5]
            for chunk_query in chunk_queries:
                queries.append({"query_id": query_count, "query": chunk_query})
                labels_chunk.append({"query_id": query_count, 
                           "doc_id": chunk["doc_id"],  
                           "chunk_id": chunk["chunk_id"],
                           "label": "Relevant"})
                query_count += 1
            del chunk["querygen"]
            del chunk["querygen_score"]
            chunks.append(chunk)
            labels.extend(labels_chunk)
            
            
documents_df = pd.DataFrame(chunks)
documents_df.rename(columns={"document_category": "category", "text": "content"}, inplace=True)
documents_df.to_csv("dataset/legal-docs/document.csv", index=False, encoding="utf-8")


queries_df = pd.DataFrame(queries)
queries_df.set_index('query_id', inplace=True)
queries_df.to_csv("dataset/legal-docs/query.csv", index=True, encoding="utf-8")


labels_df = pd.DataFrame(labels)
labels_df.to_csv("dataset/legal-docs/positive-label.csv", index=False, encoding="utf-8")        

In [None]:
documents_df = pd.read_csv("dataset/legal-docs/document.csv")
queries_df = pd.read_csv("dataset/legal-docs/query.csv")
positive_labels_df = pd.read_csv("dataset/legal-docs/positive-label.csv")


In [None]:
documents_df.head()

In [None]:
positive_labels_df.head()

In [None]:
positive_labels_df[["doc_id","chunk_id"]].value_counts()

In [None]:
queries_df.head()

In [None]:
# create negative labels

import random
from pyterrier_doc2query import QueryScorer
from pyterrier_dr import ElectraScorer

def create_negative_labels(labels_df, documents_df, quires_df, query_scorer):

    query_id = labels_df["query_id"].tolist()
    random.shuffle(query_id)

    # build df for Query Scorer
    query_list = [{
        "query_id": query_id[i],
        "doc_id": labels_df['doc_id'][i],
        "chunk_id": labels_df['chunk_id'][i],
        "text": documents_df[documents_df["id"] == f"{labels_df['doc_id'][i]}__{labels_df['chunk_id'][i]}"]["content"].values[0],
        "querygen": quires_df[quires_df["query_id"] == query_id[i]]["query"].values[0]
    }
    for i in range(len(labels_df))
    ]

    scores_df = query_scorer.transform(pd.DataFrame.from_records(query_list))

    return scores_df


In [None]:
n_runs = 1

negative_labels_df = pd.DataFrame()

for _ in range(n_runs):
     negative_labels_df = pd.concat((negative_labels_df,
                                     create_negative_labels(positive_labels_df, documents_df, queries_df, QueryScorer(ElectraScorer()))
                                     )
                                    )
negative_labels_df.reset_index(inplace=True)

In [None]:
negative_labels_df

In [None]:
# create relevance
relevance = []
for i in range(len(negative_labels_df)):
    if negative_labels_df['querygen_score'][i][0] > 1.5:
        relevance.append("Relevant")
    else:
        relevance.append("Irrelevant")


negative_labels_df["label"] = relevance
negative_labels_df = negative_labels_df[positive_labels_df.columns]

In [None]:
# concat with positive labels
labels_df = pd.concat((positive_labels_df, negative_labels_df))
labels_df.reset_index(inplace=True)

In [None]:
labels_df

In [None]:
labels_df.to_csv("dataset/legal-docs/label.csv", index=False, encoding="utf-8") 