# Processing dataset

In [None]:
%pip install chunkipy
%pip install --upgrade git+https://github.com/terrierteam/pyterrier_dr.git 
%pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
%pip install --upgrade git+https://github.com/terrierteam/pyterrier_doc2query.git

## Chunking

In [2]:
import lzma
import json
import hashlib


files = {
    "guidance": "train.doj_guidance.jsonl.xz",
    "memos": "train.olc_memos.jsonl.xz",
    "eurlex": "train.eurlex.jsonl.xz",
    "legal-advice": "train.irs_legal_advice_memos.jsonl.xz"
}
base_folder = "resources/data/documents"

document_categories = {}
for category, file_name in files.items():
    file_path = f"{base_folder}/{file_name}"
    print(f"Loading {file_path}...")
    document_categories[category] = {}
    
    # Open the compressed JSONL file and read lines
    with lzma.open(file_path, "rt", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if len(document_categories[category]) >= 100:
                break
            document = json.loads(line)
            hash = hashlib.sha256(document["text"].encode('utf-8')).hexdigest()
            document_categories[category][hash] = document
        document_categories[category] = list(document_categories[category].values())

    print(f"Loaded {len(document_categories[category])} documents.")

Loading resources/data/documents/train.doj_guidance.jsonl.xz...
Loaded 100 documents.
Loading resources/data/documents/train.olc_memos.jsonl.xz...
Loaded 100 documents.
Loading resources/data/documents/train.eurlex.jsonl.xz...
Loaded 100 documents.
Loading resources/data/documents/train.irs_legal_advice_memos.jsonl.xz...
Loaded 100 documents.


In [8]:
import os
from chunkipy import TextChunker

def chunk_and_save(text, category, chunk_size, tokens, overlap_percent, doc_id, output_dir="resources/data/chunks/guidance"):
    """
    Splits the input text into overlapping chunks and saves each chunk as a JSON file.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The size of each chunk.
        tokens (bool): Whether to chunk by tokens.
        overlap_percent (float): Percentage of overlap between chunks.
        document_name (str): Name of the original document.
        output_dir (str): Directory to save the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_chunker = TextChunker(chunk_size, tokens=tokens, overlap_percent=overlap_percent)
    chunks = text_chunker.chunk(text)
    document_data = {
        "doc_id": doc_id,
        "category": category,
        "chunks": []
    }
    for i, chunk in enumerate(chunks):
        chunk_data = {
            "id": f"{doc_id}__{str(i)}",
            "chunk_id": str(i),
            "content": chunk
        }
        document_data["chunks"].append(chunk_data)
        print(f"Added chunk #{i} ({len(chunk_data['content'])} characters)")
    document_filename = f"{doc_id}_chunks.json"
    document_path = os.path.join(output_dir, document_filename)
    with open(document_path, "w", encoding="utf-8") as f:
        json.dump(document_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {document_filename} ({len(document_data['chunks'])} chunks)")

In [9]:
# loop over docs
for category, documents in document_categories.items():
    for idx, document in enumerate(documents):
        url = document['url']
        doc_id = f"{category}_{str(idx)}"
        chunk_and_save(document['text'], category=category, chunk_size=500, tokens=True, overlap_percent=0.1, doc_id=doc_id, output_dir=f"resources/data/chunks/{category}")
        print("--------------")

Added chunk #0 (3747 characters)
Added chunk #1 (3370 characters)
Added chunk #2 (3343 characters)
Added chunk #3 (2022 characters)
Saved guidance_0_chunks.json (4 chunks)
--------------
Added chunk #0 (3450 characters)
Added chunk #1 (3318 characters)
Added chunk #2 (3153 characters)
Added chunk #3 (3329 characters)
Added chunk #4 (2385 characters)
Saved guidance_1_chunks.json (5 chunks)
--------------
Added chunk #0 (3365 characters)
Added chunk #1 (3180 characters)
Added chunk #2 (3215 characters)
Added chunk #3 (2797 characters)
Saved guidance_2_chunks.json (4 chunks)
--------------
Added chunk #0 (3754 characters)
Added chunk #1 (3558 characters)
Added chunk #2 (3402 characters)
Added chunk #3 (957 characters)
Saved guidance_3_chunks.json (4 chunks)
--------------
Added chunk #0 (3235 characters)
Added chunk #1 (3536 characters)
Added chunk #2 (3504 characters)
Added chunk #3 (3100 characters)
Added chunk #4 (2591 characters)
Saved guidance_4_chunks.json (5 chunks)
--------------


## Query generation

In [2]:
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
import json
from glob import glob
import os


def list_all_files(directory):
    all_paths = glob(directory + '/**/*', recursive=True)
    all_files = [f.replace("\\", "/") for f in all_paths if os.path.isfile(f)]
    return all_files

def queries_processing(pipeline, input):
    # generate queries
    results = pipeline(input)

    # modify data structure
    for i, result in enumerate(results):
        if "querygen_score" in result:
            result["querygen_score"] = [str(num) for num in result["querygen_score"]]
   
        result["querygen"] = result["querygen"].split("\n") 
        if len(result["querygen_score"]) == 0:
            result["querygen"] = []
        results[i] = result

    return results

def process(input_data_path, output_data_path, pipeline):
    # list all docs
    chunks_document_files = list_all_files(input_data_path)

    print(f"Total number of docs: {len(chunks_document_files)}")
    destination_files = list_all_files(output_data_path)

    for i, chunks_document_file in enumerate(chunks_document_files):
        with open(chunks_document_file, 'r', encoding="utf-8") as file:
            chunks_document = json.load(file)
            
        doc_id = chunks_document["doc_id"]
        category = chunks_document["category"]
        
        # skip if the file has been already processed
        if f"{output_data_path}/{category}/{doc_id}_queries.json" in destination_files:
            print(f"Skipping already processed file: {doc_id}")
            continue
        
        chunks = []
        for chunk in chunks_document["chunks"]:
            chunk_data = {
                "text": chunk["content"]
            }
            chunks.append(chunk_data)

        # process batch
        chunks_queries = queries_processing(pipeline, chunks)
        
        chunks_document_queries = {
            "doc_id": doc_id,
            "category": category,
            "chunks": [
                {
                    "id": cd["id"],
                    "chunk_id": cd["chunk_id"],
                    "content": cd["content"],
                    "queries": cq["querygen"],
                    "queries_score": cq.get("querygen_score", []),
                } for cd, cq in zip(chunks_document["chunks"], chunks_queries)],
        }

        # save batch
        with open(f"{output_data_path}/{category}/{doc_id}_queries.json", "w") as outfile:
            json.dump(chunks_document_queries, outfile, indent=2)
        
        print(f"Document processed: {doc_id} - Progress: {i+1} / {len(chunks_document_files)}")


  from .autonotebook import tqdm as notebook_tqdm


In [71]:
chunks_data_path = "resources/data/chunks"
queries_data_path = "resources/data/queries" 
n_queries = 3
threshold = 1.5

os.makedirs(queries_data_path, exist_ok=True)
# pipeline
doc2query = Doc2Query(num_samples=n_queries, fast_tokenizer=True)
scorer = ElectraScorer()

# inspection 
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t=threshold)  # 30% electra filter

process(chunks_data_path, queries_data_path, pipeline)


Total number of docs: 400
Skipping already processed file: eurlex_0
Skipping already processed file: eurlex_10
Skipping already processed file: eurlex_11
Skipping already processed file: eurlex_12
Skipping already processed file: eurlex_13
Skipping already processed file: eurlex_14
Skipping already processed file: eurlex_15
Skipping already processed file: eurlex_16
Skipping already processed file: eurlex_17
Skipping already processed file: eurlex_18
Skipping already processed file: eurlex_19
Skipping already processed file: eurlex_1
Skipping already processed file: eurlex_20
Skipping already processed file: eurlex_21
Skipping already processed file: eurlex_22
Skipping already processed file: eurlex_23
Skipping already processed file: eurlex_24
Skipping already processed file: eurlex_25
Skipping already processed file: eurlex_26
Skipping already processed file: eurlex_27
Skipping already processed file: eurlex_28
Skipping already processed file: eurlex_29
Skipping already processed fil

## Dataset creation

In [5]:
import os
import json
import pandas as pd

dataset_folder = "dataset/legal-docs"
base_folder = "resources/data/negqueries"
filenames = list_all_files(base_folder)
os.makedirs(dataset_folder, exist_ok=True)

chunks = []
queries_map = {}
labels = []
query_count = 0
for filename in filenames:
    with open(filename, "r", encoding="utf-8") as file:
        chunks_with_queries_document = json.load(file)
        doc_id = chunks_with_queries_document["doc_id"]
        for chunk in chunks_with_queries_document["chunks"]:
            labels_chunk = []
            for query, query_score in zip(chunk["queries"], chunk["queries_score"]):
                if query not in queries_map:
                    queries_map[query] = query_count
                    query_count += 1
                labels_chunk.append(
                    {
                        "query_id": queries_map[query], 
                        "id": chunk["id"], 
                        "doc_id": doc_id,  
                        "chunk_id": chunk["chunk_id"],
                        "label": "Relevant" if float(query_score) >= threshold else "Irrelevant"
                    }
                )
     
            del chunk["queries"]
            del chunk["queries_score"]
            chunk["id"] = chunk["id"]
            chunk["chunk_id"] = chunk["chunk_id"]
            chunk["doc_id"] = doc_id
            chunk["category"] = chunks_with_queries_document["category"]
            chunk["content"] = chunk["content"].replace("\t, ", " ").replace("\n", " ")
            chunks.append(chunk)
            labels.extend(labels_chunk)
            
            
documents_df = pd.DataFrame(chunks)
documents_df.set_index('id', inplace=True)
documents_df.to_csv(f"{dataset_folder}/document.csv", index=True, sep="\t")


queries = [{"query_id": query_count, "query": query} for query, query_count in queries_map.items()]
queries_df = pd.DataFrame(queries)
queries_df.set_index('query_id', inplace=True)
queries_df.to_csv(f"{dataset_folder}/query.csv", index=True, encoding="utf-8", sep="\t")


labels_df = pd.DataFrame(labels)
labels_df.to_csv(f"{dataset_folder}/label.csv", index=False, encoding="utf-8", sep="\t")

 

In [114]:
documents_df = pd.read_csv(f"{dataset_folder}/document.csv", sep="\t")
queries_df = pd.read_csv(f"{dataset_folder}/query.csv", sep="\t")
labels_df = pd.read_csv(f"{dataset_folder}/label.csv", sep="\t")


In [115]:
documents_df.head()

Unnamed: 0,id,chunk_id,content,doc_id,category
0,eurlex_0__0,0,Name: Decision (EU) 2019/276 of the European P...,eurlex_0,eurlex
1,eurlex_0__1,1,(6) In order to allow for the quick use of the...,eurlex_0,eurlex
2,eurlex_10__0,0,Name: Decision (EU) 2018/515 of the European P...,eurlex_10,eurlex
3,eurlex_10__1,1,For the general budget of the Union for the fi...,eurlex_10,eurlex
4,eurlex_11__0,0,Name: Decision (EU) 2018/508 of the European P...,eurlex_11,eurlex


In [118]:
labels_df.head()

Unnamed: 0,query_id,id,doc_id,chunk_id,label
0,0,eurlex_0__0,eurlex_0,0,Relevant
1,1,eurlex_0__0,eurlex_0,0,Irrelevant
2,2,eurlex_0__0,eurlex_0,0,Irrelevant
3,3,eurlex_0__1,eurlex_0,1,Relevant
4,4,eurlex_0__1,eurlex_0,1,Irrelevant


In [None]:
positive_labels_df[["doc_id", "chunk_id"]].value_counts()

doc_id     chunk_id
eurlex_0   0           3
           1           2
eurlex_11  0           2
eurlex_12  0           2
eurlex_11  1           1
eurlex_12  1           1
Name: count, dtype: int64

In [95]:
from glob import glob

def list_all_files(directory):
    all_paths = glob(directory + '/**/*', recursive=True)
    all_files = [f.replace("\\", "/") for f in all_paths if os.path.isfile(f)]
    return all_files

def load_all_chunks(base_folder):
    filenames = list_all_files(base_folder)
    
    chunks = {}
    for filename in filenames:
        with open(filename, "r", encoding="utf-8") as file:
            chunks_document = json.load(file)
            for chunk in chunks_document["chunks"]:
                chunk["doc_id"] = chunks_document["doc_id"]
                chunks[chunk["id"]] = chunk
    return chunks



base_folder = "resources/data/queries"
chunks_with_queries = load_all_chunks(base_folder)

In [None]:
output_folder = "resources/data/negqueries"

def save_data_with_queries(chunks_with_queries, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    doc_ids = list(set([c["doc_id"] for c in chunks_with_queries.values()]))

    for doc_id in doc_ids:
        chunks = [c for c in chunks_with_queries.values() if c["doc_id"] == doc_id]
        category = doc_id.split("_")[0] 
        chunks_data = []
        for chunk in chunks:
            chunk_data = {
                "id": chunk["id"],
                "chunk_id": chunk["chunk_id"],
                "content": chunk["content"],
                "queries": chunk["queries"],
                "queries_score": chunk["queries_score"]
            }
            chunks_data.append(chunk_data)
        chunks_document = {
            "doc_id": doc_id,
            "category": category,
            "chunks": chunks_data
        }

        outfile_name = f"{output_folder}/{category}/{doc_id}_negqueries.json"
        with open(outfile_name, "w", encoding="utf-8") as file:
            json.dump(chunks_document, file, indent=2)
            print(f"Saved {outfile_name} with {len(chunks_document['chunks'])} chunks.")

Saved resources/data/negqueries/legal-advice/legal-advice_56_negqueries.json with 22 chunks.
Saved resources/data/negqueries/eurlex/eurlex_28_negqueries.json with 1 chunks.
Saved resources/data/negqueries/memos/memos_75_negqueries.json with 16 chunks.
Saved resources/data/negqueries/eurlex/eurlex_75_negqueries.json with 2 chunks.
Saved resources/data/negqueries/legal-advice/legal-advice_16_negqueries.json with 7 chunks.
Saved resources/data/negqueries/eurlex/eurlex_7_negqueries.json with 2 chunks.
Saved resources/data/negqueries/memos/memos_61_negqueries.json with 9 chunks.
Saved resources/data/negqueries/memos/memos_71_negqueries.json with 3 chunks.
Saved resources/data/negqueries/memos/memos_78_negqueries.json with 7 chunks.
Saved resources/data/negqueries/legal-advice/legal-advice_97_negqueries.json with 5 chunks.
Saved resources/data/negqueries/memos/memos_83_negqueries.json with 9 chunks.
Saved resources/data/negqueries/memos/memos_52_negqueries.json with 2 chunks.
Saved resources

# Negative labels

In [None]:
# create negative labels

import random
from pyterrier_doc2query import QueryScorer
from pyterrier_dr import ElectraScorer

def create_negative_labels(labels_df, documents_df, queries_df, query_scorer):
    query_ids = labels_df["query_id"].tolist()
    random.shuffle(query_ids)

    # build df for Query Scorer
    query_list = [
        {
            "query_id": query_ids[i],
            "doc_id": labels_df['doc_id'][i],
            "chunk_id": labels_df['chunk_id'][i],
            "text": documents_df[documents_df["id"] == f"{labels_df['doc_id'][i]}__{labels_df['chunk_id'][i]}"]["content"].values[0],
            "querygen": queries_df[queries_df["query_id"] == query_ids[i]]["query"].values[0]
        }
        for i in range(len(labels_df))
    ]

    scores_df = query_scorer.transform(pd.DataFrame.from_records(query_list))
    return scores_df


In [51]:
positive_labels_df

Unnamed: 0,query_id,id,doc_id,chunk_id,label
0,0,eurlex_0__0,eurlex_0,0,Relevant
1,1,eurlex_0__0,eurlex_0,0,Relevant
2,2,eurlex_0__0,eurlex_0,0,Relevant
3,3,eurlex_0__1,eurlex_0,1,Relevant
4,4,eurlex_0__1,eurlex_0,1,Relevant
5,5,eurlex_11__0,eurlex_11,0,Relevant
6,6,eurlex_11__0,eurlex_11,0,Relevant
7,7,eurlex_11__1,eurlex_11,1,Relevant
8,8,eurlex_12__0,eurlex_12,0,Relevant
9,9,eurlex_12__0,eurlex_12,0,Relevant


In [58]:
positive_labels_df[["query_id", "id"]].values.tolist()

[[0, 'eurlex_0__0'],
 [1, 'eurlex_0__0'],
 [2, 'eurlex_0__0'],
 [3, 'eurlex_0__1'],
 [4, 'eurlex_0__1'],
 [5, 'eurlex_11__0'],
 [6, 'eurlex_11__0'],
 [7, 'eurlex_11__1'],
 [8, 'eurlex_12__0'],
 [9, 'eurlex_12__0'],
 [10, 'eurlex_12__1']]

In [54]:
query_ids = positive_labels_df[["query_id", "id"]].tolist()
#random.shuffle(query_ids)

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [49]:
query_ids

[3, 10, 4, 9, 7, 0, 8, 6, 1, 5, 2]

In [None]:
n_runs = 1

negative_labels_df = pd.DataFrame()

for _ in range(n_runs):
     negative_labels_df = pd.concat(
          (
               negative_labels_df,
               create_negative_labels(
                    positive_labels_df, 
                    documents_df, 
                    queries_df, 
                    QueryScorer(ElectraScorer()))
               )
          )
negative_labels_df.reset_index(inplace=True)

ELECTRA scoring: 100%|██████████| 11/11 [00:00<?, ?record/s]


In [47]:
negative_labels_df

Unnamed: 0,index,query_id,doc_id,chunk_id,text,querygen,querygen_score
0,0,8,eurlex_0,0,Name: Decision (EU) 2019/276 of the European P...,when was the european commission decision on s...,[-4.6685557]
1,1,3,eurlex_0,0,Name: Decision (EU) 2019/276 of the European P...,when does the flexibility instrument apply for...,[1.41791]
2,2,0,eurlex_0,0,Name: Decision (EU) 2019/276 of the European P...,why is the flexibility instrument necessary,[2.5053818]
3,3,9,eurlex_0,1,(6) In order to allow for the quick use of the...,when is eu's decision on sustainable development,[-4.5256705]
4,4,6,eurlex_0,1,(6) In order to allow for the quick use of the...,when will the european solidarity fund be mobi...,[-3.3839595]
5,5,10,eurlex_11,0,Name: Decision (EU) 2018/508 of the European P...,what is the payment appropriation for the flex...,[-4.1214275]
6,6,5,eurlex_11,0,Name: Decision (EU) 2018/508 of the European P...,when was the european solidarity fund allocate...,[1.7820455]
7,7,2,eurlex_11,1,Article 2 This Decision shall enter into force...,what is the purpose of the flexibility instrument,[-10.536338]
8,8,1,eurlex_12,0,Name: Decision (EU) 2018/51 of the European Pa...,what is the purpose of the flexibility instrum...,[0.5726434]
9,9,7,eurlex_12,0,Name: Decision (EU) 2018/51 of the European Pa...,when is this decision in force?,[-2.5245578]


In [None]:
# create relevance
relevance = []
for i in range(len(negative_labels_df)):
    if negative_labels_df['querygen_score'][i][0] > threshold:
        relevance.append("Relevant")
    else:
        relevance.append("Irrelevant")


negative_labels_df["label"] = relevance
negative_labels_df = negative_labels_df[positive_labels_df.columns]

In [None]:
# concat with positive labels
labels_df = pd.concat((positive_labels_df, negative_labels_df))
labels_df.reset_index(inplace=True)

In [None]:
labels_df

In [None]:
labels_df.to_csv("dataset/legal-docs/label.csv", index=False, encoding="utf-8") 