# Processing dataset

In [1]:
from glob import glob

def list_all_files(directory):
    all_paths = glob(directory + '/**/*', recursive=True)
    all_files = [f.replace("\\", "/") for f in all_paths if os.path.isfile(f)]
    return all_files

In [None]:
import json
import os
import random 

def load_all_chunks(base_folder):
    filenames = list_all_files(base_folder)
    
    chunks = {}
    for filename in filenames:
        with open(filename, "r", encoding="utf-8") as file:
            chunks_document = json.load(file)
            for chunk in chunks_document["chunks"]:
                chunk["doc_id"] = chunks_document["doc_id"]
                chunks[chunk["id"]] = chunk
    return chunks


def random_select_chunks(chunks_with_queries, num_samples=5, exclude_doc_id=None):
    exclude_ids = set()
    if exclude_doc_id is not None:
        exclude_ids = set([cid for cid in chunks_with_queries.keys() if cid.startswith(exclude_doc_id)])
    
    selected_chunks = []
    available_ids = list(set(chunks_with_queries.keys()) - exclude_ids)
    
    if len(available_ids) < num_samples:
        raise ValueError("Not enough unique chunks available to select the requested number of samples.")
    
    selected_ids = random.sample(available_ids, num_samples)
    
    for chunk_id in selected_ids:
        selected_chunks.append(chunks_with_queries[chunk_id])
    
    return selected_chunks


def compute_query_content_matches(chunks_with_queries, num_samples=5):
    query_content_lists = []
    for _, chunk_data in chunks_with_queries.items():
        if "queries" in chunk_data:
            queries = chunk_data["queries"]
            for query in queries:
                
                random_chunks = random_select_chunks(chunks_with_queries, num_samples=num_samples, exclude_doc_id=None)
                query_content_list = [{"text": c["content"], "querygen": query, "id": c["id"]} for c in random_chunks]
                query_content_lists.append(query_content_list)
    return query_content_lists  



In [None]:
import pandas as pd
from pyterrier_doc2query import QueryScorer
from pyterrier_dr import ElectraScorer

query_scorer = QueryScorer(ElectraScorer())
base_folder = "resources/data/queries"

# all chunks are loaded from files with queries and their score => a map {"chunk_id": chunk_data} is built
chunks_with_queries = load_all_chunks(base_folder) 
# for each query, num_samples chunks are randomly taken (with their content and id) 
query_content_lists = compute_query_content_matches(chunks_with_queries, num_samples=5) 


for i, query_content_list in enumerate(query_content_lists):

    scores = query_scorer.transform(pd.DataFrame.from_records(query_content_list)).to_dict(orient='records')
    for score in scores:
        # update the map adding the new queries and their scores
        chunks_with_queries[score["id"]]["queries"].append(score["querygen"])
        chunks_with_queries[score["id"]]["queries_score"].append(float(score["querygen_score"][0]))
        
    print(f"Processing {i+1}/{len(query_content_lists)} query...")
    
# at the end we will have a map of all the chunks with some queries each that could have a positive value or negative values
# the value <= threshold=1.5 is considered Irrelevant, otherwise Relevant 

In [10]:
import os 
output_folder = "resources/data/negqueries"

def save_data_with_queries(chunks_with_queries, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    doc_ids = list(set([c["doc_id"] for c in chunks_with_queries.values()]))

    for doc_id in doc_ids:
        chunks = [c for c in chunks_with_queries.values() if c["doc_id"] == doc_id]
        category = doc_id.split("_")[0] 
        chunks_data = []
        for chunk in chunks:
            chunk_data = {
                "id": chunk["id"],
                "chunk_id": chunk["chunk_id"],
                "content": chunk["content"],
                "queries": chunk["queries"],
                "queries_score": chunk["queries_score"]
            }
            chunks_data.append(chunk_data)
        chunks_document = {
            "doc_id": doc_id,
            "category": category,
            "chunks": chunks_data
        }

        outfile_name = f"{output_folder}/{category}/{doc_id}_negqueries.json"
        with open(outfile_name, "w", encoding="utf-8") as file:
            json.dump(chunks_document, file, indent=2)
            print(f"Saved {outfile_name} with {len(chunks_document['chunks'])} chunks.")
            
            
pd.DataFrame.from_dict(chunks_with_queries, orient='index').to_csv(f"{output_folder}/chunks_with_queries123.csv", index=False)
save_data_with_queries(chunks_with_queries, output_folder)

Saved resources/data/negqueries/legal-advice/legal-advice_72_negqueries.json with 4 chunks.
Saved resources/data/negqueries/eurlex/eurlex_62_negqueries.json with 2 chunks.
Saved resources/data/negqueries/guidance/guidance_25_negqueries.json with 78 chunks.
Saved resources/data/negqueries/eurlex/eurlex_4_negqueries.json with 2 chunks.
Saved resources/data/negqueries/memos/memos_43_negqueries.json with 3 chunks.
Saved resources/data/negqueries/legal-advice/legal-advice_13_negqueries.json with 4 chunks.
Saved resources/data/negqueries/guidance/guidance_5_negqueries.json with 10 chunks.
Saved resources/data/negqueries/guidance/guidance_63_negqueries.json with 19 chunks.
Saved resources/data/negqueries/eurlex/eurlex_43_negqueries.json with 2 chunks.
Saved resources/data/negqueries/eurlex/eurlex_1_negqueries.json with 2 chunks.
Saved resources/data/negqueries/memos/memos_20_negqueries.json with 14 chunks.
Saved resources/data/negqueries/guidance/guidance_56_negqueries.json with 6 chunks.
Sav