In [63]:
import os
import json
from tqdm import tqdm
import html

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


In [64]:
# Create the "DocTag_LongEval_conf" directory if it doesn't already exist
output_dir = "DocTag_LongEval_conf_french"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [65]:
def clean_text(text):
    # Replace escape characters with their corresponding characters
    cleaned_text = html.unescape(text)

    # Remove \u00 sequences
    cleaned_text = re.sub(r"\\u00([a-zA-Z0-9]{2})", lambda m: chr(int(m.group(1), 16)), cleaned_text)

    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)

    # Remove special characters and numbers
    cleaned_text = re.sub(r"[^a-zA-Z]", " ", cleaned_text)

    tokens = word_tokenize(cleaned_text)

    cleaned_text = " ".join(tokens)

    return cleaned_text.strip()

In [66]:
def preprocess_Json_files(input_dir, output_file, doc_ids):
    preprocessed_data = []

    # Get the list of JSON files in the input directory
    input_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]

    # Iterate through each file in the input directory with a progress bar
    for filename in tqdm(input_files, desc="Processing JSON files"):
        # Open the file and load the data into a Python object
        with open(os.path.join(input_dir, filename), 'r') as infile:
            data = json.load(infile)

        # Preprocess and convert the relevant data into the format expected by DocTAG
        for item in data:
            doc_id = item['id']
            if doc_id in doc_ids:
                contents = item['contents']
                contents = clean_text(contents)
                preprocessed_data.append({
                    "document_id": doc_id,   
                    "text": contents,
                })

    # Wrap the preprocessed data in a dictionary with the "collection" key
    output_data = {"collection": preprocessed_data}

    # Save the preprocessed data to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(output_data, outfile, indent=2)

    print(f"Preprocessed data saved to {output_file}")

In [67]:
# 1) Save LongEval/documents collection
qrels_file = "./publish/French/stratified-pool-train-1.txt" # path to file with queries_id, documents_id 
qrels_file_description = "./publish/French/Queries/queries_description.txt" # path to file with queries_id, queries_description
input_dir = "./publish/French/Documents/Json/"     # Directory with Json files of the collection
output_file = "collection.json"     # path to save the collection for Doctag

doc_ids = []

with open(qrels_file, 'r') as f:
    for line in f:
        topic_id, doc_id= line.strip().split()
        doc_ids.append((doc_id))

doc_ids = list(set(doc_ids))  # remove duplicates from documents list
doc_ids = sorted(doc_ids, key=str.lower) 
    
preprocess_Json_files(input_dir, output_file, doc_ids)

# Move the preprocessed data file to the "DocTag_LongEval_conf" directory
os.rename(output_file, os.path.join(output_dir, output_file))


Processing JSON files: 100%|█████████████████████████████████████████████████████████| 158/158 [01:09<00:00,  2.26it/s]

Preprocessed data saved to collection.json





In [68]:
# 2) Save LongEval topics

topic_ids = []

with open(qrels_file, 'r') as f:   # open the files with queries_id, documents_id to extract the queries id
    for line in f:
        topic_id, doc_id= line.strip().split()
        topic_ids .append((topic_id))
        
topic_ids = sorted(set(topic_ids))  # Sort and remove duplicates


filtered_queries = []

with open(qrels_file_description, "r") as file:  # open the files with queries_id and description
    for line in file:
        line = line.strip() 
        if line:  # Skip empty lines
            id_, text = line.split("\t", 1)
            if id_ in topic_ids:
                filtered_queries.append((id_, text))

print(filtered_queries) # Our queries and description filtered from the full list




topics = []
for topic_id, text in filtered_queries:
    topics.append({
        "topic_id": topic_id,
        "title": text,
        "description": "",
        "narrative": ""
    })

output_data = {"topics": topics}

output_file = os.path.join(output_dir, "topics.json")
with open(output_file, 'w') as outfile:
    json.dump(output_data, outfile, indent=2)

print(f"Topics saved to {output_file}")

[('q06221312', 'groupama espace client'), ('q062217180', 'fuseau horaire'), ('q06224677', 'impots gouv simulateur'), ('q06227537', 'gateau a la banane'), ('q06227968', 'location voiture gare de lyon'), ('q062216056', 'achat voiture Ã©lectrique'), ('q062216960', 'emploi store'), ('q062217348', 'hotel blois'), ('q062217472', "jeux d'argent"), ('q062220278', 'franÃ§ois sureau')]
Topics saved to DocTag_LongEval_conf_french\topics.json


In [69]:
# 3) Save LongEval runs

# Read the topic_doc_pairs list from the qrels file
doc_topic_pairs = []
with open(qrels_file, 'r') as f:
    for line in f:
        topic_id, doc_id= line.strip().split()
        doc_topic_pairs.append((doc_id, topic_id))
        
runs = []
for topic_id in set([pair[1] for pair in doc_topic_pairs]):
    documents = [{"document_id": pair[0], "language": "french"} for pair in doc_topic_pairs if pair[1] == topic_id]
    runs.append({"topic_id": topic_id, "documents": documents})

output_data = {"run": runs}

# Save the output to a file in the "DocTag_LongEval_conf" directory
output_file = os.path.join(output_dir, "run.json")
with open(output_file, 'w') as outfile:
    json.dump(output_data, outfile, indent=2)

print(f"Output saved to {output_file}")


Output saved to DocTag_LongEval_conf_french\run.json


In [70]:
# 4) LongEval label have been done manually