In [1]:
import os
import json


def load_jsonl(file_path):
    """Reads a JSONL file and returns a dictionary."""
    data = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line.strip())
            entry_id = entry.pop('question') # each entry has an unique 'question' field
            data[entry_id] = entry
    return data

In [None]:
model_names = ['contriever-base-msmarco', 'e5-base-v2', 'msmarco-distilbert-base-tas-b']
datasets = ["scifact", "nfcorpus", "fiqa", "legalbenchrag", "hotpotqa", "fever", "nq-train"]

In [3]:
for model_name in model_names:
    for dataset in datasets:
        path = f"../beir_datasets/{dataset}/{model_name}/training_data.jsonl"
        if not os.path.exists(path):
            print(f"{dataset} not preprocessed with {model_name}. Skipping...")
            continue
        data = load_jsonl(path)
        problematic_matches = 0
        for elem in data.values():
            for p in elem['positive_ctxs']:
                for hn in elem.get('hard_negative_ctxs', []):
                    if p['title'] + " " + p['text'] == hn['title'] + " " + hn['text']:
                        print("Found a problematic match!")
                        print("POSITIVE:\n",p['title'][:50] + " | " + p["text"][:50])
                        print("HN:\n",hn['title'][:50] + " | " + hn["text"][:50])
                        problematic_matches += 1
                        break
            if problematic_matches > 0:
                break

        print(f"[{model_name}] [{dataset}] Number of problematic matches: {problematic_matches}, which is {problematic_matches/len(data)*100:.2f}% of the dataset.")

[contriever-base-msmarco] [scifact] Number of problematic matches: 0, which is 0.00% of the dataset.
[e5-base-v2] [scifact] Number of problematic matches: 0, which is 0.00% of the dataset.
[msmarco-distilbert-base-tas-b] [scifact] Number of problematic matches: 0, which is 0.00% of the dataset.


In [None]:
print(f"The training data is composed of {len(data)} entries.")
list(data.items())[0]

The training data is composed of 646 entries.


('70% of burn patients are admitted for hospitalization and further treatment after appearing at hospital emergency wards or outpatient clinics.',
 {'positive_ctxs': [{'title': 'ABC of burns. Introduction.',
    'text': 'Burns are one of the most devastating conditions encountered in medicine. The injury represents an assault on all aspects of the patient, from the physical to the psychological. It affects all ages, from babies to elderly people, and is a problem in both the developed and developing world. All of us have experienced the severe pain that even a small burn can bring. However the pain and distress caused by a large burn are not limited to the immediate event. The visible physical and the invisible psychological scars are long lasting and often lead to chronic disability. Burn injuries represent a diverse and varied challenge to medical and paramedical staff. Correct management requires a skilled multidisciplinary approach that addresses all the problems facing a burn pati