In [1]:
%pip install beir

Collecting beir
  Downloading beir-2.0.0.tar.gz (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytrec_eval (from beir)
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu (from beir)
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting datasets (from beir)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->beir)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->beir)
  Downloa

# make prompt

In [2]:
import random

def get_messages(query, docs, labels, max_passage_length=2500):
    """
    Generate a conversation structure for ranking passages, including scores in the same order as passages.

    Parameters:
        query (str): The search query.
        docs (dict): Dictionary where each key is a document ID, and the value is the document content.
        labels (dict): Dictionary where each key is a document ID, and the value is its relevance score (1 for relevant, 0 for non-relevant).
        max_passage_length (int): Maximum length of each passage (in characters).

    Returns:
        list: A structured conversation with system, human, and assistant messages, with scores for each document.
    """
    output = []

    # Shuffle document IDs
    idslist = list(docs.keys())

    # Truncate passages to the maximum allowed length
    passages = [
        f"[{doc_id}] {docs[doc_id][:max_passage_length]}{'...' if len(docs[doc_id]) > max_passage_length else ''}"
        for doc_id in idslist
    ]

    # Shuffle the passages (but keep track of original IDs for scores)
    random.shuffle(passages)

    # Create the human message
    human_message = (
        "You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query. "
        f"I will provide you with {len(passages)} passages, each indicated by a numerical identifier []. "
        f"Rank the passages based on their relevance to the search query: {query}.\n\n"
    )
    human_message += "\n".join(passages)

    human_message += (
    f"\n\nSearch Query: {query}\nRank the {len(passages)} passages above based on their relevance to the search query. "
     "All the passages should be included and listed using identifiers, in descending order of relevance. "
     "The output format should be [] > [], e.g., [B] > [A]. Only respond with the ranking results; do not add any explanation.")


    output.append({
        "role": "user",
        "content": human_message
    })

    # Create the assistant message with the scores (same order as passages)
    assistant_message = {
        "role": "assistant",
        "content": labels
    }

    output.append(assistant_message)

    return output

# generate validation set

In [3]:
def rank_passages_and_replace(data):
    """
    Ranks passages based on their relevance scores and replaces the original score dictionary with the ranking.

    Parameters:
    data (dict): A dictionary containing the dataset information and relevance scores.

    Returns:
    dict: The updated dictionary with the ranking string replacing the scores.
    """
    # Extract relevance scores
    relevance_scores = data['messages'][-1]['content']

    # Sort identifiers based on their relevance scores (descending) and identifier (ascending for ties)
    ranked = sorted(relevance_scores.items(), key=lambda x: (-x[1], x[0]))

    # Create the ranking string
    ranking = " > ".join([f"[{item[0]}]" for item in ranked])

    # Replace the content with the ranking
    data['messages'][-1]['content'] = ranking

    return data



# general function

In [4]:
# prompt: make above a full function with input as the dataset name

import os
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
import json
import random
from tqdm import tqdm
import string

def process_dataset(dataset_name, data_dir=""):
    # dataset ="vihealthqa"
    # dataset ="nfcorpus" # Example dataset, change as needed
    dataset = dataset_name
    out_dir = os.path.join(data_dir, "datasets")
    data_path = os.path.join(out_dir, dataset)

    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
    if not os.path.exists(data_path):
        data_path = util.download_and_unzip(url, out_dir)

    # Load datasets
    # split = "dev"
    split = "train"
    try:
        corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split=split)
    except Exception as e:
        print(f"Error loading dataset {dataset}: {e}")
        return  # Exit the function if dataset loading fails


    # Open the file to write in JSONL format
    output_file = f"{dataset}_data.jsonl"
    with open(output_file, 'w') as file:
        # Iterate through qrels with tqdm for progress
        i =0
        for query_id, relevant_docs in tqdm(qrels.items(), desc="Processing Queries"):
            query_text = queries[query_id]  # Get the query text

            # Sort relevant_docs by their relevance scores (descending) and take top 5
            top_docs = sorted(relevant_docs.items(), key=lambda x: x[1], reverse=True)[:3]

            # Extract the top document texts and their scores
            docs = {doc_id: corpus[doc_id]['text'][:100] for doc_id, _ in top_docs if doc_id in corpus}
            labels = {doc_id: score for doc_id, score in top_docs if doc_id in corpus}

            # Identify non-relevant document IDs
            non_relevant_ids = list(set(corpus.keys()) - set(relevant_docs.keys()))
            if len(non_relevant_ids) >= 2:
                random_non_relevant = random.sample(non_relevant_ids, 5 - len(docs))  # Fill to 5 docs
            else:
                random_non_relevant = non_relevant_ids  # Use all if less than 5 non-relevant available

            # Add random non-relevant documents to the output until total docs reach 5
            for doc_id in random_non_relevant:
                docs[doc_id] = corpus[doc_id]['text'][:100]
                labels[doc_id] = 0  # Assign label 0 for non-relevant

            # Ensure that we have exactly 5 documents (top 5 relevant + non-relevant)

            # Generate random IDs in the range of the total number of documents
            random_ids = random.sample(string.ascii_uppercase, len(docs))

            # Map old `doc_id` to new random IDs
            id_map = {old_id: new_id for old_id, new_id in zip(docs.keys(), random_ids)}

            # Update docs and labels with new random IDs
            updated_docs = {id_map[doc_id]: text for doc_id, text in docs.items()}
            updated_labels = {id_map[doc_id]: score for doc_id, score in labels.items()}
            # print(updated_labels,updated_docs)
            # Create conversation messages using updated IDs
            conversation = get_messages(query_text, updated_docs, updated_labels)
                # Initialize the row with the random ID, query id, and actual query
            row = {
                "dataset": f"{dataset}",  # Random ID
                "id": f"{dataset}_{i}",
                "messages": conversation
            }
            i+=1

            # Create the dictionary and write to JSONL
            json.dump(row , file)
            file.write('\n')  # Ensure each entry is on a new line

  from tqdm.autonotebook import tqdm


In [5]:
import os
import json
import random
from tqdm import tqdm
import string
from beir import util
from beir.datasets.data_loader import GenericDataLoader


def process_dataset_with_dev(dataset_name, data_dir=""):
    dataset = dataset_name
    out_dir = os.path.join(data_dir, "datasets")
    data_path = os.path.join(out_dir, dataset)

    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
    if not os.path.exists(data_path):
        data_path = util.download_and_unzip(url, out_dir)

    split = "train"
    try:
        corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split=split)
    except Exception as e:
        print(f"Error loading dataset {dataset}: {e}")
        return

    examples = []

    for query_id, relevant_docs in tqdm(qrels.items(), desc="Processing Queries"):
        query_text = queries[query_id]

        top_docs = sorted(relevant_docs.items(), key=lambda x: x[1], reverse=True)[:3]
        docs = {doc_id: corpus[doc_id]['text'][:100] for doc_id, _ in top_docs if doc_id in corpus}
        labels = {doc_id: score for doc_id, score in top_docs if doc_id in corpus}

        non_relevant_ids = list(set(corpus.keys()) - set(relevant_docs.keys()))
        random_non_relevant = random.sample(non_relevant_ids, max(0, 5 - len(docs)))

        for doc_id in random_non_relevant:
            docs[doc_id] = corpus[doc_id]['text'][:100]
            labels[doc_id] = 0

        random_ids = random.sample(string.ascii_uppercase, len(docs))
        id_map = {old_id: new_id for old_id, new_id in zip(docs.keys(), random_ids)}

        updated_docs = {id_map[doc_id]: text for doc_id, text in docs.items()}
        updated_labels = {id_map[doc_id]: score for doc_id, score in labels.items()}

        conversation = get_messages(query_text, updated_docs, updated_labels)

        examples.append({
            "dataset": f"{dataset}",
            "id": f"{dataset}_{len(examples)}",
            "messages": conversation
        })

    # Write top 10 examples to _data_dev.json
    with open(f"{dataset}_dev.jsonl", "w") as dev_file:
        for example in examples[:10]:
            example = rank_passages_and_replace(example)
            json.dump(example, dev_file)
            dev_file.write("\n")

    # Write the remaining examples to _data.json
    with open(f"{dataset}_data.jsonl", "w") as data_file:
        for example in examples[10:]:
            json.dump(example, data_file)
            data_file.write("\n")

# Example usage:
process_dataset("vihealthqa")


datasets/vihealthqa.zip:   0%|          | 0.00/2.13M [00:00<?, ?iB/s]

  0%|          | 0/9896 [00:00<?, ?it/s]

Processing Queries: 100%|██████████| 7009/7009 [00:10<00:00, 639.61it/s]


# modifed dataset with random ids

In [6]:
# prompt: make above a full function with input as the dataset name
import string
import os
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
import json
import random
from tqdm import tqdm

def process_dataset_modified(dataset_name, data_dir=""):

    dataset = dataset_name
    out_dir = os.path.join(data_dir, "datasets")
    data_path = os.path.join(out_dir, dataset)

    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
    if not os.path.exists(data_path):
        data_path = util.download_and_unzip(url, out_dir)

    # Load datasets
    # split = "dev"
    split = "train"
    try:
        corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split=split)
    except Exception as e:
        print(f"Error loading dataset {dataset}: {e}")
        return  # Exit the function if dataset loading fails


    # Open the file to write in JSONL format
    output_file = f"{dataset}_data.jsonl"
    with open(output_file, 'w') as file:
        # Iterate through qrels with tqdm for progress
        i =0
        for query_id, relevant_docs in tqdm(qrels.items(), desc="Processing Queries"):
            query_text = queries[query_id]  # Get the query text

            # Sort relevant_docs by their relevance scores (descending) and take top 5
            top_docs = sorted(relevant_docs.items(), key=lambda x: x[1], reverse=True)[:3]

            # Extract the top document texts and their scores
            docs = {doc_id: corpus[doc_id]['text'][:100] for doc_id, _ in top_docs if doc_id in corpus}
            labels = {doc_id: score for doc_id, score in top_docs if doc_id in corpus}

            # Identify non-relevant document IDs
            non_relevant_ids = list(set(corpus.keys()) - set(relevant_docs.keys()))
            if len(non_relevant_ids) >= 2:
                random_non_relevant = random.sample(non_relevant_ids, 5 - len(docs))  # Fill to 5 docs
            else:
                random_non_relevant = non_relevant_ids  # Use all if less than 5 non-relevant available

            # Add random non-relevant documents to the output until total docs reach 5
            for doc_id in random_non_relevant:
                docs[doc_id] = corpus[doc_id]['text'][:100]
                labels[doc_id] = 0  # Assign label 0 for non-relevant

            # Ensure that we have exactly 5 documents (top 5 relevant + non-relevant)

            def generate_random_prefix(length=9):
                # Combine letters and symbols for the random prefix
                char_pool = string.ascii_letters + string.punctuation
                # char_pool = string.ascii_letters
                return ''.join(random.choices(char_pool, k=length))

            # Generate the ID map with random alpha prefixes
            id_map = {old_id: f"{generate_random_prefix()}{old_id}" for old_id in docs.keys()}

            # Update docs and labels with new random IDs
            updated_docs = {id_map[doc_id]: text for doc_id, text in docs.items()}
            updated_labels = {id_map[doc_id]: score for doc_id, score in labels.items()}

            # print(updated_labels,updated_docs)
            # Create conversation messages using updated IDs
            conversation = get_messages(query_text, updated_docs, updated_labels)
                # Initialize the row with the random ID, query id, and actual query
            row = {
                "dataset": f"{dataset}",  # Random ID
                "id": f"{dataset}_{i}",
                "messages": conversation
            }
            i+=1

            # Create the dictionary and write to JSONL
            json.dump(row , file)
            file.write('\n')  # Ensure each entry is on a new line

In [7]:
import json

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns its content as a list of dictionaries.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        list: A list of dictionaries, where each line in the JSONL file is a dictionary.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))  # Parse each JSON line
    return data



# generate all datasets

In [None]:
datasets = ["vihealthqa", "nfcorpus","scifact","fiqa"]

for dataset in datasets:
    process_dataset_with_dev(dataset)
    file_path = f"{dataset}_data.jsonl"
    content = read_jsonl(file_path)
    print(content[0])
    file_path = f"{dataset}_dev.jsonl"
    content = read_jsonl(file_path)
    print(content[0])



  0%|          | 0/9896 [00:00<?, ?it/s]

Processing Queries: 100%|██████████| 7009/7009 [00:08<00:00, 864.83it/s]


{'dataset': 'vihealthqa', 'id': 'vihealthqa_10', 'messages': [{'role': 'user', 'content': 'You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query. I will provide you with 5 passages, each indicated by a numerical identifier []. Rank the passages based on their relevance to the search query: Lệch vách ngăn mũi phải làm sao?.\n\n[S] Theo những mô tả về tình trạng của anh, chị vẫn có thể tiêm chủng vaccine Covid-19 bình thường\n[R] Vách ngăn mũi là cấu trúc gồm sụn và xương để chia hốc mũi làm hai bên. Thông thường ai cũng có sự l\n[E] Trường hợp của anh/chị, bệnh nền đã được điều trị ổn định thì hoàn toàn có thể tiêm chủng vaccine ng\n[C] Đây là một trường hợp khó và hiếm gặp.\n[X] Trường hợp của bạn đã tiêm vaccine Covid-19 cách đây 10 ngày, sau tiêm một ngày bắt đầu có biểu hiện\n\nSearch Query: Lệch vách ngăn mũi phải làm sao?\nRank the 5 passages above based on their relevance to the search query. All the passages should be included and

datasets/nfcorpus.zip:   0%|          | 0.00/2.34M [00:00<?, ?iB/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

Processing Queries: 100%|██████████| 2590/2590 [00:00<00:00, 2790.77it/s]


{'dataset': 'nfcorpus', 'id': 'nfcorpus_10', 'messages': [{'role': 'user', 'content': 'You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query. I will provide you with 5 passages, each indicated by a numerical identifier []. Rank the passages based on their relevance to the search query: Phytates in Beans: Anti-Nutrient or Anti-Cancer?.\n\n[Q] Adequate fruit and vegetable intake was suggested to protect against colorectal cancer and colorecta\n[C] Background Prospective, randomized, pilot clinical study was conducted to evaluate the beneficial ef\n[N] The transforming growth factor-beta (TGF-beta) superfamily encompasses a large group of growth and d\n[F] Seven cyclists exercised at 70% of maximal O2 uptake (VO2max) until fatigue (170 +/- 9 min) on three\n[J] Introduction Matrix metalloproteinases (MMPs) have repeatedly been shown to play a very active role \n\nSearch Query: Phytates in Beans: Anti-Nutrient or Anti-Cancer?\nRank the 5 pass

datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

Processing Queries: 100%|██████████| 809/809 [00:00<00:00, 1490.09it/s]


{'dataset': 'scifact', 'id': 'scifact_10', 'messages': [{'role': 'user', 'content': 'You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query. I will provide you with 5 passages, each indicated by a numerical identifier []. Rank the passages based on their relevance to the search query: 53% of perinatal mortality is due to low birth weight..\n\n[O] Type 1 diabetes (T1D) is characterized by the destruction of the insulin-producing β-cells of pancre\n[C] Members of the cationic host defense (antimicrobial) peptide family are widely distributed in nature\n[F] Limited neural input results in muscle weakness in neuromuscular disease because of a reduction in t\n[D] Satellite cells are skeletal muscle stem cells capable of self-renewal and differentiation after tra\n[H] CONTEXT One key target of the United Nations Millennium Development goals is to reduce the prevalenc\n\nSearch Query: 53% of perinatal mortality is due to low birth weight.\nRank 

datasets/fiqa.zip:   0%|          | 0.00/17.1M [00:00<?, ?iB/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

Processing Queries:   5%|▌         | 290/5500 [00:02<00:39, 131.97it/s]