## LIMIT dataset construction

In [3]:
import pandas as pd

# we use the originally generated attributes using gemini-pro-2.5 by the creators of the LIMIT dataset
liked_items = pd.read_csv("/kaggle/input/generated-attributes/generated_attributes.csv")
liked_items = liked_items["liked_item"].to_list()

# Random first and last names, originally used by the creators of the LIMIT dataset
# array of first names dataset, source: https://gist.github.com/ruanbekker/a1506f06aa1df06c5a9501cb393626ea#file-array-names-py
first_names = pd.read_csv("/kaggle/input/random-names/first_names.csv")
first_names = first_names["first_name"].to_list()
# most common last name dataset, source: https://gist.github.com/craigh411/19a4479b289ae6c3f6edb95152214efc
last_names = pd.read_csv("/kaggle/input/random-names/last_names.csv")
last_names = last_names["last_name"].to_list()


In [4]:
from itertools import combinations
import random

def generate_dataset(
    liked_items: list[str] = liked_items,
    first_names: list[str] = first_names,
    last_names: list[str] = last_names,
    num_of_docs: int = 46,
    max_num_of_docs: int = 50000,
    num_of_queries: int = 1000,
    limit: int = 46,
    top_k: int = 2
):
    # create relevant names, docs, ground-truths
    names = [
        f"{random.choice(first_names)} {random.choice(last_names)}"
        for _ in range(num_of_docs)    
    ]

    docs = {
        name: {
            "_id": name,
            "liked_items": []
        } 
        for name in names    
    }

    selected_items = random.sample(liked_items, num_of_queries)
    queries = []
    qrels = []
    for index, (comb_of_names, item) in enumerate(zip(combinations(names, top_k), selected_items)):
        queries.append({"_id": f"query_{index}", "text": f"Who likes {item}?"})
        for name in comb_of_names:
            qrels.append({"query-id": f"query_{index}", "corpus-id": name, "score": 1})
            docs[name]["liked_items"].append(item)

    remaining_items = list(set(liked_items) - set(selected_items))
    for name in docs:
        docs[name]["liked_items"] += random.sample(remaining_items, limit - len(docs[name]["liked_items"]))

    # create other irelevant documents
    names = set(names)
    for _ in range(max_num_of_docs - num_of_docs):
        while True:
            name = f"{random.choice(first_names)} {random.choice(last_names)}"
            if name not in names:
                names.add(name)
                break
        
        docs[name] = {
            "_id": name,
            "liked_items": random.sample(remaining_items, limit)
        }
    
    docs = pd.DataFrame([
        {
            "_id": doc["_id"],
            "title": "", 
            "text": f"{doc['_id']} likes {', '.join(doc['liked_items'])}."
        }
        for doc in docs.values()
    ])
    docs.to_json("corpus.jsonl", orient="records", lines=True)
    
    queries = pd.DataFrame(queries)
    queries.to_json("queries.jsonl", orient="records", lines=True)

    qrels = pd.DataFrame(qrels)
    qrels.to_json("qrels.jsonl", orient="records", lines=True)
    
    
generate_dataset()

## LIMIT vs other datasets

In [1]:
import pandas as pd
from itertools import combinations

def query_graph_metrics(qrels: pd.DataFrame):    
    # Get relevant documents per query
    relevant_df = qrels[qrels['score'] > 0]
    query_to_docs = relevant_df.groupby('query-id')['corpus-id'].apply(set).to_dict()
    
    query_ids = list(query_to_docs.keys())
    vertex_count = len(query_ids)
    
    strength_of_query = {
        q_id: 0
        for q_id in query_ids
    }
    
    # Add edges between queries that share relevant documents
    edge_count = 0
    for q1_id, q2_id in combinations(query_ids, 2):
        docs1 = query_to_docs[q1_id]
        docs2 = query_to_docs[q2_id]
        intersection = len(docs1.intersection(docs2))
        
        if intersection > 0:
            # Calculate Jaccard similarity
            union = len(docs1.union(docs2))
            jaccard_score = intersection / union if union > 0 else 0
            if jaccard_score > 0:
                strength_of_query[q1_id] += jaccard_score
                strength_of_query[q2_id] += jaccard_score
                edge_count += 1
    
    # Calculate graph density
    density = 2 * edge_count / (vertex_count * (vertex_count - 1)) if vertex_count > 0 else 0
    
    # Calculate average edge weight (query similarity)
    if edge_count > 0:
        strengths = strength_of_query.values()
        avg_strength = sum(strengths) / vertex_count if strengths else 0
    else:
        avg_strength = 0
    
    # print(f"\nQuery Graph Statistics:")
    # print(f"  Number of queries (nodes): {vertex_count}")
    # print(f"  Number of query pairs with shared docs (edges): {edge_count}")
    # print(f"  Maximum possible edges: {vertex_count * (vertex_count - 1) // 2}")
    # print(f"  Query Graph Density: {density:.6f}")
    # print(f"  Average Query Similarity (weighted degree): {avg_strength:.4f}")
    
    return density, avg_strength
    

In [3]:
from datasets import load_dataset

qrels = pd.read_json("hf://datasets/orionweller/LIMIT/qrels.jsonl", lines=True)
print("LIMIT", query_graph_metrics(qrels))

qrels = pd.read_parquet("hf://datasets/mteb/Core17InstructionRetrieval/qrels/test-00000-of-00001.parquet")
print("core 17 ir", query_graph_metrics(qrels))

qrels = pd.read_json("hf://datasets/mteb/hotpotqa/qrels/test.jsonl", lines=True)
print("hotpotqa", query_graph_metrics(qrels))

qrels = pd.read_json("hf://datasets/mteb/scifact/qrels/test.jsonl", lines=True)
print("scifact", query_graph_metrics(qrels))

qrels = pd.read_json("hf://datasets/mteb/nq/qrels/test.jsonl", lines=True)
print("nq", query_graph_metrics(qrels))

LIMIT (0.08548148148148148, 28.4653333333333)
core 17 ir (0.02564102564102564, 0.5911717092249338)
hotpotqa (3.735411739771666e-05, 0.11037587215845267)
scifact (0.0014492753623188406, 0.4222222222222222)
nq (0.0, 0)


In [4]:
custom_corpus = load_dataset("json", data_files="/kaggle/input/custom-limit/corpus.jsonl", split="all")
custom_corpus = custom_corpus.to_pandas()
custom_qrels  = load_dataset("json", data_files="/kaggle/input/custom-limit/qrels.jsonl", split="all")
custom_qrels = custom_qrels.to_pandas()
custom_queries = load_dataset("json", data_files="/kaggle/input/custom-limit/queries.jsonl", split="all")
custom_queries = custom_queries.to_pandas()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
print("custom_limit", query_graph_metrics(custom_qrels))

custom_limit (0.08548148148148148, 28.4653333333333)
