In [1]:
import ir_datasets

# pip install ir_datasets watermark


In [2]:
%load_ext watermark


In [3]:
%watermark -v -m -p ir_datasets


Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.14.0

ir_datasets: 0.5.5

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.0-89-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 16
Architecture: 64bit



## BEIR

In [4]:
beir_dataset_names = [
    "beir/arguana",
    "beir/climate-fever",
    "beir/cqadupstack/android",
    "beir/dbpedia-entity/test",
    "beir/fever/test",
    "beir/fiqa/test",
    "beir/hotpotqa/test",
    "beir/msmarco/test",
    "beir/nfcorpus/test",
    "beir/nq",
    "beir/quora/test",
    "beir/scidocs",
    "beir/scifact/test",
    "beir/trec-covid",
    "beir/webis-touche2020/v2",
]
print(len(beir_dataset_names))

# Missing:
#  signal1m
#  bioasq
#  trec-news
#  robust04


15


In [5]:

def get_qrel_examples(dataset_name: str, num_examples: int = 2) -> dict:
    """Get a dictionary of query_id -> relevant_doc_id for `num_examples` queries.
    """
    dataset = ir_datasets.load(dataset_name)
    highest_relevance = int(max(dataset.qrels.metadata["fields"]["relevance"]["counts_by_value"].keys()))
    query2doc = {} # query_id -> one relevant doc_id
    for qrel in dataset.qrels_iter():
        query_id = qrel.query_id
        doc_relevance = qrel.relevance
        if doc_relevance == highest_relevance:
            doc_id = qrel.doc_id
            if query_id not in query2doc:
                query2doc[query_id] = doc_id
            if len(query2doc) == num_examples:
                break
    return query2doc

def doc_as_text(dataset, doc):
    """Convert a doc to a string representation of its data.
    """
    keys = list(dataset.docs_cls().__annotations__.keys())
    res = [f"{key}: {getattr(doc, key)}" for key in keys]
    return "\n".join(res)

def query_as_text(dataset, doc):
    """Convert a doc to a string representation of its data.
    """
    keys = list(dataset.queries_cls().__annotations__.keys())
    res = [f"{key}: {getattr(doc, key)}" for key in keys]
    return "\n".join(res)

def get_qrel_texts(dataset, qrel_dict: dict) -> dict:
    """Convert a dictionary of query_id -> relevant_doc_id to [(query_data, relevant_doc_data)].
    """
    examples = []
    for query_id, doc_id in qrel_dict.items():
        query = dataset.queries.lookup(query_id)
        doc = dataset.docs.lookup(doc_id)
        example = (query_as_text(dataset, query), doc_as_text(dataset, doc))
        examples.append(example)
    return examples

def get_dataset_examples(dataset_name: str, num_examples: int = 2) -> dict:
    """Get a dictionary of query_text -> relevant_doc_text for `num_examples` queries.
    """
    qrel_dict = get_qrel_examples(dataset_name, num_examples)
    dataset = ir_datasets.load(dataset_name)
    return get_qrel_texts(dataset, qrel_dict)


In [6]:
# print N examples for each dataset:

for dataset_name in beir_dataset_names:
    examples = get_dataset_examples(dataset_name, 2)
    for i, (query_text, doc_text) in enumerate(examples):
        print(f"— Example #{i+1} from {dataset_name}") 
        print(" [Query]")
        print(query_text)
        print(" [One relevant document]")
        print(doc_text)
        print()
    print()


— Example #1 from beir/arguana
 [Query]
query_id: test-environment-aeghhgwpe-pro02a
text: Being vegetarian helps the environment  Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation  At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statist