Put all datasets in appropriate format:

* Folder data/dataset_corpus/ with docs-**.jsonl.gz for each dataset with fields docid, title, text. (FILL TITLE WITH EMPTY IF MISSING)
* Folder data/dataset_eval/ with 
    * queries.dataset.csv with cols id, query, WITH header.  COMMA SEP.
    * topics.dataset.tsv con qid, query, NO header. TAB SEP.
    * qrels.dataset.tsv con qid, Q0, docid, rel (only 1s), NO header.  TAB SEP.

In [3]:
import os
import gzip
import json
from pathlib import Path

import pandas as pd
import datasets
from tqdm import tqdm

In [4]:
batch_size = 500_000

def write_batch_to_jsonl_gz(docs, outdir, file_index):
    out_file = Path(outdir) / f"docs-{file_index:02d}.jsonl.gz"
    with gzip.open(out_file, "wt", encoding="utf-8") as f_out:
        for doc in docs:
            f_out.write(json.dumps(doc, ensure_ascii=False) + "\n")


## mmarco

In [8]:
docs_outdir = "../../data/mmarco_corpus"
eval_outdir = "../../data/mmarco_eval"

In [7]:
# DOCS (fields: text)
mmarco_collection = datasets.load_dataset('unicamp-dl/mmarco', 'collection-spanish', trust_remote_code=True)
ds_docs = mmarco_collection["collection"]
ds_docs = ds_docs.rename_column("id", "docid")
ds_docs = ds_docs.add_column("title", [""] * len(ds_docs))

n_docs = len(ds_docs)

end_start_indices = [(i, i + batch_size) for i in range(0, n_docs, batch_size)]

Path(docs_outdir).mkdir(exist_ok=True, parents=True)
for file_index, (start, end) in enumerate(end_start_indices):
    end_ = min(end, n_docs)
    docs = ds_docs.select(range(start, end_))
    write_batch_to_jsonl_gz(docs, docs_outdir, file_index)

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
# QUERIES
ds_queries = datasets.load_dataset('unicamp-dl/mmarco', 'queries-spanish', trust_remote_code=True)
df_queries = ds_queries["dev"].to_pandas()
df_queries = df_queries.rename(columns={"text": "query"})

Path(eval_outdir).mkdir(exist_ok=True, parents=True)
df_queries.to_csv(f"{eval_outdir}/queries.mmarco.csv", header=True, index=False)
df_queries.to_csv(f"{eval_outdir}/topics.mmarco.tsv", sep="\t", header=False, index=False)

Repo card metadata block was not found. Setting CardData to empty.


In [10]:
# QRELS
df_qrels = pd.read_csv(
    "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.small.tsv",
    # "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.tsv",
    sep="\t", header=None, names=["id", "q0", "docid", "rel"],
)
df_qrels = df_qrels.query("rel > 0")

df_qrels.to_csv(f"{eval_outdir}/qrels.mmarco.tsv", sep="\t", header=False, index=False)

## PRES

In [5]:
docs_outdir = "../../data/pres_corpus"
eval_outdir = "../../data/pres_eval"

In [6]:
# DOCS (fields: text)
pres_corpus = datasets.load_dataset("jinaai/spanish_passage_retrieval", "corpus.documents", trust_remote_code=True)
ds_docs = pres_corpus["test"]
ds_docs = ds_docs.rename_column("_id", "docid")
ds_docs = ds_docs.add_column("title", [""] * len(ds_docs))

n_docs = len(ds_docs)

end_start_indices = [(i, i + batch_size) for i in range(0, n_docs, batch_size)]

Path(docs_outdir).mkdir(exist_ok=True, parents=True)
for file_index, (start, end) in enumerate(end_start_indices):
    end_ = min(end, n_docs)
    docs = ds_docs.select(range(start, end_))
    write_batch_to_jsonl_gz(docs, docs_outdir, file_index)

Repo card metadata block was not found. Setting CardData to empty.


In [8]:
ds_docs.to_pandas()["docid"].duplicated().sum()

0

In [9]:
# QUERIES
pres_queries = datasets.load_dataset("jinaai/spanish_passage_retrieval", "queries", trust_remote_code=True)
df_queries = pres_queries["test"].to_pandas().rename(
    columns={"_id": "id", "text": "query"}
)

Path(eval_outdir).mkdir(exist_ok=True, parents=True)
df_queries.to_csv(f"{eval_outdir}/queries.pres.csv", header=True, index=False)
df_queries.to_csv(f"{eval_outdir}/topics.pres.tsv", sep="\t", header=False, index=False)

Repo card metadata block was not found. Setting CardData to empty.


In [11]:
df_queries["id"].duplicated().sum()

0

In [14]:
# Qrels
pres_qrels = datasets.load_dataset("jinaai/spanish_passage_retrieval", "qrels.s2p", trust_remote_code=True)
df_qrels = pres_qrels["test"].to_pandas()
df_qrels["text"] = df_qrels["text"].str.split(" ")
df_qrels = df_qrels.explode("text").reset_index(drop=True).rename(
    columns={"_id": "id", "text": "docid"}
)
df_qrels["q0"] = "Q0"
df_qrels["rel"] = 1
df_qrels = df_qrels[["id", "q0", "docid", "rel"]]
df_qrels = df_qrels.query("rel > 0")

df_qrels = df_qrels.drop_duplicates(["id", "docid"])

df_qrels.to_csv(f"{eval_outdir}/qrels.pres.tsv", sep="\t", header=False, index=False)

Repo card metadata block was not found. Setting CardData to empty.


In [15]:
print(df_qrels[["id", "docid"]].duplicated().sum())


0


## MIRACL

In [25]:
docs_outdir = "../../data/miracl_corpus"
eval_outdir = "../../data/miracl_eval"

In [26]:
### DOCS (fields: title, text)
ds_docs = datasets.load_dataset("miracl/miracl-corpus", "es", trust_remote_code=True)["train"]

n_docs = len(ds_docs)

end_start_indices = [(i, i + batch_size) for i in range(0, n_docs, batch_size)]

Path(docs_outdir).mkdir(exist_ok=True, parents=True)
for file_index, (start, end) in enumerate(end_start_indices):
    end_ = min(end, n_docs)
    docs = ds_docs.select(range(start, end_))
    write_batch_to_jsonl_gz(docs, docs_outdir, file_index)


In [28]:
### QUERIES
df_queries = pd.read_csv(
    f"../../data/miracl/miracl-v1.0-es/topics/topics.miracl-v1.0-es-dev.tsv",
    sep="\t", header=None, names=["id", "query"],
)

Path(eval_outdir).mkdir(exist_ok=True, parents=True)
df_queries.to_csv(f"{eval_outdir}/queries.miracl.csv", header=True, index=False)
df_queries.to_csv(f"{eval_outdir}/topics.miracl.tsv", sep="\t", header=False, index=False)


In [29]:
### QRELS
df_qrels = pd.read_csv(
    f"../../data/miracl/miracl-v1.0-es/qrels/qrels.miracl-v1.0-es-dev.tsv",
    sep="\t", header=None, names=["id", "q0", "docid", "rel"],
)
df_qrels = df_qrels.query("rel > 0")

df_qrels.to_csv(f"{eval_outdir}/qrels.miracl.tsv", sep="\t", header=False, index=False)

## Multi-EuParl

In [30]:
docs_outdir = "../../data/meup_corpus"
eval_outdir = "../../data/meup_eval"

In [35]:
meup = datasets.load_dataset("unimelb-nlp/Multi-EuP", keep_default_na=False)
meup = meup["full"].select_columns(["TEXT", "did", "title_ES", "qid_ES", "LANGUAGE"])
df_meup = meup.to_pandas().query("LANGUAGE == 'ES'").rename(
    columns={"TEXT": "text", "did": "docid", "title_ES": "query", "qid_ES": "id"}
).drop(columns="LANGUAGE")

### DOCS
df_docs = df_meup[["docid", "text"]].drop_duplicates()
df_docs["title"] = ""
ds_docs = datasets.Dataset.from_pandas(df_docs)
n_docs = len(ds_docs)
end_start_indices = [(i, i + batch_size) for i in range(0, n_docs, batch_size)]
Path(docs_outdir).mkdir(exist_ok=True, parents=True)
for file_index, (start, end) in enumerate(end_start_indices):
    end_ = min(end, n_docs)
    docs = ds_docs.select(range(start, end_))
    write_batch_to_jsonl_gz(docs, docs_outdir, file_index)

### QUERIES
df_queries = df_meup[["id", "query"]].drop_duplicates()
Path(eval_outdir).mkdir(exist_ok=True, parents=True)
df_queries.to_csv(f"{eval_outdir}/queries.meup.csv", header=True, index=False)
df_queries.to_csv(f"{eval_outdir}/topics.meup.tsv", sep="\t", header=False, index=False)

### QRELS
df_qrels = df_meup[["id", "docid"]].drop_duplicates()
df_qrels["q0"] = "Q0"
df_qrels["rel"] = 1
df_qrels = df_qrels[["id", "q0", "docid", "rel"]]
df_qrels = df_qrels.query("rel > 0")

df_qrels.to_csv(f"{eval_outdir}/qrels.meup.tsv", sep="\t", header=False, index=False)

## SQAC

In [41]:
docs_outdir = "../../data/sqac_corpus"
eval_outdir = "../../data/sqac_eval"

In [37]:
def sqac_examples(sqac_data):
    """This function returns the examples in the raw (text) form."""
    for article in sqac_data:
        title = article.get("title", "").strip()
        for paragraph in article["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                id_ = qa["id"]

                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                answers = [answer["text"].strip() for answer in qa["answers"]]

                # Features currently used are "context", "question", and "answers".
                # Others are extracted here for the ease of future expansions.
                yield id_, {
                    "title": title,
                    "context": context,
                    "question": question,
                    "id": id_,
                    "answers": {
                        "answer_start": answer_starts,
                        "text": answers,
                    },
                }

def get_sqac(split):
    sqac = pd.read_json(f"https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/{split}.json")
    sqac_data = sqac["data"].tolist()
    examples = list(sqac_examples(sqac_data))
    formatted_data = [
        {
            'id': item[0],
            'title': item[1]['title'],
            'text': item[1]['context'],
            'query': item[1]['question'],
        }
        for item in examples
    ]
    df_sqac = pd.json_normalize(formatted_data)
    return df_sqac

df_sqac = pd.DataFrame()
for split in ["train", "dev", "test"]:
    df_tmp = get_sqac(split)
    df_tmp["split"] = split
    df_sqac = pd.concat([df_sqac, df_tmp])

In [38]:
df_sqac.head(2)

Unnamed: 0,id,title,text,query,split
0,6cf3dcd6-b5a3-4516-8f9e-c5c1c6b66628,Historia de Japón,"La historia de Japón (日本の歴史 o 日本史, Nihon no re...",¿Qué influencia convirtió Japón en una nación ...,train
1,2663226e-e652-43a2-a6ba-c1fd02a1df31,Historia de Japón,"La historia de Japón (日本の歴史 o 日本史, Nihon no re...",¿Cuándo se detuvo el expansionismo de Japón?,train


In [40]:
# NOTE I will make my own ids because it's not clear what ID col stands for.
df_sqac['id'] = pd.factorize(df_sqac['query'])[0]
df_sqac['docid'] = pd.factorize(df_sqac['title'] + " " + df_sqac['text'])[0]

In [42]:
### DOCS
df_docs = df_sqac[["docid", "title", "text"]].drop_duplicates()
ds_docs = datasets.Dataset.from_pandas(df_docs)
n_docs = len(ds_docs)
end_start_indices = [(i, i + batch_size) for i in range(0, n_docs, batch_size)]
Path(docs_outdir).mkdir(exist_ok=True, parents=True)
for file_index, (start, end) in enumerate(end_start_indices):
    end_ = min(end, n_docs)
    docs = ds_docs.select(range(start, end_))
    write_batch_to_jsonl_gz(docs, docs_outdir, file_index)

In [43]:
### QUERIES
df_queries = df_sqac.query("split == 'test'")[["id", "query"]].drop_duplicates()
Path(eval_outdir).mkdir(exist_ok=True, parents=True)
df_queries.to_csv(f"{eval_outdir}/queries.sqac.csv", header=True, index=False)
df_queries.to_csv(f"{eval_outdir}/topics.sqac.tsv", sep="\t", header=False, index=False)

In [44]:
### QRELS
df_qrels = df_sqac.query("split == 'test'")[["id", "docid"]].drop_duplicates()
df_qrels["q0"] = "Q0"
df_qrels["rel"] = 1
df_qrels = df_qrels[["id", "q0", "docid", "rel"]]
df_qrels = df_qrels.query("rel > 0")

df_qrels.to_csv(f"{eval_outdir}/qrels.sqac.tsv", sep="\t", header=False, index=False)

In [9]:
# for split in ["train", "dev", "test"]:
#     n_queries = df_sqac[df_sqac["split"] == split]["query"].nunique()
#     n_docs = len(df_sqac[df_sqac["split"] == split]["text"])
#     print(f"{split}: {n_queries} queries, {n_docs} docs, J/D: {n_docs / n_queries:.2f}")

train: 14934 queries, 15036 docs, J/D: 1.01
dev: 1861 queries, 1864 docs, J/D: 1.00
test: 1908 queries, 1910 docs, J/D: 1.00
