Datasets comparison

In [1]:
import os
import gzip
import json
from pathlib import Path

import pandas as pd
import datasets
from tqdm import tqdm

### Data

Create one df_* for each dataset.

#### MIRACL

To download the whole dataset:

```bash
# Requires Access Token authentication:
mkdir -p data &&
cd data && 
git clone https://huggingface.co/datasets/miracl/miracl-corpus && 
git clone https://huggingface.co/datasets/miracl/miracl &&
cd -
```

NOTE this downloads all subsets -- it might not be the best or only way to do this.

In [3]:
# miracl_corpus = datasets.load_dataset("../data/miracl-corpus", "es", trust_remote_code=True) # NOTE this fails... so we use a function
# miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', "es", trust_remote_code=True) # this also fails...
# df_docs = miracl_corpus["train"].to_pandas()

def read_miracl_corpus():
    files = sorted(list(Path("../data/miracl-corpus/miracl-corpus-v1.0-es/").glob("docs-*.jsonl.gz")))
    data = []
    for file in tqdm(files):
        with gzip.open(file, "rb") as f:
            for line in f:
                line_str = line.decode("utf-8").strip()
                line_json = json.loads(line_str)
                data.append(line_json)
    df = pd.DataFrame(data)
    return df


def get_miracl_dataset(split: str) -> pd.DataFrame:
    df_qrels = pd.read_csv(
        f"../data/miracl/miracl-v1.0-es/qrels/qrels.miracl-v1.0-es-{split}.tsv",
        sep="\t", header=None, names=["id", "q0", "docid", "rel"],
    )
    df_queries = pd.read_csv(
        f"../data/miracl/miracl-v1.0-es/topics/topics.miracl-v1.0-es-{split}.tsv",
        sep="\t", header=None, names=["id", "query"],
    )
    df_docs = read_miracl_corpus()
    print(f"# docs: {df_docs.shape[0]}")
    df_miracl = df_qrels.merge(df_queries, left_on="id", right_on="id").merge(
        df_docs, left_on="docid", right_on="docid"
    )
    return df_miracl

df_miracl = pd.DataFrame()
for split in ["train", "dev"]:
    df_tmp = get_miracl_dataset(split)
    df_tmp["split"] = split
    df_miracl = pd.concat([df_miracl, df_tmp])

del df_tmp
# df_miracl_test = get_miracl_dataset("test") # NOTE we don't have the test set

100%|██████████| 21/21 [00:46<00:00,  2.22s/it]


# docs: 10373953


100%|██████████| 21/21 [00:45<00:00,  2.15s/it]


# docs: 10373953


In [4]:
# J/Docs counts, according to paper https://arxiv.org/pdf/2210.09984:
print(f"Train {21531 / 2162:.2f}")
print(f"Dev {6443 / 648:.2f}")
print(f"Test {15074 / 1515:.2f}")

Train 9.96
Dev 9.94
Test 9.95


In [5]:
df_miracl.head(2)

Unnamed: 0,id,q0,docid,rel,query,title,text,split
0,10000698#0,Q0,541735#4,1,¿Cómo se forman los nombres coreanos?,Onomástica coreana,"En el curso de la historia coreana, el uso de ...",train
1,10000698#0,Q0,541735#17,1,¿Cómo se forman los nombres coreanos?,Onomástica coreana,Antes de la adopción de nombres al estilo chin...,train


In [6]:
for split in ["train", "dev"]:
    print(f"{split} queries: {df_miracl[df_miracl['split'] == split]['query'].nunique()}")

train queries: 2161
dev queries: 648


#### SQAC (test)

In [7]:
# sqac = datasets.load_dataset("PlanTL-GOB-ES/SQAC", split="test", trust_remote_code=True) # NOTE this fails
# sqac = datasets.load_dataset(
#     "PlanTL-GOB-ES/SQAC", split="test", revision="95f30249a0b52dd9eb67b2423cd20fcdcb8be79d",
#     trust_remote_code=True) # NOTE this also fails

# We replicate loading script from HuggingFace:
def sqac_examples(sqac_data):
    """This function returns the examples in the raw (text) form."""
    for article in sqac_data:
        title = article.get("title", "").strip()
        for paragraph in article["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                id_ = qa["id"]

                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                answers = [answer["text"].strip() for answer in qa["answers"]]

                # Features currently used are "context", "question", and "answers".
                # Others are extracted here for the ease of future expansions.
                yield id_, {
                    "title": title,
                    "context": context,
                    "question": question,
                    "id": id_,
                    "answers": {
                        "answer_start": answer_starts,
                        "text": answers,
                    },
                }

def get_sqac(split):
    sqac = pd.read_json(f"https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/{split}.json")
    sqac_data = sqac["data"].tolist()
    examples = list(sqac_examples(sqac_data))
    formatted_data = [
        {
            'id': item[0],
            'title': item[1]['title'],
            'text': item[1]['context'],
            'query': item[1]['question'],
        }
        for item in examples
    ]
    df_sqac = pd.json_normalize(formatted_data)
    return df_sqac

df_sqac = pd.DataFrame()
for split in ["train", "dev", "test"]:
    df_tmp = get_sqac(split)
    df_tmp["split"] = split
    df_sqac = pd.concat([df_sqac, df_tmp])

del df_tmp

# #docs is the number of unique docs in train+test+dev, according to paper 6,247
# del sqac, sqac_data, examples, formatted_data

In [8]:
df_sqac.head(2)

Unnamed: 0,id,title,text,query,split
0,6cf3dcd6-b5a3-4516-8f9e-c5c1c6b66628,Historia de Japón,"La historia de Japón (日本の歴史 o 日本史, Nihon no re...",¿Qué influencia convirtió Japón en una nación ...,train
1,2663226e-e652-43a2-a6ba-c1fd02a1df31,Historia de Japón,"La historia de Japón (日本の歴史 o 日本史, Nihon no re...",¿Cuándo se detuvo el expansionismo de Japón?,train


In [9]:
for split in ["train", "dev", "test"]:
    n_queries = df_sqac[df_sqac["split"] == split]["query"].nunique()
    n_docs = len(df_sqac[df_sqac["split"] == split]["text"])
    print(f"{split}: {n_queries} queries, {n_docs} docs, J/D: {n_docs / n_queries:.2f}")

train: 14934 queries, 15036 docs, J/D: 1.01
dev: 1861 queries, 1864 docs, J/D: 1.00
test: 1908 queries, 1910 docs, J/D: 1.00


#### mMARCO

In [13]:
# qrels
df_qrels = pd.read_csv(
    "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.small.tsv",
    # "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.tsv",
    sep="\t", header=None, names=["id", "q0", "docid", "rel"],
)
# df_qrels["id"].value_counts() # NOTE some queries have multiple relevant docs

# collection
mmarco_collection = datasets.load_dataset('unicamp-dl/mmarco', 'collection-spanish', trust_remote_code=True)
df_docs = mmarco_collection["collection"].to_pandas().rename(
    columns={"id": "docid"}
)
print(f"# docs: {df_docs.shape[0]}")

# queries
mmarco_queries = datasets.load_dataset('unicamp-dl/mmarco', 'queries-spanish', trust_remote_code=True)
df_queries = mmarco_queries["dev"].to_pandas().rename(
    columns={"text": "query"}
)

df_mmarco = df_queries.merge(df_qrels, left_on="id", right_on="id", how="left").merge(
    df_docs, left_on="docid", right_on="docid", how="left"
)
print(df_mmarco.shape)

del df_qrels, df_docs, df_queries, mmarco_collection, mmarco_queries

Repo card metadata block was not found. Setting CardData to empty.


# docs: 8841823


Repo card metadata block was not found. Setting CardData to empty.


(7437, 6)


In [8]:
df_mmarco.head(2)

Unnamed: 0,id,query,q0,docid,rel,text
0,1048585,que es el hermano de paula deen,0,7187158,1,Paula Deen y su hermano Earl W. Bubba Hiers es...
1,2,Definición del receptor de andrógenos,0,4339068,1,"El receptor de andrógenos (AR), también conoci..."


In [61]:
print("Dev")
print(f"#Q: {df_mmarco['query'].nunique()}")
print(f"#J: {df_mmarco['docid'].__len__()}")
print(f"#J/#Q: {df_mmarco['docid'].__len__() / df_mmarco['query'].nunique():.2f}")

# df_tmp = pd.read_csv(
#     "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.small.tsv",
#     sep="\t", header=None, names=["id", "q0", "docid", "rel"],
# )
# print(f"#Q: {df_tmp['id'].nunique()}, #D: {df_tmp['docid'].__len__()}")
# print(f"#J/#Q: {df_tmp['docid'].__len__() / df_tmp['id'].nunique():.2f}")

Dev
#Q: 6979
#J: 7437
#J/#Q: 1.07


In [35]:
print("Train")
df_tmp = pd.read_csv(
    "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv",
    sep="\t", header=None, names=["query", "positive", "negative"],
)
df_queries_tmp = datasets.load_dataset(
    "unicamp-dl/mmarco", "queries-spanish", trust_remote_code=True
)["train"].to_pandas()

Train


Repo card metadata block was not found. Setting CardData to empty.


In [40]:
df_tmp.sort_values("query") # 3.9M triples

Unnamed: 0,query,positive,negative
34073092,3,1142680,3709694
27777536,3,1142680,1689342
39427775,3,1142680,7726790
18737611,3,1142680,7773621
10129377,3,1142680,6340510
...,...,...,...
38831317,1185869,0,4757034
28805795,1185869,0,987970
19803504,1185869,0,4153975
3296599,1185869,0,7527611


In [42]:
df_tmp.drop_duplicates(["query", "positive"]).shape # 415.938 positive pairs

(415938, 3)

In [41]:
df_queries_tmp.merge(df_tmp, left_on="id", right_on="query", how="inner").drop_duplicates(
    ["id", "positive"]
)

Unnamed: 0,id,text,query,positive,negative
0,121352,definir extremo,121352,2912794,3933233
119,634306,¿Qué significa bienes muebles en el historial ...,634306,1668221,4248288
217,920825,¿Cuál fue el gran salto hacia adelante mentalm...,920825,3285660,6617428
314,737889,qué es el proceso de descentralización.,737889,189333,2327459
423,303205,¿Cuánto puedo contribuir a la ira no deducible?,303205,6487240,821461
...,...,...,...,...,...
39780324,155334,¿Los helechos de Boston tienen que estar a la ...,155334,5482817,7674719
39780406,421294,¿Son las palomitas de maíz un bocadillo saluda...,421294,2687906,5469267
39780510,1059728,Al por mayor precio de lularoe,1059728,5279595,5046717
39780599,210839,¿Cómo puedo ver el día siguiente?,210839,5272393,3078695


In [25]:
df_tmp.shape, mmarco_queries["train"].shape

((39780811, 3), (808731, 2))

In [22]:
mmarco_queries["train"][0]

{'id': 121352, 'text': 'definir extremo'}

In [1]:
# NOTE this is too slow:
# ds_tmp = datasets.load_dataset('unicamp-dl/mmarco', 'spanish', trust_remote_code=True) 

#### PRES (test)

In [2]:
# NOTE usamos documents porque el set completo de documents es mas grande (10k ) 
# que el de oraciones (265), así como está armado en Jina. 

# ['corpus.sentences', 'corpus.documents', 'queries', 'qrels.s2s', 'qrels.s2p']
# collection
pres_corpus = datasets.load_dataset("jinaai/spanish_passage_retrieval", "corpus.documents", trust_remote_code=True)
df_docs = pres_corpus["test"].to_pandas().rename(
    columns={"_id": "docid"}
)
print(f"# docs: {df_docs['docid'].nunique()}")

# queries
pres_queries = datasets.load_dataset("jinaai/spanish_passage_retrieval", "queries", trust_remote_code=True)
df_queries = pres_queries["test"].to_pandas().rename(
    columns={"_id": "id", "text": "query"}
)

# Qrels
pres_qrels = datasets.load_dataset("jinaai/spanish_passage_retrieval", "qrels.s2p", trust_remote_code=True)
df_qrels = pres_qrels["test"].to_pandas()
df_qrels["text"] = df_qrels["text"].str.split(" ")
df_qrels = df_qrels.explode("text").reset_index(drop=True).rename(
    columns={"_id": "id", "text": "docid"}
)

df_pres = df_queries.merge(df_qrels, left_on="id", right_on="id").merge(
    df_docs, left_on="docid", right_on="docid"
)
print(df_pres.shape)

del pres_corpus, pres_queries, pres_qrels, df_docs, df_queries, df_qrels

Repo card metadata block was not found. Setting CardData to empty.


# docs: 10037


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


(1289, 4)


In [3]:
df_pres.head(2)

Unnamed: 0,id,query,docid,text
0,PR_ES_1_1,¿Es importante controlar el peso del bebé y su...,doc_8561,El crecimiento de los niños en los primeros añ...
1,PR_ES_1_2,¿Es importante controlar el peso del bebé y cu...,doc_8561,El crecimiento de los niños en los primeros añ...


In [4]:
# print number of queries, docs and J/D ratio for df_pres:
print(f"{df_pres['query'].nunique()} queries")
print(f"{df_pres['docid'].__len__()} judgments")
print(f"{df_pres['docid'].__len__() / df_pres['query'].nunique():.2f} J/Q ratio")

167 queries
1289 judgments
7.72 J/Q ratio


In [9]:
print(df_pres["query"])
# print(df_pres["text"][3])

0       ¿Es importante controlar el peso del bebé y su...
1       ¿Es importante controlar el peso del bebé y cu...
2       ¿Es importante controlar el peso y el crecimie...
3       ¿Es importante controlar el peso y la medida d...
4       ¿El niño debería ser medido y pesado durante e...
                              ...                        
1284    ¿Cuáles son los números de teléfono a los que ...
1285    ¿Quién tengo que contactar en caso de emergencia?
1286     ¿A quién tengo que llamar en caso de emergencia?
1287    ¿Existen algunos números de teléfono de emerge...
1288    ¿Cuáles son los números de teléfono a los que ...
Name: query, Length: 1289, dtype: object


#### Multi-EuParl

In [53]:
meup = datasets.load_dataset("unimelb-nlp/Multi-EuP", keep_default_na=False)
meup = meup["full"].select_columns(["TEXT", "TEXTID", "title_ES", "qid_ES", "did", "LANGUAGE"])

df_meup = meup.to_pandas().query("LANGUAGE == 'ES'").rename(
    columns={"TEXT": "text", "TEXTID": "docid", "title_ES": "query", "qid_ES": "id"}
).drop(columns="LANGUAGE")
print(df_meup.shape)

del meup

(2371, 5)


In [12]:
df_meup.head(2)

Unnamed: 0,text,docid,query,id,did
36,"– Señora presidenta, lamento profundamente lo...",d8275c0f-a8f5-4b51-ae6b-dad05b210bf3,Incendios forestales devastadores en Australia...,qid0#ES,doc36
58,"– Señora presidenta, el tema que nos trae hoy...",3ccfdbae-da90-4e97-bebf-2248b28ced73,Delincuencia organizada transfronteriza y su i...,qid1#ES,doc58


In [54]:
print(f"{df_meup['query'].nunique()} queries")
print(f"{df_meup['text'].__len__()} docs")
print(f"J/D: {df_meup['text'].__len__() / df_meup['query'].nunique():.2f}")
# NOTE paper says 680 queries...

633 queries
2371 docs
J/D: 3.75


#### Ours (messirve)

In [None]:
hf_access_token = os.getenv("HF_READ_TOKEN") # None if not set

df_messirve = pd.DataFrame()
for split in ["train", "test"]:
    df_tmp = datasets.load_dataset("spanish-ir/messirve", "full", token=hf_access_token)[
        split].to_pandas()[['id', 'query', 'docid', 'docid_text']].rename(columns={"docid_text": "text"})
    df_tmp["split"] = split
    df_messirve = pd.concat([df_messirve, df_tmp])

del df_tmp


In [3]:
df_messirve["article_id"] = df_messirve["docid"].str.split("#").str[0]

In [4]:
for split in ["train", "test"]:
    print(split)
    print(f"# docs: {df_messirve[df_messirve['split'] == split]['docid'].nunique()}")
    print(f"# queries: {df_messirve[df_messirve['split'] == split]['query'].nunique()}")
    print(f"J/D: {df_messirve[df_messirve['split'] == split]['docid'].__len__() / df_messirve[df_messirve['split'] == split]['query'].nunique():.2f}")

train
# docs: 96935
# queries: 537730


J/D: 1.06
test
# docs: 63557
# queries: 156528
J/D: 1.02


In [6]:
# total unique queries in messirve:
df_messirve["query"].nunique()

694258

### Articles comparison (MIRACL, SQAC, ours)

In [18]:
print(f"MIRACL # articles: {df_miracl['title'].nunique()}")
print(f"SQAC # articles: {df_sqac['title'].nunique()}")
print(f"Ours # articles: {df_messirve['article_id'].nunique()}")

MIRACL # articles: 17640
SQAC # articles: 3823
Ours # articles: 84284
