In [1]:
import os

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset
from tqdm import tqdm

## Prefixes for Google API

In [1]:
import sys 

sys.path.append('../scripts')

from google_questions.helpers import QUERY_PATTERNS, add_accented_patterns

In [2]:
seed_prefixes = add_accented_patterns(QUERY_PATTERNS)

In [8]:
formatted_prefixes = sorted([f"``{prefix.strip()}''" for prefix in seed_prefixes])

print(*formatted_prefixes, sep=', ')

``a cual'', ``a cuales'', ``a cuantas'', ``a cuantos'', ``a cuál'', ``a cuáles'', ``a cuántas'', ``a cuántos'', ``a donde'', ``a dónde'', ``a que'', ``a quien'', ``a quienes'', ``a quién'', ``a quiénes'', ``a qué'', ``adonde debe'', ``adónde debe'', ``ante cual'', ``ante cuales'', ``ante cuál'', ``ante cuáles'', ``ante que'', ``ante quien'', ``ante quienes'', ``ante quién'', ``ante quiénes'', ``ante qué'', ``bajo cual'', ``bajo cuales'', ``bajo cuál'', ``bajo cuáles'', ``bajo que'', ``bajo qué'', ``buenas razones para'', ``buenos motivos para'', ``como'', ``con cual'', ``con cuales'', ``con cuanta'', ``con cuantas'', ``con cuanto'', ``con cuantos'', ``con cuál'', ``con cuáles'', ``con cuánta'', ``con cuántas'', ``con cuánto'', ``con cuántos'', ``con que'', ``con quien'', ``con quienes'', ``con quién'', ``con quiénes'', ``con qué'', ``cual'', ``cuales'', ``cuando'', ``cuanta'', ``cuantas'', ``cuanto'', ``cuantos'', ``cuál'', ``cuáles'', ``cuándo'', ``cuánta'', ``cuántas'', ``cuánto'', `

## Stats

In [2]:
hf_access_token = os.getenv("HF_READ_TOKEN") # None if not set

In [3]:
isos = [
    'full', 'no_country',
    'ar', 'bo', 'cl', 'co', 'cr', 'cu', 'do', 'ec', 'es', 'gt', 'hn', 'mx',
    'ni', 'pa', 'pe', 'pr', 'py', 'sv', 'us', 'uy', 've',
]
iso2country = {
    'ar': 'Argentina',
    'bo': 'Bolivia',
    'cl': 'Chile',
    'co': 'Colombia',
    'cr': 'Costa Rica',
    'cu': 'Cuba',
    'do': 'Dominican Rep.',
    'ec': 'Ecuador',
    'es': 'Spain',
    'gt': 'Guatemala',
    'hn': 'Honduras',
    'mx': 'Mexico',
    'ni': 'Nicaragua',
    'pa': 'Panama',
    'pe': 'Peru',
    'pr': 'Puerto Rico',
    'py': 'Paraguay',
    'sv': 'El Salvador',
    'us': 'United States',
    'uy': 'Uruguay',
    've': 'Venezuela',
    'full': '-',
    'no_country': '-',
}

In [None]:
# For each iso:
# number of queries in train
# number of queries in test
# number of unique relevant documents in test
# number of unique relevant articles in test
# avg query length in test
# avg relevant doc. length in test
datasets.disable_progress_bars()

def extract_data(examples):
    query_counts = [x.count(" ") + 1 for x in examples["query"]]
    doc_counts = [x.count(" ") + 1 for x in examples["docid_text"]]
    # get articleid from docid (format: "articleid#passagenumber")
    examples["articleid"] = [x.split("#")[0] for x in examples["docid"]]
    return {"query_len": query_counts, "doc_len": doc_counts, "articleid": examples["articleid"]}

stats = {}
for iso in tqdm(isos):
    dataset = load_dataset("spanish-ir/messirve", iso, token=hf_access_token)
    res = {
        "n_train": pd.Series(dataset["train"]["query"]).nunique(),
        "n_test": pd.Series(dataset["test"]["query"]).nunique(),
        "n_unique_docs": pd.Series(dataset["test"]["docid"]).nunique(),
    }
    dataset["test"] = dataset["test"].map(extract_data, batched=True, remove_columns=dataset["train"].column_names)
    res["n_unique_articles"] = pd.Series(dataset["test"]["articleid"]).nunique()
    res["avg_query_length"] = np.mean(dataset["test"]["query_len"])
    res["avg_doc_length"] = np.mean(dataset["test"]["doc_len"])
    stats[iso] = res


100%|██████████| 23/23 [01:18<00:00,  3.43s/it]


In [9]:
# DF with cols: country, iso, n_train, n_test, n_unique_docs, n_unique_articles, avg_query_length, avg_doc_length
df = pd.DataFrame(stats).T
df["country"] = df.index.map(iso2country)
df["iso"] = df.index
df = df[["country", "iso", "n_train", "n_test", "n_unique_docs", "n_unique_articles", "avg_query_length", "avg_doc_length"]]
int_cols = [c for c in df.columns if c.startswith("n_")]
df[int_cols] = df[int_cols].astype(int)
df = df.reset_index(drop=True)
df.head(2)

Unnamed: 0,country,iso,n_train,n_test,n_unique_docs,n_unique_articles,avg_query_length,avg_doc_length
0,-,full,537730,156528,63557,47127,5.690465,79.937415
1,-,no_country,356040,101359,44869,34306,5.760228,80.806648


In [10]:
df.columns = pd.MultiIndex.from_tuples([
    ('', 'Country'),
    ('', 'Code'),
    ('train', '#Unique Q'),
    ('test', '#Unique Q'),
    ('test', '#Unique Docs'),
    ('test', '#Unique Articles'),
    ('test', 'Avg Query Length'),
    ('test', 'Avg Doc Length'),
])

In [11]:
df_tmp = df.copy()
df_tmp.iloc[:, 1] = df_tmp.iloc[:, 1].replace("no_country", "none")
df_tmp.iloc[:, 1] = df_tmp.iloc[:, 1].apply(lambda x: f"\\texttt{{{x}}}")

print(
    df_tmp.style.format(decimal='.', thousands=' ', precision=1).hide()
    .to_latex().replace("#", "\#")
    # .replace("texttt", "texttt")
)

\begin{tabular}{llrrrrrr}
\multicolumn{2}{r}{} & train & \multicolumn{5}{r}{test} \\
Country & Code & \#Unique Q & \#Unique Q & \#Unique Docs & \#Unique Articles & Avg Query Length & Avg Doc Length \\
- & \texttt{full} & 537 730 & 156 528 & 63 557 & 47 127 & 5.7 & 79.9 \\
- & \texttt{none} & 356 040 & 101 359 & 44 869 & 34 306 & 5.8 & 80.8 \\
Argentina & \texttt{ar} & 22 560 & 5 481 & 3 829 & 3 498 & 5.4 & 80.4 \\
Bolivia & \texttt{bo} & 24 912 & 4 810 & 3 230 & 2 866 & 5.3 & 79.7 \\
Chile & \texttt{cl} & 22 486 & 5 408 & 3 694 & 3 381 & 5.4 & 79.3 \\
Colombia & \texttt{co} & 25 914 & 5 667 & 3 845 & 3 464 & 5.6 & 79.8 \\
Costa Rica & \texttt{cr} & 23 662 & 5 690 & 4 047 & 3 693 & 5.5 & 79.3 \\
Cuba & \texttt{cu} & 22 071 & 4 787 & 3 374 & 3 071 & 5.4 & 80.9 \\
Dominican Rep. & \texttt{do} & 27 830 & 5 359 & 3 725 & 3 320 & 5.6 & 79.8 \\
Ecuador & \texttt{ec} & 27 599 & 6 074 & 4 214 & 3 734 & 5.9 & 81.1 \\
Spain & \texttt{es} & 23 476 & 7 148 & 5 004 & 4 654 & 5.5 & 80.0 \\
Guatemala 