### Queries comparison

In [1]:
import datasets
from datasets import load_dataset
import pandas as pd

#### reading queries (load all splits for each)

In [2]:
# MIRACL
dfs = []
for split in ["train", "dev"]:
    df_ = pd.read_csv(
        f"../data/miracl/miracl-v1.0-es/topics/topics.miracl-v1.0-es-{split}.tsv",
        sep="\t", header=None, names=["id", "query"],
    )
    df_["split"] = split
    dfs.append(df_)

df_miracl = pd.concat(dfs)
df_miracl["dataset"] = "miracl"

In [3]:
# MMARCO
ds_queries = datasets.load_dataset('unicamp-dl/mmarco', 'queries-spanish', trust_remote_code=True)
dfs = []
for k, v in ds_queries.items():
    df_ = v.to_pandas()
    df_["split"] = k
    dfs.append(df_)
df_mmarco = pd.concat(dfs)
df_mmarco = df_mmarco.rename(columns={"text": "query"})
df_mmarco["dataset"] = "mmarco"

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
# PRES
pres_queries = datasets.load_dataset("jinaai/spanish_passage_retrieval", "queries", trust_remote_code=True)
dfs = []
for k, v in pres_queries.items():
    df_ = v.to_pandas()
    df_["split"] = k
    dfs.append(df_)
df_pres = pd.concat(dfs)
df_pres["dataset"] = "pres"
df_pres = df_pres.rename(columns={"_id": "id", "text": "query"})

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
# MEUP
meup = datasets.load_dataset("unimelb-nlp/Multi-EuP", keep_default_na=False)
meup = meup["full"].select_columns(["TEXT", "did", "title_ES", "qid_ES", "LANGUAGE"])
df_tmp = meup.to_pandas().query("LANGUAGE == 'ES'").rename(
    columns={"TEXT": "text", "did": "docid", "title_ES": "query", "qid_ES": "id"}
).drop(columns="LANGUAGE")
df_meup = df_tmp[["id", "query"]].drop_duplicates()
df_meup["split"] = "test"
df_meup["dataset"] = "meup"

In [6]:
# SQAC
def sqac_examples(sqac_data):
    """This function returns the examples in the raw (text) form."""
    for article in sqac_data:
        title = article.get("title", "").strip()
        for paragraph in article["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                id_ = qa["id"]

                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                answers = [answer["text"].strip() for answer in qa["answers"]]

                # Features currently used are "context", "question", and "answers".
                # Others are extracted here for the ease of future expansions.
                yield id_, {
                    "title": title,
                    "context": context,
                    "question": question,
                    "id": id_,
                    "answers": {
                        "answer_start": answer_starts,
                        "text": answers,
                    },
                }

def get_sqac(split):
    sqac = pd.read_json(f"https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/{split}.json")
    sqac_data = sqac["data"].tolist()
    examples = list(sqac_examples(sqac_data))
    formatted_data = [
        {
            'id': item[0],
            'title': item[1]['title'],
            'text': item[1]['context'],
            'query': item[1]['question'],
        }
        for item in examples
    ]
    df_sqac = pd.json_normalize(formatted_data)
    return df_sqac

df_sqac = pd.DataFrame()
for split in ["train", "dev", "test"]:
    df_tmp = get_sqac(split)
    df_tmp["split"] = split
    df_sqac = pd.concat([df_sqac, df_tmp])

df_sqac = df_sqac[["id", "query", "split"]].drop_duplicates()
df_sqac["dataset"] = "sqac"

In [7]:
# MESSIRVE
ds = load_dataset("spanish-ir/messirve", "full", trust_remote_code=True)
for k, v in ds.items():
    df_ = v.to_pandas()
    df_["split"] = k
    dfs.append(df_)
df_messirve = pd.concat(dfs)
df_messirve = df_messirve[["id", "query", "split"]].drop_duplicates()
df_messirve["dataset"] = "messirve"

In [8]:
# Concatenate all queries:
df_queries = pd.concat(
    [
        df_miracl,
        df_mmarco,
        df_pres,
        df_meup,
        df_sqac,
        df_messirve,
    ],
    ignore_index=True,
)

In [9]:
df_queries.head()
# df_queries.query("_id.notnull()")

Unnamed: 0,id,query,split,dataset
0,1769696#0,¿Cuáles son las principales plantas fanerógamas?,train,miracl
1,919762#0,¿Qué significa Cuauhtémoc?,train,miracl
2,287784#0,¿Quién fue George Armstrong Custer?,train,miracl
3,7576513#0,¿Cuál es la iglesia más antigua de Polonia?,train,miracl
4,1166707#0,¿Cuál es la diferencia entre las tormentas elé...,train,miracl


In [10]:
df_queries["query"].isnull().sum()

1

In [11]:
df_queries = df_queries.dropna(subset=["query"])

In [12]:
import string
import unidecode

df_queries["clean_query"] = (
    df_queries["query"]
    .str.lower()
    .str.translate(str.maketrans("", "", string.punctuation + "¿¡"))
    .apply(unidecode.unidecode)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)


In [13]:
df_queries.head(2)

Unnamed: 0,id,query,split,dataset,clean_query
0,1769696#0,¿Cuáles son las principales plantas fanerógamas?,train,miracl,cuales son las principales plantas fanerogamas
1,919762#0,¿Qué significa Cuauhtémoc?,train,miracl,que significa cuauhtemoc


#### Length analysis

In [14]:
df_queries["word_count"] = df_queries["clean_query"].str.count(" ") + 1

In [15]:
# describe each dataset:
df_stats = df_queries.groupby("dataset").agg(
    q1=("word_count", lambda x: x.quantile(0.25)),
    m=("word_count", lambda x: x.quantile(0.5)),
    q3=("word_count", lambda x: x.quantile(0.75)),
    mean=("word_count", "mean"),
    std=("word_count", "std"),
).reset_index()
df_stats

Unnamed: 0,dataset,q1,m,q3,mean,std
0,messirve,5.0,6.0,7.0,5.919933,1.906939
1,meup,8.0,12.0,18.0,14.533965,10.659541
2,miracl,6.0,8.0,9.0,7.94911,2.300937
3,mmarco,5.0,7.0,9.0,7.114734,2.955962
4,pres,8.0,12.0,15.0,11.610778,3.998268
5,sqac,6.0,8.0,11.0,8.762414,3.430986


In [16]:
stats_dict = df_stats[["dataset", "mean", "std"]].set_index("dataset").to_dict(orient="index")
# sort by mean:
stats_dict = dict(sorted(stats_dict.items(), key=lambda x: x[1]["mean"]))
for k, v in stats_dict.items():
    print(f"- {k:10}: {v['mean']:5.1f} ({v['std']:.1f})")

- messirve  :   5.9 (1.9)
- mmarco    :   7.1 (3.0)
- miracl    :   7.9 (2.3)
- sqac      :   8.8 (3.4)
- pres      :  11.6 (4.0)
- meup      :  14.5 (10.7)


In [17]:
datasets_names_map = {
    "meup": "Multi-EuP",
    "pres": "PRES",
    "sqac": "SQAC",
    "miracl": "MIRACL",
    "mmarco": "mMARCO",
    "messirve": "MessIRve",
}

In [40]:
# boxplots of word count:
import plotnine as p9

df_plot = df_queries[["dataset", "word_count"]].copy()
df_plot["dataset"] = df_plot["dataset"].map(datasets_names_map)
df_plot["dataset"] = pd.Categorical(df_plot["dataset"], categories=datasets_names_map.values())

df_plot_stats = df_stats.copy()
df_plot_stats["dataset"] = df_plot_stats["dataset"].map(datasets_names_map)
df_plot_stats["dataset"] = pd.Categorical(df_plot_stats["dataset"], categories=datasets_names_map.values())

p = (
    p9.ggplot(df_plot, p9.aes(x="dataset", y="word_count"))
    + p9.geom_boxplot(fill="lightblue", alpha=0.5, outlier_shape="")
    + p9.geom_point(p9.aes(x="dataset", y="mean"), data=df_plot_stats, color="red", size=2, shape="x")
    + p9.theme_bw()
    + p9.labs(x="", y="query word count")
    # + p9.coord_flip()
    + p9.theme(
        legend_position="bottom",
        text=p9.element_text(size=9),
        legend_title=p9.element_blank(),
        axis_text_x=p9.element_text(rotation=15),
    )
)
# p.show()

p.save("../runs/plots/datasets_query_len.pdf", width=3.5, height=2.0, dpi=300)



In [41]:
!cp ../runs/plots/datasets_query_len.pdf ../latex/plots/

#### Prefix analysis

In [22]:
# Most common 1-word, 2-word, 3-word prefixes:
df_queries["words"] = df_queries["clean_query"].str.split(" ")
for i in range(1, 7):
    df_queries[f"prefix_{i}"] = df_queries["words"].apply(lambda x: " ".join(x[:i]).strip())

In [23]:
# Compute entropy of each distribution of prefixes:
import numpy as np

def gini_index(probas: np.ndarray) -> float:
    """Compute the Gini index of a numpy array of class probabilities."""
    terms = probas * (1 - probas)
    return np.sum(terms)

def entropy(probas: np.ndarray) -> float:
    terms = probas[probas > 0] * np.log(probas[probas > 0])
    return -np.sum(terms)

In [24]:
df_tmp = df_queries.copy()
df_tmp["dataset"] = df_tmp["dataset"].astype("category")
for i in range(1, 7):
    df_tmp[f"prefix_{i}"] = df_tmp[f"prefix_{i}"].astype("category")

In [25]:
from IPython.display import display

# df_distrib = df_tmp.groupby(["dataset", "prefix_1"], observed=False).size().reset_index(name="count")
# df_distrib["rel_count"] = df_distrib["count"] / df_distrib.groupby("dataset", observed=False)["count"].transform("sum")

df_res = pd.DataFrame()
for i in range(1, 7):
    df_distrib = df_tmp.groupby(["dataset", f"prefix_{i}"], observed=False).size().reset_index(name="count")
    df_distrib["rel_count"] = df_distrib["count"] / df_distrib.groupby("dataset", observed=False)["count"].transform("sum")
    df_res_ = df_distrib.groupby("dataset", observed=False)["rel_count"].agg(
        entropy=entropy,
        # gini=gini_index,
        # std="std",
        n=lambda x: (x > 0).sum(),
    ).reset_index().sort_values("entropy", ascending=False)
    df_res_["prefix_len"] = i
    df_res = pd.concat([df_res, df_res_], ignore_index=True)
    # print(f"Prefix {i}:")
    # display(df_res_)


In [26]:
df_plot = df_res.copy()

df_plot["dataset"] = df_plot["dataset"].map(datasets_names_map)
df_plot["dataset"] = pd.Categorical(df_plot["dataset"], categories=datasets_names_map.values())

p = (
    p9.ggplot(df_plot, p9.aes(x="dataset", y="entropy"))
    + p9.geom_col(p9.aes(fill="dataset"), alpha=0.5, color="gray", width=0.8)
    + p9.facet_wrap("prefix_len", ncol=1, scales="free_y", labeller=p9.labeller(prefix_len=lambda x: f"Prefix length = {x}"))
    + p9.theme_bw()
    + p9.scale_fill_brewer(type="qual", palette="Set1", direction=-1)
    + p9.labs(x="", y="Entropy")
    + p9.coord_flip()
    + p9.theme(
        # figure_size=(3.5, 6.8),
        legend_position="none", 
        text=p9.element_text(size=10),
    )
)
# p.show()

p.save("../runs/plots/datasets_query_entropy.pdf", width=2.65, height=6.8, dpi=300)
# p.save("../runs/plots/datasets_query_entropy.png", dpi=300)



In [27]:
df_plot = df_res.copy()

df_plot["dataset"] = df_plot["dataset"].map(datasets_names_map)
df_plot["dataset"] = pd.Categorical(df_plot["dataset"], categories=datasets_names_map.values())

p = (
    p9.ggplot(df_plot, p9.aes(x="dataset", y="n"))
    + p9.geom_col(p9.aes(fill="dataset"), alpha=0.5, color="gray", width=0.8)
    + p9.facet_wrap("prefix_len", ncol=1, scales="free_y", labeller=p9.labeller(prefix_len=lambda x: f"Prefix length = {x}"))
    + p9.theme_bw()
    + p9.labs(x="", y="Unique prefixes")
    + p9.scale_fill_brewer(type="qual", palette="Set1", direction=-1)
    + p9.scale_y_log10(labels=lambda l: [f"$10^{int(np.log10(v))}$" if v > 0 else "0" for v in l])
    + p9.coord_flip()
    + p9.theme(
        figure_size=(2.9, 6.2),
        legend_position="none",
        axis_text_y=p9.element_blank(),
        text=p9.element_text(size=10),
    #     legend_title=p9.element_blank(),
    )
)
# p.show()
p.save("../runs/plots/datasets_query_unique.pdf", width=2.0, height=6.8, dpi=300)



In [None]:
!cp ../runs/plots/datasets_query_entropy.pdf ../latex/plots/
!cp ../runs/plots/datasets_query_unique.pdf ../latex/plots/

In [42]:
from IPython.display import display

for i in range(1, 4):
    df_res = df_queries.groupby([f"prefix_{i}", "dataset"]).agg(
        count=("query", "count"),
    ).reset_index()
    df_res["rel_count"] = df_res["count"] / df_res.groupby("dataset")["count"].transform("sum")
    # top by dataset:
    display(
        (
            df_res
            .sort_values("rel_count", ascending=False)
            .groupby("dataset", as_index=False)
            .head(3)
            .sort_values(["dataset", "rel_count"], ascending=[True, False])
            .query("dataset not in ['meup', 'pres']")
        )
    )

Unnamed: 0,prefix_1,dataset,count,rel_count
21753,que,messirve,137235,0.197671
6034,como,messirve,52640,0.075822
9394,en,messirve,46986,0.067678
21754,que,miracl,648,0.230605
7016,cual,miracl,458,0.162989
6035,como,miracl,428,0.152313
21755,que,mmarco,232532,0.253633
6036,como,mmarco,65041,0.070943
7017,cual,mmarco,57491,0.062708
21757,que,sqac,3475,0.184742


Unnamed: 0,prefix_2,dataset,count,rel_count
93292,por que,messirve,37835,0.054497
53720,en que,messirve,33711,0.048557
90121,para que,messirve,24000,0.034569
23003,cual es,miracl,413,0.146975
23992,cuales son,miracl,192,0.068327
18865,como se,miracl,189,0.06726
103351,que es,mmarco,86740,0.094611
23004,cual es,mmarco,51585,0.056266
30960,cuanto tiempo,mmarco,26880,0.029319
23006,cual es,sqac,951,0.050558


Unnamed: 0,prefix_3,dataset,count,rel_count
268422,para que sirve,messirve,7918,0.011405
74927,cual es el,messirve,6664,0.009599
75117,cual es la,messirve,5927,0.008537
75118,cual es la,miracl,234,0.083274
74928,cual es el,miracl,179,0.063701
79133,cuales son las,miracl,98,0.034875
74929,cual es el,mmarco,27139,0.029602
75119,cual es la,mmarco,22320,0.024345
344094,que es un,mmarco,19378,0.021136
74931,cual es el,sqac,552,0.029346


In [43]:
q_words = ["quien", "quienes", "que", "cual", "cuales", "cuando", "donde", "cuanto", "cuanta", "cuantos", "cuantas", "por que", "como"]
df_queries["startswith_q"] = df_queries["clean_query"].str.startswith(tuple(q_words))

# who, what, which, when, where, how many, how much, why, how

In [44]:
df_queries.groupby("dataset")["startswith_q"].mean().sort_values(ascending=False).round(3)

dataset
miracl      0.920
sqac        0.822
messirve    0.655
mmarco      0.597
pres        0.365
meup        0.000
Name: startswith_q, dtype: float64

In [50]:
# Queries not starting with typical question words:
from IPython.display import display

df_tmp = df_queries.query("startswith_q == False & dataset == 'messirve'").copy()
for i in range(1, 4):
    df_res = df_tmp.groupby([f"prefix_{i}", "dataset"]).agg(
        count=("query", "count"),
    ).reset_index()
    df_res["rel_count"] = df_res["count"] / df_res.groupby("dataset")["count"].transform("sum")
    # top by dataset:
    display(
        (
            df_res
            .sort_values("rel_count", ascending=False)
            .groupby("dataset", as_index=False)
            .head(3)
            .sort_values(["dataset", "rel_count"], ascending=[True, False])
            .query("dataset not in ['meup', 'pres']")
        )
    )

Unnamed: 0,prefix_1,dataset,count,rel_count
1356,en,messirve,46986,0.196387
3313,porque,messirve,42046,0.175739
1108,de,messirve,38847,0.162369


Unnamed: 0,prefix_2,dataset,count,rel_count
3431,en que,messirve,33711,0.140902
7171,para que,messirve,24000,0.100313
2418,de que,messirve,21508,0.089897


Unnamed: 0,prefix_3,dataset,count,rel_count
18846,para que sirve,messirve,7918,0.033095
18556,para que es,messirve,4264,0.017822
10489,en que consiste,messirve,4000,0.016719


In [55]:
df_queries.head(2)

Unnamed: 0,id,query,split,dataset,clean_query,word_count,words,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,startswith_q,startswith_p
0,1769696#0,¿Cuáles son las principales plantas fanerógamas?,train,miracl,cuales son las principales plantas fanerogamas,6,"[cuales, son, las, principales, plantas, faner...",cuales,cuales son,cuales son las,cuales son las principales,cuales son las principales plantas,cuales son las principales plantas fanerogamas,True,True
1,919762#0,¿Qué significa Cuauhtémoc?,train,miracl,que significa cuauhtemoc,3,"[que, significa, cuauhtemoc]",que,que significa,que significa cuauhtemoc,que significa cuauhtemoc,que significa cuauhtemoc,que significa cuauhtemoc,True,True


In [58]:
prefixes = ["para que sirve", "para que es", "en que consiste"]

df_tmp = df_queries.copy()
mask = df_queries["prefix_3"].isin(prefixes)
df_tmp["flag"] = mask

df_res = df_tmp.groupby(["dataset"]).agg(
    count=("flag", "sum"),
    rel_count=("flag", "mean"),
).reset_index()
df_res["perc_count"] = df_res["rel_count"] * 100
df_res

Unnamed: 0,dataset,count,rel_count,perc_count
0,messirve,16182,0.023308,2.330834
1,meup,0,0.0,0.0
2,miracl,8,0.002847,0.284698
3,mmarco,1387,0.001513,0.151286
4,pres,0,0.0,0.0
5,sqac,15,0.000797,0.079745


In [45]:
# Queries not starting with seed prefixes:
import sys
sys.path.append("../scripts")


from google_questions.helpers import QUERY_PATTERNS

In [46]:
df_queries["startswith_p"] = df_queries["clean_query"].str.startswith(tuple(QUERY_PATTERNS))

In [47]:
# df_queries.query("startswith_p == False & dataset == 'messirve'") #["clean_query"].iloc[0]

In [48]:
from IPython.display import display

df_tmp = df_queries.query("startswith_p == False & dataset == 'messirve'").copy()
for i in range(1, 4):
    df_res = df_tmp.groupby([f"prefix_{i}", "dataset"]).agg(
        count=("query", "count"),
    ).reset_index()
    df_res["rel_count"] = df_res["count"] / df_res.groupby("dataset")["count"].transform("sum")
    # top by dataset:
    display(
        (
            df_res
            .sort_values("rel_count", ascending=False)
            .groupby("dataset", as_index=False)
            .head(10)
            .sort_values(["dataset", "rel_count"], ascending=[True, False])
            .query("dataset not in ['meup', 'pres']")
        )
    )

Unnamed: 0,prefix_1,dataset,count,rel_count
1371,en,messirve,1697,0.072007
25,a,messirve,1514,0.064242
3765,se,messirve,899,0.038147
3137,para,messirve,548,0.023253
1125,de,messirve,379,0.016082
3523,razones,messirve,352,0.014936
3322,por,messirve,304,0.012899
1349,el,messirve,270,0.011457
2494,los,messirve,247,0.010481
2761,motivos,messirve,246,0.010438


Unnamed: 0,prefix_2,dataset,count,rel_count
8407,se puede,messirve,658,0.02792
3289,en la,messirve,266,0.011287
268,a los,messirve,254,0.010778
8408,se pueden,messirve,225,0.009547
7986,razones por,messirve,186,0.007892
1411,cada cuanto,messirve,176,0.007468
329,a partir,messirve,166,0.007044
260,a las,messirve,141,0.005983
184,a cuanto,messirve,127,0.005389
5816,lugar donde,messirve,107,0.00454


Unnamed: 0,prefix_3,dataset,count,rel_count
562,a los cuantos,messirve,211,0.008953
674,a partir de,messirve,166,0.007044
524,a las cuantas,messirve,139,0.005898
13546,razones por las,messirve,91,0.003861
5567,en la biblia,messirve,77,0.003267
14401,segun la biblia,messirve,49,0.002079
6540,factores que inciden,messirve,45,0.001909
6541,factores que influyen,messirve,45,0.001909
14224,se puede embarazar,messirve,44,0.001867
14262,se puede morir,messirve,38,0.001612


---------------------------