In [1]:
import pandas as pd

pd.options.display.max_columns = 100

df_qrels = pd.read_csv("../runs/google_questions/qrels.csv")

In [2]:
df_qrels.head(2)

Unnamed: 0,country,id,query,docid,docid_text,query_date,answer_date,match_score,expanded_search,answer_type,split,id_country
0,mx,4,1 kg a cuantos miligramos equivale,1595#34,"1 ""kilogramo"" es equivalente a:",2024-04-10,2024-04-17,0.6829,False,feat_snip,train,
1,mx,5,1 kilo a cuantos miligramos equivale,1595#34,"1 ""kilogramo"" es equivalente a:",2024-04-10,2024-04-17,0.6829,False,feat_snip,train,


### Unigram counts

In [3]:
df_tmp = df_qrels[["id", "country", "query"]].copy()
df_countries = df_tmp.query("country != 'full'").copy()
df_general = df_tmp.query("country == 'full'").copy()

In [4]:
# Sort countries by "no_country", and then alphabetically:
unique_countries = df_countries["country"].unique()
sorted_countries = sorted(unique_countries, key=lambda x: (x != "no_country", x))

In [5]:
# "general" vocabulary should have all possible words in the queries
import unidecode

df_vocab = pd.DataFrame()
df_vocab["full"] = (
    df_general["query"]
    .str.lower()
    .str.replace(r'[^\w\s]', '', regex=True)
    .apply(unidecode.unidecode)
    .str.split().explode().value_counts()
)

for country in sorted_countries:
    df_vocab[country] = (
        df_countries.query(f"country == '{country}'")["query"]
        .str.lower()
        .str.replace(r'[^\w\s]', '', regex=True)
        .apply(unidecode.unidecode)
        .str.split().explode().value_counts()
    )

In [6]:
df_vocab = df_vocab.fillna(0).astype(int)

In [7]:
# df_general.head(2)
df_vocab.head(2)

Unnamed: 0_level_0,full,no_country,ar,bo,cl,co,cr,cu,do,ec,es,gt,hn,mx,ni,pa,pe,pr,py,sv,us,uy,ve
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
que,331791,214847,11144,12161,10652,12788,12512,10611,13886,14677,11390,11569,14434,16526,15115,13404,13077,13788,12290,13918,11396,10949,13617
de,173711,105743,6554,6127,6083,7551,6943,5366,6936,7861,7380,6069,7231,10045,8260,7396,7130,7403,6725,7655,6456,5822,7540


### Log odds ratio

In [8]:
# Log-odds-ratio for each word in each country wrt the general vocabulary:
import numpy as np

df_vocab_smoothed = df_vocab.copy() + 0.1

df_lor = pd.DataFrame()
for country in sorted_countries:
    df_lor[country] = np.log(
        df_vocab_smoothed[country] / (df_vocab_smoothed[country].sum() - df_vocab_smoothed[country])
     ) - np.log(
        df_vocab_smoothed["full"] / (df_vocab_smoothed["full"].sum() - df_vocab_smoothed["full"])
     )
del df_vocab_smoothed
# df_lor = df_lor.fillna(0)

In [9]:
top_n = 10
min_freq = 10

df_top_words = pd.DataFrame()
for country in sorted_countries:
    words = df_vocab.query(f"{country} >= {min_freq}").index
    df_top_words[country] = df_lor.loc[words, country].sort_values(ascending=False).head(top_n).index


In [10]:
df_top_words

Unnamed: 0,no_country,ar,bo,cl,co,cr,cu,do,ec,es,gt,hn,mx,ni,pa,pe,pr,py,sv,us,uy,ve
0,dicta,antula,samaipata,huilo,eliecer,puntarenas,maceo,enriquillo,montubios,croquetas,mazatenango,sula,maluma,masaya,delfia,boluarte,decadron,paraguaya,pital,taxes,artigas,camejo
1,servian,1982,bulo,pichilemu,gaitan,alajuela,cienfuegos,bani,veintimilla,declinaciones,monterrico,tegucigalpa,decanato,gueguense,panamena,quispe,scan,itaipu,parlacen,apocalypto,penarol,hipolita
2,elegancia,cuit,pagador,pasamos,covenas,bifonazol,baragua,balaguer,guayasamin,protestan,pacaya,morazan,redactaron,matagalpa,cortez,chiclayo,micoplasma,solano,siguanaba,millions,uruguaya,morrocoy
3,orificio,pampita,willka,apuros,nevados,guanacaste,monilia,luperon,proponen,supervivientes,tikal,copan,kilataje,sandino,rolo,benavides,coqui,plaqueta,lempa,contenedor,montevideo,avanzadora
4,kripton,renga,katari,chiloe,frailejones,rica,yate,dominicano,ecuatoriana,naiara,verapaz,lempira,kumbia,managua,martinelli,chimbote,baclofen,paraguay,fulcro,million,sambayon,ribas
5,interactua,rivadavia,evo,colocolo,caldas,hidroxizina,bayamo,altagracia,roldos,euskera,xincas,yojoa,checar,esteli,torrijos,pisco,mycoplasma,paraguayo,salvador,green,sapitos,cardenales
6,meterse,titila,yacuiba,tirita,uribe,cartago,granma,mirabal,yasuni,lorca,esquipulas,xiomara,refinerias,nicaragua,matador,lapadula,albuterol,amoniaco,butilbromuro,calefaccion,bodegas,gallegos
7,zonda,mitre,tumusla,pinera,cundinamarca,costa,pcc,bosch,matilde,mandaba,atitlan,motagua,hormiguea,provenian,panama,dina,azithromycin,mariscal,ollantay,card,uruguay,liberto
8,aprendizajes,peron,abya,temuco,cumbia,clorhidrato,habana,ameba,dinastias,agricultores,irtra,honduras,difieren,radiactividad,tejada,fujimori,neurontin,almeida,llanuras,mega,rescataron,bolivariana
9,tonalidad,dnu,plurinacional,talca,baloto,heredia,platt,prm,chimborazo,empenos,huehuetenango,controlaba,ocurrira,dario,lotto,piero,mofongo,criptomoneda,monsenor,toneladas,franquicias,lvbp


In [11]:
# latex table with countries in rows and no colnames
df_table = df_top_words.drop("no_country", axis=1).head(6).T
# for each row, print index in texttt and then rest of the row separated by "&", 
# end with \\, sep by midrule:
print("\\begin{tabular}{l" + "c" * len(df_table.columns) + "}")
print("\\toprule")
for i, row in df_table.iterrows():
    print(f"\\texttt{{{i}}}", end=" & ")
    print(" & ".join(row), end=" \\\\ \n")
    print("\\midrule")
print("\\bottomrule")
print("\\end{tabular}")

\begin{tabular}{lcccccc}
\toprule
\texttt{ar} & antula & 1982 & cuit & pampita & renga & rivadavia \\ 
\midrule
\texttt{bo} & samaipata & bulo & pagador & willka & katari & evo \\ 
\midrule
\texttt{cl} & huilo & pichilemu & pasamos & apuros & chiloe & colocolo \\ 
\midrule
\texttt{co} & eliecer & gaitan & covenas & nevados & frailejones & caldas \\ 
\midrule
\texttt{cr} & puntarenas & alajuela & bifonazol & guanacaste & rica & hidroxizina \\ 
\midrule
\texttt{cu} & maceo & cienfuegos & baragua & monilia & yate & bayamo \\ 
\midrule
\texttt{do} & enriquillo & bani & balaguer & luperon & dominicano & altagracia \\ 
\midrule
\texttt{ec} & montubios & veintimilla & guayasamin & proponen & ecuatoriana & roldos \\ 
\midrule
\texttt{es} & croquetas & declinaciones & protestan & supervivientes & naiara & euskera \\ 
\midrule
\texttt{gt} & mazatenango & monterrico & pacaya & tikal & verapaz & xincas \\ 
\midrule
\texttt{hn} & sula & tegucigalpa & morazan & copan & lempira & yojoa \\ 
\midrule
\

### query-doc examples

Containing top log-odds words.

In [None]:
# We need to add titles to doc text...
from datasets import load_dataset
ds_docs = load_dataset("../data/eswiki_20240401_corpus")
# df_docs = ds_docs.map(
#     lambda x: {"doc": x["title"] + ". " + x["text"]}
# ).to_pandas()
df_docs = ds_docs["train"].to_pandas()
df_qrels = df_qrels.merge(df_docs[["docid", "title"]], on="docid", how="left")
df_qrels["doc"] = df_qrels["title"] + ". " + df_qrels["docid_text"]


Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

NameError: name 'df_docs' is not defined

In [15]:
import re

# For each country, sample a query-doc pair containing top log-odds word:
country2word = df_top_words.head(1).T[0].to_dict()
np.random.seed(34)
query_doc_samples = []
for country, word in country2word.items():
    df_tmp = df_qrels[df_qrels['country'] == country].copy()
    mask = df_tmp['query'].str.contains(word, case=False)
    df_sample = df_tmp[mask].sample(1)
    query = df_sample['query'].values[0]
    doc = df_sample['doc'].values[0]
    doc_len = len(re.findall(r'\s+', doc))
    if doc_len > 30:
        # truncate doc:
        doc = ' '.join(doc.split()[:30]) + ' [...]'
    query_doc_samples.append({"country": country, "query": query, "doc": doc, "word": word})

In [16]:
df_query_doc_samples = (
    pd.DataFrame(query_doc_samples)
    .query("country != 'no_country'")
)

In [17]:
def safe_latex_string(s: str) -> str:
    s = s.replace("_", "\\_")
    s = s.replace("&", "\\&")
    s = s.replace("%", "\\%")
    return s

In [18]:
df_query_doc_samples["query"].values

array(['quien era mama antula',
       'a cuantos kilometros esta samaipata de santa cruz',
       'en qué región queda huilo huilo',
       'donde murio jorge eliecer gaitan', 'que visitar en puntarenas',
       'que hizo antonio maceo por cuba', 'qué hizo enriquillo',
       'de que region son los montubios',
       'de qué se pueden hacer las croquetas', 'dónde queda mazatenango',
       'quien y en que año se fundo la villa de san pedro sula',
       'maluma carin leon - según quién versuri romana',
       'cuándo hizo erupción el volcán masaya',
       'que año nacio delfia cortez', 'quién es dina boluarte',
       'que hace el decadron inyectable',
       'cómo surgió la nación paraguaya',
       'a cuántos grados está el pital chalatenango',
       'cuando debo declarar taxes', 'de que trabajo artigas',
       'que hizo pedro camejo'], dtype=object)

In [19]:
# GPT translations:
translations = {
    "quien era mama antula": "who was Mama Antula?",
    "a cuantos kilometros esta samaipata de santa cruz": "how many kilometers is Samaipata from Santa Cruz?",
    "en qué región queda huilo huilo": "in which region is Huilo Huilo located?",
    "donde murio jorge eliecer gaitan": "where did Jorge Eliécer Gaitán die?",
    "que visitar en puntarenas": "what to visit in Puntarenas?",
    "que hizo antonio maceo por cuba": "what did Antonio Maceo do for Cuba?",
    "qué hizo enriquillo": "what did Enriquillo do?",
    "de que region son los montubios": "from which region are the Montubios?",
    "de qué se pueden hacer las croquetas": "what can croquettes be made from?",
    "dónde queda mazatenango": "where is Mazatenango?",
    "quien y en que año se fundo la villa de san pedro sula": "who and in which year was the town of San Pedro Sula founded?",
    "maluma carin leon - según quién versuri romana": "Maluma Carin León - according to whom, Roman lyrics?",
    "cuándo hizo erupción el volcán masaya": "when did Masaya Volcano erupt?",
    "que año nacio delfia cortez": "in what year was Delfia Cortez born?",
    "quién es dina boluarte": "who is Dina Boluarte?",
    "que hace el decadron inyectable": "what does injectable Decadron do?",
    "cómo surgió la nación paraguaya": "how did the Paraguayan nation arise?",
    "a cuántos grados está el pital chalatenango": "how many degrees is El Pital in Chalatenango?",
    "cuando debo declarar taxes": "when should I file taxes?",
    "de que trabajo artigas": "what was Artigas' profession?",
    "que hizo pedro camejo": "what did Pedro Camejo do?"
}


In [20]:
# Print df in latex format with cols: country, query, relevant document:
df_table = df_query_doc_samples[["country", "query", "doc"]].copy()
df_table = df_table.map(safe_latex_string)
df_table.columns = ["Country", "Query", "Relevant document"]

# Add translation to Query column e.g. <query_orig> (\emph{<translation>})
df_table["Query"] = df_table["Query"].apply(
    # lambda x: f"\\makecell{{{x} \\\\ (\\emph{{{translations.get(x, '')}}})}}" if x in translations else x
    lambda x: f"{x} (\\emph{{{translations.get(x, '')}}})" if x in translations else x
)

print("\\toprule")
print(" & ".join([f"\\textbf{{{col}}}" for col in df_table.columns]) + " \\\\")
print("\\midrule")

df_table = df_table.set_index("Country")

for i, row in df_table.iterrows():
    print("\\midrule")
    print(f"\\texttt{{{i}}}", end=" & ")
    print(" & ".join(row), end=" \\\\ \n")
print("\\bottomrule")

# print("\\end{tabular}")


\toprule
\textbf{Country} & \textbf{Query} & \textbf{Relevant document} \\
\midrule
\midrule
\texttt{ar} & quien era mama antula (\emph{who was Mama Antula?}) & María Antonia de Paz y Figueroa. Mama Antula se convirtió en la novena persona de nacionalidad argentina en ser beatificada. Esto ocurrió entre casi medio centenar de causas para canonizar [...] \\ 
\midrule
\texttt{bo} & a cuantos kilometros esta samaipata de santa cruz (\emph{how many kilometers is Samaipata from Santa Cruz?}) & Samaipata. La ciudad de Samaipata se encuentra a 119 km por carretera al suroeste de la capital departamental, Santa Cruz de la Sierra. Por Samaipata discurre la ruta troncal Ruta [...] \\ 
\midrule
\texttt{cl} & en qué región queda huilo huilo (\emph{in which region is Huilo Huilo located?}) & Reserva biológica Huilo Huilo. La Reserva Biológica Huilo Huilo es un área natural protegida privada que se ubica en medio de Los Andes, a 860 km al sur de Santiago [...] \\ 
\midrule
\texttt{co} & donde murio 