## Carga de datos

In [1]:
repo = 'spanish-ir/'
d = { 'dataset': 'messirve', 'corpus': 'eswiki_20240401_corpus' }

In [2]:
import os.path
import datasets

dataset, corpus = None, None

if not os.path.isdir('dataset'):
    revision = '1.2'
    country = 'full'
    dataset = datasets.load_dataset(repo + d['dataset'],
        revision=revision, country=country)

if not os.path.isdir('corpus'):
    corpus = datasets.load_dataset(repo + d['corpus'])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# in case we already have the dataset, load them from disk
dataset = dataset if dataset is not None else datasets.load_from_disk('dataset')
train, test = dataset['train'].to_pandas(), dataset['test'].to_pandas()

corpus = corpus if corpus is not None else datasets.load_from_disk('corpus')
corpus = corpus['corpus'].to_pandas()

In [4]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 766296 entries, 0 to 766295
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               766296 non-null  int64  
 1   query            766296 non-null  str    
 2   docid            766296 non-null  str    
 3   docid_text       766296 non-null  str    
 4   docid_title      766296 non-null  str    
 5   query_date       766296 non-null  object 
 6   answer_date      766296 non-null  object 
 7   match_score      766296 non-null  float32
 8   expanded_search  766296 non-null  bool   
 9   answer_type      766296 non-null  str    
 10  id_country       766296 non-null  float64
dtypes: bool(1), float32(1), float64(1), int64(1), object(2), str(5)
memory usage: 463.5+ MB


In [5]:
train.head(3)

Unnamed: 0,id,query,docid,docid_text,docid_title,query_date,answer_date,match_score,expanded_search,answer_type,id_country
0,7397857,cuántos inning se juegan en el kickingball,1869086#17,El juego de kitball tiene 6 entradas y cada un...,Kickball,2024-04-07,2024-05-06,0.8829,False,feat_snip,976827.0
1,7397858,cómo beneficia la biodiversidad a la salud de...,16208#36,La biodiversidad es importante ya que cada esp...,Biodiversidad,2024-04-06,2024-05-09,1.0,False,feat_snip,976828.0
2,7397861,quienes somos,3328953#1,"Wikipedia es una enciclopedia libre, políglota...",Wikipedia,2024-05-18,2024-06-24,1.0,False,feat_snip,6328791.0


In [6]:
test.info()

<class 'pandas.DataFrame'>
RangeIndex: 174078 entries, 0 to 174077
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               174078 non-null  int64  
 1   query            174078 non-null  str    
 2   docid            174078 non-null  str    
 3   docid_text       174078 non-null  str    
 4   docid_title      174078 non-null  str    
 5   query_date       174078 non-null  object 
 6   answer_date      174078 non-null  object 
 7   match_score      174078 non-null  float32
 8   expanded_search  174078 non-null  bool   
 9   answer_type      174078 non-null  str    
 10  id_country       174078 non-null  float64
dtypes: bool(1), float32(1), float64(1), int64(1), object(2), str(5)
memory usage: 105.9+ MB


In [7]:
test.head(3)

Unnamed: 0,id,query,docid,docid_text,docid_title,query_date,answer_date,match_score,expanded_search,answer_type,id_country
0,7397859,en grecia quién aplico la democracia radical,87525#2,Efialtes es considerado por muchos historiador...,Efialtes de Atenas,2024-04-07,2024-05-06,1.0,False,feat_snip,976830.0
1,7397860,que conoces de la familia arduino,1337914#0,"Arduino es una compañía de desarrollo de ""soft...",Arduino,2024-05-20,2024-06-20,1.0,False,feat_snip,5870869.0
2,7397866,1 arroba cuantas kilogramos tiene,77666#7,En Bolivia y Perú hojas de coca se comercializ...,Arroba (unidad de masa),2024-04-09,2024-05-07,1.0,False,feat_snip,976874.0


In [8]:
corpus.info()

<class 'pandas.DataFrame'>
RangeIndex: 14047759 entries, 0 to 14047758
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   docid   str  
 1   title   str  
 2   text    str  
dtypes: str(3)
memory usage: 5.3 GB


In [9]:
corpus.head(3)

Unnamed: 0,docid,title,text
0,7#0,Andorra,"Para otros usos de este término, véase Andorra..."
1,7#1,Andorra,"Andorra, oficialmente Principado de Andorra ()..."
2,7#2,Andorra,"Con sus 468 km² de extensión territorial, Ando..."


In [10]:
# delete the dataset unused columns to save memory
columns = ['id', 'docid_text', 'query_date', 'answer_date', 'expanded_search', 'answer_type', 'id_country']
train.drop(columns=columns, inplace=True)
test.drop(columns=columns,  inplace=True)

In [11]:
import gc

# select only the documents that are present in the dataset
docids = set(train['docid']) | set(test['docid'])
subcorpus = corpus[corpus['docid'].isin(docids)].copy()

# free memory
del corpus
gc.collect()

43

### Preprocesado

In [12]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
stops = set(stopwords.words('spanish'))

def preprocess_tfidf(text: str) -> str:
    """ Applies strong preprocessing to a text """
    text = text.lower()
    # remove punctuation signs
    text = re.sub(r'[^\w\s]','', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove stopwords
    words = [w for w in text.split() if w not in stops]
    return ' '.join(words)


def preprocess_embeddings(text: str) -> str:
    """ Applies light preprocessing to a text """
    text = text.lower()
    # remove punctuation signs
    text = re.sub(r'[^\w\s]','', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text

In [13]:
from joblib import Parallel, delayed

def parallel_apply(series, f):
    # applies a function to a pandas series in parallel
    return Parallel(n_jobs=-1)(delayed(f)(x) for x in series)

In [14]:
for column in ['query', 'docid_title']:
    # apply the preprocessing to the dataset columns, for embeddings
    train[column + '_raw'] = parallel_apply(train[column], preprocess_embeddings)
    test[column + '_raw'] = parallel_apply(test[column], preprocess_embeddings)

    # apply the preprocessing to the dataset columns, for tf-idf
    train[column + '_tfidf'] = parallel_apply(train[column], preprocess_tfidf)
    test[column + '_tfidf'] = parallel_apply(test[column], preprocess_tfidf)

In [15]:
for column in ['text', 'title']:
    # apply the preprocessing to the corpus columns for embeddings & tf-idf
    subcorpus[column + '_raw'] = parallel_apply(subcorpus[column], preprocess_embeddings)
    subcorpus[column + '_tfidf'] = parallel_apply(subcorpus[column], preprocess_tfidf)

## TF-IDF

### Vectorizado de documentos y consultas

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), 
    sublinear_tf=True, 
    max_features=50_000,
    norm='l2',
    max_df=0.85,
    min_df=2,
)

# vectorize the corpus using tf-idf
corpus_combined = subcorpus['title_tfidf'].fillna('') + ' ' + subcorpus['text_tfidf']
corpus_tfidf = vectorizer.fit_transform(corpus_combined).tocsc()

# vectorize the dataset using the same vectorizer
queries = pandas.concat([train['query_tfidf'], test['query_tfidf']])
queries_tfidf = vectorizer.transform(queries).tocsc()

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def rank_batch(query_matrix, corpus_tfidf, top_k=100):
    """ Gets the top k most similar documents for each query """
    # obtain similarity (query - corpus)
    scores = cosine_similarity(
        query_matrix, 
        corpus_tfidf,
        dense_output=False)
    results = []

    for i in range(scores.shape[0]):
        # for each query, get the top k results
        # get the top k indices, sorted by score
        row = scores[i]
        k = min(top_k, row.nnz)
        top_idx = np.argpartition(row.data, -k)[-k:]
        doc_idx = row.indices[top_idx]
        doc_scores = row.data[top_idx]
        order = np.argsort(-doc_scores)
        results.append(doc_idx[order])
    return results

In [18]:
if not os.path.isfile('rankings_tfidf.npy'):
    batch_size = 256
    rankings_tfidf = []
    top_k = 100

    # rank the queries using tf-idf & save the results to disk
    for i in range(0, queries_tfidf.shape[0], batch_size):
        batch = queries_tfidf[i:i+batch_size]
        batch_rankings = rank_batch(batch, corpus_tfidf, top_k=top_k)
        rankings_tfidf.extend(batch_rankings)

    # save the ranking to a file
    normalized_ranking = []
    for ranking in rankings_tfidf:
        if ranking.shape[0] < top_k:
            # add padding to the ranking if len < 100
            ranking = np.pad(ranking, (0, top_k - ranking.shape[0]),
                constant_values=-1)
        normalized_ranking.append(ranking)
    np.save('rankings_tfidf.npy', np.array(normalized_ranking))
else:
    # load the tf-idf rankings from disk
    rankings_tfidf = np.load('rankings_tfidf.npy')

## Embeddings

In [19]:
corpus_combined_emb = subcorpus['title_raw'].fillna('') + ' ' + subcorpus['text_raw']

In [73]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1286.82it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: hiiamsid/sentence_similarity_spanish_es
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [None]:
if not os.path.isfile('corpus_embeddings.npy'):
    # encode the corpus using the sentence transformer model, then save to disk
    corpus_embeddings = model.encode(corpus_combined_emb.tolist(), batch_size=64, 
    show_progress_bar=True, normalize_embeddings=True)
    np.save('corpus_embeddings.npy', corpus_embeddings)
else:
    # load the corpus embeddings from disk
    corpus_embeddings = np.load('corpus_embeddings.npy')

In [22]:
all_queries = pandas.concat([train['query_raw'], test['query_raw']]).tolist()

In [23]:
if not os.path.isfile('query_embeddings.npy'):
    # encode the queries using the sentence transformer model
    query_embeddings = model.encode(all_queries, batch_size=64, 
    show_progress_bar=True, normalize_embeddings=True)
    np.save('query_embeddings.npy', query_embeddings)
else:
    # load the query embeddings from disk
    query_embeddings = np.load('query_embeddings.npy')

## Obtener similitud y rankings

In [61]:
def rank_batch_embeddings(query_emb, corpus_emb, top_k=100):
    """ Gets the top k most similar documents for each query using embeddings """
    scores = query_emb @ corpus_emb.T
    rankings = []

    for i in range(scores.shape[0]):
        # for each query, get the top k results
        # get the top k indices, sorted by score
        row = scores[i]
        part_idx = np.argpartition(row, -top_k)[-top_k:]
        top_scores = row[part_idx]
        sorted_order = np.argsort(-top_scores)
        top_k_idx = part_idx[sorted_order]
        rankings.append(top_k_idx)
    return rankings

In [62]:
if not os.path.isfile('embedding_rankings.npy'):
    batch_size = 256
    embedding_rankings = []

    # rank the queries using embeddings & save the results to disk
    for i in range(0, query_embeddings.shape[0], batch_size):
        batch = query_embeddings[i:i+batch_size]
        batch_rankings = rank_batch_embeddings(batch, corpus_embeddings)
        embedding_rankings.extend(batch_rankings)
    np.save('embedding_rankings.npy', np.array(embedding_rankings))
else:
    # load the embedding rankings from disk
    embedding_rankings = np.load('embedding_rankings.npy', allow_pickle=True)

In [63]:
idx_to_docid = subcorpus['docid'].values

In [64]:
rankings_tdidf_final = []
for query_ranking in rankings_tfidf:
    top_docids = [idx_to_docid[idx] for idx in query_ranking]
    rankings_tdidf_final.append(top_docids)

In [65]:
rankings_emb_final = []
for query_indices in embedding_rankings:
    top_docids = [idx_to_docid[idx] for idx in query_indices]
    rankings_emb_final.append(top_docids)

## Evaluar

In [66]:
import math

def evaluate(y_true, rankings, k_values=[1,5,10]):
    """ Evaluates a IR system """
    metrics_results = {k: {'precision': 0.0, 'recall': 0.0, 'ndcg': 0.0} for k in k_values}
    n_queries = len(y_true)

    for true_doc_id, predicted_docs in zip(y_true, rankings):
        # find the idx of the correct document in the list
        rank_dict = { doc: idx + 1 for idx, doc 
            in enumerate(predicted_docs) }
        rank = rank_dict.get(true_doc_id, float('inf'))

        for k in k_values:
            if rank <= k:
                k_metric = metrics_results[k]
                k_metric['precision'] += 1.0 / k
                k_metric['recall'] += 1.0
                k_metric['ndcg'] += 1.0 / math.log2(rank + 1)

    final_metrics = {}
    for k in k_values:
        k_metric = metrics_results[k]
        final_metrics[f'Precision@{k}'] = k_metric['precision'] / n_queries
        final_metrics[f'Recall@{k}'] = k_metric['recall'] / n_queries
        final_metrics[f'nDCG@{k}'] = k_metric['ndcg'] / n_queries
    return pandas.DataFrame([final_metrics])

In [67]:
y_true = test['docid'].tolist()
n_train = len(train)

In [68]:
rankings_tfidf_test = rankings_tdidf_final[n_train:]
evaluate(y_true, rankings_tfidf_test)

Unnamed: 0,Precision@1,Recall@1,nDCG@1,Precision@5,Recall@5,nDCG@5,Precision@10,Recall@10,nDCG@10
0,0.100168,0.100168,0.100168,0.051695,0.258476,0.181666,0.034316,0.343157,0.209103


In [69]:
rankings_emb_test = rankings_emb_final[n_train:]
evaluate(y_true, rankings_emb_test)

Unnamed: 0,Precision@1,Recall@1,nDCG@1,Precision@5,Recall@5,nDCG@5,Precision@10,Recall@10,nDCG@10
0,0.174945,0.174945,0.174945,0.069731,0.348654,0.266843,0.041863,0.418628,0.289532


## 
