In [8]:
from pathlib import Path
from typing import List

from gensim import corpora, similarities
from gensim.models import TfidfModel

from documents import load_docs, Document

In [9]:
all_docs = load_docs(Path("./data/docs-raw-texts"))

In [10]:
import pandas as pd


class GensimSearch:
    def __init__(self, docs: List[Document]):
        self.docs = docs
        self.dictionary = corpora.Dictionary(all_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in all_docs]
        self.model = TfidfModel(self.corpus)
        self.index = similarities.MatrixSimilarity(self.model[self.corpus])
        
    def search(self, query_document: Document, min_similarity: float = 0.0):
        query_bow = self.dictionary.doc2bow(query_document)
        query_tfidf = self.model[query_bow]
        sims = self.index[query_tfidf]
        # Crear un DataFrame con los resultados de similitud
        results = pd.DataFrame({
            'similarity': sims,
            'doc': self.docs
        }, index=[doc.name for doc in self.docs])
        
        # Ordenar los resultados por similitud de mayor a menor
        results.sort_values(by='similarity', ascending=False, inplace=True)

        # Filtrar los resultados por similitud mínima
        results = results[results['similarity'] > min_similarity]
        
        return results
    
    def evaluate_search(self, queries: List[Document], output_path: Path):
        """
        Evalúa las consultas y escribe los resultados en un archivo de salida.
    
        Args:
            queries (List[Document]): Lista de documentos de consulta para evaluar.
            output_path (Path): Ruta del archivo donde se guardarán los resultados.
        """
        with open(output_path, 'w') as output_file:
            for query in queries:
                relevant_docs = self.search(query_document=query)    
                result_texts = [f'{doc_name}:{row.similarity}' for doc_name, row in relevant_docs.iterrows()]
                output_file.write(f"{query.name}\t{','.join(result_texts)}\n")
    

In [11]:
gensim_search = GensimSearch(all_docs)

In [12]:
all_queries = load_docs(Path("./data/queries-raw-texts"))
gensim_search.evaluate_search(all_queries, output_path='./data/GESIM-consultas_resultados')