# Sistema de Recuperacion de Informacion

In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

In [2]:
#Función para cargar los datos precomputados
def load_precomputed_data(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

In [3]:
# Ruta del archivo de datos precomputados
precomputed_data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\ir24a\proyecto1Bimestre\data\preprocessd_data.pkl'

In [4]:
# Cargar los datos precomputados
precomputed_data = load_precomputed_data(precomputed_data_path)
filenames = precomputed_data['filenames']
processed_docs = precomputed_data['processed_docs']
bow_vectors = precomputed_data['bow_vectors']
tfidf_vectors = precomputed_data['tfidf_vectors']
bow_vectorizer = precomputed_data['bow_vectorizer']
tfidf_vectorizer = precomputed_data['tfidf_vectorizer']

In [5]:
# Función para vectorizar una consulta
def vectorize_query(query, bow_vectorizer, tfidf_vectorizer):
    bow_query_vector = bow_vectorizer.transform([query])
    tfidf_query_vector = tfidf_vectorizer.transform([query])
    return bow_query_vector, tfidf_query_vector

In [6]:
#Función para buscar documentos
def sistema_RI(bow_query_vector, bow_vectors, tfidf_query_vector, tfidf_vectors):
    bow_query_array = bow_query_vector.toarray()[0]
    bow_vectors_array = bow_vectors.toarray()
    jaccard_similarities = [
        jaccard_score(bow_query_array, bow_vector, average='binary') for bow_vector in bow_vectors_array
    ]
    cosine_similarities = cosine_similarity(tfidf_query_vector, tfidf_vectors).flatten()
    return jaccard_similarities, cosine_similarities

In [7]:
# Probar la búsqueda
query = "earn"
bow_query_vector, tfidf_query_vector = vectorize_query(query, bow_vectorizer, tfidf_vectorizer)
jaccard_similarities, cosine_similarities = sistema_RI(bow_query_vector, bow_vectors, tfidf_query_vector, tfidf_vectors)

In [8]:
# Crear el DataFrame de resultados
results_df = pd.DataFrame({
    'id': filenames,
    'Jaccard_Similarity': jaccard_similarities,
    'Cosine_Similarity': cosine_similarities
})

In [9]:
# Mostrar los primeros 10 resultados
limited_results_df = results_df.head(10)
print(tabulate(limited_results_df, headers='keys', tablefmt='grid'))

+----+-----------+----------------------+---------------------+
|    | id        |   Jaccard_Similarity |   Cosine_Similarity |
|  0 | 1.txt     |            0         |            0        |
+----+-----------+----------------------+---------------------+
|  1 | 10.txt    |            0         |            0        |
+----+-----------+----------------------+---------------------+
|  2 | 100.txt   |            0         |            0        |
+----+-----------+----------------------+---------------------+
|  3 | 1000.txt  |            0         |            0        |
+----+-----------+----------------------+---------------------+
|  4 | 10000.txt |            0.0285714 |            0.167332 |
+----+-----------+----------------------+---------------------+
|  5 | 10002.txt |            0         |            0        |
+----+-----------+----------------------+---------------------+
|  6 | 10005.txt |            0         |            0        |
+----+-----------+----------------------