In [None]:
# base
import numpy as np
import matplotlib.pyplot as plt
import random

# database
import pickle

# models
from models.baselines_models import random_search, tfidf_search, bm25_search

from models.bert_model import bert_search
from models.multi_staging_model import primary_stage, secondary_stage
from utils.save_embeddings import save_embeddings, save_embeddings_choose

# metrics
from metrics.metrics_functions import mean_precision_at_k, mean_average_precision_at_k, mean_recall_at_k, mean_reciprocal_rank, mean_ndcg_at_k

Base de dados

In [None]:
PATH = 'subset_msmarco_train_0.01_99.pkl'
PATH_DATA = '../data/' + PATH
PATH_DATA_CLEAN = '../data/data_clean/' + PATH

In [None]:
with open(PATH_DATA, 'rb') as f:
    data = pickle.load(f)

Queries:

In [None]:
# Convertendo o dicionário de queries para formato id:text
queries_dict = {qid: query.text for qid, query in data['queries'].items()}
print(queries_dict)
print(f'Quantidade de queries: {len(queries_dict)}')

Docs:

In [None]:
# Convertendo o dicionário de docs para formato id:text
docs_dict = {did: doc.text for did, doc in data['docs'].items()}
print(docs_dict)
print(f'Quantidade de docs: {len(docs_dict)}')

Qrels:

In [None]:
# Criando um dicionário para armazenar as relações query-documentos
qrels_dict = {}

# Iterando sobre os qrels para construir o dicionário
for qrel in data['qrels']:
    query_id = qrel.query_id
    doc_id = qrel.doc_id
    
    # Se a query já existe no dicionário, adiciona o doc à lista
    if query_id in qrels_dict:
        qrels_dict[query_id].append(doc_id)
    # Se não existe, cria uma nova lista com o doc
    else:
        qrels_dict[query_id] = [doc_id]

print(qrels_dict)
print(f'Quantidade de qrels: {len(qrels_dict)}')

Limpeza de dados

In [None]:
# Se você nunca baixou esses recursos NLTK, descomente as linhas abaixo:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('rslp')
# nltk.download('punkt_tab')

# queries_dict_clean, docs_dict_clean = clean_texts(queries_dict, docs_dict)

# Caso tenha os dados baixados
with open(PATH_DATA_CLEAN, 'rb') as f:
    data_clean = pickle.load(f)

docs_dict_clean = data_clean['docs_dict']
queries_dict_clean = data_clean['queries_dict']


print("Queries limpas:")
print(len(queries_dict_clean))
print("\nDocs limpos:")
print(len(docs_dict_clean))

Split

In [None]:
random.seed(42)

# Split the queries (assuming queries is a dictionary of {query_id: query_object})
query_ids = list(queries_dict.keys())  # List of query IDs

# Shuffle query IDs to ensure a random split
random.shuffle(query_ids)

# Split into 80% for training, 20% for validation
split_ratio = 0.8
test_query_ids = query_ids[int(len(query_ids) * split_ratio):]

test_queries_dict = {qid: queries_dict[qid] for qid in test_query_ids}
test_queries_dict_clean = {qid: queries_dict_clean[qid] for qid in test_query_ids}

print(len(queries_dict))
print(len(queries_dict_clean))
print(len(test_queries_dict))
print(len(test_queries_dict_clean))

Saving embeddings

In [None]:
save_embeddings_choose(docs_dict, queries_dict, 'bert')
save_embeddings_choose(docs_dict, queries_dict, 'tevatron')