In [1]:
from vespa.package import ApplicationPackage, Schema, Document, Field, RankProfile
from vespa.application import Vespa
from vespa.deployment import VespaDocker
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import time
import pickle

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 2. Configuração do modelo de embedding (Semantic Search)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 

In [3]:
# 3. Definição do Schema
document = Document(
    fields=[
        Field(name="id", type="string", indexing=["summary", "attribute"]),
        Field(name="text", type="string", indexing=["index", "summary"]),
        Field(name="embedding", type="tensor<float>(d[384])", indexing=["attribute", "summary"], ann="true", hnsw={"distance-metric": "dotproduct"})
    ]
)

schema = Schema(
    name="document",
    document=document,
    rank_profiles=[
        RankProfile(name="bm25", first_phase="nativeRank(text)"),
        RankProfile(name="dot_product", first_phase="closeness(embedding)"),
        RankProfile(name="hybrid", first_phase="nativeRank(text) + closeness(embedding)")
    ]
)

In [7]:
# 4. Criando o app e subindo via Docker
app_package = ApplicationPackage(name="vespasearch1", schema=None)
vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for configuration server, 15/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Waiting for application to come up, 25/300 seconds.
Waiting for application to come up, 30/300 seconds.
Application is up!
Finished deployment.


In [8]:
# 5. Indexação de documentos
def index_documents(app, docs):
    for doc in docs:
        doc_id = doc["id"]
        text = doc["text"]
        embedding = embedding_model.encode(text).tolist()
        app.feed_data_point(schema="document", data_id=doc_id, fields={"id": doc_id, "text": text, "embedding": embedding})

In [10]:
# 6. Funções de Busca
def lexical_search(app, query):
    return app.query(
        yql="select * from sources * where userQuery();",
        query=query,
        ranking="bm25"
    )

def semantic_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query_tensor={"query_embedding": query_vec},
        ranking="dot_product"
    )

def hybrid_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where userQuery() or ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query=query,
        query_tensor={"query_embedding": query_vec},
        ranking="hybrid"
    )

In [None]:
PATH = 'subset_msmarco_train_0.01_9.pkl'
PATH_DATA = '../data/' + PATH
PATH_DATA_CLEAN = '../data_clean/' + PATH

In [14]:
# 7 Carregando e formatando dados do MSMARCO
with open("./subset_msmarco_train_0/subset_msmarco_train_0.01_9.pkl", "rb") as f:
    data = pickle.load(f)

# Extraindo documentos únicos dos dados
unique_docs = {}
for item in data:
    for doc in item.get("relevant_docs", []):
        unique_docs[doc["id"]] = doc["text"]

docs = [{"id": doc_id, "text": text} for doc_id, text in unique_docs.items()]

FileNotFoundError: [Errno 2] No such file or directory: './subset_msmarco_train_0/subset_msmarco_train_0.01_9.pkl'