In [19]:
from vespa.package import ApplicationPackage, Schema, Document, Field, RankProfile
from vespa.application import Vespa
from vespa.deployment import VespaDocker
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import time
import pickle
import os

In [2]:
# 2. Configuração do modelo de embedding (Semantic Search)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 

In [3]:
# 3. Definição do Schema
document = Document(
    fields=[
        Field(name="id", type="string", indexing=["summary", "attribute"]),
        Field(name="text", type="string", indexing=["index", "summary"]),
        Field(name="embedding", type="tensor<float>(d[384])", indexing=["attribute", "summary"], ann="true", hnsw={"distance-metric": "dotproduct"})
    ]
)

schema = Schema(
    name="document",
    document=document,
    rank_profiles=[
        RankProfile(name="bm25", first_phase="nativeRank(text)"),
        RankProfile(name="dot_product", first_phase="closeness(embedding)"),
        RankProfile(name="hybrid", first_phase="nativeRank(text) + closeness(embedding)")
    ]
)

In [28]:
# 4. Criando o app e subindo via Docker
app_package = ApplicationPackage(name="vespasearch1", schema=None)
vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for configuration server, 15/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Waiting for application to come up, 25/300 seconds.
Waiting for application to come up, 30/300 seconds.
Waiting for application to come up, 35/300 seconds.
Waiting for application to come up, 40/300 seconds.
Application is up!
Finished deployment.


In [25]:
# 5. Indexação de documentos
def index_documents(app, docs):
    for doc in docs:
        doc_id = doc["id"]
        text = doc["text"]
        embedding = embedding_model.encode(text).tolist()
        app.feed_data_point(
            schema="document",
            data_id=doc_id,
            fields={"id": doc_id, "text": text, "embedding": embedding}
        )

In [6]:
# 6. Funções de Busca
def lexical_search(app, query):
    return app.query(
        yql="select * from sources * where userQuery();",
        query=query,
        ranking="bm25"
    )

def semantic_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query_tensor={"query_embedding": query_vec},
        ranking="dot_product"
    )

def hybrid_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where userQuery() or ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query=query,
        query_tensor={"query_embedding": query_vec},
        ranking="hybrid"
    )

In [7]:
PATH = 'subset_msmarco_train_0.01_9.pkl'
PATH_DATA = '../data/' + PATH
PATH_DATA_CLEAN = '../data_clean/' + PATH

In [8]:
# 7 Carregando e formatando dados do MSMARCO
with open("../subset_msmarco_train_0/subset_msmarco_train_0.01_9.pkl", "rb") as f:
    data = pickle.load(f)

# Extraindo lista de documentos
docs_raw = data["docs"]
docs = [{"id": doc_id, "text": doc_obj.text} for doc_id, doc_obj in docs_raw.items()]

In [23]:
response = app.get_application_status()
print(response.status_code)
print(response.json())


200
{'application': {'vespa': {'version': '8.520.18'}, 'meta': {'name': 'default', 'user': 'unknown', 'path': '', 'generation': 2, 'timestamp': 1747368002275, 'date': 'Fri May 16 04:00:02 UTC 2025', 'checksum': '7230ec67ce2cefcbb4c41b6b20ad51d7'}, 'user': {'version': ''}}, 'abstractComponents': [{'id': 'com.yahoo.search.searchers.InputCheckingSearcher@vespasearch1_content', 'class': 'com.yahoo.search.searchers.InputCheckingSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yahoo.container.logging.FileConnectionLog', 'class': 'com.yahoo.container.logging.FileConnectionLog', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.container.jdisc.state.StateMonitor', 'class': 'com.yahoo.container.jdisc.state.StateMonitor', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher@vespa', 'class': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yaho

In [18]:
import requests

r = requests.get("http://localhost:8080/document/v1/document/document/docid/")
print(r.status_code, r.text)

400 {"pathId":"/document/v1/document/document/docid/","message":"There is no document type 'document' in cluster 'vespasearch1_content', only 'vespasearch1'"}


In [27]:
import os
print("Local esperado:", os.path.abspath("schemas/document.sd"))
print("Existe?", os.path.exists("schemas/document.sd"))

with open("schemas/document.sd", "r") as f:
    print("Conteúdo:\n")
    print(f.read())

Local esperado: c:\Users\ramyr\OneDrive\Área de Trabalho\FGV\5periodo\ProjectsInDataScience\data-sci-project\src\schemas\document.sd
Existe? True
Conteúdo:

schema document {
    document document {
        field id type string {
            indexing: summary | attribute
        }

        field text type string {
            indexing: index | summary
        }

        field embedding type tensor<float>(d[384]) {
            indexing: attribute | summary
            attribute {
                distance-metric: dotproduct
            }
        }
    }

    rank-profile bm25 inherits default {
        first-phase {
            expression: nativeRank(text)
        }
    }

    rank-profile dot_product inherits default {
        first-phase {
            expression: closeness(embedding)
        }
    }

    rank-profile hybrid inherits default {
        first-phase {
            expression: nativeRank(text) + closeness(embedding)
        }
    }
}


In [29]:
index_documents(app, docs)

VespaError: No field 'id' in the structure of type 'document', which has the fields: []