In [1]:
from vespa.package import ApplicationPackage, Schema, Document, Field, RankProfile, HNSW
from vespa.application import Vespa
from vespa.deployment import VespaDocker
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import time
import pickle
import os

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 2. Configuração do modelo de embedding (Semantic Search)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 

In [3]:
# 3. Definição do App

app_package= ApplicationPackage(name="vespasearch1")

app_package.schema.add_fields(
        Field(name="id", type="string", indexing=["summary", "attribute"]),
        Field(name="text", type="string", indexing=["index", "summary"]),
        Field(name="embedding", type="tensor<float>(d[384])", indexing=["attribute", "summary", "index"],
              ann=HNSW(distance_metric="angular")
        #       ann="true",
        #       attribute={"distance-metric": "euclidean"
                                #      , "method": "hnsw"
                                #      }
        ), 
)

app_package.schema.add_rank_profile(
        RankProfile(name="bm25", first_phase="nativeRank(text)"),
)

app_package.schema.add_rank_profile(
        RankProfile(name="dot_product", first_phase="closeness(embedding)"),
)

app_package.schema.add_rank_profile(
        RankProfile(name="hybrid", first_phase="nativeRank(text) + closeness(embedding)")
)

In [4]:
app_package.schema.inputs = [("query_embedding", "tensor<float>(d[384])")]

In [5]:
print(app_package.schema.schema_to_text)

schema vespasearch1 {
    document vespasearch1 {
        field id type string {
            indexing: summary | attribute
        }
        field text type string {
            indexing: index | summary
        }
        field embedding type tensor<float>(d[384]) {
            indexing: attribute | summary | index
            attribute {
                distance-metric: angular
            }
            index {
                hnsw {
                    max-links-per-node: 16
                    neighbors-to-explore-at-insert: 200
                }
            }
        }
    }
    rank-profile bm25 {
        first-phase {
            expression {
                nativeRank(text)
            }
        }
    }
    rank-profile dot_product {
        first-phase {
            expression {
                closeness(embedding)
            }
        }
    }
    rank-profile hybrid {
        first-phase {
            expression {
                nativeRank(text) + closeness(embedding)
      

In [6]:
# 4. Criando o app e subindo via Docker
# app_package = ApplicationPackage(name="vespasearch1", schema=None)
vespa_docker = VespaDocker(port=8080, container_memory="8G")
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for configuration server, 15/60 seconds...
Waiting for configuration server, 20/60 seconds...
Waiting for configuration server, 25/60 seconds...
Waiting for configuration server, 30/60 seconds...
Waiting for configuration server, 35/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Waiting for application to come up, 25/300 seconds.
Waiting for application to come up, 30/300 seconds.
Waiting for application to come up, 35/300 seconds.
Waiting for application to come up, 40/300 seconds.
Waiting for application to come up, 45/300 seconds.
Waiting for application to come up, 50/300 seconds.
Application is up!
Finis

In [7]:
# 5. Indexação de documentos
from tqdm import tqdm
def index_documents(app, docs):
    for doc in tqdm(docs, desc="Indexing documents", unit="document"):
        doc_id = doc["id"]
        text = doc["text"]
        embedding = embedding_model.encode(text).tolist()
        app.feed_data_point(
            schema="vespasearch1",
            data_id=doc_id,
            fields={"id": doc_id, "text": text, "embedding": embedding}
        )

In [8]:
# 6. Funções de Busca
def lexical_search(app, query):
    return app.query(
        yql="select * from sources * where userQuery();",
        query=query,
        ranking="bm25"
    )

def semantic_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query_tensor={"query_embedding": query_vec},
        ranking="dot_product"
    )

def hybrid_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        body={
            "yql": """
                select * from sources * where 
                userQuery() or ([{"targetNumHits":10}]nearestNeighbor(embedding, query_embedding));
            """,
            "query": query,
            "ranking": "hybrid",
            "hits": 10,
            "input.query(query_embedding)": query_vec
        }
    )

In [9]:
PATH = 'subset_msmarco_train_0.01_9.pkl'
PATH_DATA = '../data/' + PATH
PATH_DATA_CLEAN = '../data_clean/' + PATH

In [10]:
# 7 Carregando e formatando dados do MSMARCO
with open("../subset_msmarco_train_0/subset_msmarco_train_0.01_9.pkl", "rb") as f:
    data = pickle.load(f)

# Extraindo lista de documentos
docs_raw = data["docs"]
docs = [{"id": doc_id, "text": doc_obj.text} for doc_id, doc_obj in docs_raw.items()]

In [11]:
response = app.get_application_status()
print(response.status_code)
print(response.json())


200
{'application': {'vespa': {'version': '8.520.18'}, 'meta': {'name': 'default', 'user': 'unknown', 'path': '', 'generation': 12, 'timestamp': 1747691043909, 'date': 'Mon May 19 21:44:03 UTC 2025', 'checksum': '4aea6142f0514374a0ca30b13504bc6b'}, 'user': {'version': ''}}, 'abstractComponents': [{'id': 'com.yahoo.search.searchers.InputCheckingSearcher@vespasearch1_content', 'class': 'com.yahoo.search.searchers.InputCheckingSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yahoo.container.logging.FileConnectionLog', 'class': 'com.yahoo.container.logging.FileConnectionLog', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.container.jdisc.state.StateMonitor', 'class': 'com.yahoo.container.jdisc.state.StateMonitor', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher@vespa', 'class': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yah

In [12]:
import requests

r = requests.get("http://localhost:8080/document/v1/document/document/docid/0")
print(r.status_code, r.text)

404 {"pathId":"/document/v1/document/document/docid/0","id":"id:document:document::0","message":"[UNKNOWN(251001) @ tcp/vespasearch1:19115/default]: Unknown bucket space mapping for document type 'document' in id: 'id:document:document::0' "}


In [13]:
import os
print("Local esperado:", os.path.abspath("schemas/document.sd"))
print("Existe?", os.path.exists("schemas/document.sd"))

with open("schemas/document.sd", "r") as f:
    print("Conteúdo:\n")
    print(f.read())

Local esperado: c:\Users\ramyr\OneDrive\Área de Trabalho\FGV\5periodo\ProjectsInDataScience\data-sci-project\src\schemas\document.sd
Existe? True
Conteúdo:

schema document {
    document document {
        field id type string {
            indexing: summary | attribute
        }

        field text type string {
            indexing: index | summary
        }

        field embedding type tensor<float>(d[384]) {
            indexing: attribute | summary
            attribute {
                distance-metric: dotproduct
            }
        }
    }

    rank-profile bm25 inherits default {
        first-phase {
            expression: nativeRank(text)
        }
    }

    rank-profile dot_product inherits default {
        first-phase {
            expression: closeness(embedding)
        }
    }

    rank-profile hybrid inherits default {
        first-phase {
            expression: nativeRank(text) + closeness(embedding)
        }
    }
}


In [14]:
index_documents(app, docs)

Indexing documents:  59%|█████▉    | 16492/27778 [26:49<18:21, 10.25document/s]  


KeyboardInterrupt: 

In [20]:
result.json

{'pathId': '/document/v1/vespasearch1/vespasearch1/docid/doc1',
 'id': 'id:vespasearch1:vespasearch1::doc1'}

In [35]:
response = app.query(
    body={
        "yql": "select * from sources * where true;",
        "hits": 10
    }, timeout = 1000
)

for hit in response.hits:
    print(hit["fields"])


{'sddocname': 'vespasearch1', 'documentid': 'id:vespasearch1:vespasearch1::msmarco_passage_31_15605486', 'id': 'msmarco_passage_31_15605486', 'text': 'Summary. Factor VII deficiency is a rare genetic bleeding disorder characterized by a deficiency or reduced activity of clotting factor VII. Clotting factors are specialized proteins that are essential for the blood to clot normally. Individuals with factor VII deficiency can experience prolonged, uncontrolled bleeding episodes.', 'embedding': {'type': 'tensor<float>(d[384])', 'values': [-0.02649758942425251, -0.04880521073937416, -0.037282705307006836, 0.04838443547487259, 0.07315978407859802, 0.05858995392918587, -0.04715218022465706, 0.08747385442256927, -0.0536402091383934, 0.0022039851173758507, -0.044738803058862686, -0.04408184811472893, -0.054344192147254944, 0.056642740964889526, -0.09955909848213196, -0.06986312568187714, -0.016022592782974243, 0.03696442395448685, -0.08723299205303192, -0.036453329026699066, -0.076594926416873

In [38]:
response = app.query(
    body={
        "yql": "select * from sources * where userQuery();",
        "query": "If my name is Ramyro, what is the capital of France?",
        "hits": 5,
        "ranking": "bm25"
    }, timeout = 10000
)
print(response.hits)

[]


In [16]:
hybrid_search(app, "What is the capital of France?")


VespaError: [{'code': 12, 'summary': 'Timed out', 'source': 'vespasearch1_content', 'message': "Error in execution of chain 'vespasearch1_content': Chain timed out."}]