In [1]:
from vespa.package import ApplicationPackage, Schema, Document, Field, RankProfile, HNSW
from vespa.application import Vespa
from vespa.deployment import VespaDocker
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import time
import pickle
import os

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 2. Configuração do modelo de embedding (Semantic Search)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 

In [3]:
# 3. Definição do App

app_package= ApplicationPackage(name="vespasearch1")

app_package.schema.add_fields(
        Field(name="id", type="string", indexing=["summary", "attribute"]),
        Field(name="text", type="string", indexing=["index", "summary"]),
        Field(name="embedding", type="tensor<float>(d[384])", indexing=["attribute", "summary", "index"],
              ann=HNSW(distance_metric="angular")
        #       ann="true",
        #       attribute={"distance-metric": "euclidean"
                                #      , "method": "hnsw"
                                #      }
        ), 
)

app_package.schema.add_rank_profile(
        RankProfile(name="bm25", first_phase="nativeRank(text)"),
)

app_package.schema.add_rank_profile(
        RankProfile(name="dot_product", first_phase="closeness(embedding)"),
)

app_package.schema.add_rank_profile(
        RankProfile(name="hybrid", first_phase="nativeRank(text) + closeness(embedding)")
)

In [4]:
app_package.schema.inputs = [("query_embedding", "tensor<float>(d[384])")]

In [5]:
print(app_package.schema.schema_to_text)

schema vespasearch1 {
    document vespasearch1 {
        field id type string {
            indexing: summary | attribute
        }
        field text type string {
            indexing: index | summary
        }
        field embedding type tensor<float>(d[384]) {
            indexing: attribute | summary | index
            attribute {
                distance-metric: angular
            }
            index {
                hnsw {
                    max-links-per-node: 16
                    neighbors-to-explore-at-insert: 200
                }
            }
        }
    }
    rank-profile bm25 {
        first-phase {
            expression {
                nativeRank(text)
            }
        }
    }
    rank-profile dot_product {
        first-phase {
            expression {
                closeness(embedding)
            }
        }
    }
    rank-profile hybrid {
        first-phase {
            expression {
                nativeRank(text) + closeness(embedding)
      

In [6]:
# 4. Criando o app e subindo via Docker
# app_package = ApplicationPackage(name="vespasearch1", schema=None)
vespa_docker = VespaDocker(port=8080, container_memory="8G")
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for configuration server, 15/60 seconds...
Waiting for configuration server, 20/60 seconds...
Waiting for configuration server, 25/60 seconds...
Waiting for configuration server, 30/60 seconds...
Waiting for configuration server, 35/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Waiting for application to come up, 25/300 seconds.
Waiting for application to come up, 30/300 seconds.
Waiting for application to come up, 35/300 seconds.
Waiting for application to come up, 40/300 seconds.
Waiting for application to come up, 45/300 seconds.
Waiting for application to come up, 50/300 seconds.
Application is up!
Finis

In [7]:
# 5. Indexação de documentos
from tqdm import tqdm
def index_documents(app, docs):
    for doc in tqdm(docs, desc="Indexing documents", unit="document"):
        doc_id = doc["id"]
        text = doc["text"]
        embedding = embedding_model.encode(text).tolist()
        app.feed_data_point(
            schema="vespasearch1",
            data_id=doc_id,
            fields={"id": doc_id, "text": text, "embedding": embedding}
        )

In [8]:
# 6. Funções de Busca
def lexical_search(app, query):
    return app.query(
        yql="select * from sources * where userQuery();",
        query=query,
        ranking="bm25"
    )

def semantic_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        yql="select * from sources * where ([{\"targetNumHits\":10}]nearestNeighbor(embedding, query_embedding));",
        query_tensor={"query_embedding": query_vec},
        ranking="dot_product"
    )

def hybrid_search(app, query):
    query_vec = embedding_model.encode(query).tolist()
    return app.query(
        body={
            "yql": """
                select * from sources * where 
                userQuery() or ([{"targetNumHits":10}]nearestNeighbor(embedding, query_embedding));
            """,
            "query": query,
            "ranking": "hybrid",
            "hits": 10,
            "input.query(query_embedding)": query_vec
        }
    )

In [9]:
PATH = 'subset_msmarco_train_0.01_9.pkl'
PATH_DATA = '../data/' + PATH
PATH_DATA_CLEAN = '../data_clean/' + PATH

In [10]:
# 7 Carregando e formatando dados do MSMARCO
with open("../subset_msmarco_train_0/subset_msmarco_train_0.01_9.pkl", "rb") as f:
    data = pickle.load(f)

# Extraindo lista de documentos
docs_raw = data["docs"]
docs = [{"id": doc_id, "text": doc_obj.text} for doc_id, doc_obj in docs_raw.items()]

In [11]:
response = app.get_application_status()
print(response.status_code)
print(response.json())


200
{'application': {'vespa': {'version': '8.520.18'}, 'meta': {'name': 'default', 'user': 'unknown', 'path': '', 'generation': 12, 'timestamp': 1747691043909, 'date': 'Mon May 19 21:44:03 UTC 2025', 'checksum': '4aea6142f0514374a0ca30b13504bc6b'}, 'user': {'version': ''}}, 'abstractComponents': [{'id': 'com.yahoo.search.searchers.InputCheckingSearcher@vespasearch1_content', 'class': 'com.yahoo.search.searchers.InputCheckingSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yahoo.container.logging.FileConnectionLog', 'class': 'com.yahoo.container.logging.FileConnectionLog', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.container.jdisc.state.StateMonitor', 'class': 'com.yahoo.container.jdisc.state.StateMonitor', 'bundle': 'container-disc:8.520.18'}, {'id': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher@vespa', 'class': 'com.yahoo.search.searchers.OpportunisticWeakAndSearcher', 'bundle': 'container-search-and-docproc:8.520.18'}, {'id': 'com.yah

In [12]:
import requests

r = requests.get("http://localhost:8080/document/v1/document/document/docid/0")
print(r.status_code, r.text)

404 {"pathId":"/document/v1/document/document/docid/0","id":"id:document:document::0","message":"[UNKNOWN(251001) @ tcp/vespasearch1:19115/default]: Unknown bucket space mapping for document type 'document' in id: 'id:document:document::0' "}


In [13]:
import os
print("Local esperado:", os.path.abspath("schemas/document.sd"))
print("Existe?", os.path.exists("schemas/document.sd"))

with open("schemas/document.sd", "r") as f:
    print("Conteúdo:\n")
    print(f.read())

Local esperado: c:\Users\ramyr\OneDrive\Área de Trabalho\FGV\5periodo\ProjectsInDataScience\data-sci-project\src\schemas\document.sd
Existe? True
Conteúdo:

schema document {
    document document {
        field id type string {
            indexing: summary | attribute
        }

        field text type string {
            indexing: index | summary
        }

        field embedding type tensor<float>(d[384]) {
            indexing: attribute | summary
            attribute {
                distance-metric: dotproduct
            }
        }
    }

    rank-profile bm25 inherits default {
        first-phase {
            expression: nativeRank(text)
        }
    }

    rank-profile dot_product inherits default {
        first-phase {
            expression: closeness(embedding)
        }
    }

    rank-profile hybrid inherits default {
        first-phase {
            expression: nativeRank(text) + closeness(embedding)
        }
    }
}


In [None]:
index_documents(app, docs)

Indexing documents:   6%|▌         | 1572/27778 [02:51<1:33:11,  4.69document/s] 

In [None]:
hybrid_search(app, "What is the capital of France?")


VespaError: [{'code': 4, 'summary': 'Invalid query parameter', 'source': 'vespasearch1_content', 'message': "Expected 'query(query_embedding)' to be a tensor, but it is the string '[0.08204811066389084, 0.03605553135275841, -0.0038928852882236242, -0.0048810457810759544, 0.02565113641321659, -0.05714348703622818, 0.012191606685519218, 0.004678904078900814, 0.03494987264275551, -0.0224219411611557, -0.008005237206816673, -0.10935354232788086, 0.022724784910678864, -0.02932087890803814, -0.04352205619215965, -0.12024123221635818, -0.000848641328047961, -0.018150122836232185, 0.056129537522792816, 0.003085229778662324, 0.0023363472428172827, -0.01683923974633217, 0.06362469494342804, -0.023660214617848396, 0.03149356320500374, -0.034797921776771545, -0.0205488633364439, -0.002790951170027256, -0.011037975549697876, -0.03612672537565231, 0.0541410930454731, -0.036617133766412735, -0.02500864863395691, -0.03817041590809822, -0.04960364103317261, -0.015148096717894077, 0.02131503075361252, -0.012740420177578926, 0.07670091837644577, 0.04435573145747185, -0.010834861546754837, -0.029760034754872322, -0.016970466822385788, -0.0246918722987175, 0.00808711163699627, 0.04358769208192825, 0.007177512627094984, 0.07550127804279327, 0.0328066311776638, -0.062046367675065994, 0.0667789950966835, 0.027091365307569504, -0.045689500868320465, -0.031441159546375275, -0.03115525096654892, 0.09153684973716736, -0.0017882229294627905, -0.01128263957798481, 0.03649929165840149, 0.056927137076854706, 0.0022999553475528955, -0.03775055706501007, -0.015484667383134365, 0.05239144340157509, 0.06036447361111641, -0.01664833165705204, 0.008809935301542282, -0.006622296292334795, -0.1062970981001854, 0.001715893275104463, -0.04830581322312355, -0.029768725857138634, 0.004325534217059612, -0.08567411452531815, 0.06620791554450989, -0.05518355220556259, -0.11332660168409348, 0.05084019899368286, -0.009317240677773952, 0.006006753537803888, 0.02101273089647293, -0.022515468299388885, 0.00047272659139707685, 0.05638972297310829, 0.045443445444107056, -0.005277520976960659, 0.09359359741210938, 0.0274602510035038, 0.02944193407893181, -0.045696649700403214, -0.04894433543086052, 0.0013615117641165853, -0.012853391468524933, 0.07980718463659286, -0.11903545260429382, 0.06876879930496216, -0.02271835133433342, 0.044857025146484375, -0.08129198849201202, 0.044057779014110565, 0.0029563619755208492, 0.01762099377810955, 0.08311296254396439, -0.01805495098233223, -0.04792353883385658, 0.05866710841655731, 0.0062464564107358456, -0.014656787738204002, -0.007337239105254412, -0.07807920128107071, -0.10076916217803955, -0.033526722341775894, -0.0009018618729896843, -0.051131170243024826, 0.027221741154789925, 0.07086150348186493, 0.04740171507000923, -0.10456675291061401, 0.004401118028908968, -0.028793739154934883, -0.01835579238831997, -0.050585903227329254, -0.03154190629720688, -0.009517701342701912, -0.06064473092556, 0.0211639404296875, -0.046602193266153336, -7.755118268038648e-33, -0.03129624202847481, 0.056345123797655106, 0.07738033682107925, 0.06391443312168121, -0.04664720594882965, -0.007570476736873388, -0.05532645061612129, 0.040277574211359024, -0.031523946672677994, -0.007102947682142258, 0.039592377841472626, -0.13171198964118958, -0.06614522635936737, 0.021774929016828537, 0.09698943793773651, 0.011799264699220657, 0.08900413662195206, 0.0346858948469162, -0.04387175291776657, -0.00016681858687661588, 0.014680847525596619, -0.002709325635805726, -0.0033176513388752937, 0.017399972304701805, 0.06010518968105316, 0.03949518874287605, -0.0017327648820355535, 0.07728354632854462, 0.014559616334736347, -0.0021933307871222496, -0.001845336752012372, 0.01501469872891903, 0.021672876551747322, 0.007331332191824913, 0.01799950562417507, 0.04974411055445671, 0.01258815173059702, -0.0026322028134018183, 0.04346171021461487, 0.0629749596118927, 0.06660725921392441, -0.03639741241931915, -0.03872961923480034, 0.0440126471221447, 0.005643450655043125, 0.005692597012966871, -0.0348784476518631, -0.07138054817914963, 0.10089900344610214, -0.02475626766681671, 0.0146844033151865, -0.02591957151889801, -0.07273469120264053, -0.017434269189834595, 0.026018859818577766, 0.11413373798131943, -0.07092969119548798, 0.018040675669908524, -0.0033645560033619404, 0.00846823025494814, -0.003198227845132351, 0.005925266537815332, -0.022993484511971474, 0.07761334627866745, 0.03472593426704407, 0.0873919129371643, 0.0462610237300396, 0.018758686259388924, 0.01104749646037817, -0.04582411050796509, -0.046474356204271317, 0.026539446786046028, 0.07402201741933823, 0.06560054421424866, 0.06272177398204803, 0.07237666845321655, -0.008960560895502567, -0.035324882715940475, -0.005384561140090227, -0.0032188789919018745, -0.03802558034658432, -0.04136471450328827, -0.09670209139585495, 0.044219259172677994, -0.033506326377391815, -0.07136603444814682, -0.011642826721072197, -0.0071111684665083885, 0.0006453814567066729, -0.0883803591132164, -0.11334280669689178, -0.12120426446199417, -0.0013210880570113659, -0.04424311965703964, -0.08665942400693893, 3.997687309833669e-33, 0.025276146829128265, -0.0026350277476012707, -0.08113003522157669, 0.02546190284192562, 0.0013292377116158605, 0.016038019210100174, 0.09549157321453094, 0.033216968178749084, -0.012048942968249321, 0.01698562502861023, -0.08307889103889465, -0.12452160567045212, 0.04390957951545715, 0.012151076458394527, 0.06574594229459763, 0.10052962601184845, 0.07295701652765274, -0.026920173317193985, -0.032184746116399765, -0.05346688628196716, -0.12637241184711456, 0.005398081615567207, -0.0353909432888031, -0.004279972054064274, -0.02503949962556362, 0.04162561893463135, -0.09993342310190201, -0.04765276983380318, -0.023976009339094162, 0.0026398019399493933, -0.055191002786159515, 0.013548419810831547, 0.04904061555862427, 0.08499690145254135, -0.04202461242675781, 0.07673398405313492, 0.03321313112974167, 0.001265265978872776, 0.03999499976634979, 0.06455174088478088, -0.04337263107299805, -0.04965050518512726, 0.05795809254050255, 0.11267867684364319, 0.070699043571949, 0.0082264868542552, 0.043815385550260544, -0.022527918219566345, -0.007248697802424431, 0.049857787787914276, 0.0386049821972847, 0.067911796271801, -0.04107007011771202, 0.005732240621000528, 0.017908021807670593, 0.049305807799100876, -0.05145525932312012, 0.05103078484535217, -0.0938098132610321, -0.06816750019788742, 0.0652627944946289, 0.0754573717713356, -0.016841884702444077, 0.06612509489059448, -0.0028971005231142044, -0.020738182589411736, -0.12700854241847992, 0.06160473823547363, -0.009813124313950539, -0.014706097543239594, 0.13544605672359467, 0.034136854112148285, -0.06481858342885971, 0.05101701244711876, -0.06637551635503769, 0.029188334941864014, 0.07939164340496063, 0.01444028876721859, -0.02731001190841198, 0.005267046857625246, -0.06761958450078964, -0.02049446292221546, -0.027144690975546837, -0.026149794459342957, -0.07054667919874191, 0.03471790999174118, 0.00761255482211709, -0.10216667503118515, 0.058427758514881134, -0.0747859999537468, -0.021967999637126923, -0.00680866464972496, -0.05130329728126526, -0.0369698628783226, 0.025690114125609398, -1.7501513127626822e-08, 0.06809661537408829, 0.045000918209552765, -0.04408636316657066, 0.012878764420747757, -0.05775945261120796, -0.09547636657953262, 0.062199465930461884, -0.004272679332643747, -0.008670173585414886, 0.00025499105686321855, -0.07361151278018951, 0.05606214702129364, -0.06970255076885223, -0.051116351038217545, -0.041022807359695435, -0.004761005751788616, -0.03246324509382248, 0.04304740950465202, 0.008683184161782265, 0.02270779199898243, -0.004905323963612318, 0.023358002305030823, -0.04563939943909645, -0.05810337886214256, 0.012541470117866993, -0.09903230518102646, 0.04062919691205025, 0.04566897451877594, 0.002715941285714507, -0.005313000176101923, 0.06640290468931198, -0.027287563309073448, -0.05007484182715416, -0.09029500186443329, -0.03612228482961655, 0.012680093757808208, -0.0058304728008806705, -0.005093270912766457, 0.009507527574896812, -0.02905244007706642, 0.09497947990894318, 0.06199074164032936, 0.012536657974123955, -0.011961031705141068, 0.024525701999664307, 0.0453830286860466, 0.05382111668586731, -0.03517719730734825, 0.11464711278676987, -0.0890202447772026, -0.11148509383201599, 0.09941159188747406, 0.003938918001949787, 0.004478453192859888, 0.0034466348588466644, 0.07089649885892868, -0.051293618977069855, -0.012674218975007534, 0.021874740719795227, -0.02001200243830681, -0.014911333099007607, 0.04920439049601555, 0.08929188549518585, -0.011127782054245472]', this usually means that 'query(query_embedding)' is not defined in the schema. See https://docs.vespa.ai/en/tensor-user-guide.html#querying-with-tensors"}]