In [13]:
import os
import pandas as pd
import numpy as np
import random

from vespa.package import ApplicationPackage, Component, Parameter
from vespa.package import Field, FieldSet, RankProfile
from vespa.application import Vespa

In [14]:
DATA_DIR = "/datasets/csv"
DATA_FULL_PATH = "/datasets/csv/beers_full.csv"

# Re Generate the full beers dataset for indices

In [15]:
dfs = {}
for fname in os.listdir(DATA_DIR):
    if "full" in fname:
        continue
    path = os.path.join(DATA_DIR, fname)
    os.path.basename(path)
    table_name = os.path.splitext(os.path.basename(path))[0]
    dfs[table_name] = pd.read_csv(path, encoding="utf-8")

df = dfs["beers"]
df = pd.merge(
    df,
    dfs["breweries"].rename(columns={"id": "brewery_id"}),
    how="left",
    on="brewery_id",
    suffixes=("", "_breweries"),
)
df = pd.merge(
    df,
    dfs["categories"].rename(columns={"id": "cat_id"}),
    how="left",
    on="cat_id",
    suffixes=("", "_categories"),
)
df = pd.merge(
    df,
    dfs["styles"].rename(columns={"id": "style_id"}),
    how="left",
    on="style_id",
    suffixes=("", "_styles"),
)
df = pd.merge(
    df, dfs["breweries_geocode"], how="left", on="brewery_id", suffixes=("", "_geo")
)
df = df.drop(
    columns=[
        "address2",
        "filepath_breweries",
        "filepath",
        "last_mod",
        "last_mod_styles",
        "last_mod_categories",
        "add_user",
        "add_user_breweries",
        "last_mod_breweries",
    ]
)
df = df.rename(
    columns={
        "descript_breweries": "description_brewery",
        "descript": "description_beer",
        "name_breweries": "brewery",
    }
)

df.to_csv("../beers_full.csv", index=False)

In [16]:
def craft_global_description(row: pd.Series) -> str:
    name = row.name
    brewer = row.brewery
    beer_descr = row.description_beer
    brewer_descr = row.description_brewery
    style = "" if pd.isna(row.style_name) else row.style_name
    abv = row.abv if row.abv>0 else (random.random()*9 + 3)
    abv = int(abv*10)/10
    ibu = row.ibu if row.ibu>0 else 20 + (random.random()*60 + (random.random()*60 if "ipa" in style.lower() else 0))
    ibu = int(ibu*10)/10

    descr = f"The beer {name} from brewery {brewer} has the following attributes:\n"
    if not pd.isna(beer_descr):
        descr += f"- beer description: {beer_descr}\n"
    if not pd.isna(brewer_descr):
        descr += f"- brewery description: {brewer_descr}\n"
    if not pd.isna(style):
        descr += f"- beer style: {style}\n"
    descr += f"- specs: Alcohol by volume: {abv}, bitterness unit: {ibu}"
    return descr

#df = df.assign(global_description=df.apply(craft_global_description, axis=1))

# Prepare a Vespa doctype

In [17]:
VESPA_CONFIG_DIR = "/vespa-config"
VESPA_CONFIG_ZIP = os.path.join(VESPA_CONFIG_DIR, "app_package.zip")

### Define Vespa doctype

In [6]:
fields_spec = [
    {"name": "id", "type": "string"},
    {"name": "brewery_id", "type": "string"},
    {"name": "name", "type": "string", "indexing": ["index", "summary", "attribute"]},
    {
        "name": "description_beer",
        "type": "string",
        "indexing": ["index", "summary", "attribute"],
        "index": "enable-bm25",
    },
    {
        "name": "brewery",
        "type": "string",
        "indexing": ["index", "summary", "attribute"],
        "index": "enable-bm25",
    },
    {
        "name": "description_brewery",
        "type": "string",
        "indexing": ["index", "summary", "attribute"],
        "index": "enable-bm25",
    },
    {"name": "cat_id", "type": "string"},
    {"name": "style_id", "type": "string"},
    {"name": "abv", "type": "float", "indexing": ["attribute"]},
    {"name": "ibu", "type": "float", "indexing": ["attribute"]},
    {"name": "srm", "type": "float", "indexing": ["attribute"]},
    {"name": "upc", "type": "int", "indexing": ["attribute"]},
    {"name": "address1", "type": "string", "indexing": ["index"]},
    {"name": "city", "type": "string", "indexing": ["index"]},
    {"name": "state", "type": "string", "indexing": ["index"]},
    {"name": "code", "type": "string"},
    {"name": "country", "type": "string", "indexing": ["index", "summary", "attribute"]},
    {"name": "phone", "type": "string"},
    {"name": "website", "type": "string"},
    {"name": "cat_name", "type": "string", "indexing": ["index", "summary", "attribute"]},
    {"name": "cat_id_styles", "type": "string"},
    {"name": "style_name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "id_geo", "type": "string"},
    {
        "name": "geoloc",
        "type": "position",
    },  # see https://docs.vespa.ai/en/geo-search.html
    {"name": "accuracy", "type": "string"},
]

In [7]:
embedder = Component(
    id="mxbai-old", 
    type="hugging-face-embedder",
    parameters=[
        Parameter("transformer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1/resolve/main/onnx/model_q4.onnx"}),
        Parameter("tokenizer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1/resolve/main/tokenizer.json"}),
    ]
)

In [8]:
app_name = "beer"
app_package = ApplicationPackage(
    name=app_name, 
    create_query_profile_by_default=False, 
    stateless_model_evaluation=True, 
    components=[
        embedder
    ]
)

In [9]:
app_package.schema.add_fields(*[Field(**field_spec) for field_spec in fields_spec])

app_package.schema.add_field_set(
    FieldSet(name="default", fields=["name", "description_beer"])
)

app_package.schema.add_rank_profile(
    RankProfile(name="root", first_phase="bm25(description_beer)")
)

app_package.schema.add_rank_profile(
    RankProfile(name="rank-brewery-and-descr", first_phase="bm25(name) + bm25(description_beer)")
)

In [10]:
app_package.to_files(VESPA_CONFIG_DIR)
app_package.to_zipfile(VESPA_CONFIG_ZIP)

In [11]:
validation_overrides_str = """
<validation-overrides>
    <allow until='2025-03-20'>indexing-change</allow>
</validation-overrides>
"""
with open(os.path.join(VESPA_CONFIG_DIR, "validation-overrides.xml"), "w") as f:
    f.write(validation_overrides_str)

**NOW** go onto Vespa's container and manually deploy the new app
```
cd /vespa-config
chmod +x vespa-deploy.sh
./vespa-deploy.sh
```

# Push data to Vespa

In [18]:
client = Vespa(url="http://vespa", port=8080)
client.wait_for_application_up(5)

Application is up!


In [19]:
def craft_vespa_fields(s: pd.Series) -> dict:
    id_ = f"beer:{s['id']}"
    fields = {
        "id": str(s["id"]), # string
        "brewery_id": s["brewery_id"], # string
        "name": s["name"], # string
        "cat_id": s["cat_id"], # string
        "style_id": s["style_id"], # string
        "abv": s["abv"], # float
        "ibu": s["ibu"], # float
        "srm": s["srm"], # float
        "upc": s["upc"], # int
        "description_beer": s["description_beer"], # string
        "brewery": s["brewery"], # string
        "address1": s["address1"], # string
        "city": s["city"], # string
        "state": s["state"], # string
        "code": s["code"], # string
        "country": s["country"], # string
        "phone": s["phone"], # string
        "website": s["website"], # string
        "description_brewery": s["description_brewery"], # string
        "cat_name": s["cat_name"], # string
        "cat_id_styles": s["cat_id_styles"], # string
        "style_name": s["style_name"], # string
        "id_geo": s["id_geo"], # string
        "geoloc": {"lat": s["latitude"], "lng": s["longitude"]}, # position
        "accuracy": s["accuracy"], # string
    }
    return {"id": id_, "fields": fields}

def craft_vespa_fields_for_cli(s: pd.Series) -> dict:
    id_ = f"id:beer_content:beer::{s['id']}"
    fields = {
        "id": str(s["id"]), # string
        "brewery_id": s["brewery_id"], # string
        "name": s["name"], # string
        "cat_id": s["cat_id"], # string
        "style_id": s["style_id"], # string
        "abv": s["abv"], # float
        "ibu": s["ibu"], # float
        "srm": s["srm"], # float
        "upc": s["upc"], # int
        "description_beer": s["description_beer"], # string
        "brewery": s["brewery"], # string
        "address1": s["address1"], # string
        "city": s["city"], # string
        "state": s["state"], # string
        "code": s["code"], # string
        "country": s["country"], # string
        "phone": s["phone"], # string
        "website": s["website"], # string
        "description_brewery": s["description_brewery"], # string
        "cat_name": s["cat_name"], # string
        "cat_id_styles": s["cat_id_styles"], # string
        "style_name": s["style_name"], # string
        "id_geo": s["id_geo"], # string
        "geoloc": {"lat": s["latitude"], "lng": s["longitude"]}, # position
        "accuracy": s["accuracy"], # string
    }
    return {"id": id_, "fields": fields}
    
    

In [20]:
# produce a whole JSONL gathering every beers ready for Vespa CLI
df.apply(craft_vespa_fields_for_cli, axis=1).to_json("./beers_vespa.json", orient="records", lines=True)
print("""Now do:
- cp beers_vespa.json /datasets
- docker exec -it vespa bash
- cd /datasets
- vespa feed --verbose beers_vespa.json
You should have every data into Vespa""")

Now do:
- cp beers_vespa.json /datasets
- docker exec -it vespa bash
- cd /datasets
- vespa feed --verbose beers_vespa.json
You should have every data into Vespa


In [8]:
# PyVespa based push: not working well
point = df.apply(craft_vespa_fields, axis=1).iloc[0]
client.feed_iterable(df.apply(craft_vespa_fields, axis=1).to_list(), schema="beer")

## Final check

In [22]:
resp = client.query(
    {
        "yql": "select * from beer where userQuery()",
        "hits": 10,
        "query": "stout",
    }
)
resp.json["root"]["children"][0]["fields"]

{'sddocname': 'beer',
 'documentid': 'id:beer_content:beer::4933',
 'mrl_embedding': {'type': 'tensor<float>(x[384])',
  'values': [-0.002487529069185257,
   0.03732241690158844,
   0.07591114938259125,
   0.01705878973007202,
   -0.05777906998991966,
   0.026051253080368042,
   0.012164727784693241,
   0.04527696594595909,
   -0.08018425107002258,
   0.05988828465342522,
   0.02091125398874283,
   -0.03639068454504013,
   0.0402924120426178,
   -0.07223512977361679,
   0.039725352078676224,
   -0.08293665945529938,
   0.01137600652873516,
   0.019361568614840508,
   -0.013785398565232754,
   -0.029221994802355766,
   0.10936593264341354,
   -0.006100562866777182,
   -0.08122404664754868,
   -0.05353588983416557,
   -0.03830980136990547,
   0.006362958811223507,
   0.003033282235264778,
   0.018886558711528778,
   0.09863639622926712,
   0.10103290528059006,
   -0.005894968751817942,
   0.08185400068759918,
   -0.021013090386986732,
   -0.06748958677053452,
   -0.045515164732933044,
  