In [30]:
import os
import pandas as pd
import numpy as np

from vespa.package import ApplicationPackage
from vespa.package import Field, FieldSet, RankProfile
from vespa.application import Vespa

In [2]:
DATA_DIR = "/data"
DATA_FULL_PATH = "/data/beers_full.csv"

# Re Generate the full beers dataset for indices

In [3]:
REGENERATE = False

In [4]:
if REGENERATE:
    dfs = {}
    for fname in os.listdir(DATA_DIR):
        if "full" in fname:
            continue
        path = os.path.join(DATA_DIR, fname)
        os.path.basename(path)
        table_name = os.path.splitext(os.path.basename(path))[0]
        dfs[table_name] = pd.read_csv(path, encoding="utf-8")

    df = dfs["beers"]
    df = pd.merge(
        df,
        dfs["breweries"].rename(columns={"id": "brewery_id"}),
        how="left",
        on="brewery_id",
        suffixes=("", "_breweries"),
    )
    df = pd.merge(
        df,
        dfs["categories"].rename(columns={"id": "cat_id"}),
        how="left",
        on="cat_id",
        suffixes=("", "_categories"),
    )
    df = pd.merge(
        df,
        dfs["styles"].rename(columns={"id": "style_id"}),
        how="left",
        on="style_id",
        suffixes=("", "_styles"),
    )
    df = pd.merge(
        df, dfs["breweries_geocode"], how="left", on="brewery_id", suffixes=("", "_geo")
    )
    df = df.drop(
        columns=[
            "address2",
            "filepath_breweries",
            "filepath",
            "last_mod",
            "last_mod_styles",
            "last_mod_categories",
            "add_user",
            "add_user_breweries",
            "last_mod_breweries",
        ]
    )
    df = df.rename(
        columns={
            "descript_breweries": "description_brewery",
            "descript": "description_beer",
            "name_breweries": "brewery",
        }
    )

    df.to_csv(DATA_FULL_PATH, index=False)
else:
    df = pd.read_csv(DATA_FULL_PATH)

In [5]:
df.sample(3)

Unnamed: 0,id,brewery_id,name,cat_id,style_id,abv,ibu,srm,upc,description_beer,...,phone,website,description_brewery,cat_name,cat_id_styles,style_name,id_geo,latitude,longitude,accuracy
3870,3827,1326,Moonglow Weizenbock,4,55,8.7,0.0,0.0,0,This dark amber wheat beer features fruity and...,...,1-610-873-0881,http://www.victorybeer.com,,German Ale,4.0,South German-Style Weizenbock,1255.0,40.0061,-75.6942,RANGE_INTERPOLATED
627,615,1085,Blonde Ale,-1,-1,0.0,0.0,0.0,0,,...,1-604-599-1190,,,,,,,,,
4410,4362,963,Lone Star Light,8,96,3.85,0.0,0.0,0,"""Lone Star Light mimics its full-bodied counte...",...,1-800-935-2337,http://www.pabst.com/,"At Pabst Brewing Company, we make beer with hi...",North American Lager,8.0,American-Style Light Lager,907.0,29.43,-98.49,APPROXIMATE


# Prepare a Vespa doctype

In [82]:
VESPA_CONFIG_DIR = "/vespa-config"
VESPA_CONFIG_ZIP = os.path.join(VESPA_CONFIG_DIR, "app_package.zip")

### Define Vespa doctype

In [83]:
fields_spec = [
    {"name": "id", "type": "string"},
    {"name": "brewery_id", "type": "string"},
    {"name": "name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "cat_id", "type": "string"},
    {"name": "style_id", "type": "string"},
    {"name": "abv", "type": "float", "indexing": ["attribute"]},
    {"name": "ibu", "type": "float", "indexing": ["attribute"]},
    {"name": "srm", "type": "float", "indexing": ["attribute"]},
    {"name": "upc", "type": "int", "indexing": ["attribute"]},
    {
        "name": "description_beer",
        "type": "string",
        "indexing": ["index", "summary"],
        "index": "enable-bm25",
    },
    {
        "name": "brewery",
        "type": "string",
        "indexing": ["index", "summary"],
        "index": "enable-bm25",
    },
    {"name": "address1", "type": "string", "indexing": ["index"]},
    {"name": "city", "type": "string", "indexing": ["index"]},
    {"name": "state", "type": "string", "indexing": ["index"]},
    {"name": "code", "type": "string"},
    {"name": "country", "type": "string", "indexing": ["index", "summary"]},
    {"name": "phone", "type": "string"},
    {"name": "website", "type": "string"},
    {
        "name": "description_brewery",
        "type": "string",
        "indexing": ["index"],
        "index": "enable-bm25",
    },
    {"name": "cat_name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "cat_id_styles", "type": "string"},
    {"name": "style_name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "id_geo", "type": "string"},
    {
        "name": "geoloc",
        "type": "position",
    },  # see https://docs.vespa.ai/en/geo-search.html
    {"name": "accuracy", "type": "string"},
]

In [84]:
app_name = "beer"
app_package = ApplicationPackage(name=app_name, create_query_profile_by_default=False)

In [85]:
app_package.schema.add_fields(*[Field(**field_spec) for field_spec in fields_spec])

app_package.schema.add_field_set(
    FieldSet(name="default", fields=["name", "description_beer"])
)

app_package.schema.add_rank_profile(
    RankProfile(name="default", first_phase="bm25(name) + bm25(description_beer)")
)

In [86]:
app_package.to_files(VESPA_CONFIG_DIR)
app_package.to_zipfile(VESPA_CONFIG_ZIP)

In [89]:
validation_overrides_str = """
<validation-overrides>
    <allow until='2024-08-01'>indexing-change</allow>
</validation-overrides>
"""
with open(os.path.join(VESPA_CONFIG_DIR, "validation-overrides.xml"), "w") as f:
    f.write(validation_overrides_str)

**NOW** go onto Vespa's container and manually deploy the new app

## Push data to Vespa

In [46]:
client = Vespa(url="http://vespa", port=8080)
client.wait_for_application_up(5)

Using http Authentication against endpoint http://vespa:8080/ApplicationStatus
Application is up!


In [79]:
def craft_vespa_fields(s: pd.Series) -> dict:
    id_ = f"beer:{s['id']}"
    fields = {
        "id": str(s["id"]), # string
        "brewery_id": s["brewery_id"], # string
        "name": s["name"], # string
        "cat_id": s["cat_id"], # string
        "style_id": s["style_id"], # string
        "abv": s["abv"], # float
        "ibu": s["ibu"], # float
        "srm": s["srm"], # float
        "upc": s["upc"], # int
        "description_beer": s["description_beer"], # string
        "brewery": s["brewery"], # string
        "address1": s["address1"], # string
        "city": s["city"], # string
        "state": s["state"], # string
        "code": s["code"], # string
        "country": s["country"], # string
        "phone": s["phone"], # string
        "website": s["website"], # string
        "description_brewery": s["description_brewery"], # string
        "cat_name": s["cat_name"], # string
        "cat_id_styles": s["cat_id_styles"], # string
        "style_name": s["style_name"], # string
        "id_geo": s["id_geo"], # string
        "geoloc": {"lat": s["latitude"], "lng": s["longitude"]}, # position
        "accuracy": s["accuracy"], # string
    }
    return {"id": id_, "fields": fields}
    

In [90]:
client.feed_iterable(df.apply(craft_vespa_fields, axis=1).to_list(), schema="beer")

## Final check

In [91]:
resp = client.query(
    {
        "yql": "select * from beer where userQuery()",
        "hits": 1,
        "query": "stout",
    }
)
resp.json["root"]["children"][0]["fields"]

{'sddocname': 'beer',
 'documentid': 'id:beer:beer::beer:4134',
 'name': 'Kalamazoo Stout',
 'description_beer': 'A full-bodied stout with plenty of roast flavor. Kalamazoo Stout is available year round, leading our vast portfolio of stouts.',
 'brewery': "Bell's Brewery Inc.",
 'country': 'United States',
 'cat_name': 'British Ale',
 'style_name': 'Sweet Stout'}