In [1]:
!df -h

Filesystem            Size  Used Avail Use% Mounted on
overlay               110G   84G   21G  81% /
tmpfs                  64M     0   64M   0% /dev
shm                    64M     0   64M   0% /dev/shm
/run/host_mark/Users  461G  251G  211G  55% /datasets
/dev/vda1             110G   84G   21G  81% /data
tmpfs                 8.8G     0  8.8G   0% /sys/firmware


In [1]:
!pip install pandas numpy pyvespa

Defaulting to user installation because normal site-packages is not writeable


In [54]:
import os
import pandas as pd
import numpy as np

from vespa.package import ApplicationPackage
from vespa.package import Field, FieldSet, RankProfile
from vespa.application import Vespa

In [55]:
DATA_DIR = "/datasets/csv"
DATA_FULL_PATH = "/datasets/csv/beers_full.csv"

# Re Generate the full beers dataset for indices

In [56]:
dfs = {}
for fname in os.listdir(DATA_DIR):
    if "full" in fname:
        continue
    path = os.path.join(DATA_DIR, fname)
    os.path.basename(path)
    table_name = os.path.splitext(os.path.basename(path))[0]
    dfs[table_name] = pd.read_csv(path, encoding="utf-8")

df = dfs["beers"]
df = pd.merge(
    df,
    dfs["breweries"].rename(columns={"id": "brewery_id"}),
    how="left",
    on="brewery_id",
    suffixes=("", "_breweries"),
)
df = pd.merge(
    df,
    dfs["categories"].rename(columns={"id": "cat_id"}),
    how="left",
    on="cat_id",
    suffixes=("", "_categories"),
)
df = pd.merge(
    df,
    dfs["styles"].rename(columns={"id": "style_id"}),
    how="left",
    on="style_id",
    suffixes=("", "_styles"),
)
df = pd.merge(
    df, dfs["breweries_geocode"], how="left", on="brewery_id", suffixes=("", "_geo")
)
df = df.drop(
    columns=[
        "address2",
        "filepath_breweries",
        "filepath",
        "last_mod",
        "last_mod_styles",
        "last_mod_categories",
        "add_user",
        "add_user_breweries",
        "last_mod_breweries",
    ]
)
df = df.rename(
    columns={
        "descript_breweries": "description_brewery",
        "descript": "description_beer",
        "name_breweries": "brewery",
    }
)

In [57]:
df.sample(3)

Unnamed: 0,id,brewery_id,name,cat_id,style_id,abv,ibu,srm,upc,description_beer,...,phone,website,description_brewery,cat_name,cat_id_styles,style_name,id_geo,latitude,longitude,accuracy
525,513,204,Coronator Helle Doppelbock,7,90,7.5,0.0,0.0,0,,...,49-(0)7542-/-7452,,,German Lager,7.0,Traditional German-Style Bock,191.0,47.6715,9.5888,ROOFTOP
3413,3390,1095,Amarossa,-1,-1,0.0,0.0,0.0,0,,...,,,,,,,1034.0,29.4241,-98.4936,APPROXIMATE
2982,2959,907,Norski Honey Bock,8,95,0.0,0.0,0.0,0,,...,608-527-5850,http://www.newglarusbrewing.com/,"The brewery is run by an enthusiastic couple, ...",North American Lager,8.0,American-Style Lager,856.0,42.8171,-89.6306,ROOFTOP


0       Burlington microbrewers of Humble Patience, Fa...
1                                                     NaN
2                                                     NaN
3                                                     NaN
4                                                     NaN
                              ...                        
5951    Founded in 1986, the Abita Brewing Company is ...
5952    Founded in 1986, the Abita Brewing Company is ...
5953    Founded in 1986, the Abita Brewing Company is ...
5954                                                  NaN
5955                     La Brasserie du Bouffay est situ
Name: description_brewery, Length: 5956, dtype: object

# Prepare a Vespa doctype

In [11]:
VESPA_CONFIG_DIR = "/vespa-config"
VESPA_CONFIG_ZIP = os.path.join(VESPA_CONFIG_DIR, "app_package.zip")

### Define Vespa doctype

In [12]:
fields_spec = [
    {"name": "id", "type": "string"},
    {"name": "brewery_id", "type": "string"},
    {"name": "name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "cat_id", "type": "string"},
    {"name": "style_id", "type": "string"},
    {"name": "abv", "type": "float", "indexing": ["attribute"]},
    {"name": "ibu", "type": "float", "indexing": ["attribute"]},
    {"name": "srm", "type": "float", "indexing": ["attribute"]},
    {"name": "upc", "type": "int", "indexing": ["attribute"]},
    {
        "name": "description_beer",
        "type": "string",
        "indexing": ["index", "summary"],
        "index": "enable-bm25",
    },
    {
        "name": "brewery",
        "type": "string",
        "indexing": ["index", "summary", "attribute"],
        "index": "enable-bm25",
    },
    {"name": "address1", "type": "string", "indexing": ["index"]},
    {"name": "city", "type": "string", "indexing": ["index"]},
    {"name": "state", "type": "string", "indexing": ["index"]},
    {"name": "code", "type": "string"},
    {"name": "country", "type": "string", "indexing": ["index", "summary", "attribute"]},
    {"name": "phone", "type": "string"},
    {"name": "website", "type": "string"},
    {
        "name": "description_brewery",
        "type": "string",
        "indexing": ["index"],
        "index": "enable-bm25",
    },
    {"name": "cat_name", "type": "string", "indexing": ["index", "summary", "attribute"]},
    {"name": "cat_id_styles", "type": "string"},
    {"name": "style_name", "type": "string", "indexing": ["index", "summary"]},
    {"name": "id_geo", "type": "string"},
    {
        "name": "geoloc",
        "type": "position",
    },  # see https://docs.vespa.ai/en/geo-search.html
    {"name": "accuracy", "type": "string"},
]

In [9]:
app_name = "beer"
app_package = ApplicationPackage(name=app_name, create_query_profile_by_default=False)

In [10]:
app_package.schema.add_fields(*[Field(**field_spec) for field_spec in fields_spec])

app_package.schema.add_field_set(
    FieldSet(name="default", fields=["name", "description_beer"])
)

app_package.schema.add_rank_profile(
    RankProfile(name="default", first_phase="bm25(name) + bm25(description_beer)")
)

In [11]:
app_package.to_files(VESPA_CONFIG_DIR)
app_package.to_zipfile(VESPA_CONFIG_ZIP)

In [12]:
validation_overrides_str = """
<validation-overrides>
    <allow until='2024-08-01'>indexing-change</allow>
</validation-overrides>
"""
with open(os.path.join(VESPA_CONFIG_DIR, "validation-overrides.xml"), "w") as f:
    f.write(validation_overrides_str)

**NOW** go onto Vespa's container and manually deploy the new app
```
cd /vespa-config
chmod +x vespa-deploy.sh
./vespa-deploy.sh
```

# Push data to Vespa

In [6]:
client = Vespa(url="http://vespa", port=8080)
client.wait_for_application_up(5)

Using http Authentication against endpoint http://vespa:8080/ApplicationStatus
Application is up!


In [7]:
def craft_vespa_fields(s: pd.Series) -> dict:
    id_ = f"beer:{s['id']}"
    fields = {
        "id": str(s["id"]), # string
        "brewery_id": s["brewery_id"], # string
        "name": s["name"], # string
        "cat_id": s["cat_id"], # string
        "style_id": s["style_id"], # string
        "abv": s["abv"], # float
        "ibu": s["ibu"], # float
        "srm": s["srm"], # float
        "upc": s["upc"], # int
        "description_beer": s["description_beer"], # string
        "brewery": s["brewery"], # string
        "address1": s["address1"], # string
        "city": s["city"], # string
        "state": s["state"], # string
        "code": s["code"], # string
        "country": s["country"], # string
        "phone": s["phone"], # string
        "website": s["website"], # string
        "description_brewery": s["description_brewery"], # string
        "cat_name": s["cat_name"], # string
        "cat_id_styles": s["cat_id_styles"], # string
        "style_name": s["style_name"], # string
        "id_geo": s["id_geo"], # string
        "geoloc": {"lat": s["latitude"], "lng": s["longitude"]}, # position
        "accuracy": s["accuracy"], # string
    }
    return {"id": id_, "fields": fields}

def craft_vespa_fields_for_cli(s: pd.Series) -> dict:
    id_ = f"id:beer_content:beer::{s['id']}"
    fields = {
        "id": str(s["id"]), # string
        "brewery_id": s["brewery_id"], # string
        "name": s["name"], # string
        "cat_id": s["cat_id"], # string
        "style_id": s["style_id"], # string
        "abv": s["abv"], # float
        "ibu": s["ibu"], # float
        "srm": s["srm"], # float
        "upc": s["upc"], # int
        "description_beer": s["description_beer"], # string
        "brewery": s["brewery"], # string
        "address1": s["address1"], # string
        "city": s["city"], # string
        "state": s["state"], # string
        "code": s["code"], # string
        "country": s["country"], # string
        "phone": s["phone"], # string
        "website": s["website"], # string
        "description_brewery": s["description_brewery"], # string
        "cat_name": s["cat_name"], # string
        "cat_id_styles": s["cat_id_styles"], # string
        "style_name": s["style_name"], # string
        "id_geo": s["id_geo"], # string
        "geoloc": {"lat": s["latitude"], "lng": s["longitude"]}, # position
        "accuracy": s["accuracy"], # string
    }
    return {"id": id_, "fields": fields}
    
    

In [None]:
# produce a whole JSONL gathering every beers ready for Vespa CLI
df.apply(craft_vespa_fields_for_cli, axis=1).to_json("./beers_vespa.json", orient="records", lines=True)
print("""Now do:
- cp beers_vespa.json /datasets
- docker exec -it vespa bash
- cd /datasets
- vespa feed --verbose beers_vespa.json
You should have every data into Vespa""")

In [8]:
# PyVespa based push: not working well
point = df.apply(craft_vespa_fields, axis=1).iloc[0]
client.feed_iterable(df.apply(craft_vespa_fields, axis=1).to_list(), schema="beer")

## Final check

In [10]:
resp = client.query(
    {
        "yql": "select * from beer where userQuery()",
        "hits": 10,
        "query": "stout",
    }
)
resp.json["root"]["children"][0]["fields"]

{'sddocname': 'beer',
 'documentid': 'id:beer:beer::beer:4134',
 'name': 'Kalamazoo Stout',
 'description_beer': 'A full-bodied stout with plenty of roast flavor. Kalamazoo Stout is available year round, leading our vast portfolio of stouts.',
 'brewery': "Bell's Brewery Inc.",
 'country': 'United States',
 'cat_name': 'British Ale',
 'style_name': 'Sweet Stout',
 'summaryfeatures': {'bm25(brewery)': 0.0,
  'bm25(description_beer)': 3.73974816301847,
  'length_of_descr': 1.0,
  'vespa.summaryFeatures.cached': 0.0}}