In [1]:
# !python3 -m venv venv
!source venv/bin/activate

In [1]:
# pip install pyvespa

In [None]:
# pip install datasets

In [None]:
# pip install docker

In [1]:
# !docker info | grep "Total Memory" 

 Total Memory: 7.469GiB


### Parte apenas para limpar o docker pq ele tava ficando com os dados das outras bases de dados 

In [10]:
import docker
client = docker.from_env()
for container in client.containers.list():
  print(container.id, container.name)


b3e03197362e10830ee5f8414a3b2d4ce768e1251df0998ec48f54e4e769e454 podflix


In [11]:
container = client.containers.get('podflix')
container.stop()  # Parar o contêiner
container.remove()  # Remover o contêiner

## BM25

In [12]:
from vespa.package import ApplicationPackage, Field, Schema, Document, RankProfile, HNSW, RankProfile, Component, Parameter, FieldSet, GlobalPhaseRanking, Function
from vespa.io import VespaResponse, VespaQueryResponse

import pandas as pd

In [13]:

package = ApplicationPackage(
        name="podflix",
        schema=[Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"),
                    Field(name="description", type="string", indexing=["index", "summary"], index="enable-bm25", bolding=True),
                    Field(name="transcript", type="string", indexing=["index", "summary"], index="enable-bm25", bolding=True)
                ]
            ),
            fieldsets=[
                FieldSet(name = "default", fields = ["title", "description", "transcript"])
            ],
            rank_profiles=[
                RankProfile(
                    name="bm25",
                    inputs=[("query(q)", "tensor<float>(x[384])")],
                    functions=[Function(
                        name="bm25sum", expression="bm25(title) + bm25(description) + bm25(transcript)"
                    )],
                    first_phase="bm25sum"
                )
            ]
        )
        ],
        components=[Component(id="e5", type="hugging-face-embedder",
            parameters=[
                Parameter("transformer-model", {"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"}),
                Parameter("tokenizer-model", {"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"})
            ]
        )]
    )

In [14]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker()

app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 30/300 seconds...
Using plain http against endpoin

In [15]:
data = pd.read_csv('data/tal_links_full.csv')
data.head(2)

Unnamed: 0,link,episode,title,date_on_air,description,transcript
0,https://www.thisamericanlife.org/sites/default...,1,New Beginnings,"1 November 17, 1995",Our program's very first broadcast.,Ira Glass: Joe Franklin?\nJoe Franklin: I'm re...
1,https://www.thisamericanlife.org/sites/default...,2,Small Scale Sin,"2 November 24, 1995",Small-scale stories on the nature of small-sca...,"Ira Glass: OK, three boys, aged 13, 15, and 16..."


In [16]:
def transform_row(row):
    return {
        "id": row['episode'],
        "fields": {
            "title": row['title'],
            "description": row['description'],
            "transcript": row['transcript'],
            "id": row['episode']
        }
    }

vespa_feed = data.apply(transform_row, axis=1).tolist()

In [17]:
def callback(response:VespaResponse, id:str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")

app.feed_iterable(vespa_feed, schema="doc", namespace="teste_podflix", callback=callback)


Error when feeding document 6: {'Exception': 'Out of range float values are not JSON compliant', 'id': 6, 'message': 'Exception during feed_data_point'}


Error when feeding document 460: {'Exception': 'Out of range float values are not JSON compliant', 'id': 460, 'message': 'Exception during feed_data_point'}
Error when feeding document 497: {'Exception': 'Out of range float values are not JSON compliant', 'id': 497, 'message': 'Exception during feed_data_point'}


In [18]:
def display_hits_as_df(response:VespaQueryResponse, fields) -> pd.DataFrame:
    records = []
    for hit in response.hits:
        record = {}
        for field in fields:
            record[field] = hit['fields'][field]
        records.append(record)
    return pd.DataFrame(records)

In [19]:
# pd.reset_option('display.max_colwidth')
# pd.set_option('display.max_colwidth', None)

with app.syncio(connections=1) as session:
  query = "Hollywood"
  response:VespaQueryResponse = session.query(
    yql="select * from sources * where userQuery() limit 5",
    query=query,
    ranking="bm25"
  )
  assert(response.is_successful())
  
  print(display_hits_as_df(response, ["id", "title"]))

    id                     title
0  225               Home Movies
1  169      Pursuit of Happiness
2  121     Twentieth Century Man
3  173  Three Kinds of Deception
4   43         Faustian Bargains
