In [2]:
%load_ext autoreload
%autoreload 2
import vap.foo as foo

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [166]:
# The goal is to prepare a Vespa app that models documents that have many chunks, and each chunk can have many entities.
# Then score the chunks based on how many entities it matched.
# Do not return chunks that match no entities.
# The tricks are:
# - using the "layered ranking" https://blog.vespa.ai/introducing-layered-ranking-for-rag-applications/
# - to discard the low scoring chunks we can use filter_subspaces https://docs.vespa.ai/en/reference/ranking/ranking-expressions.html#filter-subspaces

# To achieve that we need one document
# Lucene linguistics would be a definite way to achieve tokenization and matching requirements, but let's first try to achieve the goal with the default linguistics library.

In [13]:
from vespa.package import (ApplicationPackage, Field, Schema, Document, RankProfile, Function, Summary, FirstPhaseRanking, DocumentSummary)

doc_schema = Schema(
    name='docs',
    document=Document(
        fields=[
            Field(name='id', type='int', indexing=['attribute']),
            Field(
                name='chunks',
                type='array<string>',
                indexing=['index', 'summary'],
                summary=Summary(fields=[('select-elements-by', ['best_chunks'])]),
            ),
            Field(
                # stores ids per chunk,
                # each string encodes the entities mentioned per chunk
                # e.g. ["EID123 EID321", "", "EID9"]
                # doc has 3 chunks:
                # - first chunk has mentioned 'EID123' and 'EID321' entities
                # - second chunk has no entities
                # - third chunk has mentioned one entity "EID9"
                name='entities',
                type='array<string>',
                indexing=['index', 'summary'],
                match=['text', 'cased'],
                index={'stemming': 'none'},
                summary=Summary(fields=[('select-elements-by', ['best_chunks'])]),
            ),
        ]
    ),
    rank_profiles=[
        RankProfile(
            name='default',
            first_phase=FirstPhaseRanking(
                expression='reduce(best_entities, sum, i)'
            ),
            functions=[
                Function(
                    name='best_chunks',
                    expression='top(2, best_entities)',
                ),
                Function(
                    name='best_entities',
                    expression='elementwise(bm25(entities),i,float)'
                )
            ],
            summary_features=['best_chunks'],
            match_features=['best_entities'],
            rank_properties=[
                ('elementwise(bm25(entities),i,float).k1', '0')
            ]
        )
    ],
    document_summaries=[
        DocumentSummary(
            name='tokenization',
            from_disk=True,
            summary_fields=[
                Summary(
                    name='entities_tokens',
                    fields=[('source', 'entities'), 'tokens']
                )
            ],
        )
    ]
)
application_package = ApplicationPackage(
    name='test',
    schema=[doc_schema],
)

In [4]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker(
    container_image="vespaengine/vespa:8.617.12",
)
vespa_docker.deploy(application_package=application_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Application is up!
Finished deployment.


Vespa(http://localhost, 8080)

In [14]:
client = vap.redeploy(vespa_docker, application_package)

Deploy status code: 200


In [6]:
docs = [
    {
        'id': f'{1}',
        'fields': {
            'id': 1,
            'chunks': [
                'chunk_1',
                'chunk_2',
                'chunk_3'
            ],
            'entities': [
                'eid1 eid2',
                '',
                'eid9'
            ],
        }
    }
]
client.feed_iterable(docs, schema="docs", namespace="doc", callback=vap.feed_callback)

In [15]:
client.query(
    body={
        # https://docs.vespa.ai/en/ranking/significance.html
        'yql': '''
               select *
               from sources docs
               where
                   {significance:1}
                   rank(
                   true
                   , entities contains "eid1"
                   , entities contains "eid2"
                   , entities contains "eid9"
                   )
               ''',
        # 'trace.level': 2,
        'summary': 'default'
    },
).json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 1},
  'coverage': {'coverage': 100,
   'documents': 1,
   'full': True,
   'nodes': 1,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'id:doc:docs::1',
    'relevance': 3.0,
    'source': 'test_content',
    'fields': {'matchfeatures': {'best_entities': {'type': 'tensor<float>(i{})',
       'cells': {'0': 2.0, '2': 1.0}}},
     'sddocname': 'docs',
     'documentid': 'id:doc:docs::1',
     'chunks': ['chunk_1', 'chunk_3'],
     'entities': ['eid1 eid2', 'eid9'],
     'summaryfeatures': {'best_chunks': {'type': 'tensor<float>(i{})',
       'cells': {'0': 2.0, '2': 1.0}},
      'vespa.summaryFeatures.cached': 0.0}}}]}}

In [16]:
# Fetching the exact tokens stored in the index
client.query(
    body={
        # https://docs.vespa.ai/en/ranking/significance.html
        'yql': '''
               select *
               from sources docs
               where
                   {significance:1}
                   rank(
                   true
                   , entities contains "eid1"
                   , entities contains "eid2"
                   , entities contains "eid9"
                   )
               ''',
        # 'trace.level': 2,
        'summary': 'tokenization'
    },
).json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 1},
  'coverage': {'coverage': 100,
   'documents': 1,
   'full': True,
   'nodes': 1,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'index:test_content/0/c4ca4238f19e7628cfd2bf01',
    'relevance': 3.0,
    'source': 'test_content',
    'fields': {'matchfeatures': {'best_entities': {'type': 'tensor<float>(i{})',
       'cells': {'0': 2.0, '2': 1.0}}},
     'sddocname': 'docs',
     'entities_tokens': [['eid1', 'eid2'], {}, ['eid9']],
     'summaryfeatures': {'best_chunks': {'type': 'tensor<float>(i{})',
       'cells': {'0': 2.0, '2': 1.0}},
      'vespa.summaryFeatures.cached': 0.0}}}]}}

In [17]:
# Let's use the entity ids for filtering docs
# The condition `id=1` is to show that filtering can be combined with anything, e.g. nearestNeighbor matching.
client.query(
    body={
        'yql': '''
               select *
               from sources docs
               where
                   id=1
                   AND
                   ({significance:1}
                   entities contains "eid9")
               ''',
        'summary': 'default'
    },
).json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 1},
  'coverage': {'coverage': 100,
   'documents': 1,
   'full': True,
   'nodes': 1,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'id:doc:docs::1',
    'relevance': 1.0,
    'source': 'test_content',
    'fields': {'matchfeatures': {'best_entities': {'type': 'tensor<float>(i{})',
       'cells': {'2': 1.0}}},
     'sddocname': 'docs',
     'documentid': 'id:doc:docs::1',
     'chunks': ['chunk_3'],
     'entities': ['eid9'],
     'summaryfeatures': {'best_chunks': {'type': 'tensor<float>(i{})',
       'cells': {'2': 1.0}},
      'vespa.summaryFeatures.cached': 0.0}}}]}}