In [4]:
# For analytics, it is typical to get how many times some entity is mentioned in the documents, but also get counts of entities within a category.

# E.g.
# not only {'entity_1', 4, 'entity_2': 3, ...}
# but also {'person': {'entity_1', 4, ...}, 'location': {'entity_2': 3, ...}}

# Let's try Vespa structs to get the job done.

In [5]:
%load_ext autoreload
%autoreload 2
import mycode.vap as vap

In [12]:
from vespa.package import (ApplicationPackage, Field, Schema, Document, RankProfile, Function, Summary,
                           FirstPhaseRanking, DocumentSummary, Struct, StructField, ValidationID)

doc_schema = Schema(
    name='docs',
    document=Document(
        structs=[Struct(
            name='entity',
            fields=[
                Field(name="entity_id", type="string"),
                Field(name="category", type="string"),
            ]
        )],
        fields=[
            Field(name='id', type='int', indexing=['attribute']),
            Field(
                # stores ids per chunk,
                # each string encodes the entities mentioned per chunk
                # e.g. ["EID123 EID321", "", "EID9"]
                # doc has 3 chunks:
                # - first chunk has mentioned 'EID123' and 'EID321' entities
                # - second chunk has no entities
                # - third chunk has mentioned one entity "EID9"
                name='entities',
                type='array<entity>',
                struct_fields=[StructField(name="entity_id", indexing=["attribute"]),
                               StructField(name="category", indexing=["attribute"])],
                indexing=['summary'],  # add to default summary
            ),
        ]
    ),
    rank_profiles=[],
    document_summaries=[]
)
application_package = ApplicationPackage(
    name='entities',
    schema=[doc_schema],
)

In [13]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker(
    container_image="vespaengine/vespa:8.634.24",
)
vespa_docker.deploy(application_package=application_package)

Vespa(http://localhost, 8080)

In [17]:
from vespa.application import Vespa
client = Vespa(url="http://localhost", port=vespa_docker.local_port)
docs = [
    {
        'id': f'{1}',
        'fields': {
            'id': 1,
            'entities': [
                {"entity_id": "entity_1", "category": "person"},
                {"entity_id": "entity_2", "category": "person"},
                {"entity_id": "entity_3", "category": "company"},
            ],
        }
    },
    {
        'id': f'{2}',
        'fields': {
            'id': 2,
            'entities': [
                {"entity_id": "entity_1", "category": "person"},
                {"entity_id": "entity_4", "category": "company"},
            ],
        }
    }
]
client.feed_iterable(docs, schema="docs", namespace="doc", callback=vap.feed_callback)

In [25]:
client.query(body={
    'yql': '''
           select * from sources docs where true limit 0 |
           all(
                group(entities.category)
                each(
                    group(entities.entity_id)
                    each(output(count()))
                )
           )
           '''
}).json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 2},
  'coverage': {'coverage': 100,
   'documents': 2,
   'full': True,
   'nodes': 1,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'group:root:0',
    'relevance': 1.0,
    'continuation': {'this': ''},
    'children': [{'id': 'grouplist:entities.category',
      'relevance': 1.0,
      'label': 'entities.category',
      'children': [{'id': 'group:string:company',
        'relevance': 0.0,
        'value': 'company',
        'children': [{'id': 'grouplist:entities.entity_id',
          'relevance': 1.0,
          'label': 'entities.entity_id',
          'children': [{'id': 'group:string:entity_3',
            'relevance': 0.0,
            'value': 'entity_3',
            'fields': {'count()': 1}},
           {'id': 'group:string:entity_4',
            'relevance': 0.0,
            'value': 'entity_4',
            'fields': {'count()': 1}}]}]},
       {'id': 'group:string:person',
        'relev