In [1]:
import weaviate
from weaviate.classes.config import Property, DataType, Tokenization, Configure
from weaviate.classes.query import MetadataQuery, Filter
import os

In [2]:
client = weaviate.connect_to_local()

In [3]:
tkn_options = [
    Tokenization.WORD,
    Tokenization.LOWERCASE,
    Tokenization.WHITESPACE,
    Tokenization.FIELD,
]

properties = [
    Property(
        name=f"text_{tokenization}",
        data_type=DataType.TEXT,
        tokenization=tokenization
    ) for tokenization in tkn_options
]

for p in properties:
    print(p.name, p.tokenization)

text_word Tokenization.WORD
text_lowercase Tokenization.LOWERCASE
text_whitespace Tokenization.WHITESPACE
text_field Tokenization.FIELD


In [4]:
property_names = [p.name for p in properties]
property_names

['text_word', 'text_lowercase', 'text_whitespace', 'text_field']

In [5]:
client.collections.delete("TokenExample")

collection = client.collections.create(
    name="TokenExample",
    properties=properties,
    vectorizer_config=Configure.Vectorizer.text2vec_transformers()
)

In [6]:
for phrase in [
    "Lois & Clark: The New Adventures of Superman",
    "Beyoncé - Single Ladies (Put a Ring on It)",
    "15-30",
    "30-15",
]:
    obj_properties = {name: phrase for name in property_names}
    print(obj_properties)
    collection.data.insert(
        properties=obj_properties
    )

{'text_word': 'Lois & Clark: The New Adventures of Superman', 'text_lowercase': 'Lois & Clark: The New Adventures of Superman', 'text_whitespace': 'Lois & Clark: The New Adventures of Superman', 'text_field': 'Lois & Clark: The New Adventures of Superman'}
{'text_word': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_lowercase': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_whitespace': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_field': 'Beyoncé - Single Ladies (Put a Ring on It)'}
{'text_word': '15-30', 'text_lowercase': '15-30', 'text_whitespace': '15-30', 'text_field': '15-30'}
{'text_word': '30-15', 'text_lowercase': '30-15', 'text_whitespace': '30-15', 'text_field': '30-15'}


In [7]:
query_terms = ["Lois", "Lois & Clark", "lois, clark - new", "ladies put", "Ladies (Put", "15-30", "15"]
for query_term in query_terms:
    print(f"\nHits for: '{query_term}'")
    for name in property_names:
        response = collection.query.fetch_objects(
            filters=Filter.by_property(name).like(query_term),
            limit=5
        )
        if len(response.objects) > 0:
            for obj in response.objects:
                print(f"'{obj.properties[name]}' found in {name}")


Hits for: 'Lois'
'Lois & Clark: The New Adventures of Superman' found in text_word
'Lois & Clark: The New Adventures of Superman' found in text_lowercase
'Lois & Clark: The New Adventures of Superman' found in text_whitespace

Hits for: 'Lois & Clark'
'Lois & Clark: The New Adventures of Superman' found in text_word

Hits for: 'lois, clark - new'
'Lois & Clark: The New Adventures of Superman' found in text_word

Hits for: 'ladies put'
'Beyoncé - Single Ladies (Put a Ring on It)' found in text_word

Hits for: 'Ladies (Put'
'Beyoncé - Single Ladies (Put a Ring on It)' found in text_word
'Beyoncé - Single Ladies (Put a Ring on It)' found in text_lowercase
'Beyoncé - Single Ladies (Put a Ring on It)' found in text_whitespace

Hits for: '15-30'
'15-30' found in text_word
'30-15' found in text_word
'15-30' found in text_lowercase
'15-30' found in text_whitespace
'15-30' found in text_field

Hits for: '15'
'15-30' found in text_word
'30-15' found in text_word
