In [1]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
import numpy
import tenseal

from datasets import load_dataset

from search.embedding import Model
from search.index import Index
from search.client import Client

In [3]:
dataset = load_dataset("imdb")["train"][:100]["text"]
context = tenseal.context(
    tenseal.SCHEME_TYPE.CKKS,
    poly_modulus_degree=8192,
    coeff_mod_bit_sizes=[60, 40, 40, 60],
)
context.generate_galois_keys()
context.global_scale = 2**40

model = Model(id="paraphrase-MiniLM-L6-v2")
index = Index(model=model, corpus=dataset)
client = Client(model=model, centroids=index.centroids, context=context)

In [4]:
text = dataset[0]

query = client.query(text)
result = client.decrypt(index.search(query))

In [5]:
client.rank(result=result, text=text)

array([ 1.24675271e+01,  4.55197050e+00,  3.05972705e+00,  5.05467643e+00,
        4.17172464e+00,  4.29356623e+00, -2.42126205e+03, -2.42126193e+03,
       -2.42126210e+03, -2.42126197e+03,  1.97270662e+04, -9.62603934e-03,
        1.36890088e-02,  5.48082256e-03, -1.67164575e-03,  9.71143563e-03,
        1.80828172e-01,  3.73622375e-04,  9.06484401e-01,  2.11591527e-01,
        1.13190748e+00])