In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time
from datasets import load_dataset
from perfcounters import PerfCounters

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = load_dataset("rotten_tomatoes", split="train")
test = load_dataset("rotten_tomatoes", split="test")
train_texts = [t['text'] for t in train]
test_texts = [t['text'] for t in test]

# texts = load_dataset('wiki40b', 'en')
# print(f"num: {len(texts)}")

In [4]:
import os
os.environ['BACKEND'] = 'tf'  # or tf or comment for auto
from unisim import ExactUniSim, ApproxUniSim
usim = ExactUniSim(store_data=True,
                   batch_size=1024,
                   use_tf_knn=True)
# usim = ApproxUniSim(store_data=False)
usim.info()


Using tf with cpu
UniSim is storing a copy of the indexed data
if you are using large data corpus consider disable this behavior using store_data=False
[Embedder]
|-batch_size:1024
[Indexer]
|-is_exact:True
|-use_tf_knn:True
|-store index data:True


In [5]:
v = usim.text.batch_embed(['lala', 'lali'])
idx = usim.text.batch_index(test_texts[:20])





Computing partial embeddings: 100%|██████████| 2/2 [00:00<00:00, 10.80embeddings/s]
Computing partial embeddings: 100%|██████████| 20/20 [00:00<00:00, 110.75embeddings/s]


ges (20, 256)
bpes (1, 256)

In [6]:
usim.text.embed('warm-up')
usim.text.reset_index()
usim.text.batch_index(test_texts[:20])
dups = usim.text.batch_search(test_texts[:20])

Computing partial embeddings: 100%|██████████| 1/1 [00:00<00:00,  9.11embeddings/s]
Computing partial embeddings: 100%|██████████| 20/20 [00:00<00:00, 290.50embeddings/s]
Computing partial embeddings: 100%|██████████| 20/20 [00:00<00:00, 283.10embeddings/s]


In [7]:
import numpy as np
index_texts = train_texts + test_texts[:100]
query_texts = test_texts[:500]


In [8]:
usim.text.reset_index()

cnts = PerfCounters()
cnts.start('total')

cnts.start('indexing')
usim.text.batch_index(index_texts)
cnts.stop('indexing')

cnts.start('search')
dups = usim.text.batch_search(query_texts)
cnts.stop('search')
cnts.stop('total')


cnts.report()
iqps = len(index_texts) / cnts.get('indexing')
sqps = len(query_texts) / cnts.get('search')
print(f'indexing {round(iqps)} qps')
print(f'search {round(sqps)} qps')

Computing partial embeddings: 100%|██████████| 8630/8630 [00:31<00:00, 273.82embeddings/s]
Computing partial embeddings: 100%|██████████| 500/500 [00:01<00:00, 266.21embeddings/s]

-=[Timing counters]=-
+----------+----------+
| name     |    value |
|----------+----------|
| total    | 36.3945  |
| indexing | 34.3018  |
| search   |  2.09272 |
+----------+----------+


indexing 252 qps
search 239 qps





In [9]:
# set True to store data to show the cotent
for d in dups:
    if d.num_global_matches:
        usim.viz.result(d)
        break

Query 0: "lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness ."
  idx  is_global      global_sim  is_partial      partial_sim  content
-----  -----------  ------------  ------------  -------------  ------------------------------
 8530  True                 1     True                   1     lovingly photographed in the m
 2173  False                0.66  False                  0.66  it's refreshing to see a movie
 2062  False                0.65  False                  0.65  one hour photo may seem disapp
 7069  False                0.65  False                  0.65  succumbs to the same kind of m
 2601  False                0.65  False                  0.65  the sweetest thing , a romanti


In [10]:
wiki_time = (3_000_000 / iqps)
wiki_hour = wiki_time / 3600
print(f"{round(wiki_time)} sec, {round(wiki_hour, 1)}h")

11924 sec, 3.3h


In [11]:
usim.text.reset_index()
g = usim.text.batch_index(query_texts, verbose=1)

Computing partial embeddings: 100%|██████████| 500/500 [00:01<00:00, 284.80embeddings/s]

-=[Timing counters]=-
+-----------+------------+
| name      |      value |
|-----------+------------|
| total     | 1.91406    |
| predict   | 1.75639    |
| binarizer | 0.154915   |
| averaging | 0.00274587 |
+-----------+------------+


-=[Timing counters]=-
+----------------------------+-------------+
| name                       |       value |
|----------------------------+-------------|
| total                      | 1.91514     |
| batch_embed                | 1.91473     |
| flatten_partial_embeddings | 0.000333071 |
| batch_index                | 5.72205e-05 |
| compute_global_idxs        | 1.4782e-05  |
| store_data                 | 2.14577e-06 |
+----------------------------+-------------+







# indexing

## binarizer
- fused operation  0.31305
- baseline         0.325764  

# searching
## TF compile
-=[Timing counters]=-
+----------+---------+
| name     |   value |
|----------+---------|
| total    | 5.63792 |
| indexing | 3.94475 |
| search   | 1.69313 |
+----------+---------+
indexing 254 qps
search 295 qps  vs 236qps


# not useful 
- store data=False not useful on 1000 example


# baseline

| name     |   value |
|----------+---------|
| total    | 6.36012 |
| indexing | 4.37516 |
| search   | 1.9849  |

indexing 223 qps
search 236 qps
