In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time
from datasets import load_dataset
from perfcounters import PerfCounters

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = load_dataset("rotten_tomatoes", split="train")
test = load_dataset("rotten_tomatoes", split="test")

train_texts = [t['text'] for t in train]
test_texts = [t['text'] for t in test]
print(f"train_texts: {len(train_texts)} - test_texts: {len(test_texts)}")

train_texts: 8530 - test_texts: 1066


In [4]:
import os
os.environ['BACKEND'] = 'tf'  # or tf or comment for auto
from unisim import ExactUniSim, ApproxUniSim
usim = ExactUniSim(store_data=False, use_tf_knn=True)
# usim = ApproxUniSim(store_data=False)
usim.info()


Using tf with cpu
UniSim is not storing a copy of the index data to save memory
If you want to store it use store_data=True
[Embedder]
|-batch_size:128
[Indexer]
|-is_exact:True
|-use_tf_knn:True
|-store index data:False


In [5]:
v = usim.text.batch_embed(['lala', 'lali'])
idx = usim.text.batch_index(test_texts[:20])





ges (20, 256)
bpes (1, 256)

In [6]:
usim.text.embed('warm-up')
usim.text.reset_index()
usim.text.batch_index(test_texts[:20])
dups = usim.text.batch_search(test_texts[:20])

In [7]:
import numpy as np
index_texts = np.asanyarray(train_texts[:900] + test_texts[:100])
query_texts = np.asanyarray(test_texts[:500])


In [13]:
usim.text.reset_index()

cnts = PerfCounters()
cnts.start('total')

cnts.start('indexing')
usim.text.batch_index(index_texts)
cnts.stop('indexing')

cnts.start('search')
dups = usim.text.batch_search(query_texts)
cnts.stop('search')
cnts.stop('total')


cnts.report()
iqps = len(index_texts) / cnts.get('indexing')
sqps = len(query_texts) / cnts.get('search')
print(f'indexing {round(iqps)} qps')
print(f'search {round(sqps)} qps')

-=[Timing counters]=-
+----------+---------+
| name     |   value |
|----------+---------|
| total    | 5.69777 |
| indexing | 3.80161 |
| search   | 1.89613 |
+----------+---------+


indexing 263 qps
search 264 qps


In [14]:
# set True to store data to show the cotent
for d in dups:
    if d.num_global_matches:
        usim.viz.result(d)
        break

Query 0
  idx  is_global      global_sim  is_partial      partial_sim  content
-----  -----------  ------------  ------------  -------------  ---------
  900  True                 1     True                   1
  534  False                0.54  False                  0.54
   86  False                0.54  False                  0.54
   56  False                0.52  False                  0.52
  866  False                0.52  False                  0.52


In [15]:
wiki_time = (3_000_000 / iqps)
wiki_hour = wiki_time / 3600
print(f"{round(wiki_time)} sec, {round(wiki_hour, 1)}h")

11405 sec, 3.2h


In [16]:
usim.text.reset_index()
g = usim.text.batch_index(index_texts, verbose=1)

-=[Timing counters]=-
+--------------------------+------------+
| name                     |      value |
|--------------------------+------------|
| compute_embeddings_total | 3.56328    |
| predict                  | 3.25436    |
| binarizer                | 0.305219   |
| averaging                | 0.00369477 |
+--------------------------+------------+


-=[Timing counters]=-
+----------------------------+-------------+
| name                       |       value |
|----------------------------+-------------|
| total                      | 3.56512     |
| batch_embed                | 3.56437     |
| flatten_partial_embeddings | 0.000588655 |
| batch_index                | 0.000123024 |
| compute_global_idxs        | 3.19481e-05 |
+----------------------------+-------------+




# indexing

## binarizer
- fused operation  0.31305
- baseline         0.325764  

# searching
## TF compile
-=[Timing counters]=-
+----------+---------+
| name     |   value |
|----------+---------|
| total    | 5.63792 |
| indexing | 3.94475 |
| search   | 1.69313 |
+----------+---------+
indexing 254 qps
search 295 qps  vs 236qps


# not useful 
- store data=False not useful on 1000 example


# baseline

| name     |   value |
|----------+---------|
| total    | 6.36012 |
| indexing | 4.37516 |
| search   | 1.9849  |

indexing 223 qps
search 236 qps
