In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time
from datasets import load_dataset
from perfcounters import PerfCounters

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = load_dataset("rotten_tomatoes", split="train")
test = load_dataset("rotten_tomatoes", split="test")
train_texts = [t['text'] for t in train]
test_texts = [t['text'] for t in test]
num_train_texts = len(train_texts)
num_test_texts = len(test_texts)
num_texts = num_train_texts + num_test_texts
# texts = load_dataset('wiki40b', 'en')
# print(f"num: {len(texts)}")

In [4]:
import os
BATCH_SIZE = 4096
STORE_DATA = False
TFKNN = False
APPROX = True

os.environ['BACKEND'] = 'tf'  # or tf or comment for auto
from unisim import ExactUniSim, ApproxUniSim
if APPROX:

    usim = ApproxUniSim(store_data=STORE_DATA,
                    batch_size=BATCH_SIZE,
                    use_tf_knn=TFKNN)
else:
    usim = ExactUniSim(store_data=STORE_DATA,
                    batch_size=BATCH_SIZE,
                    use_tf_knn=TFKNN)


# usim = ApproxUniSim(store_data=False)
usim.info()


Using tf with cpu
UniSim is not storing a copy of the index data to save memory
If you want to store it use store_data=True
[Embedder]
|-batch_size:4096
[Indexer]
|-is_exact:False
|-use_tf_knn:False
|-store index data:False


In [5]:
usim.text.embed('warm-up')
usim.text.reset_index()
usim.text.batch_index(test_texts[:20])
dups = usim.text.batch_search(test_texts[:20])





Computing partial embeddings: 100%|██████████| 1/1 [00:00<00:00,  4.82embeddings/s]
Computing partial embeddings: 100%|██████████| 20/20 [00:00<00:00, 81.53embeddings/s]
Computing partial embeddings: 100%|██████████| 20/20 [00:00<00:00, 267.07embeddings/s]


In [22]:
usim.text.reset_index()

cnts = PerfCounters()
cnts.start('total')

cnts.start('indexing-train')
usim.text.batch_index(train_texts)
cnts.stop('indexing-train')

cnts.start('indexing-test')
usim.text.batch_index(test_texts)
cnts.stop('indexing-test')

cnts.start('search-train')
train_dups = usim.text.batch_search(train_texts)
cnts.stop('search-train')

cnts.start('search-test')
train_dups = usim.text.batch_search(test_texts)
cnts.stop('search-test')

cnts.stop('total')

Computing partial embeddings:  96%|█████████▌| 8192/8530 [00:53<00:02, 160.51embeddings/s]



Computing partial embeddings: 100%|██████████| 8530/8530 [00:54<00:00, 155.86embeddings/s]
Computing partial embeddings: 100%|██████████| 1066/1066 [00:04<00:00, 239.74embeddings/s]
Computing partial embeddings: 100%|██████████| 8530/8530 [00:50<00:00, 169.27embeddings/s]
Computing partial embeddings: 100%|██████████| 1066/1066 [00:04<00:00, 239.65embeddings/s]


In [32]:
# timings
counters = {}
for c in cnts.get_all()['Timing counters']:
    print(c)
    counters[c[0]] = c[1]

print("timing")
print(f"total: {round(counters['total'])} sec - {round(num_texts / counters['total'], 2)}  ex/s")
print(f"embedding_time: fixme")
print(f"indexing:fixme")

print("Train")
neardup_train_ratio = train_dups.total_global_matches / num_train_texts
neardup_test_ratio = test_dups.total_global_matches / num

# iqps = num_train_texts / cnts.get('indexing')
# sqps = num_test_texts / cnts.get('search')
# print(f'indexing {round(iqps)} qps')
# print(f'search {round(sqps)} qps')

['total', 121.98661589622498]
['indexing-train', 58.14421033859253]
['search-train', 53.8358268737793]
['search-test', 5.1059370040893555]
['indexing-test', 4.900509834289551]
timing
total: 122 sec - 78.66  ex/s
embedding_time: fixme
indexing:fixme


In [9]:
# set True to store data to show the cotent
for d in dups:
    if d.num_global_matches:
        usim.viz.result(d)
        break

Query 0: "lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness ."
  idx  is_global      global_sim  is_partial      partial_sim  content
-----  -----------  ------------  ------------  -------------  ------------------------------
 8530  True                 1     True                   1     lovingly photographed in the m
 2173  False                0.66  False                  0.66  it's refreshing to see a movie
 2062  False                0.65  False                  0.65  one hour photo may seem disapp
 7069  False                0.65  False                  0.65  succumbs to the same kind of m
 2601  False                0.65  False                  0.65  the sweetest thing , a romanti


In [10]:
wiki_time = (3_000_000 / iqps)
wiki_hour = wiki_time / 3600
print(f"{round(wiki_time)} sec, {round(wiki_hour, 1)}h")

11924 sec, 3.3h


In [10]:
usim.text.reset_index()
g = usim.text.batch_index(train_texts, verbose=1)
print(num_test_texts)

Computing partial embeddings: 100%|██████████| 8530/8530 [00:44<00:00, 189.77embeddings/s]


-=[Timing counters]=-
+-----------+------------+
| name      |      value |
|-----------+------------|
| total     | 47.9073    |
| predict   | 44.9494    |
| binarizer |  2.91629   |
| averaging |  0.0415931 |
+-----------+------------+


-=[Timing counters]=-
+----------------------------+--------------+
| name                       |        value |
|----------------------------+--------------|
| total                      | 48.5263      |
| batch_embed                | 47.9169      |
| batch_index                |  0.603534    |
| flatten_partial_embeddings |  0.00552797  |
| compute_global_idxs        |  0.000332117 |
+----------------------------+--------------+


1066


# > (num_dup - len(dups)) / 2

# indexing

## binarizer
- fused operation  0.31305
- baseline         0.325764  

# searching
## TF compile
-=[Timing counters]=-
+----------+---------+
| name     |   value |
|----------+---------|
| total    | 5.63792 |
| indexing | 3.94475 |
| search   | 1.69313 |
+----------+---------+
indexing 254 qps
search 295 qps  vs 236qps


# not useful 
- store data=False not useful on 1000 example


# baseline

| name     |   value |
|----------+---------|
| total    | 6.36012 |
| indexing | 4.37516 |
| search   | 1.9849  |

indexing 223 qps
search 236 qps
