In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from time import time
from datasets import load_dataset
from perfcounters import PerfCounters

In [7]:
train = load_dataset("rotten_tomatoes", split="train")
test = load_dataset("rotten_tomatoes", split="test")
train_texts = [t['text'] for t in train]
test_texts = [t['text'] for t in test]
num_train_texts = len(train_texts)
num_test_texts = len(test_texts)
num_texts = num_train_texts + num_test_texts

In [8]:
import os
BATCH_SIZE = 256
STORE_DATA = False
APPROX = True

os.environ['BACKEND'] = 'onnx'  # or tf or comment for auto
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
from unisim import TextSim
if APPROX:
    tsim = TextSim(index_type="approx", batch_size=BATCH_SIZE)
else:
    tsim = TextSim(index_type="exact", batch_size=BATCH_SIZE)

tsim.info()

UniSim is storing a copy of the indexed data
If you are using large data corpus, consider disabling this behavior using store_data=False
Accelerator is not available, using cpu instead
[Embedder]
|-batch_size: 256
|-model_id: text/retsim/v1
|-embedding_size: 256
[Indexer]
|-index_type: approx
|-use_accelerator: False
|-store index data: True


In [9]:
tsim.embed(['warm-up'])
tsim.reset_index()
tsim.index(test_texts[:20])
dups = tsim.search(test_texts[:20])
print(dups)

-[Results Statistics]-
Number of Results: 20
Total Global Matches: 20
Total Partial Matches: 20


In [10]:
tsim.reset_index()

cnts = PerfCounters()
cnts.start('total')

cnts.start('indexing-train')
tsim.index(train_texts)
cnts.stop('indexing-train')

cnts.start('indexing-test')
tsim.index(test_texts)
cnts.stop('indexing-test')

cnts.start('search-train')
train_dups = tsim.search(train_texts)
cnts.stop('search-train')

cnts.start('search-test')
test_dups = tsim.search(test_texts)
cnts.stop('search-test')

cnts.stop('total')

In [11]:
cnts.report()

-=[Timing counters]=-
+----------------+----------+
| name           |    value |
|----------------+----------|
| total          | 39.5803  |
| indexing-train | 18.0286  |
| search-train   | 17.3134  |
| search-test    |  2.12564 |
| indexing-test  |  2.11229 |
+----------------+----------+




In [7]:
# timings
counters = {}
for c in cnts.get_all()['Timing counters']:
    print(c)
    counters[c[0]] = c[1]

print("timing")
print(f"total: {round(counters['total'])} sec - {round(num_texts / counters['total'], 2)}  ex/s")
print(f"embedding_time: fixme")
print(f"indexing:fixme")

print("Train")
neardup_train_ratio = train_dups.total_global_matches / num_train_texts

iqps = num_train_texts / cnts.get('indexing-train')
sqps = num_test_texts / cnts.get('search-test')
print(f'indexing {round(iqps)} qps')
print(f'search {round(sqps)} qps')

['total', 40.264972448349]
['indexing-train', 18.601977348327637]
['search-train', 17.315526962280273]
['search-test', 2.1855432987213135]
['indexing-test', 2.1615777015686035]
timing
total: 40 sec - 238.32  ex/s
embedding_time: fixme
indexing:fixme
Train
indexing 459 qps
search 488 qps


In [8]:
t1 = train_texts[2062]
t2 = "lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness ."
tsim.similarity(t1, t2)

0.5280528664588928

In [23]:
# import numpy as np

# [0.1, 0.3, 0.4,0.5]/np.linalg.norm([0.1, 0.3, 0.4,0.5],axis=0, keepdims=True)

In [24]:
# import tensorflow as tf
# tf.math.l2_normalize([0.1, 0.3, 0.4,0.5], axis=0)

In [9]:
import tensorflow as tf
model = tf.keras.models.load_model("/workspace/RetSim/models/retsim_infinity/retsim_infinite_model_v1")
e1 = model.predict(tf.constant([t1]))
e2 = model.predict(tf.constant([t2]))

# with tf.device()
tf.matmul(e1["global_embedding"], e2["global_embedding"], transpose_b=True)





<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.52804935]], dtype=float32)>

In [10]:
# set True to store data to show the cotent
for d in test_dups.results[0:]:
    if d.num_global_matches:
        tsim.visualize(d)
        break

Query 0: "lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness ."
  idx  is_global    global_sim    is_partial    partial_sim    content
-----  -----------  ------------  ------------  -------------  --------------------------------
 8530  True         1.0           True          1.0            lovingly photographed in the man
 8979  False        0.57          False         0.57           waydowntown manages to nail the
 8960  False        0.56          False         0.56           manages to delight without much
 8209  False        0.54          False                        unfortunately , one hour photo l
 3510  False        0.54          False                        if divine secrets of the ya-ya s
  909  False                      False         0.56           culkin , who's in virtually ever
 3951  False                      False         0.55           baran is shockingly devoid of yo


In [11]:
wiki_time = (3_000_000 / iqps)
wiki_hour = wiki_time / 3600
print(f"{round(wiki_time)} sec, {round(wiki_hour, 1)}h")

6542 sec, 1.8h
