# 13. Scaling up (optimization, parallelization, and batch processing)

###  13.2.3 Advanced indxing with Annoy

In [1]:
import os
from gensim.models.keyedvectors import KeyedVectors
# from nlpia.data.loaders import BIGDATA_PATH

# not in book, reader required to compose this path
# wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')
## https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
wordvector_path = os.path.join("../../bigdata", 'GoogleNews-vectors-negative300.bin.gz')

wv = KeyedVectors.load_word2vec_format(wordvector_path, binary=True, limit=200000)

In [8]:
len(wv.key_to_index), len(wv[next(iter(wv.key_to_index))])

(200000, 300)

In [2]:
wv.vectors.shape

(200000, 300)

In [4]:
!pip install --user annoy

Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.17.1-cp310-cp310-linux_x86_64.whl size=76888 sha256=e67eb26179f1ce8b09f0e5866f9c9a55b9ce0936d1e7515ece6fa0794ad2b57e
  Stored in directory: /home/jovyan/.cache/pip/wheels/1a/8a/8b/ca301ec85de2c145c45b09994765966c7148e54dbbf2b8bfff
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


In [2]:
from annoy import AnnoyIndex
num_words, num_dimensions = wv.vectors.shape  # <1>
index = AnnoyIndex(num_dimensions)
index.set_seed(1983)

# <1> The original GoogleNews word2vec model contains 3M word vectors, each with 300 dimensions

  index = AnnoyIndex(num_dimensions)


In [4]:
from tqdm import tqdm

for i, word in enumerate(tqdm(wv.index_to_key)):  # <1> & <2>
    index.add_item(i, wv[word])

# <1> `tqdm()` takes an iterable and returns an iterable (like `enumerate()`) and inserts code in your loop to display a progress bar
# <2> `.index2word` is an unsorted list of all 3M tokens in your vocabulary, equivalent to a map of the integer indexes (0-2999999) to 
#     tokens ('</s>' to 'snowcapped_Caucasus').

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [00:04<00:00, 42120.13it/s]


In [7]:
import numpy as np
num_trees = int(np.log(num_words).round(0))  # <1>
print(num_trees)

index.build(num_trees)  # <2>
index.save('Word2vec_euc_index.ann')  # <3>

# <1> This is just a rule of thumb -- you may want to optimize this hyperparameter if this index isn't performant for the things you care about (RAM, lookup, indexing) or accurate enough for your application.
# <2> round(ln(3000000)) => 15 indexing trees for our 3M vectors -- this takes a few minutes on a laptop
# <3> Saves the index to a local file and frees up RAM

12


True

In [17]:
wv.key_to_index['Harry_Potter']

9494

In [18]:
# wv.vocab['Harry_Potter'].index  # <1>
wv.key_to_index['Harry_Potter']  # <1>
# 9494
# <1> The gensim KeyedVectors.vocab dict contains Vocab objects rather than raw strings or index numbers.

9494

In [20]:
# wv.vocab['Harry_Potter'].count  # <2>
wv.get_vecattr("Harry_Potter", "count")  # <2>
# 2990506
# <2> The gensim Vocab object can tell you the number of times the "Harry_Potter" 2-gram was mentioned in the googleNews corpus... nearly 3M times.

190506

In [22]:
# w2id = dict(zip(
#     wv.vocab, range(len(wv.vocab))))  # <3>
w2id = wv.key_to_index
# <3> Create a map similar to mv,vocab, mapping the tokens to their index values (integer).

In [23]:
w2id['Harry_Potter']
# 9494

9494

In [24]:
ids = index.get_nns_by_item(
    w2id['Harry_Potter'], 11)  # <4>
ids

# <4> Annoy returns the target vector first, so we have to request 11 "neighbours" if we want t10 in addition to the target.

[9494, 39034, 114813, 172698, 59576, 15107, 145465, 15396, 58514, 22364, 22105]

In [29]:
# [wv.vocab[i] for i in _]

In [28]:
# [wv.index2word[i] for i in ids]
[wv.index_to_key[i] for i in ids]

['Harry_Potter',
 'Sherlock_Holmes',
 'Lemony_Snicket',
 'Spiderwick_Chronicles',
 'Superman_Returns',
 'comic_book',
 'Unfortunate_Events',
 'Batman',
 'Goblet',
 'Shrek',
 'Transformers']

In [30]:
[word for word, similarity in wv.most_similar('Harry_Potter', topn=10)]
# ['JK_Rowling_Harry_Potter',
#  'JK_Rowling',
#  'boy_wizard',
#  'Deathly_Hallows',
#  'Half_Blood_Prince',
#  'Rowling',
#  'Actor_Rupert_Grint',
#  'HARRY_Potter',
#  'wizard_Harry_Potter',
#  'HARRY_POTTER']

['JK_Rowling_Harry_Potter',
 'JK_Rowling',
 'boy_wizard',
 'Deathly_Hallows',
 'Half_Blood_Prince',
 'Rowling',
 'Twilight',
 'Twilight_saga',
 'author_JK_Rowling',
 'Narnia']

In [32]:
index_cos = AnnoyIndex(
    f=num_dimensions, metric="angular" # <1>
)
# for i, word in enumerate(wv.index2word):
for i, word in enumerate(wv.index_to_key):
    if not i % 100000:
        print('{}: {}'.format(i, word)) # <2>
    index_cos.add_item(i, wv[word])

# <1> metric='angular' ues the angular (cosine) distance metric to compute your clusters and hashes.
#     Your options are: 'angular', 'euclidean', 'manhattan', or 'hamming'.
# <2> Another way to keep informed of your progress if you don't like tqdm

0: </s>
100000: distinctiveness


In [33]:
index_cos.build(30)  # <1>
index_cos.save('Word2vec_cos_index.ann')

# <1> 30 equals int(np.log(num_vectors).round(0)), double what you had before.

True

In [34]:
idx_cos = index_cos.get_nns_by_item(w2id['Harry_Potter'], 10)
idx_cos

[9494, 193309, 37681, 71557, 40544, 41526, 30024, 78932, 32643, 84628]

In [37]:
# [wv.index2word[i] for i in ids_cos] # <1>
[wv.index_to_key[i] for i in idx_cos] # <1>

# <1> You'll not get the same results. Random projection for LSH is stochastic. Use AnnoyIndex.set_seed() if you need repeatability.

['Harry_Potter',
 'JK_Rowling_Harry_Potter',
 'JK_Rowling',
 'boy_wizard',
 'Deathly_Hallows',
 'Half_Blood_Prince',
 'Rowling',
 'author_JK_Rowling',
 'Narnia',
 'Stephenie_Meyer']

In [41]:
import pandas as pd

gensim_top10 = pd.Series([word for word, similarity in wv.most_similar('Harry_Potter', topn=10)])

annoy_top10_15 = pd.Series([wv.index_to_key[i] for i in ids])

annoy_top10_30 = pd.Series([wv.index_to_key[i] for i in idx_cos])

top10s = pd.concat([gensim_top10, annoy_top10_15, annoy_top10_30], axis=1).head(10)
top10s.columns = ["gensim", "annoy_15trees", "annoy_30trees"]
top10s.set_index("gensim")

Unnamed: 0_level_0,annoy_15trees,annoy_30trees
gensim,Unnamed: 1_level_1,Unnamed: 2_level_1
JK_Rowling_Harry_Potter,Harry_Potter,Harry_Potter
JK_Rowling,Sherlock_Holmes,JK_Rowling_Harry_Potter
boy_wizard,Lemony_Snicket,JK_Rowling
Deathly_Hallows,Spiderwick_Chronicles,boy_wizard
Half_Blood_Prince,Superman_Returns,Deathly_Hallows
Rowling,comic_book,Half_Blood_Prince
Twilight,Unfortunate_Events,Rowling
Twilight_saga,Batman,author_JK_Rowling
author_JK_Rowling,Goblet,Narnia
Narnia,Shrek,Stephenie_Meyer


### 13.2.5 An indexing workaround: discretizing

In [54]:
from sklearn.preprocessing import MinMaxScaler

real_values = [-1.2, 3.4, 5.6, -7.8, 9.0]
print(real_values)
real_values = np.array(real_values)
print(real_values)
real_values = real_values.reshape(-1, 1)
print(real_values)

scaler = MinMaxScaler()  # <1>
scaler.fit(real_values)
[int(x * 100.) for x in scaler.transform(real_values)]  # <2>

# <1> Confine our floats to be between 0.0 and 1.0.
# <2> Scaled, discretized ints, 0-100

[-1.2, 3.4, 5.6, -7.8, 9.0]
[-1.2  3.4  5.6 -7.8  9. ]
[[-1.2]
 [ 3.4]
 [ 5.6]
 [-7.8]
 [ 9. ]]


[39, 66, 79, 0, 100]

### 13.6.1 How to visualize word embeddings

In [55]:
!pip install tensorboard



In [56]:
!tensorboard --logdir=/tmp/

2023-01-03 14:10:35.796549: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-03 14:10:36.850661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-03 14:10:36.850730: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-03 14:10:39.598231: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [107]:
import os
import tensorflow as tf
import numpy as np
from io import open
from tensorboard.plugins import projector

tf.compat.v1.disable_eager_execution()

def create_projection(projection_data,
                     projection_name="tensorboard_viz",
                     path="/tmp/"):  # <1>
    meta_file = "{}.tsv".format(projection_name)
    vector_dim = len(projection_data[0][1])
    samples = len(projection_data)
    projection_matrix = np.zeros((samples, vector_dim))

    with open(os.path.join(path, meta_file), "w") as file_metadata:
        for i, row in enumerate(projection_data):  # <2>
            label, vector = row[0], row[1]
            projection_matrix[i] = np.array(vector)
            file_metadata.write("{}\n".format(label))
            
    print(projection_matrix)

    # sess = tf.InteractiveSession()  # <3>
    sess = tf.compat.v1.InteractiveSession()

    embedding = tf.Variable(projection_matrix,
                            trainable=False,
                            name=projection_name)
    # tf.global_variables_initializer().run()
    # tf.compat.v1.global_variables_initializer().run()

    # saver = tf.train.Saver()
    saver = tf.compat.v1.train.Saver()
    # writer = tf.summary.FileWriter(path, sess.graph)  # <4>
    # writer = tf.compat.v1.summary.FileWriter(path, sess.graph)  # <4>
    # writer = tf.summary.create_file_writer(path, sess.graph)  # <4>
    with tf.compat.v1.Graph().as_default():
        writer = tf.compat.v1.summary.FileWriter(path, sess.graph)  # <4>

    config = projector.ProjectorConfig()
    embed = config.embeddings.add()

    embed.tensor_name = "{}".format(projection_name)
    embed.metadata_path = os.path.join(path, meta_file)

    projector.visualize_embeddings(writer, config)  # <5>
    # print(sess)
    saver.save(sess, os.path.join(path, "{}.ckpt".format(projection_name)))    
    sess.close()

    print("Run `tensorboard --logdir={0}` to run\
           visualize result on tensorboard".format(path))

# <1> The create_projection function takes three arguments: the embedding data, a name for the projection and a path, and where to
#     store the projection files.
# <2> The function loops over the embedding data and creates a numpy array, which will then be converted to a Tensorflow variable.
# <3> To create the TensorBoard projection, you need to create a Tensorflow session.
# <4> TensorFlow provides built-in methods to create projections.
# <5> visualize_embeddings writes the projection to your path and is then available for TensorBoard.

In [108]:
projection_name = "NLP_in_Action"
projection_data = [
    ('car', [0.34, -0.72]),
    ('toy', [0.46, 0.39])
]

create_projection(projection_data, projection_name)

[[ 0.34 -0.72]
 [ 0.46  0.39]]


FailedPreconditionError: Graph execution error:

Detected at node 'NLP_in_Action/Read/ReadVariableOp' defined at (most recent call last):
    File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 982, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_4370/3755568974.py", line 7, in <module>
      create_projection(projection_data, projection_name)
    File "/tmp/ipykernel_4370/1444917779.py", line 28, in create_projection
      embedding = tf.Variable(projection_matrix,
Node: 'NLP_in_Action/Read/ReadVariableOp'
Could not find variable NLP_in_Action. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status error message=Container localhost does not exist. (Could not find resource: localhost/NLP_in_Action)
	 [[{{node NLP_in_Action/Read/ReadVariableOp}}]]

Original stack trace for 'NLP_in_Action/Read/ReadVariableOp':
  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 982, in launch_instance
    app.start()
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
    self._run_once()
  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
    handle._run()
  File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
    await self.process_one()
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
    await dispatch(*args)
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
    await result
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
    reply_content = await reply_content
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
    res = shell.run_cell(
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell
    result = self._run_cell(
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell
    return runner(coro)
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_4370/3755568974.py", line 7, in <module>
    create_projection(projection_data, projection_name)
  File "/tmp/ipykernel_4370/1444917779.py", line 28, in create_projection
    embedding = tf.Variable(projection_matrix,
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/variables.py", line 271, in __call__
    return cls._variable_v2_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/variables.py", line 250, in _variable_v2_call
    return previous_getter(
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/variables.py", line 243, in <lambda>
    previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/variable_scope.py", line 2758, in default_variable_creator_v2
    return resource_variable_ops.ResourceVariable(
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/variables.py", line 273, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 1721, in __init__
    self._init_from_args(
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 1963, in _init_from_args
    value = gen_resource_variable_ops.read_variable_op(handle, dtype)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/gen_resource_variable_ops.py", line 539, in read_variable_op
    _, _, _op, _outputs = _op_def_library._apply_op_helper(
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/framework/op_def_library.py", line 795, in _apply_op_helper
    op = g._create_op_internal(op_type_name, inputs, dtypes=None,
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/framework/ops.py", line 3798, in _create_op_internal
    ret = Operation(
