document indexing unit tests and modifications

ddangelov · Dec 7, 2020 · a5a4221 · a5a4221
1 parent b1d6338
commit a5a4221
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 34 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,7 +29,7 @@
 author = 'Dimo Angelov'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.16'
+release = '1.0.17'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ tensorflow
 tensorflow_hub
 tensorflow_text
 torch
-sentence_transformers
+sentence_transformers
+hnswlib
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setuptools.setup(
     name="top2vec",
     packages=["top2vec"],
-    version="1.0.16",
+    version="1.0.17",
     author="Dimo Angelov",
     author_email="dimo.angelov@gmail.com",
     description="Top2Vec learns jointly embedded topic, document and word vectors.",
@@ -44,6 +44,9 @@
             'torch',
             'sentence_transformers',
         ],
+        'indexing': [
+            'hnswlib',
+        ],
     },
     python_requires='>=3.6',
 )
diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py
@@ -101,6 +101,7 @@ class Top2Vec:
         default. However they can also be uploaded from a file that is in the
         location of embedding_model_path.
 
+        Warning: the model at embedding_model_path must match the
         Warning: the model at embedding_model_path must match the
         embedding_model parameter type.
 
@@ -183,6 +184,8 @@ def __init__(self,
             self._tokenizer = default_tokenizer
 
         # validate documents
+        if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
+            raise ValueError("Documents need to be a list of strings")
         if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
             raise ValueError("Documents need to be a list of strings")
         if keep_documents:
@@ -192,6 +195,9 @@ def __init__(self,
 
         # validate document ids
         if document_ids is not None:
+            if not (isinstance(document_ids, list) or isinstance(document_ids, np.ndarray)):
+                raise ValueError("Documents ids need to be a list of str or int")
+
             if len(documents) != len(document_ids):
                 raise ValueError("Document ids need to match number of documents")
             elif len(document_ids) != len(set(document_ids)):
@@ -366,6 +372,8 @@ def __init__(self,
         self.document_index = None
         self.serialized_document_index = None
         self.documents_indexed = False
+        self.index_id2doc_id = None
+        self.doc_id2index_id = None
 
     def save(self, file):
         """
@@ -863,7 +871,7 @@ def _validate_vector(self, vector):
         if not vector.shape[0] == vec_size:
             raise ValueError(f"Vector needs to be of {vec_size} dimensions.")
 
-    def index_documents_vectors(self, ef_construction=200, M=64):
+    def index_document_vectors(self, ef_construction=200, M=64):
         """
         Creates an index of the document vectors using hnswlib. This will
         lead to faster search times for models with a large number of
@@ -893,9 +901,14 @@ def index_documents_vectors(self, ef_construction=200, M=64):
         vec_dim = self._get_document_vectors().shape[1]
         num_vecs = self._get_document_vectors().shape[0]
 
+        index_ids = list(range(0, len(self.document_ids)))
+
+        self.index_id2doc_id = dict(zip(index_ids, self.document_ids))
+        self.doc_id2index_id = dict(zip(self.document_ids, index_ids))
+
         self.document_index = hnswlib.Index(space='ip', dim=vec_dim)
         self.document_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
-        self.document_index.add_items(self._get_document_vectors(), self.document_ids)
+        self.document_index.add_items(self._get_document_vectors(), index_ids)
         self.documents_indexed = True
 
     def update_embedding_model_path(self, embedding_model_path):
@@ -1060,10 +1073,17 @@ def add_documents(self, documents, doc_ids=None):
 
         # update index
         if self.documents_indexed:
+            # update capacity of index
             current_max = self.documents_index.get_max_elements()
             updated_max = current_max + len(documents)
             self.documents_index.resize_index(updated_max)
-            self.documents_index.add_items(document_vectors, doc_ids)
+
+            # update index_id and doc_ids
+            start_index_id = max(self.index_id2doc_id.keys()) + 1
+            new_index_ids = list(range(start_index_id, start_index_id + len(doc_ids)))
+            self.index_id2doc_id.update(dict(zip(new_index_ids, doc_ids)))
+            self.doc_id2index_id.update(dict(zip(doc_ids, new_index_ids)))
+            self.documents_index.add_items(document_vectors, new_index_ids)
 
         # update topics
         self._assign_documents_to_topic(document_vectors, hierarchy=False)
@@ -1096,8 +1116,15 @@ def delete_documents(self, doc_ids):
 
         # update index
         if self.documents_indexed:
+            # delete doc_ids from index
+            index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
+            for index_id in index_ids:
+                self.document_index.mark_deleted(index_id)
+            # update index_id and doc_ids
             for doc_id in doc_ids:
-                self.document_index.mark_deleted(doc_id)
+                self.doc_id2index_id.pop(doc_id)
+            for index_id in index_ids:
+                self.index_id2doc_id.pop(index_id)
 
         # get document indexes from ids
         doc_indexes = self._get_document_indexes(doc_ids)
@@ -1420,9 +1447,11 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
 
         ef: int (Optional default None)
             Higher ef leads to more accurate but slower search. This value
-            must be higher than num_docs. For more information see:
+            must be higher than num_docs.
 
-            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+        For more information see:
+
+        https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
 
         Returns
         -------
@@ -1449,10 +1478,11 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
             if ef is not None:
                 self.document_index.set_ef(ef)
 
-            doc_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
-            doc_ids = doc_ids[0]
+            index_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
+            index_ids = index_ids[0]
+            doc_ids = np.array([self.index_id2doc_id[index_id] for index_id in index_ids])
             doc_scores = doc_scores[0]
-            doc_scores = np.array([1-score for score in doc_scores])
+            doc_scores = np.array([1 - score for score in doc_scores])
             doc_indexes = self._get_document_indexes(doc_ids)
         else:
             doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
@@ -1570,9 +1600,11 @@ def search_documents_by_keywords(self, keywords, num_docs, keywords_neg=None, re
 
         ef: int (Optional default None)
             Higher ef leads to more accurate but slower search. This value
-            must be higher than num_docs. For more information see:
+            must be higher than num_docs.
+
+        For more information see:
 
-            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+        https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
 
         Returns
         -------
@@ -1806,9 +1838,11 @@ def search_documents_by_documents(self, doc_ids, num_docs, doc_ids_neg=None, ret
 
         ef: int (Optional default None)
             Higher ef leads to more accurate but slower search. This value
-            must be higher than num_docs. For more information see:
+            must be higher than num_docs.
+
+        For more information see:
 
-            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+        https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
 
         Returns
         -------

diff --git a/top2vec/__init__.py b/top2vec/__init__.py
@@ -1,3 +1,3 @@
 from top2vec.Top2Vec import Top2Vec
 
-__version__ = '1.0.16'
+__version__ = '1.0.17'
diff --git a/top2vec/tests/test_top2vec.py b/top2vec/tests/test_top2vec.py
@@ -50,7 +50,7 @@ def test_add_documents_original(top2vec_model):
 
     topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])
 
-    if top2vec_model.document_ids is None:
+    if top2vec_model.document_ids_provided is False:
         top2vec_model.add_documents(docs_to_add)
     else:
         doc_ids_new = [str(num) for num in range(2000, 2000 + len(docs_to_add))]
@@ -88,7 +88,7 @@ def test_add_documents_post_reduce(top2vec_model):
     topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])
     topic_count_reduced_sum = sum(top2vec_model.get_topic_sizes(reduced=True)[0])
 
-    if top2vec_model.document_ids is None:
+    if top2vec_model.document_ids_provided is False:
         top2vec_model.add_documents(docs_to_add)
     else:
         doc_ids_new = [str(num) for num in range(2100, 2100 + len(docs_to_add))]
@@ -115,7 +115,7 @@ def test_delete_documents(top2vec_model):
     topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])
     topic_count_reduced_sum = sum(top2vec_model.get_topic_sizes(reduced=True)[0])
 
-    if top2vec_model.document_ids is None:
+    if top2vec_model.document_ids_provided is False:
         top2vec_model.delete_documents(doc_ids=doc_ids_to_delete)
     else:
         doc_ids_to_delete = [str(doc_id) for doc_id in doc_ids_to_delete]
@@ -209,19 +209,16 @@ def test_search_documents_by_topic(top2vec_model, reduced):
     assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))
 
     # check that all documents returned are most similar to topic being searched
-    if top2vec_model.document_ids is not None:
-        document_indexes = [top2vec_model.doc_id2index[doc_id] for doc_id in document_ids]
-    else:
-        document_indexes = document_ids
+    document_indexes = [top2vec_model.doc_id2index[doc_id] for doc_id in document_ids]
 
     if reduced:
         doc_topics = set(np.argmax(
             np.inner(top2vec_model._get_document_vectors()[document_indexes],
-                              top2vec_model.topic_vectors_reduced), axis=1))
+                     top2vec_model.topic_vectors_reduced), axis=1))
     else:
         doc_topics = set(np.argmax(
             np.inner(top2vec_model._get_document_vectors()[document_indexes],
-                              top2vec_model.topic_vectors), axis=1))
+                     top2vec_model.topic_vectors), axis=1))
     assert len(doc_topics) == 1 and topic in doc_topics
 
 
@@ -288,10 +285,7 @@ def test_search_topics(top2vec_model, reduced):
 
 @pytest.mark.parametrize('top2vec_model', models)
 def test_search_document_by_documents(top2vec_model):
-    if top2vec_model.document_ids is not None:
-        doc_id = top2vec_model.document_ids[0]
-    else:
-        doc_id = 0
+    doc_id = top2vec_model.document_ids[0]
 
     num_docs = 10
 
@@ -314,10 +308,7 @@ def test_search_document_by_documents(top2vec_model):
 
 @pytest.mark.parametrize('top2vec_model', models)
 def test_get_documents_topics(top2vec_model):
-    if top2vec_model.document_ids is not None:
-        doc_ids_get = top2vec_model.document_ids[[0, 5]]
-    else:
-        doc_ids_get = [0, 5]
+    doc_ids_get = top2vec_model.document_ids[[0, 5]]
 
     if top2vec_model.hierarchy is not None:
         doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
@@ -326,3 +317,105 @@ def test_get_documents_topics(top2vec_model):
         doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get)
 
     assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)
+
+
+@pytest.mark.parametrize('top2vec_model', models)
+def test_search_documents_by_vector(top2vec_model):
+    document_vectors = top2vec_model._get_document_vectors()
+    top2vec_model.search_documents_by_vector(vector=document_vectors[0], num_docs=10)
+
+    num_docs = 10
+
+    if top2vec_model.documents is not None:
+        documents, document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
+                                                                                            num_docs=num_docs)
+    else:
+        document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
+                                                                                 num_docs=num_docs)
+    if top2vec_model.documents is not None:
+        assert len(documents) == len(document_scores) == len(document_ids) == num_docs
+    else:
+        assert len(document_scores) == len(document_ids) == num_docs
+
+    # check that documents are returned in decreasing order
+    assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))
+
+
+@pytest.mark.parametrize('top2vec_model', models)
+def test_index_documents(top2vec_model):
+    top2vec_model.index_document_vectors()
+    assert top2vec_model._get_document_vectors().shape[1] <= top2vec_model.document_index.get_max_elements()
+
+
+@pytest.mark.parametrize('top2vec_model', models)
+def test_search_documents_by_vector_index(top2vec_model):
+    document_vectors = top2vec_model._get_document_vectors()
+    top2vec_model.search_documents_by_vector(vector=document_vectors[0], num_docs=10)
+
+    num_docs = 10
+
+    if top2vec_model.documents is not None:
+        documents, document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
+                                                                                            num_docs=num_docs,
+                                                                                            use_index=True)
+    else:
+        document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
+                                                                                 num_docs=num_docs,
+                                                                                 use_index=True)
+    if top2vec_model.documents is not None:
+        assert len(documents) == len(document_scores) == len(document_ids) == num_docs
+    else:
+        assert len(document_scores) == len(document_ids) == num_docs
+
+    # check that documents are returned in decreasing order
+    assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))
+
+
+@pytest.mark.parametrize('top2vec_model', models)
+def test_search_documents_by_keywords_index(top2vec_model):
+    keywords = get_model_vocab(top2vec_model)
+    keyword = keywords[-1]
+    num_docs = 10
+
+    if top2vec_model.documents is not None:
+        documents, document_scores, document_ids = top2vec_model.search_documents_by_keywords(keywords=[keyword],
+                                                                                              num_docs=num_docs,
+                                                                                              use_index=True)
+    else:
+        document_scores, document_ids = top2vec_model.search_documents_by_keywords(keywords=[keyword],
+                                                                                   num_docs=num_docs,
+                                                                                   use_index=True)
+
+    # check that for each document there is a score and number
+    if top2vec_model.documents is not None:
+        assert len(documents) == len(document_scores) == len(document_ids) == num_docs
+    else:
+        assert len(document_scores) == len(document_ids) == num_docs
+
+    # check that documents are returned in decreasing order
+    assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))
+
+
+@pytest.mark.parametrize('top2vec_model', models)
+def test_search_document_by_documents_index(top2vec_model):
+    doc_id = top2vec_model.document_ids[0]
+
+    num_docs = 10
+
+    if top2vec_model.documents is not None:
+        documents, document_scores, document_ids = top2vec_model.search_documents_by_documents(doc_ids=[doc_id],
+                                                                                               num_docs=num_docs,
+                                                                                               use_index=True)
+    else:
+        document_scores, document_ids = top2vec_model.search_documents_by_documents(doc_ids=[doc_id],
+                                                                                    num_docs=num_docs,
+                                                                                    use_index=True)
+
+    # check that for each document there is a score and number
+    if top2vec_model.documents is not None:
+        assert len(documents) == len(document_scores) == len(document_ids) == num_docs
+    else:
+        assert len(document_scores) == len(document_ids) == num_docs
+
+    # check that documents are returned in decreasing order
+    assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))