Skip to content

Commit

Permalink
document indexing unit tests and modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Dec 7, 2020
1 parent b1d6338 commit a5a4221
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 34 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = 'Dimo Angelov'

# The full version, including alpha/beta/rc tags
release = '1.0.16'
release = '1.0.17'


# -- General configuration ---------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ tensorflow
tensorflow_hub
tensorflow_text
torch
sentence_transformers
sentence_transformers
hnswlib
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
setuptools.setup(
name="top2vec",
packages=["top2vec"],
version="1.0.16",
version="1.0.17",
author="Dimo Angelov",
author_email="dimo.angelov@gmail.com",
description="Top2Vec learns jointly embedded topic, document and word vectors.",
Expand Down Expand Up @@ -44,6 +44,9 @@
'torch',
'sentence_transformers',
],
'indexing': [
'hnswlib',
],
},
python_requires='>=3.6',
)
60 changes: 47 additions & 13 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class Top2Vec:
default. However they can also be uploaded from a file that is in the
location of embedding_model_path.
Warning: the model at embedding_model_path must match the
Warning: the model at embedding_model_path must match the
embedding_model parameter type.
Expand Down Expand Up @@ -183,6 +184,8 @@ def __init__(self,
self._tokenizer = default_tokenizer

# validate documents
if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
raise ValueError("Documents need to be a list of strings")
if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
raise ValueError("Documents need to be a list of strings")
if keep_documents:
Expand All @@ -192,6 +195,9 @@ def __init__(self,

# validate document ids
if document_ids is not None:
if not (isinstance(document_ids, list) or isinstance(document_ids, np.ndarray)):
raise ValueError("Documents ids need to be a list of str or int")

if len(documents) != len(document_ids):
raise ValueError("Document ids need to match number of documents")
elif len(document_ids) != len(set(document_ids)):
Expand Down Expand Up @@ -366,6 +372,8 @@ def __init__(self,
self.document_index = None
self.serialized_document_index = None
self.documents_indexed = False
self.index_id2doc_id = None
self.doc_id2index_id = None

def save(self, file):
"""
Expand Down Expand Up @@ -863,7 +871,7 @@ def _validate_vector(self, vector):
if not vector.shape[0] == vec_size:
raise ValueError(f"Vector needs to be of {vec_size} dimensions.")

def index_documents_vectors(self, ef_construction=200, M=64):
def index_document_vectors(self, ef_construction=200, M=64):
"""
Creates an index of the document vectors using hnswlib. This will
lead to faster search times for models with a large number of
Expand Down Expand Up @@ -893,9 +901,14 @@ def index_documents_vectors(self, ef_construction=200, M=64):
vec_dim = self._get_document_vectors().shape[1]
num_vecs = self._get_document_vectors().shape[0]

index_ids = list(range(0, len(self.document_ids)))

self.index_id2doc_id = dict(zip(index_ids, self.document_ids))
self.doc_id2index_id = dict(zip(self.document_ids, index_ids))

self.document_index = hnswlib.Index(space='ip', dim=vec_dim)
self.document_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
self.document_index.add_items(self._get_document_vectors(), self.document_ids)
self.document_index.add_items(self._get_document_vectors(), index_ids)
self.documents_indexed = True

def update_embedding_model_path(self, embedding_model_path):
Expand Down Expand Up @@ -1060,10 +1073,17 @@ def add_documents(self, documents, doc_ids=None):

# update index
if self.documents_indexed:
# update capacity of index
current_max = self.documents_index.get_max_elements()
updated_max = current_max + len(documents)
self.documents_index.resize_index(updated_max)
self.documents_index.add_items(document_vectors, doc_ids)

# update index_id and doc_ids
start_index_id = max(self.index_id2doc_id.keys()) + 1
new_index_ids = list(range(start_index_id, start_index_id + len(doc_ids)))
self.index_id2doc_id.update(dict(zip(new_index_ids, doc_ids)))
self.doc_id2index_id.update(dict(zip(doc_ids, new_index_ids)))
self.documents_index.add_items(document_vectors, new_index_ids)

# update topics
self._assign_documents_to_topic(document_vectors, hierarchy=False)
Expand Down Expand Up @@ -1096,8 +1116,15 @@ def delete_documents(self, doc_ids):

# update index
if self.documents_indexed:
# delete doc_ids from index
index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
for index_id in index_ids:
self.document_index.mark_deleted(index_id)
# update index_id and doc_ids
for doc_id in doc_ids:
self.document_index.mark_deleted(doc_id)
self.doc_id2index_id.pop(doc_id)
for index_id in index_ids:
self.index_id2doc_id.pop(index_id)

# get document indexes from ids
doc_indexes = self._get_document_indexes(doc_ids)
Expand Down Expand Up @@ -1420,9 +1447,11 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs. For more information see:
must be higher than num_docs.
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
Expand All @@ -1449,10 +1478,11 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
if ef is not None:
self.document_index.set_ef(ef)

doc_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
doc_ids = doc_ids[0]
index_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
index_ids = index_ids[0]
doc_ids = np.array([self.index_id2doc_id[index_id] for index_id in index_ids])
doc_scores = doc_scores[0]
doc_scores = np.array([1-score for score in doc_scores])
doc_scores = np.array([1 - score for score in doc_scores])
doc_indexes = self._get_document_indexes(doc_ids)
else:
doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
Expand Down Expand Up @@ -1570,9 +1600,11 @@ def search_documents_by_keywords(self, keywords, num_docs, keywords_neg=None, re
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs. For more information see:
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
Expand Down Expand Up @@ -1806,9 +1838,11 @@ def search_documents_by_documents(self, doc_ids, num_docs, doc_ids_neg=None, ret
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs. For more information see:
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion top2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from top2vec.Top2Vec import Top2Vec

__version__ = '1.0.16'
__version__ = '1.0.17'
127 changes: 110 additions & 17 deletions top2vec/tests/test_top2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_add_documents_original(top2vec_model):

topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])

if top2vec_model.document_ids is None:
if top2vec_model.document_ids_provided is False:
top2vec_model.add_documents(docs_to_add)
else:
doc_ids_new = [str(num) for num in range(2000, 2000 + len(docs_to_add))]
Expand Down Expand Up @@ -88,7 +88,7 @@ def test_add_documents_post_reduce(top2vec_model):
topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])
topic_count_reduced_sum = sum(top2vec_model.get_topic_sizes(reduced=True)[0])

if top2vec_model.document_ids is None:
if top2vec_model.document_ids_provided is False:
top2vec_model.add_documents(docs_to_add)
else:
doc_ids_new = [str(num) for num in range(2100, 2100 + len(docs_to_add))]
Expand All @@ -115,7 +115,7 @@ def test_delete_documents(top2vec_model):
topic_count_sum = sum(top2vec_model.get_topic_sizes()[0])
topic_count_reduced_sum = sum(top2vec_model.get_topic_sizes(reduced=True)[0])

if top2vec_model.document_ids is None:
if top2vec_model.document_ids_provided is False:
top2vec_model.delete_documents(doc_ids=doc_ids_to_delete)
else:
doc_ids_to_delete = [str(doc_id) for doc_id in doc_ids_to_delete]
Expand Down Expand Up @@ -209,19 +209,16 @@ def test_search_documents_by_topic(top2vec_model, reduced):
assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))

# check that all documents returned are most similar to topic being searched
if top2vec_model.document_ids is not None:
document_indexes = [top2vec_model.doc_id2index[doc_id] for doc_id in document_ids]
else:
document_indexes = document_ids
document_indexes = [top2vec_model.doc_id2index[doc_id] for doc_id in document_ids]

if reduced:
doc_topics = set(np.argmax(
np.inner(top2vec_model._get_document_vectors()[document_indexes],
top2vec_model.topic_vectors_reduced), axis=1))
top2vec_model.topic_vectors_reduced), axis=1))
else:
doc_topics = set(np.argmax(
np.inner(top2vec_model._get_document_vectors()[document_indexes],
top2vec_model.topic_vectors), axis=1))
top2vec_model.topic_vectors), axis=1))
assert len(doc_topics) == 1 and topic in doc_topics


Expand Down Expand Up @@ -288,10 +285,7 @@ def test_search_topics(top2vec_model, reduced):

@pytest.mark.parametrize('top2vec_model', models)
def test_search_document_by_documents(top2vec_model):
if top2vec_model.document_ids is not None:
doc_id = top2vec_model.document_ids[0]
else:
doc_id = 0
doc_id = top2vec_model.document_ids[0]

num_docs = 10

Expand All @@ -314,10 +308,7 @@ def test_search_document_by_documents(top2vec_model):

@pytest.mark.parametrize('top2vec_model', models)
def test_get_documents_topics(top2vec_model):
if top2vec_model.document_ids is not None:
doc_ids_get = top2vec_model.document_ids[[0, 5]]
else:
doc_ids_get = [0, 5]
doc_ids_get = top2vec_model.document_ids[[0, 5]]

if top2vec_model.hierarchy is not None:
doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
Expand All @@ -326,3 +317,105 @@ def test_get_documents_topics(top2vec_model):
doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get)

assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)


@pytest.mark.parametrize('top2vec_model', models)
def test_search_documents_by_vector(top2vec_model):
document_vectors = top2vec_model._get_document_vectors()
top2vec_model.search_documents_by_vector(vector=document_vectors[0], num_docs=10)

num_docs = 10

if top2vec_model.documents is not None:
documents, document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
num_docs=num_docs)
else:
document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
num_docs=num_docs)
if top2vec_model.documents is not None:
assert len(documents) == len(document_scores) == len(document_ids) == num_docs
else:
assert len(document_scores) == len(document_ids) == num_docs

# check that documents are returned in decreasing order
assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))


@pytest.mark.parametrize('top2vec_model', models)
def test_index_documents(top2vec_model):
top2vec_model.index_document_vectors()
assert top2vec_model._get_document_vectors().shape[1] <= top2vec_model.document_index.get_max_elements()


@pytest.mark.parametrize('top2vec_model', models)
def test_search_documents_by_vector_index(top2vec_model):
document_vectors = top2vec_model._get_document_vectors()
top2vec_model.search_documents_by_vector(vector=document_vectors[0], num_docs=10)

num_docs = 10

if top2vec_model.documents is not None:
documents, document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
num_docs=num_docs,
use_index=True)
else:
document_scores, document_ids = top2vec_model.search_documents_by_vector(vector=document_vectors[0],
num_docs=num_docs,
use_index=True)
if top2vec_model.documents is not None:
assert len(documents) == len(document_scores) == len(document_ids) == num_docs
else:
assert len(document_scores) == len(document_ids) == num_docs

# check that documents are returned in decreasing order
assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))


@pytest.mark.parametrize('top2vec_model', models)
def test_search_documents_by_keywords_index(top2vec_model):
keywords = get_model_vocab(top2vec_model)
keyword = keywords[-1]
num_docs = 10

if top2vec_model.documents is not None:
documents, document_scores, document_ids = top2vec_model.search_documents_by_keywords(keywords=[keyword],
num_docs=num_docs,
use_index=True)
else:
document_scores, document_ids = top2vec_model.search_documents_by_keywords(keywords=[keyword],
num_docs=num_docs,
use_index=True)

# check that for each document there is a score and number
if top2vec_model.documents is not None:
assert len(documents) == len(document_scores) == len(document_ids) == num_docs
else:
assert len(document_scores) == len(document_ids) == num_docs

# check that documents are returned in decreasing order
assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))


@pytest.mark.parametrize('top2vec_model', models)
def test_search_document_by_documents_index(top2vec_model):
doc_id = top2vec_model.document_ids[0]

num_docs = 10

if top2vec_model.documents is not None:
documents, document_scores, document_ids = top2vec_model.search_documents_by_documents(doc_ids=[doc_id],
num_docs=num_docs,
use_index=True)
else:
document_scores, document_ids = top2vec_model.search_documents_by_documents(doc_ids=[doc_id],
num_docs=num_docs,
use_index=True)

# check that for each document there is a score and number
if top2vec_model.documents is not None:
assert len(documents) == len(document_scores) == len(document_ids) == num_docs
else:
assert len(document_scores) == len(document_ids) == num_docs

# check that documents are returned in decreasing order
assert all(document_scores[i] >= document_scores[i + 1] for i in range(len(document_scores) - 1))

0 comments on commit a5a4221

Please sign in to comment.