Skip to content

Commit

Permalink
get multiple topics per document
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Jun 22, 2021
1 parent a7b0d53 commit dae3991
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 28 deletions.
97 changes: 69 additions & 28 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def _reorder_topics(self, hierarchy=False):
self.topic_sizes.reset_index(drop=True, inplace=True)

@staticmethod
def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
def _calculate_documents_topic(topic_vectors, document_vectors, dist=True, num_topics=None):
batch_size = 10000
doc_top = []
if dist:
Expand All @@ -647,23 +647,47 @@ def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):

for ind in range(0, batches):
res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])

current += batch_size

if extra > 0:
res = np.inner(document_vectors[current:current + extra], topic_vectors)
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
if dist:
doc_dist = np.array(doc_dist)
else:
res = np.inner(document_vectors, topic_vectors)
doc_top = np.argmax(res, axis=1)

if num_topics is None:
doc_top = np.argmax(res, axis=1)
if dist:
doc_dist = np.max(res, axis=1)
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])

if num_topics is not None:
doc_top = np.array(doc_top)
if dist:
doc_dist = np.max(res, axis=1)
doc_dist = np.array(doc_dist)

if dist:
return doc_top, doc_dist
Expand Down Expand Up @@ -1062,7 +1086,7 @@ def change_to_download_embedding_model(self):
"""
self.embedding_model_path = None

def get_documents_topics(self, doc_ids, reduced=False):
def get_documents_topics(self, doc_ids, reduced=False, num_topics=1):
"""
Get document topics.
Expand All @@ -1074,24 +1098,27 @@ def get_documents_topics(self, doc_ids, reduced=False):
Parameters
----------
doc_ids: List of str, int
A unique value per document that is used for referring to documents
in search results. If ids were not given to the model, the index of
each document in the model is the id.
A unique value per document that is used for referring to
documents in search results. If ids were not given to the model,
the index of each document in the model is the id.
reduced: bool (Optional, default False)
Original topics are returned by default. If True the
reduced topics will be returned.
num_topics: int (Optional, default 1)
The number of topics to return per document.
Returns
-------
topic_nums: array of int, shape(doc_ids)
The topic number of the document corresponding to each doc_id.
topic_nums: array of int, shape(len(doc_ids), num_topics)
The topic number(s) of the document corresponding to each doc_id.
topic_score: array of float, shape(doc_ids)
Semantic similarity of document to topic. The cosine similarity of
the document and topic vector.
topic_score: array of float, shape(len(doc_ids), num_topics)
Semantic similarity of document to topic(s). The cosine similarity
of the document and topic vector.
topics_words: array of shape(num_topics, 50)
topics_words: array of shape(len(doc_ids), num_topics, 50)
For each topic the top 50 words are returned, in order
of semantic similarity to topic.
Expand Down Expand Up @@ -1119,16 +1146,30 @@ def get_documents_topics(self, doc_ids, reduced=False):
# get document indexes from ids
doc_indexes = self._get_document_indexes(doc_ids)

if reduced:
doc_topics = self.doc_top_reduced[doc_indexes]
doc_dist = self.doc_dist_reduced[doc_indexes]
topic_words = self.topic_words_reduced[doc_topics]
topic_word_scores = self.topic_word_scores_reduced[doc_topics]
if num_topics == 1:
if reduced:
doc_topics = self.doc_top_reduced[doc_indexes]
doc_dist = self.doc_dist_reduced[doc_indexes]
topic_words = self.topic_words_reduced[doc_topics]
topic_word_scores = self.topic_word_scores_reduced[doc_topics]
else:
doc_topics = self.doc_top[doc_indexes]
doc_dist = self.doc_dist[doc_indexes]
topic_words = self.topic_words[doc_topics]
topic_word_scores = self.topic_word_scores[doc_topics]

else:
doc_topics = self.doc_top[doc_indexes]
doc_dist = self.doc_dist[doc_indexes]
topic_words = self.topic_words[doc_topics]
topic_word_scores = self.topic_word_scores[doc_topics]
if reduced:
topic_vectors = self.topic_vectors_reduced
else:
topic_vectors = self.topic_vectors

doc_topics, doc_dist = self._calculate_documents_topic(topic_vectors,
self._get_document_vectors()[doc_indexes],
num_topics=num_topics)

topic_words = np.array([self.topic_words[topics] for topics in doc_topics])
topic_word_scores = np.array([self.topic_word_scores[topics] for topics in doc_topics])

return doc_topics, doc_dist, topic_words, topic_word_scores

Expand Down
27 changes: 27 additions & 0 deletions top2vec/tests/test_top2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,33 @@ def test_get_documents_topics(top2vec_model):
assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)


@pytest.mark.parametrize('top2vec_model', models)
def test_get_documents_topics_multiple(top2vec_model):
doc_ids_get = top2vec_model.document_ids[[0, 1, 5]]
num_topics = 2

if top2vec_model.hierarchy is not None:
doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
reduced=True,
num_topics=num_topics)

actual_number_topics = top2vec_model.get_num_topics(reduced=True)

else:
doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
num_topics=num_topics)

actual_number_topics = top2vec_model.get_num_topics(reduced=False)

assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)

if num_topics <= actual_number_topics:
assert doc_topics.shape[1] == num_topics
assert doc_dist.shape[1] == num_topics
assert topic_words.shape[1] == num_topics
assert topic_word_scores.shape[1] == num_topics


@pytest.mark.parametrize('top2vec_model', models)
def test_search_documents_by_vector(top2vec_model):
document_vectors = top2vec_model._get_document_vectors()
Expand Down

0 comments on commit dae3991

Please sign in to comment.