get multiple topics per document

ddangelov · Jun 22, 2021 · dae3991 · dae3991
1 parent a7b0d53
commit dae3991
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 28 deletions.
diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py
@@ -634,7 +634,7 @@ def _reorder_topics(self, hierarchy=False):
             self.topic_sizes.reset_index(drop=True, inplace=True)
 
     @staticmethod
-    def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
+    def _calculate_documents_topic(topic_vectors, document_vectors, dist=True, num_topics=None):
         batch_size = 10000
         doc_top = []
         if dist:
@@ -647,23 +647,47 @@ def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
 
             for ind in range(0, batches):
                 res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
-                doc_top.extend(np.argmax(res, axis=1))
-                if dist:
-                    doc_dist.extend(np.max(res, axis=1))
+
+                if num_topics is None:
+                    doc_top.extend(np.argmax(res, axis=1))
+                    if dist:
+                        doc_dist.extend(np.max(res, axis=1))
+                else:
+                    doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                    if dist:
+                        doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
+
                 current += batch_size
 
             if extra > 0:
                 res = np.inner(document_vectors[current:current + extra], topic_vectors)
-                doc_top.extend(np.argmax(res, axis=1))
-                if dist:
-                    doc_dist.extend(np.max(res, axis=1))
+
+                if num_topics is None:
+                    doc_top.extend(np.argmax(res, axis=1))
+                    if dist:
+                        doc_dist.extend(np.max(res, axis=1))
+                else:
+                    doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                    if dist:
+                        doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
             if dist:
                 doc_dist = np.array(doc_dist)
         else:
             res = np.inner(document_vectors, topic_vectors)
-            doc_top = np.argmax(res, axis=1)
+
+            if num_topics is None:
+                doc_top = np.argmax(res, axis=1)
+                if dist:
+                    doc_dist = np.max(res, axis=1)
+            else:
+                doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                if dist:
+                    doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
+
+        if num_topics is not None:
+            doc_top = np.array(doc_top)
             if dist:
-                doc_dist = np.max(res, axis=1)
+                doc_dist = np.array(doc_dist)
 
         if dist:
             return doc_top, doc_dist
@@ -1062,7 +1086,7 @@ def change_to_download_embedding_model(self):
         """
         self.embedding_model_path = None
 
-    def get_documents_topics(self, doc_ids, reduced=False):
+    def get_documents_topics(self, doc_ids, reduced=False, num_topics=1):
         """
         Get document topics.
 
@@ -1074,24 +1098,27 @@ def get_documents_topics(self, doc_ids, reduced=False):
         Parameters
         ----------
         doc_ids: List of str, int
-            A unique value per document that is used for referring to documents
-            in search results. If ids were not given to the model, the index of
-            each document in the model is the id.
+            A unique value per document that is used for referring to
+            documents in search results. If ids were not given to the model,
+            the index of each document in the model is the id.
 
         reduced: bool (Optional, default False)
             Original topics are returned by default. If True the
             reduced topics will be returned.
 
+        num_topics: int (Optional, default 1)
+            The number of topics to return per document.
+
         Returns
         -------
-        topic_nums: array of int, shape(doc_ids)
-            The topic number of the document corresponding to each doc_id.
+        topic_nums: array of int, shape(len(doc_ids), num_topics)
+            The topic number(s) of the document corresponding to each doc_id.
 
-        topic_score: array of float, shape(doc_ids)
-            Semantic similarity of document to topic. The cosine similarity of
-            the document and topic vector.
+        topic_score: array of float, shape(len(doc_ids), num_topics)
+            Semantic similarity of document to topic(s). The cosine similarity
+            of the document and topic vector.
 
-        topics_words: array of shape(num_topics, 50)
+        topics_words: array of shape(len(doc_ids), num_topics, 50)
             For each topic the top 50 words are returned, in order
             of semantic similarity to topic.
 
@@ -1119,16 +1146,30 @@ def get_documents_topics(self, doc_ids, reduced=False):
         # get document indexes from ids
         doc_indexes = self._get_document_indexes(doc_ids)
 
-        if reduced:
-            doc_topics = self.doc_top_reduced[doc_indexes]
-            doc_dist = self.doc_dist_reduced[doc_indexes]
-            topic_words = self.topic_words_reduced[doc_topics]
-            topic_word_scores = self.topic_word_scores_reduced[doc_topics]
+        if num_topics == 1:
+            if reduced:
+                doc_topics = self.doc_top_reduced[doc_indexes]
+                doc_dist = self.doc_dist_reduced[doc_indexes]
+                topic_words = self.topic_words_reduced[doc_topics]
+                topic_word_scores = self.topic_word_scores_reduced[doc_topics]
+            else:
+                doc_topics = self.doc_top[doc_indexes]
+                doc_dist = self.doc_dist[doc_indexes]
+                topic_words = self.topic_words[doc_topics]
+                topic_word_scores = self.topic_word_scores[doc_topics]
+
         else:
-            doc_topics = self.doc_top[doc_indexes]
-            doc_dist = self.doc_dist[doc_indexes]
-            topic_words = self.topic_words[doc_topics]
-            topic_word_scores = self.topic_word_scores[doc_topics]
+            if reduced:
+                topic_vectors = self.topic_vectors_reduced
+            else:
+                topic_vectors = self.topic_vectors
+
+            doc_topics, doc_dist = self._calculate_documents_topic(topic_vectors,
+                                                                   self._get_document_vectors()[doc_indexes],
+                                                                   num_topics=num_topics)
+
+            topic_words = np.array([self.topic_words[topics] for topics in doc_topics])
+            topic_word_scores = np.array([self.topic_word_scores[topics] for topics in doc_topics])
 
         return doc_topics, doc_dist, topic_words, topic_word_scores
 

diff --git a/top2vec/tests/test_top2vec.py b/top2vec/tests/test_top2vec.py
@@ -331,6 +331,33 @@ def test_get_documents_topics(top2vec_model):
     assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)
 
 
+@pytest.mark.parametrize('top2vec_model', models)
+def test_get_documents_topics_multiple(top2vec_model):
+    doc_ids_get = top2vec_model.document_ids[[0, 1, 5]]
+    num_topics = 2
+
+    if top2vec_model.hierarchy is not None:
+        doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
+                                                                                                  reduced=True,
+                                                                                                  num_topics=num_topics)
+
+        actual_number_topics = top2vec_model.get_num_topics(reduced=True)
+
+    else:
+        doc_topics, doc_dist, topic_words, topic_word_scores = top2vec_model.get_documents_topics(doc_ids=doc_ids_get,
+                                                                                                  num_topics=num_topics)
+
+        actual_number_topics = top2vec_model.get_num_topics(reduced=False)
+
+    assert len(doc_topics) == len(doc_dist) == len(topic_words) == len(topic_word_scores) == len(doc_ids_get)
+
+    if num_topics <= actual_number_topics:
+        assert doc_topics.shape[1] == num_topics
+        assert doc_dist.shape[1] == num_topics
+        assert topic_words.shape[1] == num_topics
+        assert topic_word_scores.shape[1] == num_topics
+
+
 @pytest.mark.parametrize('top2vec_model', models)
 def test_search_documents_by_vector(top2vec_model):
     document_vectors = top2vec_model._get_document_vectors()