fixing #147

ddangelov · Mar 13, 2021 · 382ba5e · 382ba5e
1 parent 95b1930
commit 382ba5e
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py
@@ -313,11 +313,14 @@ def __init__(self,
             logger.info('Pre-processing documents for training')
 
             # preprocess documents
-            train_corpus = [' '.join(tokenizer(doc)) for doc in documents]
+            tokenized_corpus = [tokenizer(doc) for doc in documents]
+
+            def return_doc(doc):
+                return doc
 
             # preprocess vocabulary
-            vectorizer = CountVectorizer()
-            doc_word_counts = vectorizer.fit_transform(train_corpus)
+            vectorizer = CountVectorizer(tokenizer=return_doc, preprocessor=return_doc)
+            doc_word_counts = vectorizer.fit_transform(tokenized_corpus)
             words = vectorizer.get_feature_names()
             word_counts = np.array(np.sum(doc_word_counts, axis=0).tolist()[0])
             vocab_inds = np.where(word_counts > min_count)[0]
@@ -339,6 +342,7 @@ def __init__(self,
             if use_embedding_model_tokenizer:
                 self.document_vectors = self._embed_documents(documents)
             else:
+                train_corpus = [' '.join(tokens) for tokens in tokenized_corpus]
                 self.document_vectors = self._embed_documents(train_corpus)
 
         else: