Skip to content

Commit

Permalink
fixing #147
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Mar 13, 2021
1 parent 95b1930 commit 382ba5e
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,11 +313,14 @@ def __init__(self,
logger.info('Pre-processing documents for training')

# preprocess documents
train_corpus = [' '.join(tokenizer(doc)) for doc in documents]
tokenized_corpus = [tokenizer(doc) for doc in documents]

def return_doc(doc):
return doc

# preprocess vocabulary
vectorizer = CountVectorizer()
doc_word_counts = vectorizer.fit_transform(train_corpus)
vectorizer = CountVectorizer(tokenizer=return_doc, preprocessor=return_doc)
doc_word_counts = vectorizer.fit_transform(tokenized_corpus)
words = vectorizer.get_feature_names()
word_counts = np.array(np.sum(doc_word_counts, axis=0).tolist()[0])
vocab_inds = np.where(word_counts > min_count)[0]
Expand All @@ -339,6 +342,7 @@ def __init__(self,
if use_embedding_model_tokenizer:
self.document_vectors = self._embed_documents(documents)
else:
train_corpus = [' '.join(tokens) for tokens in tokenized_corpus]
self.document_vectors = self._embed_documents(train_corpus)

else:
Expand Down

0 comments on commit 382ba5e

Please sign in to comment.