Skip to content

Commit

Permalink
fixed add/delete document bug #35
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Oct 8, 2020
1 parent b82de5a commit 2ff6d96
Showing 1 changed file with 100 additions and 93 deletions.
193 changes: 100 additions & 93 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,25 @@ def __init__(self, documents, min_count=50, speed="learn", use_corpus_file=False

# calculate topic vectors from dense areas of documents
logger.info('Finding topics')

# create topic vectors
self._create_topic_vectors(cluster.labels_)

# deduplicate topics
self._deduplicate_topics()

# calculate topic sizes and index nearest topic for each document
self.topic_vectors, self.doc_top, self.doc_dist, self.topic_sizes = self._calculate_topic_sizes(
self.topic_vectors)

# find topic words and scores
self.topic_words, self.topic_word_scores = self._find_topic_words_scores(topic_vectors=self.topic_vectors)
self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)

# assign documents to topic
self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
self.model.docvecs.vectors_docs)

# calculate topic sizes
self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

# re-order topics
self._reorder_topics(hierarchy=False)

# initialize variables for hierarchical topic reduction
self.topic_vectors_reduced = None
Expand Down Expand Up @@ -284,28 +292,31 @@ def _deduplicate_topics(self):

self.topic_vectors = unique_topics

def _calculate_topic_sizes(self, topic_vectors, hierarchy=None):
# find nearest topic of each document
doc_top, doc_dist = self._calculate_documents_topic(topic_vectors=topic_vectors,
document_vectors=self.model.docvecs.vectors_docs)
topic_sizes = pd.Series(doc_top).value_counts()

return self._reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist, hierarchy)
def _calculate_topic_sizes(self, hierarchy=False):
if hierarchy:
topic_sizes = pd.Series(self.doc_top_reduced).value_counts()
else:
topic_sizes = pd.Series(self.doc_top).value_counts()

@staticmethod
def _reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist, hierarchy=None):
topic_vectors = topic_vectors[topic_sizes.index]
old2new = dict(zip(topic_sizes.index, range(topic_sizes.shape[0])))
doc_top = np.array([old2new[i] for i in doc_top])
return topic_sizes

if hierarchy is None:
topic_sizes.reset_index(drop=True, inplace=True)
return topic_vectors, doc_top, doc_dist, topic_sizes
def _reorder_topics(self, hierarchy=False):

if hierarchy:
self.topic_vectors_reduced = self.topic_vectors_reduced[self.topic_sizes_reduced.index]
self.topic_words_reduced = self.topic_words_reduced[self.topic_sizes_reduced.index]
self.topic_word_scores_reduced = self.topic_word_scores_reduced[self.topic_sizes_reduced.index]
old2new = dict(zip(self.topic_sizes_reduced.index, range(self.topic_sizes_reduced.index.shape[0])))
self.doc_top_reduced = np.array([old2new[i] for i in self.doc_top_reduced])
self.hierarchy = [self.hierarchy[i] for i in self.topic_sizes_reduced.index]
self.topic_sizes_reduced.reset_index(drop=True, inplace=True)
else:
hierarchy = [hierarchy[i] for i in topic_sizes.index]
topic_sizes.reset_index(drop=True, inplace=True)
return topic_vectors, doc_top, doc_dist, topic_sizes, hierarchy
self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
self.topic_words = self.topic_words[self.topic_sizes.index]
self.topic_word_scores = self.topic_word_scores[self.topic_sizes.index]
old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
self.doc_top = np.array([old2new[i] for i in self.doc_top])
self.topic_sizes.reset_index(drop=True, inplace=True)

@staticmethod
def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
Expand Down Expand Up @@ -344,7 +355,7 @@ def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
else:
return doc_top

def _find_topic_words_scores(self, topic_vectors):
def _find_topic_words_and_scores(self, topic_vectors):
topic_words = []
topic_word_scores = []

Expand All @@ -358,6 +369,51 @@ def _find_topic_words_scores(self, topic_vectors):

return topic_words, topic_word_scores

def _assign_documents_to_topic(self, document_vectors, hierarchy=False):

if hierarchy:
doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors_reduced,
document_vectors,
dist=True)
self.doc_top_reduced = np.append(self.doc_top_reduced, doc_top_new)
self.doc_dist_reduced = np.append(self.doc_dist_reduced, doc_dist_new)

topic_sizes_new = pd.Series(doc_top_new).value_counts()
for top in topic_sizes_new.index.tolist():
self.topic_sizes_reduced[top] += topic_sizes_new[top]
self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
else:
doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors, document_vectors, dist=True)
self.doc_top = np.append(self.doc_top, doc_top_new)
self.doc_dist = np.append(self.doc_dist, doc_dist_new)

topic_sizes_new = pd.Series(doc_top_new).value_counts()
for top in topic_sizes_new.index.tolist():
self.topic_sizes[top] += topic_sizes_new[top]
self.topic_sizes.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)

def _unassign_documents_from_topic(self, doc_indexes, hierarchy=False):
if hierarchy:
doc_top_remove = self.doc_top_reduced[doc_indexes]
self.doc_top_reduced = np.delete(self.doc_top_reduced, doc_indexes, 0)
self.doc_dist_reduced = np.delete(self.doc_dist_reduced, doc_indexes, 0)
topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
for top in topic_sizes_remove.index.tolist():
self.topic_sizes_reduced[top] -= topic_sizes_remove[top]
self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
else:
doc_top_remove = self.doc_top[doc_indexes]
self.doc_top = np.delete(self.doc_top, doc_indexes, 0)
self.doc_dist = np.delete(self.doc_dist, doc_indexes, 0)
topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
for top in topic_sizes_remove.index.tolist():
self.topic_sizes[top] -= topic_sizes_remove[top]
self.topic_sizes.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)

def save(self, file):
"""
Saves the current model to the specified file.
Expand Down Expand Up @@ -505,41 +561,6 @@ def _validate_documents(documents):
if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
raise ValueError("Documents need to be a list of strings.")

def _assign_documents_to_topic(self, document_vectors, topic_vectors, topic_sizes, doc_top, doc_dist,
hierarchy=None):

doc_top_new, doc_dist_new = self._calculate_documents_topic(topic_vectors, document_vectors, dist=True)
doc_top = np.append(doc_top, doc_top_new)
doc_dist = np.append(doc_dist, doc_dist_new)

topic_sizes_new = pd.Series(doc_top_new).value_counts()
for top in topic_sizes_new.index.tolist():
topic_sizes[top] += topic_sizes_new[top]
topic_sizes.sort_values(ascending=False, inplace=True)

if hierarchy is None:
return self._reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist)
else:
return self._reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist, hierarchy)

def _unassign_documents_from_topic(self, doc_indexes, doc_top, doc_dist, topic_sizes, topic_vectors,
hierarchy=None):

doc_top_remove = doc_top[doc_indexes]
doc_top = np.delete(doc_top, doc_indexes, 0)
doc_dist = np.delete(doc_dist, doc_indexes, 0)

topic_sizes_remove = pd.Series(doc_top_remove).value_counts()

for top in topic_sizes_remove.index.tolist():
topic_sizes[top] -= topic_sizes_remove[top]
topic_sizes.sort_values(ascending=False, inplace=True)

if hierarchy is None:
return self._reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist)
else:
return self._reorder_topics(topic_vectors, topic_sizes, doc_top, doc_dist, hierarchy)

def add_documents(self, documents, doc_ids=None):
"""
Update the model with new documents.
Expand Down Expand Up @@ -590,22 +611,10 @@ def add_documents(self, documents, doc_ids=None):
self.model.docvecs.init_sims()

# update topics
self.topic_vectors, self.doc_top, self.doc_dist, self.topic_sizes = self._assign_documents_to_topic(
document_vectors,
self.topic_vectors,
self.topic_sizes,
self.doc_top,
self.doc_dist)
self._assign_documents_to_topic(document_vectors, hierarchy=False)

if self.hierarchy is not None:
self.topic_vectors_reduced, self.doc_top_reduced, self.doc_dist_reduced, self.topic_sizes_reduced, \
self.hierarchy = self._assign_documents_to_topic(
document_vectors,
self.topic_vectors_reduced,
self.topic_sizes_reduced,
self.doc_top_reduced,
self.doc_dist_reduced,
self.hierarchy)
self._assign_documents_to_topic(document_vectors, hierarchy=True)

def delete_documents(self, doc_ids):
"""
Expand Down Expand Up @@ -656,22 +665,10 @@ def delete_documents(self, doc_ids):
self.model.docvecs.init_sims()

# update topics
self.topic_vectors, self.doc_top, self.doc_dist, self.topic_sizes = self._unassign_documents_from_topic(
doc_indexes,
self.doc_top,
self.doc_dist,
self.topic_sizes,
self.topic_vectors)
self._unassign_documents_from_topic(doc_indexes, hierarchy=False)

if self.hierarchy is not None:
self.topic_vectors_reduced, self.doc_top_reduced, \
self.doc_dist_reduced, self.topic_sizes_reduced, self.hierarchy = self._unassign_documents_from_topic(
doc_indexes,
self.doc_top_reduced,
self.doc_dist_reduced,
self.topic_sizes_reduced,
self.topic_vectors_reduced,
self.hierarchy)
self._unassign_documents_from_topic(doc_indexes, hierarchy=True)

def get_num_topics(self, reduced=False):
"""
Expand Down Expand Up @@ -910,14 +907,24 @@ def hierarchical_topic_reduction(self, num_topics):
doc_top = self._calculate_documents_topic(topic_vectors=top_vecs,
document_vectors=self.model.docvecs.vectors_docs,
dist=False)
top_vecs = np.vstack([self.model.docvecs.vectors_docs[np.where(doc_top == label)[0]].mean(axis=0)
for label in set(doc_top)])
self.topic_vectors_reduced, self.doc_top_reduced, self.doc_dist_reduced, self.topic_sizes_reduced, \
self.hierarchy = self._calculate_topic_sizes(topic_vectors=top_vecs,
hierarchy=hierarchy)
self.topic_words_reduced, self.topic_word_scores_reduced = self._find_topic_words_scores(
self.topic_vectors_reduced = np.vstack([self.model.docvecs.vectors_docs[np.where(doc_top == label)[0]]
.mean(axis=0) for label in set(doc_top)])

self.hierarchy = hierarchy

# assign documents to topic
self.doc_top_reduced, self.doc_dist_reduced = self._calculate_documents_topic(self.topic_vectors_reduced,
self.model.docvecs.vectors_docs)
# find topic words and scores
self.topic_words_reduced, self.topic_word_scores_reduced = self._find_topic_words_and_scores(
topic_vectors=self.topic_vectors_reduced)

# calculate topic sizes
self.topic_sizes_reduced = self._calculate_topic_sizes(hierarchy=True)

# re-order topics
self._reorder_topics(hierarchy=True)

return self.hierarchy

def search_documents_by_topic(self, topic_num, num_docs, return_documents=True, reduced=False):
Expand Down

0 comments on commit 2ff6d96

Please sign in to comment.