# Generate topics by clustering documents

In [89]:
import os
import numpy as np
from gensim.models import doc2vec
from gensim.utils import simple_preprocess
from nltk.cluster import kmeans
from nltk.cluster import util
import collections

In [90]:
# generic settings
HOMEDIR = './'

In [91]:
CORPUS_FILE = os.path.join(HOMEDIR, "data/train_docs.txt")
MODEL_FILE = os.path.join(HOMEDIR, "models/doc2vec_dbow_v20171229.bin")

NUM_CLUSTERS = 20

### Read corpus file and parse into token lists

In [92]:
with open(CORPUS_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    docs = [simple_preprocess(line, deacc=False, min_len=1) for line in lines]

### Read existing model and score the corpus

In [93]:
# load pre-trained model
model = doc2vec.Doc2Vec.load(MODEL_FILE)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [94]:
# infer document vectors
docvecs = [model.infer_vector(d, alpha=0.01, steps=1000) for d in docs]

## Now we have document vectors, start clustering

In [95]:
clusterer = kmeans.KMeansClusterer(NUM_CLUSTERS, distance=util.cosine_distance, repeats=3)

In [96]:
cluster_assignments = clusterer.cluster(docvecs, assign_clusters=True)

In [97]:
# how many documents per cluster?
collections.Counter(cluster_assignments)

Counter({0: 31,
         1: 68,
         2: 6,
         3: 53,
         4: 34,
         5: 106,
         6: 75,
         7: 80,
         8: 48,
         9: 20,
         10: 45,
         11: 47,
         12: 30,
         13: 75,
         14: 23,
         15: 28,
         16: 32,
         17: 111,
         18: 32,
         19: 56})

In [98]:
def get_documents_in_cluster(cluster_idx):
    return [doc for i, doc in enumerate(docs) if cluster_assignments[i] == cluster_idx]

In [99]:
def get_document_topics(doc_vec, topic_vecs):
    """
    For a given document, give the topic distribution (softmax probabilities for all topics)
    """
    similarities = [np.dot(doc_vec, topic_vec) for topic_vec in topic_vecs]
    return np.exp(similarities) / np.sum(np.exp(similarities))

In [100]:
topic_vecs = clusterer.means()

# Visualize topics using t-SNE

In [101]:
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.models import HoverTool
from bokeh.io import push_notebook, output_notebook, show

In [102]:
docs_tsne = TSNE(n_components=2, perplexity=30, init='pca').fit_transform(docvecs)
docs_tsne.shape

(1000, 2)

In [103]:
# create matrix with topic proportion per doc per topic
doc_topic_matrix = [get_document_topics(docvec, topic_vecs) for docvec in docvecs]
# select highest topic prob
prob_max_topic = np.max(doc_topic_matrix, axis=1)

In [104]:
# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [105]:
sourcedata = {
    'x': docs_tsne[:, 0],
    'y': docs_tsne[:, 1],
    'color': colormap[cluster_assignments],
    'alpha': prob_max_topic * 50,
    'content': lines,
    'topic_key': cluster_assignments
}

### Make and show the plot

In [106]:
tsne_plot = bp.figure(plot_width=1600, plot_height=900,
                      title="Topics",
                      tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                      x_axis_type=None, y_axis_type=None, min_border=1)

tsne_plot.scatter(x='x', 
                  y='y',
                  color='color',
                  size='alpha',
                  #size=10,
                  source=bp.ColumnDataSource(sourcedata)
                 )

# add hover tooltips
hover = tsne_plot.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

show(tsne_plot)