In [13]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import bokeh.plotting as bp
from bokeh.plotting import save, show, output_notebook
from bokeh.models import HoverTool
from sklearn.manifold import TSNE
import logging
import warnings

output_notebook()
logging.getLogger("lda").setLevel(logging.WARNING)
warnings.filterwarnings("ignore", category=DeprecationWarning)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        def stem (doc):
            return [stemmer.stem(x) for x in analyzer(doc)]
        return stem

In [21]:
muestra = 10000
comentarios = pd.read_csv('./data/20-400-%s.csv'%muestra)['comentario']
cvectorizer = pickle.load(open("./data/cvz-20-400-%s.pickle"%muestra, "rb"))
cvz = pickle.load(open("./data/cvzm-20-400-%s.pickle"%muestra, "rb"))
lda_topics = pickle.load(open("./data/ldadata-20-400-%s.pickle"%muestra, "rb"))
lda_model = pickle.load(open("./data/ldamodel-20-400-%s.pickle"%muestra, "rb"))
lsa_topics = pickle.load(open("./data/ldadata-20-400-%s.pickle"%muestra, "rb"))
lsa_model = pickle.load(open("./data/ldamodel-20-400-%s.pickle"%muestra, "rb"))

In [22]:
cvz.shape

(10000, 3856)

In [23]:
# threshold = 0.4
# _idx = np.amax(lda_topics, axis=1) > threshold
# lda_topics = lda_topics[_idx]

tsne_model = TSNE(n_components=2, verbose=1, random_state=0, init='pca')
tsne_lda = tsne_model.fit_transform(lda_topics)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.333365
[t-SNE] Error after 325 iterations: 1.333365


In [24]:
n_top_words = 3

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

_lda_keys = []
for i in xrange(lda_topics.shape[0]):
  _lda_keys +=  lda_topics[i].argmax(),

In [25]:
topic_summaries = []
topic_word = lda_model.topic_word_
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
  topic_summaries.append(' '.join(topic_words))

IndexError: index 3908 is out of bounds for axis 1 with size 3856

## LDA

In [None]:
title = 'Comentarios a municipios del conurbano'
num_example = len(lda_topics)

plot_lda = bp.figure(plot_width=900, plot_height=900,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": comentarios[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

In [None]:
# output to static HTML file
output_file("./output/%s.html", title="line plot example")

# create a new plot with a title and axis labels
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')

# add a line renderer with legend and line thickness
p.line(x, y, legend="Temp.", line_width=2)

# show the results
show(p)

In [None]:
topic_coord = np.empty((lda_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

for i in xrange(lda_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save(plot_lda, '{}.html'.format(title))
show(plot_lda)

## LSA scatter clusters

In [9]:
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(lsa_topics)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 25000
[t-SNE] Computed conditional probabilities for sample 2000 / 25000
[t-SNE] Computed conditional probabilities for sample 3000 / 25000
[t-SNE] Computed conditional probabilities for sample 4000 / 25000
[t-SNE] Computed conditional probabilities for sample 5000 / 25000
[t-SNE] Computed conditional probabilities for sample 6000 / 25000
[t-SNE] Computed conditional probabilities for sample 7000 / 25000
[t-SNE] Computed conditional probabilities for sample 8000 / 25000
[t-SNE] Computed conditional probabilities for sample 9000 / 25000
[t-SNE] Computed conditional probabilities for sample 10000 / 25000
[t-SNE] Computed conditional probabilities for sample 11000 / 25000
[t-SNE] Computed conditional probabilities for sample 12000 / 25000
[t-SNE] Computed conditional probabilities for sample 13000 / 25000
[t-SNE] Computed conditional probabilities 

MemoryError: 

In [None]:
plot_tfidf = bp.figure(plot_width=900, plot_height=900, title="tf-idf clustering of the news",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [None]:
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = comentarios

In [None]:
plot_tfidf.scatter(x='x', y='y', source=tfidf_df,color=colormap)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"comentario": "@description"}
show(plot_tfidf)