In [1]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool


In [2]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [3]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [4]:
n_topics = 5 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(X)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 1214
INFO:lda:vocab_size: 306
INFO:lda:n_words: 3569
INFO:lda:n_topics: 5
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -32657
INFO:lda:<10> log likelihood: -22300
INFO:lda:<20> log likelihood: -21825
INFO:lda:<30> log likelihood: -21649
INFO:lda:<40> log likelihood: -21564
INFO:lda:<50> log likelihood: -21405
INFO:lda:<60> log likelihood: -21326
INFO:lda:<70> log likelihood: -21271
INFO:lda:<80> log likelihood: -21292
INFO:lda:<90> log likelihood: -21118
INFO:lda:<100> log likelihood: -21126
INFO:lda:<110> log likelihood: -21030
INFO:lda:<120> log likelihood: -21050
INFO:lda:<130> log likelihood: -21125
INFO:lda:<140> log likelihood: -21061
INFO:lda:<150> log likelihood: -21078
INFO:lda:<160> log likelihood: -21019
INFO:lda:<170> log likelihood: -21026
INFO:lda:<180> log likelihood: -21029
INFO:lda:<190> log likelihood: -20995
INFO:lda:<200> log likelihood: -20999
INFO:lda:<210> log likelihood: -20910
IN

In [9]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1214 samples in 0.012s...
[t-SNE] Computed neighbors for 1214 samples in 0.185s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1214
[t-SNE] Computed conditional probabilities for sample 1214 / 1214
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.382034
[t-SNE] Error after 1000 iterations: -0.146787


In [10]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000",'#6AA84F', '#E69138', '#A64D79'])#, 

In [11]:
_lda_keys = []
for i in range(X_topics.shape[0]):
  _lda_keys +=  X_topics[i].argmax(),

In [12]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [13]:
topic_word = lda_model.topic_word_ 

In [14]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [15]:
topic_summaries

['police gas tear square ankara',
 'police live occupyturkey peaceful violence',
 'erdogan world says people news',
 'media erdogan going social make',
 'people police square right standing']

In [35]:
title = 'CF LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": X[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

RuntimeError: 
Supplying a user-defined data source AND iterable values to glyph methods is
not possibe. Either:

Pass all data directly as literals:

    p.circe(x=a_list, y=an_array, ...)

Or, put all data in a ColumnDataSource and pass column names:

    source = ColumnDataSource(data=dict(x=a_list, y=an_array))
    p.circe(x='x', y='y', source=source, ...)



In [37]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/fatma/PhD/Git/semi_supervision/semi_supervision/CF LDA viz.html'

In [39]:
for i in range(X_topics.shape[1]):
  print(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

198.03646850585938 153.6648406982422 ['police erdogan gas tear media']
-178.315673828125 20.907649993896484 ['police people erdogan world square']


In [41]:
X_topics.shape

(917, 5)