In [1]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool


In [2]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [15]:
df_cf.head()

Unnamed: 0.1,Unnamed: 0,_unit_id,_golden,_unit_state_x,_trusted_judgments,_last_judgment_at_x,violence_judgment,violence,violence_confidence,protest_judgment_x,protest,protest_confidence,created_at,id,proccd_text,Fatma_protest,Fatma_violence,clean_text
0,0,1657977939,False,finalized,3,3/30/2018 15:01:20,"""nan""",0.0,1.0,"""000""",0,1.0,Sat Jun 15 12:14:33 +0000 2013,3.46e+17,lawn goose clothes lot DD outfit pattern sewin...,0,0,lawn goose clothes lot outfit pattern sewing u...
1,1,1657977940,False,finalized,3,3/30/2018 15:11:00,"""010""",0.0,0.6606,"""111""",1,1.0,Tue Jun 04 16:04:35 +0000 2013,3.42e+17,rt usrId turkish police have blacked out id nu...,1,0,police blacked id numbers helmuts
2,2,1657977941,False,finalized,3,3/30/2018 15:05:25,"""000""",0.0,1.0,"""111""",1,1.0,Sat Jun 01 07:12:41 +0000 2013,3.41e+17,rt usrId #occupy #taksim protesters in #istanb...,1,0,occupy call akp govt
3,3,1657977942,False,finalized,3,04/06/18 11:49,"""1""",0.0,0.0,"""010""",0,0.6767,Sun Jun 09 14:53:50 +0000 2013,3.44e+17,rt usrId usrId erdogan says vandals entering m...,0,0,erdogan says vandals entering mosques beer bot...
4,4,1657977943,False,finalized,3,3/30/2018 15:33:35,"""nan""",0.0,1.0,"""000""",0,1.0,Sat Jun 01 22:08:17 +0000 2013,3.41e+17,photoset forest dreams turkey needs support as...,0,0,photoset forest dreams needs support always me...


In [29]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [30]:
n_topics = 5 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(X)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [31]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1214
[t-SNE] Computed conditional probabilities for sample 1214 / 1214
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.126401
[t-SNE] Error after 400 iterations: 0.126401


In [32]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000", '#6AA84F', '#E69138', '#A64D79'])

In [33]:
_lda_keys = []
for i in xrange(X_topics.shape[0]):
  _lda_keys +=  X_topics[i].argmax(),

In [34]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [35]:
topic_word = lda_model.topic_word_ 

In [36]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [37]:
topic_summaries

[u'erdogan media minister world prime',
 u'erdogan people solidarity says stop',
 u'police square government ankara people',
 u'gas police tear people water',
 u'bbc occupyturkey world live news']

In [38]:
title = 'CF LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": X[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

<bokeh.models.renderers.GlyphRenderer at 0x7fb98b215390>

In [39]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in xrange(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

In [40]:
for i in xrange(X_topics.shape[1]):
  print(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

(2.9418838162883194, -34.415646587183147, [u'erdogan media minister world prime'])
(-0.918207332198132, -8.0732219821592217, [u'erdogan people solidarity says stop'])
(75.5093946707415, 26.509066386176386, [u'police square government ankara people'])
(-12.131781009558738, 73.075920254828816, [u'gas police tear people water'])
(-3.1848470633987791, 3.5888876298540859, [u'bbc occupyturkey world live news'])


In [41]:
X_topics.shape

(917, 5)