In [3]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [4]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [5]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [6]:
n_topics = 5 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(X)

In [8]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=2)
lsi_Z = lsi_model.fit_transform(cvz)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 2)


In [9]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [13]:
print("LSI Model:")
print_topics(lsi_model, cvectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('police', 0.8718559827979699), ('square', 0.21717426303253998), ('gas', 0.18809957259565835), ('people', 0.15608235601890091), ('tear', 0.15068500879453584), ('erdogan', 0.1155696952938807), ('riot', 0.09498139903205544), ('ankara', 0.08247365130980609), ('violence', 0.07844331440882524), ('attack', 0.07447798949860095)]
Topic 1:
[('erdogan', 0.8920813082451169), ('says', 0.17563794873778169), ('people', 0.1646154052280732), ('minister', 0.12248343662323155), ('prime', 0.10181764908384121), ('media', 0.09686328295434486), ('world', 0.0814756982842812), ('news', 0.06532936629689211), ('opposition', 0.058442805839804095), ('united', 0.0509373407063768)]


In [14]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(lsi_Z)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1214 samples in 0.005s...
[t-SNE] Computed neighbors for 1214 samples in 0.037s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1214
[t-SNE] Computed conditional probabilities for sample 1214 / 1214
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.679901
[t-SNE] Error after 1000 iterations: 0.309834


In [15]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000"]) #, '#6AA84F', '#E69138', '#A64D79'

In [18]:
_lda_keys = []
for i in range(lsi_Z.shape[0]):
  _lda_keys +=  lsi_Z[i].argmax(),

In [22]:
topic_summaries = []
topic_word = lsi_model.components_# all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [24]:
threshold = 0.5
_idx = np.amax(lsi_Z, axis=1) > threshold  # idx of doc that above the threshold
X_topics = lsi_Z[_idx]

In [25]:
topic_summaries

['police square gas people tear', 'erdogan says people minister prime']

In [26]:
title = 'CF LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": X[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

RuntimeError: 
Supplying a user-defined data source AND iterable values to glyph methods is
not possibe. Either:

Pass all data directly as literals:

    p.circe(x=a_list, y=an_array, ...)

Or, put all data in a ColumnDataSource and pass column names:

    source = ColumnDataSource(data=dict(x=a_list, y=an_array))
    p.circe(x='x', y='y', source=source, ...)

