In [1]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

In [2]:
df_cf = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')

In [3]:
df_cf = df_cf.drop_duplicates()
df_cf = df_cf.dropna()

In [18]:
df_cf = df_cf.sample(frac=0.1)

X = df_cf.clean_text

y = df_cf.Fatma_violence


In [19]:
len(X)

2639

In [20]:
n_topics = 2 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(X)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 2639
INFO:lda:vocab_size: 688
INFO:lda:n_words: 13511
INFO:lda:n_topics: 2
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -94833
INFO:lda:<10> log likelihood: -81388
INFO:lda:<20> log likelihood: -80903
INFO:lda:<30> log likelihood: -80515
INFO:lda:<40> log likelihood: -80284
INFO:lda:<50> log likelihood: -80265
INFO:lda:<60> log likelihood: -80103
INFO:lda:<70> log likelihood: -79986
INFO:lda:<80> log likelihood: -79914
INFO:lda:<90> log likelihood: -79789
INFO:lda:<100> log likelihood: -79934
INFO:lda:<110> log likelihood: -79536
INFO:lda:<120> log likelihood: -79610
INFO:lda:<130> log likelihood: -79503
INFO:lda:<140> log likelihood: -79427
INFO:lda:<150> log likelihood: -79550
INFO:lda:<160> log likelihood: -79433
INFO:lda:<170> log likelihood: -79272
INFO:lda:<180> log likelihood: -79333
INFO:lda:<190> log likelihood: -79302
INFO:lda:<200> log likelihood: -79268
INFO:lda:<210> log likelihood: -79236
I

In [21]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2639 samples in 0.002s...
[t-SNE] Computed neighbors for 2639 samples in 0.099s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2639
[t-SNE] Computed conditional probabilities for sample 2000 / 2639
[t-SNE] Computed conditional probabilities for sample 2639 / 2639
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.427883
[t-SNE] Error after 1000 iterations: -5.556423


In [22]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000", '#6AA84F', '#E69138', '#A64D79'])

In [23]:
_lda_keys = []
for i in range(X_topics.shape[0]):
  _lda_keys +=  X_topics[i].argmax(),

In [24]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [25]:
topic_word = lda_model.topic_word_ 

In [26]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [27]:
topic_summaries

['occupygezi police turkey istanbul taksim',
 'turkey erdogan protests istanbul occupygezi']

In [22]:
title = 'unlabeled turkish LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": X[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

<bokeh.models.renderers.GlyphRenderer at 0x7fdd893935d0>

In [23]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in xrange(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

