In [3]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
import os

In [39]:
data_list = []
directory = os.path.join("Data/tweets_by_date/")
for root,dirs,files in os.walk(directory):
    for file in files:
       if file.endswith(".csv"):
           dataframe = pd.read_csv("Data/tweets_by_date/" + file)
           data_list.append(dataframe.values.tolist())


In [58]:
len(data_list[1])

135

In [72]:
Data = []
for i in data_list:
                Data.append(''.join("".join(e) for e in i))

In [73]:
len(Data)

26

In [75]:
X = Data

In [76]:
n_topics =5 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=3, stop_words='english')
cvz = cvectorizer.fit_transform(X)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 26
INFO:lda:vocab_size: 441
INFO:lda:n_words: 3066
INFO:lda:n_topics: 5
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -27664
INFO:lda:<10> log likelihood: -20469
INFO:lda:<20> log likelihood: -20082
INFO:lda:<30> log likelihood: -19999
INFO:lda:<40> log likelihood: -19977
INFO:lda:<50> log likelihood: -19916
INFO:lda:<60> log likelihood: -19951
INFO:lda:<70> log likelihood: -19946
INFO:lda:<80> log likelihood: -19805
INFO:lda:<90> log likelihood: -19814
INFO:lda:<100> log likelihood: -19855
INFO:lda:<110> log likelihood: -19882
INFO:lda:<120> log likelihood: -19958
INFO:lda:<130> log likelihood: -19954
INFO:lda:<140> log likelihood: -19893
INFO:lda:<150> log likelihood: -19900
INFO:lda:<160> log likelihood: -20023
INFO:lda:<170> log likelihood: -19922
INFO:lda:<180> log likelihood: -19761
INFO:lda:<190> log likelihood: -19868
INFO:lda:<200> log likelihood: -19833
INFO:lda:<210> log likelihood: -19836
INFO

In [77]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.001s...
[t-SNE] Computed neighbors for 26 samples in 0.211s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.965565
[t-SNE] Error after 950 iterations: 0.579410


In [78]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000", '#6AA84F', '#E69138', '#A64D79'])#, 

In [79]:
_lda_keys = []
for i in range(X_topics.shape[0]):
  _lda_keys +=  X_topics[i].argmax(),

In [80]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [81]:
topic_word = lda_model.topic_word_ 

In [82]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [83]:
topic_summaries

['erdogan says minister man syria',
 'world live government ankara day',
 'like need new arrested understand',
 'media peaceful social video international',
 'police gas people tear square']

In [85]:
_lda_keys

[4, 4, 1, 4, 3, 3, 0, 1, 4, 1, 0, 4, 0, 0, 0, 4, 4, 4, 0, 0, 0, 1, 1, 1, 0, 2]

In [107]:
title = 'CF pooling LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

#source = ColumnDataSource(data=dict(x=a_list, y=an_array))
data_dict = dict(x=tsne_lda[:, 0], y=tsne_lda[:, 1])
data = pd.DataFrame(data_dict, columns = ['content', 'topic'])

source = bp.ColumnDataSource(data=data)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example])
#plot_lda.scatter(x='x', y='y',
#                 color=colormap[_lda_keys][:num_example], source=source)



In [108]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/fatma/PhD/Git/semi_supervision/semi_supervision/CF pooling LDA viz.html'