In [3]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [28]:
df_cf_data = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [29]:
df_unlabeled_tweets = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')
df_unlabeled_tweets = df_unlabeled_tweets.dropna()
df_unlabeled_tweets = df_unlabeled_tweets.drop_duplicates()

In [30]:
df_cf_data = df_cf_data[['clean_text', 'Fatma_violence']]
df_unlabeled_tweets = df_unlabeled_tweets[['clean_text', 'Fatma_violence']]
df_tweets = pd.concat([df_cf_data, df_unlabeled_tweets])

### SVD Labeled Data

In [41]:
n_topics = 4 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(df_cf_data.clean_text)

In [42]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=4)
lsi_Z = lsi_model.fit_transform(cvz)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 4)


In [43]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [44]:
print("LSI Model:")
print_topics(lsi_model, cvectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('police', 0.8718562469979186), ('square', 0.2171728374096674), ('gas', 0.18810026703033117), ('people', 0.15608124809970003), ('tear', 0.15068607490458125), ('erdogan', 0.11556924004007604), ('riot', 0.09498163735892766), ('ankara', 0.08247301914043668), ('violence', 0.07844153044830099), ('attack', 0.0744780137704591)]
Topic 1:
[('erdogan', 0.8920633089673118), ('says', 0.175582091957604), ('people', 0.16463127373564582), ('minister', 0.12261595810007266), ('prime', 0.10193634253516726), ('media', 0.09693102467495528), ('world', 0.08141696379309585), ('news', 0.06526952441576764), ('opposition', 0.05843772471424674), ('united', 0.050931761430265676)]
Topic 2:
[('need', 0.5408581648891233), ('understand', 0.5199191929407079), ('things', 0.5196089446418974), ('violence', 0.1081985882773863), ('want', 0.10407347116717146), ('stop', 0.1018786148411038), ('good', 0.0986287209253838), ('burgers', 0.09084919767210617), ('nice', 0.08977010054938288), ('better', 0.0894878

In [46]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(lsi_Z)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1214 samples in 0.001s...
[t-SNE] Computed neighbors for 1214 samples in 0.040s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1214
[t-SNE] Computed conditional probabilities for sample 1214 / 1214
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.380810
[t-SNE] Error after 1000 iterations: 0.405724


In [47]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000",'#6AA84F', '#E69138', '#A64D79']) #, 

In [48]:
_lda_keys = []
for i in range(lsi_Z.shape[0]):
  _lda_keys +=  lsi_Z[i].argmax(),

In [49]:
topic_summaries = []
topic_word = lsi_model.components_# all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [50]:
threshold = 0.5
_idx = np.amax(lsi_Z, axis=1) > threshold  # idx of doc that above the threshold
X_topics = lsi_Z[_idx]

In [53]:
X_topics

array([[ 8.71856144e-01, -1.60293419e-01,  6.18608416e-04,
        -2.94453227e-01],
       [ 1.85711161e-01,  1.12470158e+00, -3.08527695e-02,
         1.52890654e-02],
       [ 1.48208875e-01,  1.09919601e+00, -2.70390193e-02,
         3.65498662e-02],
       ...,
       [ 8.99673272e-01, -1.13358522e-01,  5.76755714e-03,
        -3.09463556e-01],
       [ 1.68517332e-01,  1.17011222e+00, -2.32711775e-02,
         2.16374120e-02],
       [ 1.45609213e+00, -2.98604202e-01, -4.16249177e-01,
         1.02965609e+00]])

In [51]:
topic_summaries

['police square gas people tear',
 'erdogan says people minister prime',
 'need understand things violence want',
 'gas tear understand need things']

In [55]:
title = 'CF Labeled SVD viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": df_cf_data.clean_text[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

RuntimeError: 
Supplying a user-defined data source AND iterable values to glyph methods is
not possibe. Either:

Pass all data directly as literals:

    p.circe(x=a_list, y=an_array, ...)

Or, put all data in a ColumnDataSource and pass column names:

    source = ColumnDataSource(data=dict(x=a_list, y=an_array))
    p.circe(x='x', y='y', source=source, ...)



In [57]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/fatma/PhD/Git/semi_supervision/semi_supervision/CF Labeled SVD viz.html'