In [35]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [36]:
df_cf = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')

In [37]:
df_cf = df_cf.dropna()

In [38]:
df_cf = df_cf.drop_duplicates()

In [39]:
df_cf = df_cf.sample(frac=0.1)

X = df_cf.clean_text

y = df_cf.Fatma_violence

In [40]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [43]:
NUM_TOPICS = 2

In [44]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(X)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(8795, 2)


In [45]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(X)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(8795, 2)


In [46]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(X)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(8795, 2)


In [47]:
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[0.91828356 0.08171644]
[0.11513596 0.        ]
[ 0.87389046 -0.48788039]


In [48]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [49]:
print("LDA Model:")
print_topics(lda_model, BOW)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, BOW)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, BOW)
print("=" * 20)
 

LDA Model:
Topic 0:
[('turkey', 2185.599424681978), ('istanbul', 884.7976036874602), ('police', 872.9478983782183), ('protests', 749.7444758356397), ('erdogan', 623.1841091293492), ('turkish', 585.754086799162), ('protesters', 566.3864177165981), ('gezi', 524.2842176898833), ('park', 523.920581320399), ('gas', 446.0542458367682)]
Topic 1:
[('turkey', 1955.200566010786), ('occupygezi', 1395.5873891862182), ('taksim', 1058.860500839082), ('people', 648.8115561851221), ('istanbul', 638.1374357717018), ('square', 478.4695202285414), ('police', 455.56442509176463), ('direngeziparki', 407.56628027980776), ('turkish', 405.9015385680433), ('taksim square', 385.24209863242953)]
NMF Model:
Topic 0:
[('turkey', 8.505185018684147), ('protests', 0.9331632178764998), ('erdogan', 0.5163630460485543), ('government', 0.34731215509456753), ('turkey protests', 0.2953719972562483), ('people', 0.289730103427092), ('protesters', 0.27722189022484073), ('news', 0.25457844275179836), ('media', 0.24623592136518

##### from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import LabelSet, ColumnDataSource

In [53]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(X)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(df_cf.clean_text))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)
 

In [54]:
svd = TruncatedSVD(n_components= 2)
words_2d = svd.fit_transform(X.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], BOW.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)