In [1]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [10]:
df_cf = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')

In [11]:
df_cf = df_cf.dropna()

In [12]:
df_cf = df_cf.drop_duplicates()

In [17]:
X = df_cf.clean_text [:3000]
y = df_cf.Fatma_violence[:3000]

In [18]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [19]:
NUM_TOPICS = 4

In [20]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(X)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(3000, 4)


In [21]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(X)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(3000, 4)


In [22]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(X)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(3000, 4)


In [23]:
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[0.03724235 0.5157499  0.03577325 0.4112345 ]
[0.14262248 0.         0.         0.        ]
[ 0.86250592 -0.50511293  0.05410639 -0.06691029]


In [24]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [25]:
print("LDA Model:")
print_topics(lda_model, BOW)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, BOW)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, BOW)
print("=" * 20)
 

LDA Model:
Topic 0:
[('turkey', 478.13654137943286), ('erdogan', 111.18552464961319), ('media', 90.23706578887908), ('today', 60.170807036496626), ('occupygezi', 59.14582912668048), ('going', 58.10236428661862), ('direngezipark', 54.29594181984419), ('says', 53.190294448981575), ('need', 48.33904474158845), ('minister', 44.24496019372758)]
Topic 1:
[('istanbul', 223.69851957197), ('park', 190.50240794360616), ('gezi', 181.53561088773122), ('taksim', 160.63745125598868), ('police', 154.39721731209252), ('protesters', 149.83473748556716), ('gezi park', 121.99360429420709), ('turkish', 116.59438595442268), ('turkey', 96.96891281392374), ('erdogan', 81.75296143390702)]
Topic 2:
[('occupygezi', 367.0010007182274), ('police', 311.49579459195206), ('taksim', 236.6810174343525), ('turkish', 171.20439761799554), ('istanbul', 167.6728542310802), ('people', 156.1096687104907), ('direngeziparki', 129.47244144988417), ('square', 99.8179183684798), ('gas', 92.69830998999507), ('taksim square', 84.43

In [26]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import LabelSet, ColumnDataSource

In [32]:
svd = TruncatedSVD(n_components=3)
documents_2d = svd.fit_transform(X)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(df_cf.clean_text[:3000]))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)
 

In [33]:
svd = TruncatedSVD(n_components= 3)
words_2d = svd.fit_transform(X.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], BOW.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)