In [1]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [3]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [4]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [23]:
NUM_TOPICS = 2

In [24]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(X)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(1214, 2)


In [25]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(X)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 2)


In [26]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(X)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 2)


In [27]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[0.2574817 0.7425183]
[0.00069259 0.00242828]
[0.0048269 0.008466 ]


In [28]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [29]:
print("LDA Model:")
print_topics(lda_model, BOW)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, BOW)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, BOW)
print("=" * 20)
 

LDA Model:
Topic 0:
[('police', 195.24286318755514), ('gas', 71.09852617234272), ('square', 71.09222172115837), ('tear', 54.79461314042255), ('tear gas', 47.687367521312105), ('news', 41.88286102630114), ('people', 39.36014197373006), ('ankara', 33.9504193534747), ('photo', 30.575696042671158), ('erdogan', 28.67274728088719)]
Topic 1:
[('erdogan', 95.92098497935541), ('people', 45.01897321942163), ('media', 40.858231117373535), ('says', 32.96999631298711), ('world', 24.372520542634646), ('government', 23.078781360421086), ('occupyturkey', 22.849271860178977), ('support', 22.245252353088873), ('like', 20.906871142225548), ('day', 19.106577247612417)]
NMF Model:
Topic 0:
[('police', 3.502263185943617), ('gas', 0.9554178393156538), ('square', 0.8908812175448014), ('tear', 0.7916432311225495), ('tear gas', 0.7282279728183685), ('people', 0.47641777486627773), ('riot', 0.40619311680437675), ('riot police', 0.37353716597567027), ('ankara', 0.3221248431419325), ('violence', 0.3013758815592167

In [30]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import LabelSet, ColumnDataSource


In [34]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(X)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(df_cf.clean_text))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)
 

In [35]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(X.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], BOW.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)