In [1]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [3]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [4]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [5]:
NUM_TOPICS = 4

In [6]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(X)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 4)


In [7]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(X)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 4)


In [8]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(X)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 4)


In [9]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[ 0.12507403  0.12502471  0.62486863  0.12503264]
[ 0.00084719  0.00228189  0.          0.        ]
[ 0.00482919  0.00845152  0.001058   -0.00158344]


In [10]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [14]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import LabelSet, ColumnDataSource


ImportError: cannot import name LabelSet

In [11]:
print("LDA Model:")
print_topics(lda_model, BOW)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, BOW)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, BOW)
print("=" * 20)
 

LDA Model:
Topic 0:
[(u'gas', 70.937746356953355), (u'tear', 54.605082091944148), (u'police', 48.842229166530828), (u'tear gas', 47.461903698646758), (u'ankara', 33.971837429015459), (u'live', 25.203422088639673), (u'photo', 22.718217224625608), (u'standing', 21.722752248379567), (u'square', 21.281233988739075), (u'water', 20.001287608461706)]
Topic 1:
[(u'erdogan', 27.184700621678967), (u'minister', 23.590061159329405), (u'world', 22.160293457467347), (u'prime', 21.294687581340234), (u'syria', 18.695669572824301), (u'prime minister', 18.492535571558001), (u'support', 16.401100896728742), (u'stop', 16.28558788809082), (u'media', 15.783042290165328), (u'want', 15.736570058030509)]
Topic 2:
[(u'police', 143.04737544798499), (u'square', 49.86204527314446), (u'people', 45.402168796959536), (u'news', 38.613913481980525), (u'bbc', 26.346147021035947), (u'riot', 22.880114888690002), (u'government', 22.772597452447258), (u'today', 22.55287694229407), (u'right', 21.796177039521513), (u'attack',

In [31]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(X)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(df_cf.clean_text))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)
 



NameError: name 'LabelSet' is not defined