### gensim

In [3]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


In [45]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

In [46]:
print(dictionary)

Dictionary(44933 unique tokens: ['accepted', 'according', 'achieve', 'act', 'action']...)


In [47]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [50]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [52]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

print("=" * 20)

LDA Model:
Topic #0: 0.007*"one" + 0.005*"could" + 0.005*"would" + 0.004*"said" + 0.003*"time" + 0.003*"two" + 0.003*"new" + 0.003*"man" + 0.003*"even" + 0.002*"made"
Topic #1: 0.006*"would" + 0.005*"one" + 0.003*"could" + 0.003*"new" + 0.003*"said" + 0.003*"two" + 0.003*"first" + 0.003*"may" + 0.002*"like" + 0.002*"time"
Topic #2: 0.005*"one" + 0.004*"would" + 0.003*"time" + 0.003*"new" + 0.003*"two" + 0.002*"may" + 0.002*"could" + 0.002*"said" + 0.002*"first" + 0.002*"many"
Topic #3: 0.005*"one" + 0.003*"new" + 0.003*"said" + 0.003*"would" + 0.003*"first" + 0.003*"may" + 0.003*"two" + 0.002*"mrs." + 0.002*"time" + 0.002*"could"
Topic #4: 0.006*"one" + 0.004*"would" + 0.004*"said" + 0.003*"time" + 0.003*"may" + 0.003*"could" + 0.003*"two" + 0.003*"first" + 0.002*"new" + 0.002*"man"
Topic #5: 0.007*"would" + 0.006*"one" + 0.005*"said" + 0.004*"time" + 0.003*"could" + 0.003*"like" + 0.003*"first" + 0.002*"two" + 0.002*"may" + 0.002*"man"
Topic #6: 0.007*"one" + 0.005*"would" + 0.004*"ne

In [53]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
 
print(lda_model[bow])

[(0, 0.09161293999448276), (1, -0.008831614668939697), (2, -0.016821744887377064), (3, -0.04105771006170795), (4, 0.01572562950555604), (5, -0.011841459330382918), (6, 0.03140805073426949), (7, -0.014450956938682003), (8, 0.05456494744152885), (9, -0.020836665291843408)]
[(0, 0.020012788), (1, 0.020013249), (2, 0.02001385), (3, 0.020013236), (4, 0.020013282), (5, 0.020013161), (6, 0.020012785), (7, 0.819881), (8, 0.02001308), (9, 0.02001353)]


In [54]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10], '\n')
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

[(288, 0.99762625), (447, 0.99761987), (72, 0.9975393), (128, 0.99753535), (125, 0.9973295), (278, 0.9973295), (294, 0.9973295), (362, 0.9973295), (77, 0.99731994), (135, 0.99717253)] 

The one- or two-season hunt , of which there have been too many recently , may do more harm than good ; ; for such programs raise hopes of assistance toward achieving excellence in scholarship and the arts which are dashed when the programs are discontinued ; ; and they are dashed , no less , by lack of skill in making selections of men and women for development toward the highest reaches of the mind and spirit . For the making of selections on the basis of excellence requires that any foundation making the selections shall have available the judgments of a corps of advisors whose judgments are known to be good : such judgments can be known to be good only by the records of those selected , by records made subsequent to their selection over considerable periods of time . The central group of the Foundat

# Sklearn

In [30]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(data)

data_vectorized[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [32]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [33]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [34]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [43]:
# Let's see how the first document in the corpus looks like in different topic spaces

print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[1.65010163e-01 1.05625244e-04 1.05609344e-04 1.05602526e-04
 1.05616087e-04 1.05624324e-04 1.05620422e-04 1.05596726e-04
 1.26470246e-01 7.07780297e-01]
[0.         0.         2.11098333 0.07708942 0.         0.54373577
 1.06798434 0.         0.         0.24404752]
[ 23.30684134   1.59505053  21.83958053  -0.06707342   0.80479397
  11.84375721   4.17098172  -1.88098066   0.45715931 -13.62738873]


In [37]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
# print("NMF Model:")
# print_topics(nmf_model, vectorizer)
# print("=" * 20)
 
# print("LSI Model:")
# print_topics(lsi_model, vectorizer)
# print("=" * 20)

LDA Model:
Topic 0:
[('said', 898.9981093335066), ('mrs', 481.4117668850527), ('new', 382.2010788388641), ('time', 331.63044238993524), ('day', 304.4497954136027), ('men', 302.6069130328378), ('home', 287.52301249899426), ('house', 274.6409107095405), ('john', 257.25832324260705), ('did', 251.80838650620655)]
Topic 1:
[('year', 140.62808632234484), ('new', 105.36703008692089), ('feed', 99.55654584907741), ('tax', 90.72526673721285), ('cost', 89.35086454643323), ('use', 87.53357320632698), ('costs', 85.73049029956849), ('equipment', 82.31642346287575), ('used', 78.77798044267804), ('time', 69.24009680630768)]
Topic 2:
[('god', 145.07186594318932), ('christ', 77.80120556889656), ('jesus', 45.41228819728798), ('church', 39.11915222414219), ('man', 38.90045346001511), ('new', 35.78997064499043), ('born', 33.202598149508454), ('john', 32.358817110215874), ('parker', 32.249950463510515), ('sin', 31.663181295986178)]
Topic 3:
[('model', 13.697627056934186), ('new', 10.973403381977297), ('gun'

In [38]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.00290064 0.         0.         0.         0.         0.00439464
 0.         0.         0.         0.00462612]


In [56]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
similarities

[(115, 0.5168192780665839),
 (160, 0.54644067790078),
 (181, 0.5622746178417255),
 (252, 0.5728317838841462),
 (499, 0.5787287670300532)]

In [57]:
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

### Plotting words and documents in 2D with SVD

In [59]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

#### documents

In [68]:
data_vectorized

<500x10625 sparse matrix of type '<class 'numpy.int64'>'
	with 241921 stored elements in Compressed Sparse Row format>

In [70]:
documents_2d.shape

(500, 2)

In [62]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
df

Unnamed: 0,x,y,document
0,23.306833,1.593118,0
1,20.625832,6.275179,1
2,23.361898,7.729805,2
3,17.627838,15.492641,3
4,26.006839,0.835767,4
...,...,...,...
495,21.602202,-10.891627,495
496,24.539445,-12.658952,496
497,25.328523,-15.375643,497
498,12.376327,0.230668,498


In [63]:
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

#### words

In [71]:
data_vectorized.T

<10625x500 sparse matrix of type '<class 'numpy.int64'>'
	with 241921 stored elements in Compressed Sparse Column format>

In [64]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()

df

Unnamed: 0,x,y,word
0,0.355056,-0.201927,-day
1,0.242417,-0.086933,-degree
2,0.697428,-0.558982,-degrees
3,0.509780,-0.628094,-degrees-c
4,0.699847,-0.152887,-foot
...,...,...,...
10620,0.228999,0.032563,zest
10621,0.411322,-0.524525,zinc
10622,0.502003,-0.281443,zone
10623,0.279290,-0.196128,zoning


In [73]:
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

### More

In [75]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.02500003 0.02500248 0.02500643 0.02500462 0.02500046 0.02500002
 0.02500857 0.02500002 0.02500005 0.77497732] 1.0


In [77]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel