# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Latent-Dirichlet-Allocation" data-toc-modified-id="Latent-Dirichlet-Allocation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Latent Dirichlet Allocation</a></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

%watermark -a 'Ethen' -d -t -v -p numpy,scipy,pandas,matplotlib,sklearn

Ethen 2017-11-28 21:13:39 

CPython 3.5.2
IPython 6.2.1

numpy 1.13.3
scipy 1.0.0
pandas 0.20.3
matplotlib 2.1.0
sklearn 0.19.1


# Latent Dirichlet Allocation

In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space']

dataset = fetch_20newsgroups(
    subset = 'all', categories = categories,
    shuffle = True, random_state = 42)

X = dataset.data
y = dataset.target

In [3]:
from gensim.corpora import Dictionary

docs = [x.lower().split() for x in X]

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

Using TensorFlow backend.


In [4]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
# id2word = dictionary.id2token

model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [5]:
top_topics = model.top_topics(corpus, topn=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
top_topics

Average topic coherence: -1.5282.


[([(0.01171794697448247, 'or'),
   (0.011116718234493452, 'can'),
   (0.011097833130223802, 'would'),
   (0.01097761960891468, 'as'),
   (0.010938531529557802, 'what'),
   (0.01089389961893316, 'they'),
   (0.0097290204553938751, 'we'),
   (0.0083837276268509901, 'do'),
   (0.0078476279482066966, 'your'),
   (0.0078375440167425953, 'there'),
   (0.0078114036142310885, 'an'),
   (0.0078099478829628055, "don't"),
   (0.0072464807973690228, 'at'),
   (0.0067497057260154742, 'so'),
   (0.00671241233577668, 'my'),
   (0.0066963885795746185, 'about'),
   (0.0066581780756464837, 'just'),
   (0.0064984385667549351, 'some'),
   (0.0061233407229750398, 'think'),
   (0.0060786441966135894, 'one')],
  -0.7673092978049878),
 ([(0.014688841725788486, 'as'),
   (0.014374182877034588, 'he'),
   (0.013675221318565798, 'was'),
   (0.010512103374662045, 'by'),
   (0.0095215057077302698, 'or'),
   (0.0093433809140619842, 'his'),
   (0.0091696597160859494, 'they'),
   (0.0089523995548556091, 'who'),
   (0.

In [6]:
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

# a list of 18,846 cleaned news in string format
# only keep letters & make them all lower case
news = [raw.lower() for raw in
        newsgroups_train.data + newsgroups_test.data]
news[0]

'i was wondering if anyone out there could enlighten me on this car i saw\nthe other day. it was a 2-door sports car, looked to be from the late 60s/\nearly 70s. it was called a bricklin. the doors were really small. in addition,\nthe front bumper was separate from the rest of the body. this is \nall i know. if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 5 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
X_train = cvectorizer.fit_transform(news)

# train an LDA model
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=n_iter, verbose=1, n_jobs=-1)
X_topics = lda.fit_transform(X_train)



iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5


In [15]:
threshold = 0.8
_idx = np.amax(X_topics, axis = 1) > threshold  # idx of doc that above the threshold
X_topics_subset = X_topics[_idx]
topic = np.argmax(X_topics_subset, axis = 1)
X_topics_subset.shape

(429, 20)

In [22]:
import os

# export the item factors/embeddings and the corresponding metadata into .tsv format
# here we can more than 1 column as the metadata, thus we can include a header, but
# if we were to only have 1 column as the metadata, then no header is allowed in tensorboard
# tf_embedding_dir = 'tf_embedding'
# if not os.path.isdir(tf_embedding_dir):
#     os.mkdir(tf_embedding_dir)
    
# metadata_file = os.path.join(tf_embedding_dir, 'metadata.tsv')
# embedding_file = os.path.join(tf_embedding_dir, 'topics.tsv')
metadata_file = 'metadata.tsv'
embedding_file = 'topics.tsv'

np.savetxt(metadata_file, topic, delimiter = '\t')
# df_diag_desc[['friendly_desc', 'cluster']].to_csv(metadata_file, index = False, sep = '\t')
np.savetxt(embedding_file, X_topics_subset, delimiter = '\t')

In [24]:
from tf_utils import launch_tensorboard

log_dir = './logs/' # os.path.join(tf_embedding_dir, './logs/')
launch_tensorboard(
    embedding_file = os.path.join('..', embedding_file),
    log_dir = log_dir,
    metadata_file = os.path.join('..', metadata_file))

FileNotFoundError: [Errno 2] No such file or directory: '../topics.tsv'

In [35]:
from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 18846 samples in 0.019s...
[t-SNE] Computed neighbors for 18846 samples in 9.358s...
[t-SNE] Computed conditional probabilities for sample 1000 / 18846
[t-SNE] Computed conditional probabilities for sample 2000 / 18846
[t-SNE] Computed conditional probabilities for sample 3000 / 18846
[t-SNE] Computed conditional probabilities for sample 4000 / 18846
[t-SNE] Computed conditional probabilities for sample 5000 / 18846
[t-SNE] Computed conditional probabilities for sample 6000 / 18846
[t-SNE] Computed conditional probabilities for sample 7000 / 18846
[t-SNE] Computed conditional probabilities for sample 8000 / 18846
[t-SNE] Computed conditional probabilities for sample 9000 / 18846
[t-SNE] Computed conditional probabilities for sample 10000 / 18846
[t-SNE] Computed conditional probabilities for sample 11000 / 18846
[t-SNE] Computed conditional probabilities for sample 12000 / 18846
[t-SNE] Computed conditional probabilities for sam

In [36]:
# export the item factors/embeddings and the corresponding metadata into .tsv format
# here we can more than 1 column as the metadata, thus we can include a header, but
# if we were to only have 1 column as the metadata, then no header is allowed in tensorboard
tf_embedding_dir = 'tf_embedding'
if not os.path.isdir(tf_embedding_dir):
    os.mkdir(tf_embedding_dir)
    
metadata_file = os.path.join(tf_embedding_dir, 'metadata.tsv')
embedding_file = os.path.join(tf_embedding_dir, 'item_factors.tsv')

df_diag_desc[['friendly_desc', 'cluster']].to_csv(metadata_file, index = False, sep = '\t')
np.savetxt(embedding_file, als.item_factors_, delimiter = '\t')

18846