# Table of Contents
 <p>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

%watermark -a 'Ethen' -d -t -v -p numpy,scipy,pandas,matplotlib,sklearn

Ethen 2017-11-28 09:28:07 

CPython 3.5.2
IPython 6.2.1

numpy 1.13.3
scipy 1.0.0
pandas 0.20.3
matplotlib 2.1.0
sklearn 0.19.1


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space']

dataset = fetch_20newsgroups(
    subset = 'all', categories = categories,
    shuffle = True, random_state = 42)

X = dataset.data
y = dataset.target

In [6]:
from gensim.corpora import Dictionary

docs = [x.lower().split() for x in X]

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
# id2word = dictionary.id2token

model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [24]:
?LdaModel

In [14]:
top_topics = model.top_topics(corpus, topn=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
top_topics

Average topic coherence: -1.1262.


[([(0.03544063838627326, '>'),
   (0.011985762985512708, 'as'),
   (0.010055557590645286, 'or'),
   (0.009087870206220457, 'what'),
   (0.0089292508008324313, 'an'),
   (0.0083754499011250982, 'your'),
   (0.0082053105482205214, 'can'),
   (0.0078740331206845187, 'by'),
   (0.0074055846386851584, 'they'),
   (0.0073623357842517879, 'we'),
   (0.0072473975379090108, 'there'),
   (0.0069151693962863471, 'do'),
   (0.0065759373987215042, 'one'),
   (0.0065626611693073633, 'no'),
   (0.0061877606435094944, 'article'),
   (0.006105038840475749, '>>'),
   (0.0057245827985628068, 'from'),
   (0.0054949192073781685, 'would'),
   (0.0054771681315719722, 'about'),
   (0.0054662820369748098, 'all')],
  -0.81550797552226073),
 ([(0.012956485174748757, 'your'),
   (0.011990909261206162, 'my'),
   (0.01172099857280046, '>'),
   (0.011650923538018556, 'do'),
   (0.011577601626015038, 'what'),
   (0.010096643533714325, '|'),
   (0.0089648747856639621, 'or'),
   (0.0089305138064307427, '--'),
   (0.008