In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [15]:
import re

def clean(doc):
    """
    Convert a document into a list of words (also remove stop words and punctuation)
    """
    regex = r'\w+'
    words = re.findall(regex, doc)
    words = filter(lambda word: word not in stop and len(word) > 2 and len(re.findall(r'\d+', word)) == 0, words)
    words = map(lambda word: lemma.lemmatize(word.lower()), words)
    return words

In [4]:
def build_corpus_features(cleaned_docs, min_count=2, max_frequency=0.94):
    """
    Build a dictionary from words in dictonary to their indices.
    """
    docs = len(cleaned_docs)
    ans = Counter()
    cnt = Counter()
    freq = Counter()
    
    for doc in cleaned_docs:
        for word in set(doc):
            freq[word] += 1
        for word in doc:
            cnt[word] += 1
            
    for word, c in cnt.iteritems():
        if freq[word] > min_count and freq[word] / docs <= max_frequency:
            ans[word] = cnt[word]
    return dict(ans)

In [5]:
def featurize(cleaned_doc, index_to_word):
    features = np.zeros(len(index_to_word), dtype=np.int)
    cnt = Counter(cleaned_doc)
    ind = 0
    for w in index_to_word:
        features[ind] = cnt[w]
        ind += 1
    return features

In [6]:
def display_topics(model, index_to_word, word_to_display):
    words = np.array(index_to_word)
    top = words[np.argsort(model.components_, axis=1)[:, -word_to_display:]]
    print top.shape
    for topic in top:
        print '\n'
        for w in topic:
            print w

In [7]:
trainset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
train_data = trainset.data

In [16]:
%%time
cleaned_docs = [clean(doc) for doc in train_data]

CPU times: user 10.4 s, sys: 113 ms, total: 10.5 s
Wall time: 10.6 s


In [17]:
%%time
corpus_dict = build_corpus_features(cleaned_docs)

CPU times: user 1.19 s, sys: 50.1 ms, total: 1.24 s
Wall time: 1.22 s


In [18]:
%%time
index_to_word = zip(*sorted(corpus_dict.items(), key=lambda x: x[1]))[0][-5000:]

featurized_docs = np.array([featurize(doc, index_to_word) for doc in cleaned_docs])

CPU times: user 34.2 s, sys: 602 ms, total: 34.8 s
Wall time: 35.2 s


In [20]:
%%time
lda = LatentDirichletAllocation(n_topics=20, learning_method='online', max_iter=20)
lda.fit(featurized_docs)

CPU times: user 1min 41s, sys: 1.23 s, total: 1min 43s
Wall time: 1min 45s


In [21]:
display_topics(lda, index_to_word, 21)

(20, 21)


audio
great
all
red
smith
cable
ranger
asking
good
best
edu
for
excellent
box
condition
shipping
sell
offer
price
sale
new


year
surface
contest
lunar
flight
cost
data
rocket
station
moon
shuttle
orbit
system
mission
earth
launch
satellite
air
nasa
the
space


but
home
think
back
well
went
say
day
know
time
going
would
last
people
they
one
and
team
said
the
year


posting
contact
site
available
faq
request
pub
modem
fax
computer
email
anonymous
ftp
address
send
internet
list
mail
com
edu
max


division
played
playoff
pick
shot
baseball
san
chicago
point
run
year
fan
goal
hockey
league
season
the
win
play
player
game


human
but
christ
truth
church
faith
weapon
belief
law
word
life
bible
religion
believe
people
say
one
jesus
christian
the
god


support
cpu
char
tape
int
ibm
ide
rom
the
floppy
port
bus
controller
mac
hard
system
do
scsi
card
disk
drive


the
use
system
point
way
know
well
mean
this
could
may
many
like
question
what
think
make
key
people
one
would


number
gen