In [1]:
import numpy as np
import lda
import lda.datasets

In [2]:
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))

id_to_word = dict(zip(range(len(vocab)), vocab))

type(X): <class 'numpy.ndarray'>
shape: (395, 4258)

type(vocab): <class 'tuple'>
len(vocab): 4258

type(titles): <class 'tuple'>
len(titles): 395



### Our data is a count-vectorized matrix similar to TF-IDF (but without the word weighting)

In [3]:
doc_id = 0
word_id = 3117

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- title  : {}".format(titles[doc_id]))
print("-- words : {}".format([id_to_word[t] for t in X[doc_id].nonzero()[0]]))

doc id: 0 word id: 3117
-- count: 2
-- word : heir-to-the-throne
-- title  : 0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20
-- words : ['church', 'years', 'told', 'year', 'charles', 'catholic', 'since', 'family', 'british', 'made', 'tuesday', 'million', 'prince', 'heart', 'roman', 'monday', 'state', 'diana', 'royal', 'including', 'week', 'take', 'queen', 'next', 'long', 'head', 'ago', 'members', 'century', 'whose', 'four', 'among', 'britain', 'part', 'children', '1992', 'england', 'princess', 'end', 'throne', 'go', 'law', 'help', 'role', 'every', 'palace', 'divorce', 'personal', 'secret', 'popular', 'marry', 'daughter', 'reported', 'daily', 'child', 'brother', 'leading', 'open', 'duke', 'already', 'english', 'spokeswoman', 'include', 'main', 'press', 'wants', 'admitted', 'traditional', 'newspapers', 'something', 'monarchy', 'final', 'policy', 'committee', 'play', 'pay', 'involved', 'refused', 'biggest', 'move', 'marrying', 'fashion', 'details', 'monarch', '

### Fitting the model

In [4]:
N_TOPICS = 20
model = lda.LDA(n_topics=N_TOPICS, n_iter=500, random_state=1)
model.fit(X)

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -6

<lda.lda.LDA at 0x10f625198>

### The topic model for each word outputs a probability of it belonging to one of the topics

In [5]:
topic_word_distribution = model.topic_word_
print("type(topic_word): {}".format(type(topic_word_distribution)))
print("shape: {}".format(topic_word_distribution.shape))

type(topic_word): <class 'numpy.ndarray'>
shape: (20, 4258)


### The matrix rows are a valid probability distribution (all probs sum to 1)

In [7]:
np.sum(topic_word_distribution, axis=1)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.])

### Exercise 2.1
Write a function, that for a given topic number 't', outputs 'k' top words (by probability)

In [8]:
def get_top_k_words(topic_word_prob_distribution, topic_number, top_k):
    t1 = np.argsort(-topic_word_prob_distribution[topic_number])
    topic_words = np.array(vocab)[t1][:top_k]
    return topic_words

In [9]:
assert list(get_top_k_words(topic_word_distribution, 3, 5)) == ['yeltsin', 'russian', 
                                                                'russia', 'president', 'kremlin']
print('done')

done


In [10]:
for i in range(N_TOPICS):
    topic_words = get_top_k_words(topic_word_distribution, i, 10)
    print('*Topic {}\n- {}'.format(i+1, ' '.join(topic_words)))

*Topic 1
- government british minister west group letters party former million against
*Topic 2
- church first during people political country ceremony visit government died
*Topic 3
- elvis king wright fans presley life concert death first mark
*Topic 4
- yeltsin russian russia president kremlin michael romania operation orthodox moscow
*Topic 5
- pope vatican paul surgery pontiff john hospital rome trip mass
*Topic 6
- family police miami versace cunanan funeral home church kennedy city
*Topic 7
- south simpson born york white north african black united former
*Topic 8
- order church mother successor since election religious head nuns nirmala
*Topic 9
- charles prince diana royal queen king parker bowles camilla marriage
*Topic 10
- film france french against actor bardot paris magazine poster festival
*Topic 11
- germany german war nazi christian letter book jews scientology soviet
*Topic 12
- east prize peace timor belo quebec indonesia nobel award minister
*Topic 13
- n't told lif

### The topic model for each document outputs a probability of it belonging to one of the topics

In [11]:
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

type(doc_topic): <class 'numpy.ndarray'>
shape: (395, 20)


In [12]:
np.sum(doc_topic, axis=1)[:10]

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [13]:
for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n,
                                            topic_most_pr,
                                            titles[n]))

doc: 0 topic: 8
0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20...
doc: 1 topic: 1
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21...
doc: 2 topic: 14
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23...
doc: 3 topic: 8
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25...
doc: 4 topic: 14
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25...
doc: 5 topic: 14
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25...
doc: 6 topic: 14
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26...
doc: 7 topic: 14
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25...
doc: 8 topic: 14
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26...
doc: 9 topic: 8
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26...
