In [None]:
import numpy as np
import lda
import lda.datasets

In [None]:
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))

id_to_word = dict(zip(range(len(vocab)), vocab))
word_to_id = dict(zip(vocab, range(len(vocab))))

### Our data is a count-vectorized matrix similar to TF-IDF (but without the word weighting)

In [None]:
doc_id = 0
word_id = 3117

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- title  : {}".format(titles[doc_id]))
print("-- words : {}".format([id_to_word[t] for t in X[doc_id].nonzero()[0]]))

### Fitting the model

In [None]:
N_TOPICS = 20
model = lda.LDA(n_topics=N_TOPICS, n_iter=500, random_state=1)
model.fit(X)

### The topic model for each word outputs a probability of it belonging to one of the topics

In [None]:
topic_word_distribution = model.topic_word_
print("type(topic_word): {}".format(type(topic_word_distribution)))
print("shape: {}".format(topic_word_distribution.shape))

### The matrix rows are a valid probability distribution (all probs sum to 1)

In [None]:
np.sum(topic_word_distribution, axis=1)

### Exercise 3.1
Write a function, that for a given topic number 't', outputs 'k' top words (by probability)

In [None]:
def get_top_k_words(topic_word_prob_distribution, topic_number, top_k):
    # your code goes here
    return topic_words

In [None]:
assert list(get_top_k_words(topic_word_distribution, 3, 5)) == ['yeltsin', 'russian', 
                                                                'russia', 'president', 'kremlin']
print('done')

In [None]:
for i in range(N_TOPICS):
    topic_words = get_top_k_words(topic_word_distribution, i, 10)
    print('*Topic {}\n- {}'.format(i+1, ' '.join(topic_words)))

### The topic model for each document outputs a probability of it belonging to one of the topics

In [None]:
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

In [None]:
np.sum(doc_topic, axis=1)[:10]

In [None]:
model.transform(X[:3]).shape

In [None]:
for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n,
                                            topic_most_pr,
                                            titles[n]))