In [1]:
import sys
print(sys.executable) 

C:\Users\BernerDaniel\Anaconda3\envs\Python3.6\python.exe


In [2]:
import numpy as np
import guidedlda

In [3]:
X = guidedlda.datasets.load_data(guidedlda.datasets.NYT)
vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT)
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [7]:
X.shape

(8447, 3012)

In [9]:
X.sum()

1221626

In [10]:
# Normal LDA without seeding
model = guidedlda.GuidedLDA(n_topics=5, n_iter=100, random_state=7, refresh=20)
model.fit(X)

INFO:guidedlda:n_documents: 8447
INFO:guidedlda:vocab_size: 3012
INFO:guidedlda:n_words: 1221626
INFO:guidedlda:n_topics: 5
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -11489265
INFO:guidedlda:<20> log likelihood: -9844667
INFO:guidedlda:<40> log likelihood: -9694223
INFO:guidedlda:<60> log likelihood: -9642506
INFO:guidedlda:<80> log likelihood: -9617962
INFO:guidedlda:<99> log likelihood: -9604031


<guidedlda.guidedlda.GuidedLDA at 0x26a6d3f1828>

In [15]:
topic_word = model.topic_word_
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: game play team win season player second point start victory
Topic 1: company percent market price business sell pay plan executive buy
Topic 2: play life man music place turn book woman write thing
Topic 3: official government state political leader states issue member case country
Topic 4: school child city family problem student life program group state


In [18]:
# Guided LDA with seed topics.
seed_topic_list = [['game', 'team', 'win', 'player', 'season', 'second', 'victory'],
                   ['percent', 'company', 'market', 'price', 'sell', 'business', 'stock', 'share'],
                   ['music', 'write', 'art', 'book', 'world', 'film'],
                   ['political', 'government', 'leader', 'official', 'state', 'country'],
                   ['american','case', 'law', 'police', 'charge', 'officer', 'kill', 'arrest', 'lawyer']]

model = guidedlda.GuidedLDA(len(seed_topic_list) + 1, n_iter=100, random_state=7, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 8447
INFO:guidedlda:vocab_size: 3012
INFO:guidedlda:n_words: 1221626
INFO:guidedlda:n_topics: 6
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -11763504
INFO:guidedlda:<20> log likelihood: -9809707
INFO:guidedlda:<40> log likelihood: -9703426
INFO:guidedlda:<60> log likelihood: -9666706
INFO:guidedlda:<80> log likelihood: -9647040
INFO:guidedlda:<99> log likelihood: -9638138


<guidedlda.guidedlda.GuidedLDA at 0x26a6d41b6d8>

In [19]:
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: game play team win season player second start point lose
Topic 1: company percent market price business sell sale buy pay executive
Topic 2: play life man place music turn thing write book old
Topic 3: political government leader state official issue country states support vote
Topic 4: police official case charge man lawyer kill court officer report
Topic 5: school child program study student father family group problem public


In [20]:
# Document-Topic breakdown
doc_topic = model.transform(X)
for i in range(9):
    print("top topic: {} Document: {}".format(doc_topic[i].argmax(),
                                                  ', '.join(np.array(vocab)[list(reversed(X[i,:].argsort()))[0:5]])))



top topic: 5 Document: plant, increase, food, increasingly, animal
top topic: 3 Document: explain, life, country, citizen, nation
top topic: 2 Document: thing, solve, problem, machine, carry
top topic: 2 Document: company, authority, opera, artistic, director
top topic: 4 Document: wife, rape, husband, file, state
top topic: 4 Document: partner, lawyer, attorney, client, indict
top topic: 2 Document: roll, place, soon, treat, rating
top topic: 5 Document: city, drug, program, commission, report
top topic: 1 Document: company, comic, series, case, executive


In [21]:
# Next step will lighten the model object to make the file smaller when pickling
# This step will delete some matrices inside the model.
# you will be able to use model.transform(X) the same way as earlier.
# you wont be able to use model.fit_transform(X_new)
model.purge_extra_matrices()

In [22]:
# Save the model for production or for running later
from six.moves import cPickle as pickle
with open('guidedlda_model.pickle', 'wb') as file_handle:
    pickle.dump(model, file_handle)
# load the model for prediction
with open('guidedlda_model.pickle', 'rb') as file_handle:
    model = pickle.load(file_handle)
doc_topic = model.transform(X)

