In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

In [2]:
# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [3]:
print doc_set

['Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.', 'My mother spends a lot of time driving my brother around to baseball practice.', 'Some health experts suggest that driving may cause increased tension and blood pressure.', 'I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.', 'Health professionals say that brocolli is good for your health.']


In [4]:
doc_lowers = [doc.lower() for doc in doc_set]
print doc_lowers

['brocolli is good to eat. my brother likes to eat good brocolli, but not my mother.', 'my mother spends a lot of time driving my brother around to baseball practice.', 'some health experts suggest that driving may cause increased tension and blood pressure.', 'i often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.', 'health professionals say that brocolli is good for your health.']


In [5]:
import string
doc_no_punctuation = [doc.translate(None, string.punctuation) for doc in doc_lowers]
print doc_no_punctuation

['brocolli is good to eat my brother likes to eat good brocolli but not my mother', 'my mother spends a lot of time driving my brother around to baseball practice', 'some health experts suggest that driving may cause increased tension and blood pressure', 'i often feel pressure to perform well at school but my mother never seems to drive my brother to do better', 'health professionals say that brocolli is good for your health']


In [6]:
import nltk
doc_tokens = [nltk.word_tokenize(doc) for doc in doc_no_punctuation]
print doc_tokens

[['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother'], ['my', 'mother', 'spends', 'a', 'lot', 'of', 'time', 'driving', 'my', 'brother', 'around', 'to', 'baseball', 'practice'], ['some', 'health', 'experts', 'suggest', 'that', 'driving', 'may', 'cause', 'increased', 'tension', 'and', 'blood', 'pressure'], ['i', 'often', 'feel', 'pressure', 'to', 'perform', 'well', 'at', 'school', 'but', 'my', 'mother', 'never', 'seems', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better'], ['health', 'professionals', 'say', 'that', 'brocolli', 'is', 'good', 'for', 'your', 'health']]


In [7]:
#stop word removal
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

doc_stemmed = []
for one_doc in doc_tokens:
    doc_filtered = [w for w in one_doc if not w in stopwords.words("english")]
    doc_stem = [ps.stem(w) for w in doc_filtered]
    doc_stemmed.append(doc_stem)
print doc_stemmed

[['brocolli', 'good', 'eat', 'brother', u'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', u'spend', 'lot', 'time', u'drive', 'brother', 'around', u'basebal', u'practic'], ['health', u'expert', 'suggest', u'drive', 'may', u'caus', u'increas', 'tension', 'blood', u'pressur'], ['often', 'feel', u'pressur', 'perform', 'well', 'school', 'mother', 'never', u'seem', 'drive', 'brother', 'better'], ['health', u'profession', 'say', 'brocolli', 'good', 'health']]


In [8]:
from gensim import corpora, models

dictionary = corpora.Dictionary(doc_stemmed)

In [9]:
for i in dictionary:    
    print dictionary[i]

often
feel
profession
drive
say
pressur
basebal
seem
expert
perform
suggest
better
health
lot
tension
good
around
may
mother
school
blood
never
increas
eat
practic
brocolli
like
well
brother
caus
time
spend


In [10]:
print dictionary[31]
print dictionary[0]
print len(dictionary)

profession
brocolli
32


In [12]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1a07bbc350>

In [13]:
#The Dictionary() function traverses texts, assigning a unique integer id to each unique token 
#while also collecting word counts and relevant statistics. 
#To see each token’s unique integer id, try print(dictionary.token2id).
print dictionary.token2id

{u'often': 23, u'feel': 24, u'profession': 31, u'drive': 8, u'say': 30, u'pressur': 18, u'basebal': 7, u'seem': 29, u'expert': 14, u'perform': 28, u'suggest': 16, u'better': 27, u'health': 19, u'lot': 9, u'tension': 13, u'good': 1, u'around': 6, u'may': 15, u'mother': 4, u'school': 22, u'blood': 20, u'never': 25, u'increas': 21, u'eat': 5, u'practic': 12, u'brocolli': 0, u'like': 2, u'well': 26, u'brother': 3, u'caus': 17, u'time': 10, u'spend': 11}


In [14]:
#The doc2bow() function converts dictionary into a bag-of-words. 
#The result, corpus, is a list of vectors equal to the number of documents. 
#In each document vector is a series of tuples. As an example, print(corpus[0]) results in the following:

corpus = [dictionary.doc2bow(text) for text in doc_stemmed]

In [15]:
#This list of tuples represents our first document, doc_a. 
#The tuples are (term ID, term frequency) pairs, so if 
#print(dictionary.token2id) says brocolli’s id is 0, 
#then the first tuple indicates that brocolli appeared twice in doc_a. doc2bow() 
#only includes terms that actually occur: terms that do not occur in 
#a document will not appear in that document’s vector.
print corpus[0]

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2)]


In [16]:
#num_topics: required. An LDA model requires the user to determine how many topics should be generated. 
#Our document set is small, so we’re only asking for three topics.
#id2word: required. The LdaModel class requires our previous dictionary to map ids to strings.
#passes: optional. The number of laps the model will take through corpus. 
#The greater the number of passes, the more accurate the model will be. 
#A lot of passes can be slow on a very large corpus. 

#test with passes = 20 or not
import gensim

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes = 20)

In [17]:
#Each generated topic is separated by a comma. 
#Within each topic are the three most probable words to appear in that topic. 
#Even though our document set is small the model is reasonable.

print ldamodel.print_topics(num_topics=3, num_words=3)

[(0, u'0.056*"pressur" + 0.056*"eat" + 0.056*"drive"'), (1, u'0.068*"brother" + 0.068*"mother" + 0.068*"drive"'), (2, u'0.141*"health" + 0.080*"good" + 0.080*"brocolli"')]


In [19]:
#Adjusting the model’s number of topics and passes is important to getting a good result. 
#Two topics seems like a better fit for our documents:

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
print ldamodel.print_topics(num_topics=2, num_words=4)

[(0, u'0.067*"drive" + 0.066*"pressur" + 0.039*"well" + 0.039*"feel"'), (1, u'0.087*"good" + 0.087*"brocolli" + 0.063*"brother" + 0.063*"mother"')]


In [20]:
print ldamodel[corpus[0]]

[(0, 0.05276112178666139), (1, 0.94723887821333863)]


In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

for i in range(len(corpus)):
    print i, ldamodel[corpus[i]]

NameError: name 'corpus' is not defined