In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [2]:
doc_set

['Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.',
 'My mother spends a lot of time driving my brother around to baseball practice.',
 'Some health experts suggest that driving may cause increased tension and blood pressure.',
 'I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.',
 'Health professionals say that brocolli is good for your health.']

In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
tokenizer

RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=56)

In [7]:
raw = doc_a.lower()
tokens = tokenizer.tokenize(raw)

print(tokens)

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']


In [9]:
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [10]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]

print(stopped_tokens)

['brocolli', 'good', 'eat', 'brother', 'likes', 'eat', 'good', 'brocolli', 'mother']


In [13]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [15]:
# stem token
texts = [p_stemmer.stem(i) for i in stopped_tokens]

print(texts)

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [87]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=2, num_words=2))

[(0, '0.081*"brocolli" + 0.081*"good"'), (1, '0.072*"drive" + 0.043*"pressur"')]


In [47]:
ldamodel[corpus]

<gensim.interfaces.TransformedCorpus at 0x23a0b024d68>

In [77]:
ldamodel[dictionary.doc2bow(doc_set)]

[(0, 0.5), (1, 0.5)]

In [78]:
ldamodel.get_document_topics(corpus)

<gensim.interfaces.TransformedCorpus at 0x23a0af4fe80>

In [88]:
topics = ldamodel.show_topics()
for topic in topics:
    print (topic)

(0, '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.058*"eat" + 0.058*"health" + 0.035*"like" + 0.035*"well" + 0.035*"feel" + 0.035*"never"')
(1, '0.072*"drive" + 0.043*"pressur" + 0.043*"health" + 0.043*"suggest" + 0.043*"increas" + 0.043*"caus" + 0.043*"expert" + 0.043*"may" + 0.043*"blood" + 0.043*"tension"')


In [80]:
# get_document_topics for a document with a single token 'user'
text = ["brocolli"]
bow = dictionary.doc2bow(text)
print ( ldamodel.get_document_topics(bow))
### get_document_topics [(0, 0.74568415806946331), (1, 0.25431584193053675)]

# get_term_topics for the token user
print ( ldamodel.get_term_topics("brocolli", minimum_probability=0.000001))
### get_term_topics:  [(0, 0.1124525558321441), (1, 0.006876306738765027)]

[(0, 0.74783873126217204), (1, 0.25216126873782796)]
[(0, 0.098311991610900665), (1, 0.0030628003386857896)]


In [89]:
#The document clearly states that it returns 
#topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
prob = ldamodel.get_document_topics(corpus)
for p in prob:
    print(p)

[(0, 0.94671405083268068), (1, 0.053285949167319269)]
[(0, 0.06012114415014623), (1, 0.93987885584985376)]
[(0, 0.051489640858633966), (1, 0.94851035914136606)]
[(0, 0.95234283258192631), (1, 0.047657167418073701)]
[(0, 0.92130572388614085), (1, 0.078694276113859152)]


In [90]:
top_prob_tuple = 

In [91]:
top_prob_tuple

[((0,
   '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.058*"eat" + 0.058*"health" + 0.035*"like" + 0.035*"well" + 0.035*"feel" + 0.035*"never"'),
  [(0, 0.94671431221401936), (1, 0.05328568778598073)]),
 ((0,
   '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.058*"eat" + 0.058*"health" + 0.035*"like" + 0.035*"well" + 0.035*"feel" + 0.035*"never"'),
  [(0, 0.060128471572510375), (1, 0.93987152842748967)]),
 ((0,
   '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.058*"eat" + 0.058*"health" + 0.035*"like" + 0.035*"well" + 0.035*"feel" + 0.035*"never"'),
  [(0, 0.051467187193641684), (1, 0.94853281280635826)]),
 ((0,
   '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.058*"eat" + 0.058*"health" + 0.035*"like" + 0.035*"well" + 0.035*"feel" + 0.035*"never"'),
  [(0, 0.95234137640183314), (1, 0.047658623598166763)]),
 ((0,
   '0.081*"brocolli" + 0.081*"good" + 0.059*"mother" + 0.059*"brother" + 0.