In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [2]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
raw = doc_a.lower()
tokens = tokenizer.tokenize(raw)

In [4]:
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [5]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]

In [6]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [7]:
# stem token ， 去詞尾，例如:s,ies...
texts = [p_stemmer.stem(i) for i in stopped_tokens]
texts

['brocolli',
 'good',
 'eat',
 'brother',
 'like',
 'eat',
 'good',
 'brocolli',
 'mother']

In [8]:
### 建字典
from gensim import corpora, models
texts=[texts]
dictionary = corpora.Dictionary(texts)



In [9]:
print(dictionary.token2id)

{'brocolli': 0, 'good': 1, 'eat': 2, 'brother': 3, 'like': 4, 'mother': 5}


In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
### 建詞袋
corpus[0]

[(0, 2), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1)]

In [12]:
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20) ### passes做幾次圈選

In [13]:
import pprint
pprint.pprint(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.167*"brocolli" + 0.167*"eat" + 0.167*"good"'),
 (1, '0.167*"brother" + 0.167*"mother" + 0.167*"like"'),
 (2, '0.212*"eat" + 0.212*"good" + 0.212*"brocolli"')]


In [14]:
### 換選出2個主題
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [15]:
pprint.pprint(ldamodel.print_topics(num_topics=2, num_words=3))

[(0, '0.209*"eat" + 0.209*"brocolli" + 0.209*"good"'),
 (1, '0.167*"good" + 0.167*"brocolli" + 0.167*"eat"')]


In [16]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)



In [17]:
texts

[['brocolli',
  'good',
  'eat',
  'brother',
  'like',
  'eat',
  'good',
  'brocolli',
  'mother'],
 ['mother',
  'spend',
  'lot',
  'time',
  'drive',
  'brother',
  'around',
  'basebal',
  'practic'],
 ['health',
  'expert',
  'suggest',
  'drive',
  'may',
  'caus',
  'increas',
  'tension',
  'blood',
  'pressur'],
 ['often',
  'feel',
  'pressur',
  'perform',
  'well',
  'school',
  'mother',
  'never',
  'seem',
  'drive',
  'brother',
  'better'],
 ['health', 'profession', 'say', 'brocolli', 'good', 'health']]

In [18]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [19]:
pprint.pprint(ldamodel.print_topics(num_topics=2, num_words=4))

[(0, '0.112*"brocolli" + 0.112*"good" + 0.082*"health" + 0.080*"eat"'),
 (1, '0.075*"drive" + 0.053*"brother" + 0.053*"mother" + 0.053*"pressur"')]


In [22]:
corpus

[[(0, 2), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1)],
 [(3, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(9, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(3, 1),
  (5, 1),
  (9, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(0, 1), (1, 1), (13, 2), (30, 1), (31, 1)]]

In [26]:
dict(dictionary)

{0: 'brocolli',
 1: 'good',
 2: 'eat',
 3: 'brother',
 4: 'like',
 5: 'mother',
 6: 'spend',
 7: 'lot',
 8: 'time',
 9: 'drive',
 10: 'around',
 11: 'basebal',
 12: 'practic',
 13: 'health',
 14: 'expert',
 15: 'suggest',
 16: 'may',
 17: 'caus',
 18: 'increas',
 19: 'tension',
 20: 'blood',
 21: 'pressur',
 22: 'often',
 23: 'feel',
 24: 'perform',
 25: 'well',
 26: 'school',
 27: 'never',
 28: 'seem',
 29: 'better',
 30: 'profession',
 31: 'say'}