In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk

np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\geesi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [13]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len = 4):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

In [14]:
sample_file = 'rest_review_sample_100000.txt'
documents = []

with open (sample_file, 'r') as f:
    documents = f.readlines()
        
doc_sample = documents[1]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['I', 'wanted', 'ice', 'cream', 'so', 'my', 'husband', 'pulled', 'off', 'the', 'highway', 'at', 'this', "Culver's.", 'We', 'opted', 'to', 'get', 'the', 'flavor', 'of', 'the', 'day', '(cookie', 'dough', 'craving).', 'I', 'was', 'disappointed', 'to', 'find', 'that', 'it', 'was', 'a', 'chocolate', 'custard', 'with', 'TONS', 'of', 'cookie', 'dough', 'chunks', 'because', 'I', 'like', 'custard', 'and', 'not', 'just', 'the', 'mix-ins,', 'etc.', 'but', 'it', 'was', 'overall', 'a', 'decent', 'sundae.', "I'll", 'just', 'stick', 'to', 'vanilla', 'next', 'time!\n']


 tokenized and lemmatized document: 
['want', 'cream', 'husband', 'pull', 'highway', 'culver', 'opt', 'flavor', 'cooki', 'dough', 'crave', 'disappoint', 'chocol', 'custard', 'ton', 'cooki', 'dough', 'chunk', 'like', 'custard', 'overal', 'decent', 'sunda', 'stick', 'vanilla', 'time']


In [18]:
from tqdm import tqdm

processed_docs = [preprocess(text) for text in tqdm(documents)]

100%|█████████████████████████████████████████████████████████████████████████| 100181/100181 [02:42<00:00, 614.89it/s]


In [20]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [28]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [31]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"good" + 0.014*"like" + 0.012*"great" + 0.011*"place" + 0.009*"sauc" + 0.009*"order" + 0.009*"salad" + 0.009*"pizza" + 0.008*"love" + 0.008*"chees"
Topic: 1 
Words: 0.030*"pizza" + 0.019*"order" + 0.014*"good" + 0.011*"breakfast" + 0.011*"time" + 0.010*"thai" + 0.009*"like" + 0.009*"come" + 0.008*"place" + 0.006*"great"
Topic: 2 
Words: 0.015*"good" + 0.012*"go" + 0.010*"time" + 0.010*"restaur" + 0.010*"place" + 0.010*"order" + 0.010*"like" + 0.010*"come" + 0.009*"servic" + 0.008*"chicken"
Topic: 3 
Words: 0.020*"order" + 0.018*"place" + 0.014*"time" + 0.014*"wait" + 0.014*"come" + 0.013*"servic" + 0.012*"good" + 0.011*"tabl" + 0.010*"like" + 0.010*"drink"
Topic: 4 
Words: 0.024*"burger" + 0.019*"place" + 0.016*"like" + 0.016*"good" + 0.011*"fri" + 0.009*"come" + 0.009*"friend" + 0.009*"beer" + 0.008*"nice" + 0.008*"great"
Topic: 5 
Words: 0.013*"place" + 0.012*"sushi" + 0.011*"restaur" + 0.010*"roll" + 0.009*"good" + 0.009*"menu" + 0.008*"like" + 0.008*"dish" + 

In [32]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"thai" + 0.006*"noodl" + 0.005*"place" + 0.005*"chicken" + 0.005*"good" + 0.005*"order" + 0.005*"ramen" + 0.004*"great" + 0.004*"servic" + 0.004*"rice"
Topic: 1 Word: 0.006*"burger" + 0.005*"order" + 0.004*"time" + 0.004*"wait" + 0.004*"come" + 0.004*"minut" + 0.004*"place" + 0.004*"drink" + 0.004*"servic" + 0.004*"tabl"
Topic: 2 Word: 0.006*"buffet" + 0.004*"good" + 0.004*"great" + 0.003*"drink" + 0.003*"time" + 0.003*"place" + 0.003*"breakfast" + 0.003*"come" + 0.003*"like" + 0.003*"vega"
Topic: 3 Word: 0.006*"sushi" + 0.005*"burger" + 0.004*"pancak" + 0.004*"place" + 0.004*"good" + 0.004*"like" + 0.004*"fri" + 0.004*"chicken" + 0.004*"order" + 0.004*"roll"
Topic: 4 Word: 0.024*"pizza" + 0.008*"great" + 0.007*"sandwich" + 0.006*"crust" + 0.006*"love" + 0.006*"best" + 0.006*"awesom" + 0.006*"burger" + 0.006*"place" + 0.006*"good"
Topic: 5 Word: 0.013*"taco" + 0.006*"salsa" + 0.006*"burrito" + 0.005*"mexican" + 0.005*"great" + 0.005*"good" + 0.005*"chicken" + 0.005