In [None]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [None]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [1]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_topic_model(file_name, num_topics = 10, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()
    
    processed_docs = [preprocess(text) for text in tqdm(documents)]
    
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]

    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary)
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    print("done in %fs" % (time() - t0))
    
    return lda_model



files = ['rest_review_sample_100000.txt', 'categories/Chinese_pos.txt', 'categories/Chinese_neg.txt']
for f in files:
    model = get_topic_model(f)
    for idx, topic in model.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))

100%|█████████████████████████████████████████████████████████████████████████| 100181/100181 [03:02<00:00, 549.29it/s]


done in 97.105996s
Topic: 0 Word: 0.010*"dish" + 0.009*"rice" + 0.009*"pork" + 0.009*"roll" + 0.008*"thai" + 0.008*"spici" + 0.008*"noodl" + 0.008*"shrimp" + 0.007*"ramen" + 0.007*"chicken"
Topic: 1 Word: 0.056*"pizza" + 0.026*"crust" + 0.019*"giada" + 0.012*"pepperoni" + 0.012*"sub" + 0.012*"zupa" + 0.008*"flatbread" + 0.008*"slice" + 0.007*"summerlin" + 0.007*"knot"
Topic: 2 Word: 0.064*"sushi" + 0.020*"roll" + 0.016*"donut" + 0.015*"ayc" + 0.013*"rudi" + 0.009*"katsu" + 0.009*"nigiri" + 0.008*"nutella" + 0.007*"firehous" + 0.006*"margherita"
Topic: 3 Word: 0.015*"tapa" + 0.010*"manag" + 0.009*"min" + 0.007*"minut" + 0.006*"applebe" + 0.006*"host" + 0.006*"tabl" + 0.006*"ask" + 0.005*"wait" + 0.005*"hair"
Topic: 4 Word: 0.045*"crepe" + 0.019*"tonkatsu" + 0.014*"wifi" + 0.013*"monta" + 0.013*"grass" + 0.012*"papa" + 0.009*"smashburg" + 0.009*"chelsea" + 0.008*"escal" + 0.007*"peruvian"
Topic: 5 Word: 0.023*"jelli" + 0.021*"boba" + 0.015*"wont" + 0.014*"thit" + 0.013*"nuong" + 0.012*"c

100%|███████████████████████████████████████████████████████████████████████████| 27830/27830 [00:45<00:00, 613.62it/s]


done in 28.546391s
Topic: 0 Word: 0.051*"ramen" + 0.030*"boba" + 0.009*"belli" + 0.008*"udon" + 0.007*"cevich" + 0.007*"shabu" + 0.006*"noodl" + 0.006*"slider" + 0.006*"georg" + 0.006*"fan"
Topic: 1 Word: 0.006*"order" + 0.005*"chicken" + 0.005*"lunch" + 0.005*"time" + 0.005*"place" + 0.005*"come" + 0.004*"chines" + 0.004*"great" + 0.004*"rice" + 0.004*"restaur"
Topic: 2 Word: 0.012*"noodl" + 0.008*"beef" + 0.007*"soup" + 0.007*"pork" + 0.007*"sauc" + 0.007*"dumpl" + 0.007*"spici" + 0.006*"chicken" + 0.006*"lamb" + 0.006*"fri"
Topic: 3 Word: 0.016*"casino" + 0.010*"belli" + 0.009*"vega" + 0.008*"mango" + 0.008*"noodl" + 0.007*"great" + 0.006*"best" + 0.006*"place" + 0.006*"sandwich" + 0.006*"david"
Topic: 4 Word: 0.027*"great" + 0.019*"servic" + 0.017*"love" + 0.016*"price" + 0.015*"fast" + 0.014*"best" + 0.013*"chines" + 0.013*"deliveri" + 0.013*"place" + 0.012*"staff"
Topic: 5 Word: 0.008*"chines" + 0.007*"taiwanes" + 0.007*"restaur" + 0.006*"place" + 0.006*"authent" + 0.006*"shave" 

100%|███████████████████████████████████████████████████████████████████████████| 10912/10912 [00:20<00:00, 543.53it/s]


done in 12.257504s
Topic: 0 Word: 0.008*"servic" + 0.008*"wait" + 0.007*"coupon" + 0.006*"order" + 0.006*"server" + 0.006*"minut" + 0.005*"good" + 0.005*"beer" + 0.005*"waitress" + 0.005*"time"
Topic: 1 Word: 0.008*"chicken" + 0.006*"rice" + 0.006*"order" + 0.006*"noodl" + 0.006*"tast" + 0.006*"soup" + 0.006*"fri" + 0.005*"dish" + 0.005*"beef" + 0.005*"like"
Topic: 2 Word: 0.009*"rude" + 0.008*"servic" + 0.006*"order" + 0.006*"custom" + 0.006*"come" + 0.006*"terribl" + 0.006*"server" + 0.006*"wait" + 0.005*"time" + 0.005*"groupon"
Topic: 3 Word: 0.005*"time" + 0.005*"place" + 0.005*"restaur" + 0.004*"go" + 0.004*"servic" + 0.004*"buffet" + 0.004*"chines" + 0.004*"say" + 0.004*"come" + 0.004*"order"
Topic: 4 Word: 0.008*"buffet" + 0.008*"comp" + 0.007*"slow" + 0.005*"profession" + 0.005*"choy" + 0.005*"servic" + 0.005*"execut" + 0.005*"space" + 0.005*"clue" + 0.005*"select"
Topic: 5 Word: 0.011*"panda" + 0.010*"express" + 0.009*"smoke" + 0.008*"bake" + 0.008*"good" + 0.006*"better" + 0.