https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [2]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [3]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time
import json

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_corpus(file_name, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()

    processed_docs = [preprocess(text) for text in tqdm(documents)]

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]
        
    return (corpus, dictionary)

def get_lda_topic_model(corpus, dictionary, num_topics = 10, use_tfidf = True):
    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary) #Bad!
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto') #Good enough
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    print("done in %fs" % (time() - t0))
    
    return lda_model

def get_lsi_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    lsi_model = models.LsiModel(corpus, num_topics = num_topics, id2word = dictionary)
    print("done in %fs" % (time() - t0))
    
    return lsi_model

files = ['rest_review_sample_100000', 'categories/Chinese_pos', 'categories/Chinese_neg','categories/Chinese', 'categories/Mexican']
(corpus, dictionary) = get_corpus(files[0] + ".txt")
model_1 = get_lda_topic_model(corpus, dictionary, num_topics=20)
#model_1_1 = get_lsi_topic_model(corpus, dictionary, num_topics=20)
cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')

corpus, dictionary = get_corpus(files[1] + ".txt")
model_2 = get_lda_topic_model(corpus, dictionary)

corpus, dictionary = get_corpus(files[2] + ".txt")
model_3 = get_lda_topic_model(corpus, dictionary)
# model_4 = get_topic_model(files[3] + ".txt")
# model_5 = get_topic_model(files[4] + ".txt")

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [02:38<00:00, 629.87it/s]


done in 79.711755s


100%|███████████████████████████████████████████████████████████████████████████| 27804/27804 [00:39<00:00, 710.52it/s]


done in 24.066141s


100%|███████████████████████████████████████████████████████████████████████████| 10912/10912 [00:16<00:00, 649.54it/s]


done in 10.577471s


In [4]:
Stock_Colors = ['black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive', 'navy', 'blue', 'teal', 'aqua',
               'black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive']

def get_topic_json(model, title, compare_words):
    topic = model.show_topics(-1, formatted=False)
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    topic_out = {name_name: title, children_name:[]}

    for i in range(0, len(topic)):
        topic_out[children_name].append({name_name: 'Topic ' + str(i), children_name:[]})
        max_weight = topic[i][1][0][1]
        for j in range(0, len(topic[i][1])):
            topic_out[children_name][i][children_name].append({name_name:topic[i][1][j][0],
                                                               value_name:"{0:.2f}".format(topic[i][1][j][1]/max_weight),
                                                               color_name: Stock_Colors[i],
                                                               'new_word': not (topic[i][1][j][0] in compare_words)})
        
    return topic_out


def get_topic_words(model):
    topic = model.show_topics(-1, formatted=False)
    all_words = {}
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1])):
            if topic[i][1][j][0] in all_words:
                all_words[topic[i][1][j][0]] += 1
            else:
                all_words[topic[i][1][j][0]] = 1
    
    return all_words
    
# with open(files[0] + '.json', 'w') as f:
#     f.write(json.dumps(get_topic_json(model_1, '100000_Samples', {})))

# pos_words = get_topic_words(model_2)
# neg_words = get_topic_words(model_3)

# topic2 = get_topic_json(model_2, 'Positive', neg_words)
# topic3 = get_topic_json(model_3, 'Negative', pos_words)
# topic = {'name':'Chinese Restaurant Reviews', 'children':[topic2, topic3]}
# with open('compare' + '.json', 'w') as f:
#     f.write(json.dumps(topic))
    
# topic4 = get_topic_json(model_4, 'Chinese')
# topic5 = get_topic_json(model_5, 'Mexican')
# topic = {'name':'Chinese vs. Mexican', 'children':[topic2, topic3]}
# with open('compare_cn_mx' + '.json', 'w') as f:
#     f.write(json.dumps(topic))

In [5]:
for idx, topic in model_1.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

# for idx, topic in model_1_1.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')

for idx, topic in model_2.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

for idx, topic in model_3.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
# for idx, topic in model_4.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')
# for idx, topic in model_5.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.011*"truffl" + 0.011*"crepe" + 0.010*"chocol" + 0.007*"dessert" + 0.007*"concept" + 0.006*"cake" + 0.006*"kale" + 0.006*"appl" + 0.006*"wine" + 0.006*"lamb"
Topic: 1 Words: 0.052*"pancak" + 0.042*"breakfast" + 0.033*"egg" + 0.032*"coffe" + 0.021*"omelet" + 0.019*"benedict" + 0.019*"tot" + 0.018*"hash" + 0.018*"bagel" + 0.016*"toast"
Topic: 2 Words: 0.019*"crawfish" + 0.016*"blast" + 0.015*"cupcak" + 0.014*"reward" + 0.014*"yard" + 0.013*"scrambl" + 0.012*"alot" + 0.010*"uncomfort" + 0.010*"loco" + 0.010*"accept"
Topic: 3 Words: 0.008*"great" + 0.007*"good" + 0.007*"place" + 0.007*"burger" + 0.006*"love" + 0.005*"friend" + 0.005*"order" + 0.005*"fri" + 0.005*"like" + 0.005*"servic"
Topic: 4 Words: 0.011*"slider" + 0.009*"ketchup" + 0.007*"spoon" + 0.007*"ignor" + 0.007*"tasteless" + 0.007*"milk" + 0.006*"negat" + 0.006*"train" + 0.005*"brat" + 0.005*"horribl"
Topic: 5 Words: 0.009*"minut" + 0.008*"wait" + 0.007*"tabl" + 0.007*"manag" + 0.007*"ask" + 0.006*"custom" + 0.

In [None]:
(corpus, dictionary) = get_corpus(files[0] + ".txt")

for k in range(5, 20):
    model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
    cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
    print("k={} coherence={}"%(k,cm.get_coherence()))

  5%|███▋                                                                       | 4853/100000 [00:06<02:09, 732.53it/s]

In [25]:
(corpus, dictionary) = get_corpus(files[1] + ".txt")
hdp_2 = models.HdpModel(corpus, dictionary)
l=hdp_2.suggested_lda_model()
l.show_topics(20)

100%|███████████████████████████████████████████████████████████████████████████| 27804/27804 [00:39<00:00, 704.78it/s]


[(112,
  '0.000*"oxtail" + 0.000*"inconveni" + 0.000*"huge" + 0.000*"dip" + 0.000*"snow" + 0.000*"prevent" + 0.000*"chill" + 0.000*"truli" + 0.000*"imperi" + 0.000*"act"'),
 (120,
  '0.000*"storm" + 0.000*"what" + 0.000*"supper" + 0.000*"brocolli" + 0.000*"disgust" + 0.000*"japan" + 0.000*"spici" + 0.000*"vinegar" + 0.000*"arent" + 0.000*"tasteless"'),
 (142,
  '0.000*"true" + 0.000*"tradit" + 0.000*"length" + 0.000*"pull" + 0.000*"element" + 0.000*"habit" + 0.000*"vicin" + 0.000*"flawless" + 0.000*"signag" + 0.000*"brule"'),
 (13,
  '0.000*"seldom" + 0.000*"world" + 0.000*"piec" + 0.000*"tie" + 0.000*"everytim" + 0.000*"king" + 0.000*"rout" + 0.000*"spin" + 0.000*"huge" + 0.000*"hottest"'),
 (148,
  '0.000*"everytim" + 0.000*"tsingtao" + 0.000*"rule" + 0.000*"stuffi" + 0.000*"turkey" + 0.000*"proper" + 0.000*"trap" + 0.000*"insan" + 0.000*"italian" + 0.000*"qualifi"'),
 (35,
  '0.000*"melon" + 0.000*"southwest" + 0.000*"arcadia" + 0.000*"fiance" + 0.000*"recomend" + 0.000*"star" + 0.0

In [26]:
(corpus, dictionary) = get_corpus(files[2] + ".txt")
hdp_3 = models.HdpModel(corpus, dictionary)
l=hdp_3.suggested_lda_model()
l.show_topics(20)

100%|███████████████████████████████████████████████████████████████████████████| 10912/10912 [00:17<00:00, 634.63it/s]


[(120,
  '0.001*"trap" + 0.001*"atroci" + 0.001*"amaz" + 0.001*"sing" + 0.001*"odd" + 0.001*"ridicul" + 0.001*"substitut" + 0.001*"concern" + 0.001*"southern" + 0.001*"hoisin"'),
 (119,
  '0.001*"warm" + 0.001*"cashew" + 0.001*"anticip" + 0.001*"littl" + 0.001*"addit" + 0.001*"compens" + 0.001*"sum" + 0.001*"busi" + 0.001*"semi" + 0.001*"mini"'),
 (40,
  '0.001*"comparison" + 0.001*"eye" + 0.001*"udon" + 0.001*"wave" + 0.001*"courteous" + 0.001*"express" + 0.001*"refresh" + 0.001*"instant" + 0.001*"eastern" + 0.001*"line"'),
 (68,
  '0.001*"thought" + 0.001*"finish" + 0.001*"record" + 0.001*"standard" + 0.001*"turn" + 0.001*"friend" + 0.001*"ach" + 0.001*"shumai" + 0.001*"seattl" + 0.001*"custom"'),
 (100,
  '0.001*"combin" + 0.001*"menu" + 0.001*"assist" + 0.001*"unapp" + 0.001*"ball" + 0.001*"anymor" + 0.001*"wing" + 0.001*"coke" + 0.001*"yeah" + 0.001*"hour"'),
 (141,
  '0.001*"viet" + 0.001*"sweet" + 0.001*"dress" + 0.001*"member" + 0.001*"interior" + 0.001*"soupi" + 0.001*"mere" +

In [20]:
hdp.optimal_ordering()

In [None]:
def UMassScore(model, dictionary):
    score = 0
    topic = model.show_topics(-1, formatted=False)
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1]) - 1):
            p = 

http://qpleple.com/topic-coherence-to-evaluate-topic-models/

Select number of topics for LDA model: https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html