https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [2]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [3]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time
import json

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_corpus(file_name, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()

    processed_docs = [preprocess(text) for text in tqdm(documents)]

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]
        
    return (corpus, dictionary)

def save_corpus(corpus, file_name):
    with open(file_name + '.txt', 'w') as f:
        for doc in tqdm(corpus):
            words = []
            for i,v in doc:
                words.append(dictionary[i])
            f.write(' '.join(words) + '\n')
        
def get_lda_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary) #Bad!
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto') #Good enough
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    print("done in %fs" % (time() - t0))
    
    return lda_model

def get_lsi_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    lsi_model = models.LsiModel(corpus, num_topics = num_topics, id2word = dictionary)
    print("done in %fs" % (time() - t0))
    
    return lsi_model

files = ['rest_review_sample_100000', 'categories/Chinese_pos', 'categories/Chinese_neg','categories/Chinese', 'categories/Mexican']
(corpus, dictionary) = get_corpus(files[0] + ".txt")
save_corpus(corpus, files[0])
model_1 = get_lda_topic_model(corpus, dictionary, num_topics=20)
#model_1_1 = get_lsi_topic_model(corpus, dictionary, num_topics=20)
largest_coherence = -1e20
best_k = 0
for k in range(5, 100, 2):
    model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
    cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print("k=%d coherence=%f"%(k, coherence))
    if (coherence > largest_coherence):
        largest_coherence = coherence
        model_1 = model
        best_k = k
    
corpus, dictionary = get_corpus(files[1] + ".txt")
save_corpus(corpus, files[1])
model_2 = get_lda_topic_model(corpus, dictionary)

corpus, dictionary = get_corpus(files[2] + ".txt")
save_corpus(corpus, files[2])
model_3 = get_lda_topic_model(corpus, dictionary)
# model_4 = get_topic_model(files[3] + ".txt")
# model_5 = get_topic_model(files[4] + ".txt")

100%|██████████| 100000/100000 [02:19<00:00, 717.94it/s]


done in 54.840080s
done in 53.405018s
k=5 coherence=-8.711588
done in 55.879316s
k=7 coherence=-9.012033
done in 55.771752s
k=9 coherence=-8.386204
done in 53.025692s
k=11 coherence=-9.758425
done in 54.100503s
k=13 coherence=-7.879161
done in 55.230710s
k=15 coherence=-8.563793
done in 54.732833s
k=17 coherence=-9.184393
done in 53.434371s
k=19 coherence=-9.459875
done in 54.366743s
k=21 coherence=-8.239192
done in 55.524129s
k=23 coherence=-6.813123
done in 54.958368s
k=25 coherence=-8.404542
done in 55.031952s
k=27 coherence=-7.494700
done in 54.308684s
k=29 coherence=-8.648878
done in 55.207020s
k=31 coherence=-7.119518
done in 74.247778s
k=33 coherence=-7.831742
done in 1106.163820s
k=35 coherence=-7.199657
done in 58.562465s
k=37 coherence=-7.412367
done in 66.677724s
k=39 coherence=-7.469745
done in 65.118296s
k=41 coherence=-7.468898
done in 65.331245s
k=43 coherence=-6.461854
done in 72.429250s
k=45 coherence=-7.173412
done in 69.824019s
k=47 coherence=-7.043966
done in 70.762

  diff = np.log(self.expElogbeta)


done in 81.404716s
k=95 coherence=-6.098757
done in 72.649066s
k=97 coherence=-5.859996
done in 73.923305s


  0%|          | 119/27804 [00:00<00:23, 1179.87it/s]

k=99 coherence=-6.032273


100%|██████████| 27804/27804 [00:33<00:00, 822.53it/s] 
  1%|          | 71/10912 [00:00<00:15, 686.57it/s]

done in 16.170151s


100%|██████████| 10912/10912 [00:14<00:00, 762.07it/s]


done in 7.126101s


In [10]:
Stock_Colors = ['black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive', 'navy', 'blue', 'teal', 'aqua',
               'black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive']

def get_topic_json(model, title, compare_words):
    topic = model.show_topics(-1, formatted=False)
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    topic_out = {name_name: title, children_name:[]}

    for i in range(0, len(topic)):
        topic_out[children_name].append({name_name: 'Topic ' + str(i), children_name:[]})
        max_weight = topic[i][1][0][1]
        for j in range(0, len(topic[i][1])):
            topic_out[children_name][i][children_name].append({name_name:topic[i][1][j][0],
                                                               value_name:"{0:.2f}".format(topic[i][1][j][1]/max_weight),
                                                               color_name: Stock_Colors[i],
                                                               'new_word': not (topic[i][1][j][0] in compare_words)})
        
    return topic_out


def get_topic_words(model):
    topic = model.show_topics(-1, formatted=False)
    all_words = {}
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1])):
            if topic[i][1][j][0] in all_words:
                all_words[topic[i][1][j][0]] += 1
            else:
                all_words[topic[i][1][j][0]] = 1
    
    return all_words
    
# with open(files[0] + '.json', 'w') as f:
#     f.write(json.dumps(get_topic_json(model_1, '100000_Samples', {})))

# pos_words = get_topic_words(model_2)
# neg_words = get_topic_words(model_3)

# topic2 = get_topic_json(model_2, 'Positive', neg_words)
# topic3 = get_topic_json(model_3, 'Negative', pos_words)
# topic = {'name':'Chinese Restaurant Reviews', 'children':[topic2, topic3]}
# with open('compare' + '.json', 'w') as f:
#     f.write(json.dumps(topic))
    
# topic4 = get_topic_json(model_4, 'Chinese')
# topic5 = get_topic_json(model_5, 'Mexican')
# topic = {'name':'Chinese vs. Mexican', 'children':[topic2, topic3]}
# with open('compare_cn_mx' + '.json', 'w') as f:
#     f.write(json.dumps(topic))

In [31]:
for idx, topic in model_1.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

# for idx, topic in model_1_1.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')

for idx, topic in model_2.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

for idx, topic in model_3.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
# for idx, topic in model_4.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')
# for idx, topic in model_5.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.008*"excus" + 0.008*"manag" + 0.008*"tabl" + 0.007*"apolog" + 0.007*"wait" + 0.007*"minut" + 0.006*"order" + 0.006*"time" + 0.006*"come" + 0.006*"approach"
Topic: 1 Words: 0.056*"rib" + 0.021*"unbeliev" + 0.019*"short" + 0.018*"tonight" + 0.018*"disgust" + 0.018*"profession" + 0.017*"stumbl" + 0.016*"gross" + 0.016*"marin" + 0.015*"caramel"
Topic: 2 Words: 0.040*"groupon" + 0.023*"benedict" + 0.018*"close" + 0.017*"wont" + 0.011*"tail" + 0.010*"chilli" + 0.010*"scene" + 0.009*"golden" + 0.008*"lean" + 0.008*"cave"
Topic: 3 Words: 0.037*"buffalo" + 0.036*"falafel" + 0.027*"higher" + 0.025*"condiment" + 0.019*"competit" + 0.018*"spanish" + 0.018*"hesit" + 0.016*"complex" + 0.015*"wast" + 0.014*"smell"
Topic: 4 Words: 0.041*"airport" + 0.036*"outstand" + 0.031*"chipotl" + 0.028*"excel" + 0.026*"bomb" + 0.020*"knowledg" + 0.020*"pricey" + 0.015*"winner" + 0.013*"blend" + 0.013*"great"
Topic: 5 Words: 0.065*"burger" + 0.031*"ring" + 0.023*"onion" + 0.019*"fri" + 0.015*"hub

In [None]:
# (corpus, dictionary) = get_corpus(files[0] + ".txt")

# for k in range(5, 20):
#     model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
#     print("k=%d coherence=%f"%(k,cm.get_coherence()))

In [None]:
# (corpus, dictionary) = get_corpus(files[1] + ".txt")
# hdp_2 = models.HdpModel(corpus, dictionary)
# l=hdp_2.suggested_lda_model()
# l.show_topics(20)

# (corpus, dictionary) = get_corpus(files[2] + ".txt")
# hdp_3 = models.HdpModel(corpus, dictionary)
# l=hdp_3.suggested_lda_model()
# l.show_topics(20)

http://qpleple.com/topic-coherence-to-evaluate-topic-models/

Select number of topics for LDA model: https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html