https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [None]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time
import json

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_corpus(file_name, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()

    processed_docs = [preprocess(text) for text in tqdm(documents)]

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]
        
    return (corpus, dictionary)

def get_lda_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary) #Bad!
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto') #Good enough
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    print("done in %fs" % (time() - t0))
    
    return lda_model

def get_lsi_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    lsi_model = models.LsiModel(corpus, num_topics = num_topics, id2word = dictionary)
    print("done in %fs" % (time() - t0))
    
    return lsi_model

files = ['rest_review_sample_100000', 'categories/Chinese_pos', 'categories/Chinese_neg','categories/Chinese', 'categories/Mexican']
(corpus, dictionary) = get_corpus(files[0] + ".txt")
model_1 = get_lda_topic_model(corpus, dictionary, num_topics=20)
#model_1_1 = get_lsi_topic_model(corpus, dictionary, num_topics=20)
largest_coherence = -1e20
best_k = 0
for k in range(5, 100, 2):
    model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
    cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print("k=%d coherence=%f"%(k, coherence))
    if (coherence > largest_coherence):
        largest_coherence = coherence
        model_1 = model
        best_k = k
    
corpus, dictionary = get_corpus(files[1] + ".txt")
model_2 = get_lda_topic_model(corpus, dictionary)

corpus, dictionary = get_corpus(files[2] + ".txt")
model_3 = get_lda_topic_model(corpus, dictionary)
# model_4 = get_topic_model(files[3] + ".txt")
# model_5 = get_topic_model(files[4] + ".txt")

 64%|███████████████████████████████████████████████▍                          | 64184/100000 [01:43<00:54, 659.93it/s]

In [None]:
for i in range(5,30,2):
    print(i)

In [None]:
for i in corpus[0]:
    dictionary.id2token(i[0])

In [None]:
Stock_Colors = ['black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive', 'navy', 'blue', 'teal', 'aqua',
               'black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive']

def get_topic_json(model, title, compare_words):
    topic = model.show_topics(-1, formatted=False)
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    topic_out = {name_name: title, children_name:[]}

    for i in range(0, len(topic)):
        topic_out[children_name].append({name_name: 'Topic ' + str(i), children_name:[]})
        max_weight = topic[i][1][0][1]
        for j in range(0, len(topic[i][1])):
            topic_out[children_name][i][children_name].append({name_name:topic[i][1][j][0],
                                                               value_name:"{0:.2f}".format(topic[i][1][j][1]/max_weight),
                                                               color_name: Stock_Colors[i],
                                                               'new_word': not (topic[i][1][j][0] in compare_words)})
        
    return topic_out


def get_topic_words(model):
    topic = model.show_topics(-1, formatted=False)
    all_words = {}
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1])):
            if topic[i][1][j][0] in all_words:
                all_words[topic[i][1][j][0]] += 1
            else:
                all_words[topic[i][1][j][0]] = 1
    
    return all_words
    
# with open(files[0] + '.json', 'w') as f:
#     f.write(json.dumps(get_topic_json(model_1, '100000_Samples', {})))

# pos_words = get_topic_words(model_2)
# neg_words = get_topic_words(model_3)

# topic2 = get_topic_json(model_2, 'Positive', neg_words)
# topic3 = get_topic_json(model_3, 'Negative', pos_words)
# topic = {'name':'Chinese Restaurant Reviews', 'children':[topic2, topic3]}
# with open('compare' + '.json', 'w') as f:
#     f.write(json.dumps(topic))
    
# topic4 = get_topic_json(model_4, 'Chinese')
# topic5 = get_topic_json(model_5, 'Mexican')
# topic = {'name':'Chinese vs. Mexican', 'children':[topic2, topic3]}
# with open('compare_cn_mx' + '.json', 'w') as f:
#     f.write(json.dumps(topic))

In [None]:
for idx, topic in model_1.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

# for idx, topic in model_1_1.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')

for idx, topic in model_2.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

for idx, topic in model_3.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
# for idx, topic in model_4.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')
# for idx, topic in model_5.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

In [None]:
# (corpus, dictionary) = get_corpus(files[0] + ".txt")

# for k in range(5, 20):
#     model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
#     print("k=%d coherence=%f"%(k,cm.get_coherence()))

In [None]:
# (corpus, dictionary) = get_corpus(files[1] + ".txt")
# hdp_2 = models.HdpModel(corpus, dictionary)
# l=hdp_2.suggested_lda_model()
# l.show_topics(20)

# (corpus, dictionary) = get_corpus(files[2] + ".txt")
# hdp_3 = models.HdpModel(corpus, dictionary)
# l=hdp_3.suggested_lda_model()
# l.show_topics(20)

http://qpleple.com/topic-coherence-to-evaluate-topic-models/

Select number of topics for LDA model: https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html