https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [None]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [None]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [1]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time
import json

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_corpus(file_name, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()

    processed_docs = [preprocess(text) for text in tqdm(documents)]

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]
        
    return (corpus, dictionary)

def save_corpus(corpus, file_name):
    with open(file_name + '_processed.txt', 'w') as f:
        for doc in tqdm(corpus):
            words = []
            for i,v in doc:
                words.append(dictionary[i])
            f.write(' '.join(words) + '\n')
        
def get_lda_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary) #Bad!
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, alpha='auto') #Good enough
    #lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    print("done in %fs" % (time() - t0))
    
    return lda_model

def get_lsi_topic_model(corpus, dictionary, num_topics = 10):
    t0 = time()
    lsi_model = models.LsiModel(corpus, num_topics = num_topics, id2word = dictionary)
    print("done in %fs" % (time() - t0))
    
    return lsi_model

files = ['rest_review_sample_100000', 'categories/Chinese_pos', 'categories/Chinese_neg','categories/Chinese', 'categories/Mexican']
(corpus, dictionary) = get_corpus(files[0] + ".txt")
save_corpus(corpus, files[0])
model_1 = get_lda_topic_model(corpus, dictionary, num_topics=20)
# model_1_1 = get_lsi_topic_model(corpus, dictionary, num_topics=20)
# largest_coherence = -1e20
# best_k = 0
# for k in range(5, 100, 2):
#     model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
#     coherence = cm.get_coherence()
#     print("k=%d coherence=%f"%(k, coherence))
#     if (coherence > largest_coherence):
#         largest_coherence = coherence
#         model_1 = model
#         best_k = k
    
corpus, dictionary = get_corpus(files[1] + ".txt")
save_corpus(corpus, files[1])
model_2 = get_lda_topic_model(corpus, dictionary)

corpus, dictionary = get_corpus(files[2] + ".txt")
save_corpus(corpus, files[2])
model_3 = get_lda_topic_model(corpus, dictionary)
# model_4 = get_topic_model(files[3] + ".txt")
# model_5 = get_topic_model(files[4] + ".txt")

  0%|          | 0/100000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
name = 'categories/Chinese'
corpus, dictionary = get_corpus(name + ".txt")
save_corpus(corpus, name)


  0%|          | 0/38716 [00:00<?, ?it/s][A
  0%|          | 1/38716 [00:01<19:45:34,  1.84s/it][A
  0%|          | 106/38716 [00:01<13:47:50,  1.29s/it][A
  0%|          | 182/38716 [00:02<9:38:35,  1.11it/s] [A
  1%|          | 230/38716 [00:02<6:44:54,  1.58it/s][A
  1%|          | 292/38716 [00:02<4:43:17,  2.26it/s][A
  1%|          | 367/38716 [00:02<3:18:11,  3.23it/s][A
  1%|          | 425/38716 [00:02<2:18:53,  4.60it/s][A
  1%|          | 481/38716 [00:02<1:37:26,  6.54it/s][A
  1%|▏         | 558/38716 [00:02<1:08:19,  9.31it/s][A
  2%|▏         | 622/38716 [00:02<48:02, 13.21it/s]  [A
  2%|▏         | 704/38716 [00:02<33:47, 18.75it/s][A
  2%|▏         | 791/38716 [00:02<23:49, 26.53it/s][A
  2%|▏         | 865/38716 [00:03<16:55, 37.29it/s][A
  2%|▏         | 945/38716 [00:03<12:03, 52.23it/s][A
  3%|▎         | 1037/38716 [00:03<08:37, 72.83it/s][A
  3%|▎         | 1126/38716 [00:03<06:14, 100.49it/s][A
  3%|▎         | 1228/38716 [00:03<04:32, 137.57it

 31%|███       | 12097/38716 [00:17<00:40, 663.51it/s][A
 31%|███▏      | 12193/38716 [00:17<00:36, 730.95it/s][A
 32%|███▏      | 12269/38716 [00:17<00:38, 692.84it/s][A
 32%|███▏      | 12364/38716 [00:17<00:35, 752.27it/s][A
 32%|███▏      | 12443/38716 [00:17<00:35, 741.10it/s][A
 32%|███▏      | 12520/38716 [00:17<00:37, 701.66it/s][A
 33%|███▎      | 12609/38716 [00:17<00:34, 749.07it/s][A
 33%|███▎      | 12687/38716 [00:17<00:38, 672.81it/s][A
 33%|███▎      | 12758/38716 [00:18<00:39, 651.37it/s][A
 33%|███▎      | 12826/38716 [00:18<00:42, 609.31it/s][A
 33%|███▎      | 12890/38716 [00:18<00:43, 588.12it/s][A
 33%|███▎      | 12951/38716 [00:18<00:44, 584.18it/s][A
 34%|███▎      | 13025/38716 [00:18<00:41, 622.99it/s][A
 34%|███▍      | 13109/38716 [00:18<00:37, 674.85it/s][A
 34%|███▍      | 13204/38716 [00:18<00:34, 738.64it/s][A
 34%|███▍      | 13296/38716 [00:18<00:32, 784.52it/s][A
 35%|███▍      | 13378/38716 [00:18<00:32, 791.59it/s][A
 35%|███▍     

 61%|██████    | 23526/38716 [00:32<00:17, 879.40it/s][A
 61%|██████    | 23617/38716 [00:32<00:17, 863.49it/s][A
 61%|██████    | 23706/38716 [00:32<00:17, 850.05it/s][A
 61%|██████▏   | 23793/38716 [00:32<00:18, 806.45it/s][A
 62%|██████▏   | 23876/38716 [00:32<00:18, 781.85it/s][A
 62%|██████▏   | 23956/38716 [00:33<00:20, 723.01it/s][A
 62%|██████▏   | 24035/38716 [00:33<00:19, 741.81it/s][A
 62%|██████▏   | 24127/38716 [00:33<00:18, 786.71it/s][A
 63%|██████▎   | 24208/38716 [00:33<00:18, 784.88it/s][A
 63%|██████▎   | 24304/38716 [00:33<00:17, 827.55it/s][A
 63%|██████▎   | 24389/38716 [00:33<00:20, 708.18it/s][A
 63%|██████▎   | 24503/38716 [00:33<00:17, 797.94it/s][A
 64%|██████▎   | 24612/38716 [00:33<00:16, 864.20it/s][A
 64%|██████▍   | 24705/38716 [00:33<00:18, 760.53it/s][A
 64%|██████▍   | 24788/38716 [00:34<00:20, 669.59it/s][A
 64%|██████▍   | 24874/38716 [00:34<00:19, 716.84it/s][A
 64%|██████▍   | 24958/38716 [00:34<00:18, 746.43it/s][A
 65%|██████▍  

 89%|████████▉ | 34481/38716 [00:48<00:06, 618.94it/s][A
 89%|████████▉ | 34545/38716 [00:48<00:07, 593.81it/s][A
 89%|████████▉ | 34606/38716 [00:48<00:07, 585.78it/s][A
 90%|████████▉ | 34685/38716 [00:48<00:06, 633.17it/s][A
 90%|████████▉ | 34754/38716 [00:48<00:06, 648.08it/s][A
 90%|████████▉ | 34832/38716 [00:48<00:05, 682.61it/s][A
 90%|█████████ | 34910/38716 [00:48<00:05, 708.70it/s][A
 90%|█████████ | 35003/38716 [00:48<00:04, 762.25it/s][A
 91%|█████████ | 35082/38716 [00:48<00:04, 760.02it/s][A
 91%|█████████ | 35160/38716 [00:49<00:04, 751.03it/s][A
 91%|█████████ | 35248/38716 [00:49<00:04, 783.16it/s][A
 91%|█████████▏| 35338/38716 [00:49<00:04, 812.70it/s][A
 91%|█████████▏| 35421/38716 [00:49<00:04, 784.90it/s][A
 92%|█████████▏| 35501/38716 [00:49<00:04, 684.01it/s][A
 92%|█████████▏| 35596/38716 [00:49<00:04, 740.73it/s][A
 92%|█████████▏| 35674/38716 [00:49<00:04, 737.05it/s][A
 92%|█████████▏| 35751/38716 [00:49<00:03, 743.23it/s][A
 93%|█████████

In [None]:
Stock_Colors = ['black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive', 'navy', 'blue', 'teal', 'aqua',
               'black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive']

def get_topic_json(model, title, compare_words):
    topic = model.show_topics(-1, formatted=False)
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    topic_out = {name_name: title, children_name:[]}

    for i in range(0, len(topic)):
        topic_out[children_name].append({name_name: 'Topic ' + str(i), children_name:[]})
        max_weight = topic[i][1][0][1]
        for j in range(0, len(topic[i][1])):
            topic_out[children_name][i][children_name].append({name_name:topic[i][1][j][0],
                                                               value_name:"{0:.2f}".format(topic[i][1][j][1]/max_weight),
                                                               color_name: Stock_Colors[i],
                                                               'new_word': not (topic[i][1][j][0] in compare_words)})
        
    return topic_out


def get_topic_words(model):
    topic = model.show_topics(-1, formatted=False)
    all_words = {}
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1])):
            if topic[i][1][j][0] in all_words:
                all_words[topic[i][1][j][0]] += 1
            else:
                all_words[topic[i][1][j][0]] = 1
    
    return all_words
    
# with open(files[0] + '.json', 'w') as f:
#     f.write(json.dumps(get_topic_json(model_1, '100000_Samples', {})))

# pos_words = get_topic_words(model_2)
# neg_words = get_topic_words(model_3)

# topic2 = get_topic_json(model_2, 'Positive', neg_words)
# topic3 = get_topic_json(model_3, 'Negative', pos_words)
# topic = {'name':'Chinese Restaurant Reviews', 'children':[topic2, topic3]}
# with open('compare' + '.json', 'w') as f:
#     f.write(json.dumps(topic))
    
# topic4 = get_topic_json(model_4, 'Chinese')
# topic5 = get_topic_json(model_5, 'Mexican')
# topic = {'name':'Chinese vs. Mexican', 'children':[topic2, topic3]}
# with open('compare_cn_mx' + '.json', 'w') as f:
#     f.write(json.dumps(topic))

In [None]:
for idx, topic in model_1.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

# for idx, topic in model_1_1.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')

for idx, topic in model_2.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')

for idx, topic in model_3.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
# for idx, topic in model_4.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')
# for idx, topic in model_5.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

In [None]:
# (corpus, dictionary) = get_corpus(files[0] + ".txt")

# for k in range(5, 20):
#     model = get_lda_topic_model(corpus, dictionary, num_topics=k)    
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
#     print("k=%d coherence=%f"%(k,cm.get_coherence()))

In [None]:
# (corpus, dictionary) = get_corpus(files[1] + ".txt")
# hdp_2 = models.HdpModel(corpus, dictionary)
# l=hdp_2.suggested_lda_model()
# l.show_topics(20)

# (corpus, dictionary) = get_corpus(files[2] + ".txt")
# hdp_3 = models.HdpModel(corpus, dictionary)
# l=hdp_3.suggested_lda_model()
# l.show_topics(20)

http://qpleple.com/topic-coherence-to-evaluate-topic-models/

Select number of topics for LDA model: https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html