In [1]:
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
# import numpy as np
# import nltk

# np.random.seed(2018)
# nltk.download('wordnet')

In [2]:
# import logging
# from time import time
# np.random.seed(2018)

# #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# #lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# t0 = time()
# #lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, iterations = 1000, passes = 2)
# lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [3]:
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from tqdm import tqdm
from time import time
import json

np.random.seed(2018)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

def get_topic_model(file_name, num_topics = 10, use_tfidf = True):
    documents = []

    with open (file_name, 'r') as f:
        documents = f.readlines()
    
    processed_docs = [preprocess(text) for text in tqdm(documents)]
    
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    if use_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]

    t0 = time()
    #lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary)
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,  eval_every=5, iterations = 1000, alpha='auto', gamma_threshold=0.01)
    print("done in %fs" % (time() - t0))
    
    return lda_model

files = ['rest_review_sample_100000', 'categories/Chinese_pos', 'categories/Chinese_neg','categories/Chinese', 'categories/Mexican']
model_1 = get_topic_model(files[0] + ".txt", num_topics=20)
model_2 = get_topic_model(files[1] + ".txt")
model_3 = get_topic_model(files[2] + ".txt")
# model_4 = get_topic_model(files[3] + ".txt")
# model_5 = get_topic_model(files[4] + ".txt")

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [02:41<00:00, 619.33it/s]


done in 79.154130s


100%|███████████████████████████████████████████████████████████████████████████| 27804/27804 [00:40<00:00, 690.49it/s]


done in 23.665388s


100%|███████████████████████████████████████████████████████████████████████████| 10912/10912 [00:17<00:00, 625.32it/s]


done in 11.250513s


In [4]:
Stock_Colors = ['black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive', 'navy', 'blue', 'teal', 'aqua',
               'black', 'maroon', 'red', 'purple', 'fuchsia', 'green', 'lime', 'olive']

def get_topic_json(model, title, compare_words):
    topic = model.show_topics(-1, formatted=False)
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    topic_out = {name_name: title, children_name:[]}

    for i in range(0, len(topic)):
        topic_out[children_name].append({name_name: 'Topic ' + str(i), children_name:[]})
        max_weight = topic[i][1][0][1]
        for j in range(0, len(topic[i][1])):
            topic_out[children_name][i][children_name].append({name_name:topic[i][1][j][0],
                                                               value_name:"{0:.2f}".format(topic[i][1][j][1]/max_weight),
                                                               color_name: Stock_Colors[i],
                                                               'new_word': not (topic[i][1][j][0] in compare_words)})
        
    return topic_out


def get_topic_words(model):
    topic = model.show_topics(-1, formatted=False)
    all_words = {}
    for i in range(0, len(topic)):
        for j in range(0, len(topic[i][1])):
            if topic[i][1][j][0] in all_words:
                all_words[topic[i][1][j][0]] += 1
            else:
                all_words[topic[i][1][j][0]] = 1
    
    return all_words
    
with open(files[0] + '.json', 'w') as f:
    f.write(json.dumps(get_topic_json(model_1, '100000_Samples', {})))

pos_words = get_topic_words(model_2)
neg_words = get_topic_words(model_3)

topic2 = get_topic_json(model_2, 'Positive', neg_words)
topic3 = get_topic_json(model_3, 'Negative', pos_words)
topic = {'name':'Chinese Restaurant Reviews', 'children':[topic2, topic3]}
with open('compare' + '.json', 'w') as f:
    f.write(json.dumps(topic))
    
# topic4 = get_topic_json(model_4, 'Chinese')
# topic5 = get_topic_json(model_5, 'Mexican')
# topic = {'name':'Chinese vs. Mexican', 'children':[topic2, topic3]}
# with open('compare_cn_mx' + '.json', 'w') as f:
#     f.write(json.dumps(topic))

In [6]:
for idx, topic in model_1.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
for idx, topic in model_2.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
for idx, topic in model_3.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
print('-------------------------------------------------------------------')
# for idx, topic in model_4.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))
# print('-------------------------------------------------------------------')
# for idx, topic in model_5.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.011*"truffl" + 0.011*"crepe" + 0.010*"chocol" + 0.007*"dessert" + 0.007*"concept" + 0.006*"cake" + 0.006*"kale" + 0.006*"appl" + 0.006*"wine" + 0.006*"lamb"
Topic: 1 Words: 0.051*"pancak" + 0.042*"breakfast" + 0.033*"egg" + 0.030*"coffe" + 0.021*"omelet" + 0.019*"benedict" + 0.019*"tot" + 0.019*"hash" + 0.018*"bagel" + 0.016*"toast"
Topic: 2 Words: 0.019*"crawfish" + 0.016*"blast" + 0.015*"cupcak" + 0.014*"reward" + 0.014*"yard" + 0.013*"scrambl" + 0.012*"alot" + 0.010*"uncomfort" + 0.010*"loco" + 0.010*"accept"
Topic: 3 Words: 0.008*"great" + 0.007*"good" + 0.007*"place" + 0.007*"burger" + 0.006*"love" + 0.005*"friend" + 0.005*"fri" + 0.005*"order" + 0.005*"like" + 0.005*"servic"
Topic: 4 Words: 0.011*"slider" + 0.009*"ketchup" + 0.007*"spoon" + 0.007*"ignor" + 0.007*"tasteless" + 0.006*"milk" + 0.006*"negat" + 0.006*"train" + 0.005*"horribl" + 0.005*"brat"
Topic: 5 Words: 0.009*"minut" + 0.008*"wait" + 0.007*"tabl" + 0.007*"manag" + 0.006*"ask" + 0.006*"custom" + 0.