In [42]:
import os
import os.path
import pickle
import time
import shelve

import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np

from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec import LDA2Vec

In [43]:
# gpu_id = int(os.getenv('CUDA_GPU', 0))
# cuda.get_device(gpu_id).use()
# print("Using GPU:" + str(gpu_id))

In [44]:
#data_dir = os.getenv('data_dir', '../data/')
fn_vocab = 'res/vocab.pkl'
fn_corpus = 'res/corpus.pkl'
fn_flatnd = 'res/flattened.npy'
fn_docids = 'res/doc_ids.npy'
fn_vectors = 'res/vectors.npy'
vocab = pickle.load(open(fn_vocab, 'rb'))
corpus = pickle.load(open(fn_corpus, 'rb'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)

In [45]:
# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 20))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

In [46]:
# for key in sorted(locals().keys()):
#     val = locals()[key]
#     if len(str(val)) < 100 and '<' not in str(val):
#         print(key, val)

# training the model

In [47]:
model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=20, power=power, temperature=temperature)

In [48]:
# if os.path.exists('lda2vec.hdf5'):
#     print("Reloading from saved")
#     serializers.load_hdf5("lda2vec.hdf5", model)

if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]

In [49]:
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

In [50]:
j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('res/progress.shelve')

In [51]:
for epoch in range(1):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('res/topics.pyldavis', **data)
    print(epoch)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        model.cleargrads()
        #optimizer.use_cleargrads(use=False)
        l = model.fit_partial(d.copy(), f.copy())
        print("after partial fitting:", l)
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
                    prior=float(prior.data), rate=rate)
        print(msg.format(**logs))
        j += 1
    serializers.save_hdf5("res/lda2vec.hdf5", model)

Top words in topic 0 atmosphere delicious great price out_of_vocabulary place value best good relaxed
Top words in topic 1 relaxed atmosphere delicious out_of_vocabulary place out_of_vocabulary price pizza great food
Top words in topic 2 relaxed good food delicious high value best pizza price service
Top words in topic 3 out_of_vocabulary atmosphere best service out_of_vocabulary place high good food great
Top words in topic 4 value atmosphere service high reasonably place out_of_vocabulary out_of_vocabulary <SKIP> good
Top words in topic 5 out_of_vocabulary service <SKIP> great place delicious good high atmosphere reasonably
Top words in topic 6 delicious value price pizza reasonably place great best service good
Top words in topic 7 pizza food service atmosphere delicious place great good value price
Top words in topic 8 out_of_vocabulary <SKIP> place food value price atmosphere best out_of_vocabulary pizza
Top words in topic 9 pizza atmosphere relaxed <SKIP> delicious service high r

In [52]:
all_topics = []
for row in top_words:
    for word in row:
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))
print(len(sorted(list(dict.fromkeys(all_topics)))))

['<SKIP>', 'atmosphere', 'best', 'delicious', 'food', 'good', 'great', 'high', 'out_of_vocabulary', 'pizza', 'place', 'price', 'reasonably', 'relaxed', 'service', 'value']
16


In [53]:
import pandas as pd

df = pd.read_csv('../res_mul_all.csv')
df.head()

Unnamed: 0,id,reviewID,sentenceID,review,category,polarity,entity,preprocessed_sentence,type_sentence
0,0,RL#3,RL#3:1,I am not necessarily fanatical about this plac...,VALUE#PRICES,positive,VALUE,i am not necessarily fanatical about this plac...,compound_sentence
1,2,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
2,3,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
3,4,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
4,5,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence


In [54]:
import pandas as pd

def aspect_topic(tipe, all_topics):
    sf = pd.DataFrame(columns=['id','review','category','term'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        for cat in df['category'][index].split(','):
            splitted = cat.split('#')
            if splitted[1] == 'PRICES':
                category.append('VALUE')
            else:
                category.append(splitted[0])
        id_name = df['id'][index]
        for topic in all_topics:
            tokens = lowercased.split(' ')
            for token in tokens:
                if token.startswith(topic):
                    term.append(topic)
#         print(term)
        if len(term) == 0:
            print(lowercased)
            count += 1
        sf = sf.append({'id': id_name, 'review': sentence.strip().lower().replace('  ', ' '), 'category': '|'.join(category), 'term': '|'.join(term)}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("../Results/Aspect Terms Extraction/"+ tipe +".csv")
    sf.to_excel("../Results/Aspect Terms Extraction/"+ tipe +".xlsx")

In [55]:
aspect_topic("20",list(dict.fromkeys(all_topics)))

the duck confit is always amazing and the foie gras terrine with figs was out of this world.
chow fun was dry; pork shu mai was more than usually greasy and had to share a table with loud and rude family. 
i had the duck breast special on my last visit and it was not incredible.
the only thing i moderately enjoyed was their grilled chicken special with edamame puree.
i had never had edamame pureed before but i thought it was innovative and tasty (could've used a bit more salt).
i happen to have a policy that goes along with a little bit of self-respect, which includes not letting a waiter intimidate me, i.e. make me feel bad asking for trivialities like water, or the check.
i tend to judge a sushi restaurant by its sea urchin, which was heavenly at sushi rose.
the sushi seemed pretty fresh and was adequately proportioned.
the prix fixe menu is worth every penny and you get more than enough (both in quantity and quality).
i am not a vegetarian but, almost all the dishes were bad.
i like

we are very particular about sushi and were both please with every choice which included: ceviche mix (special), crab dumplings, assorted sashimi, sushi and rolls, two types of sake, and the banana tempura.
we were greeted promptly by the waiter who was very nice and cordial.
the crust is thin, the ingredients are fresh and the staff is friendly.
the menu has so many fish items and oysters.
the fish was really,really fresh.
the first time the sushi was outstanding, the second time it was a little bland.
mizu is home to creative and unique rolls not to found anywhere else.
i ordered the smoked salmon and roe appetizer and it was off flavor.
the entree was bland and small, dessert was not inspired.
i expected quite a bit more from such an expensive menu.
the cold appetizer dishes taste like the way i remember them to taste when i was growing up in taiwan.
kind, attentive wait staff.
i really like both the scallops and the mahi mahi (on saffron risotto-yum!).
their calzones are horrific, 