In [1]:
import os
import os.path
import pickle
import time
import shelve

import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np

from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec import LDA2Vec

  from ._conv import register_converters as _register_converters


In [2]:
# gpu_id = int(os.getenv('CUDA_GPU', 0))
# cuda.get_device(gpu_id).use()
# print("Using GPU:" + str(gpu_id))

In [3]:
#data_dir = os.getenv('data_dir', '../data/')
fn_vocab = 'res/vocab.pkl'
fn_corpus = 'res/corpus.pkl'
fn_flatnd = 'res/flattened.npy'
fn_docids = 'res/doc_ids.npy'
fn_vectors = 'res/vectors.npy'
vocab = pickle.load(open(fn_vocab, 'rb'))
corpus = pickle.load(open(fn_corpus, 'rb'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)

In [4]:
# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 20))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

In [5]:
# for key in sorted(locals().keys()):
#     val = locals()[key]
#     if len(str(val)) < 100 and '<' not in str(val):
#         print(key, val)

# training the model

In [6]:
model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=20, power=power, temperature=temperature)

In [7]:
# if os.path.exists('lda2vec.hdf5'):
#     print("Reloading from saved")
#     serializers.load_hdf5("lda2vec.hdf5", model)

# if pretrained:
#     model.sampler.W.data[:, :] = vectors[:n_vocab, :]

In [8]:
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

In [9]:
j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('res/progress.shelve')

In [10]:
for epoch in range(1):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('res/topics.pyldavis', **data)
    print(epoch)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        model.cleargrads()
        #optimizer.use_cleargrads(use=False)
        l = model.fit_partial(d.copy(), f.copy())
        print("after partial fitting:", l)
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
                    prior=float(prior.data), rate=rate)
        print(msg.format(**logs))
        j += 1
    serializers.save_hdf5("res/lda2vec.hdf5", model)

Top words in topic 0 make out_of_vocabulary never overpriced service place ambience decor rude would
Top words in topic 1 pizza would best out_of_vocabulary delicious dish atmosphere sushi friendly good
Top words in topic 2 worth make feel dinner like best though <SKIP> service restaurant
Top words in topic 3 menu good reasonable though well food feel always make sushi
Top words in topic 4 menu fresh wine service back place thing dinner friendly sushi
Top words in topic 5 really worth dinner place out_of_vocabulary romantic back overpriced like out_of_vocabulary
Top words in topic 6 ambience make place always friendly excellent atmosphere menu overpriced nice
Top words in topic 7 great romantic always back place out_of_vocabulary fresh best nice make
Top words in topic 8 romantic always delicious dinner nice wait would out_of_vocabulary menu special
Top words in topic 9 excellent always thing fresh feel wine back restaurant food never
Top words in topic 10 price dinner sushi nice good 

In [11]:
all_topics = []
for row in top_words:
    for word in row:
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))
print(len(sorted(list(dict.fromkeys(all_topics)))))

['<SKIP>', 'always', 'ambience', 'atmosphere', 'back', 'best', 'decor', 'delicious', 'dinner', 'dish', 'excellent', 'feel', 'food', 'fresh', 'friendly', 'good', 'great', 'like', 'make', 'meal', 'menu', 'never', 'nice', 'out_of_vocabulary', 'overpriced', 'pizza', 'place', 'price', 'really', 'reasonable', 'restaurant', 'romantic', 'rude', 'service', 'special', 'staff', 'sushi', 'thing', 'though', 'time', 'wait', 'well', 'wine', 'worth', 'would']
45


In [12]:
import pandas as pd

df = pd.read_csv('../dataset/res16_baru.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewID,sentenceID,review,target,category,polarity
0,0,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",staff,SERVICE,negative
1,1,1004293,1004293:3,The food was lousy - too sweet or too salty an...,"food,portions","FOOD,FOOD","negative,negative"
2,2,1014458,1014458:0,"I have eaten at Saul, many times, the food is ...",food,FOOD,positive
3,3,1014458,1014458:2,The duck confit is always amazing and the foie...,"foie gras terrine with figs,duck confit","FOOD,FOOD","positive,positive"
4,4,1014458,1014458:3,The wine list is interesting and has many good...,"wine list,wine list","FOOD,PRICES","positive,positive"


In [13]:
import pandas as pd

def aspect_topic(tipe, all_topics):
    sf = pd.DataFrame(columns=['id','review','target', 'category','term','polarity'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        polarity = df['polarity'][index]
        category = df['category'][index]
        id_name = df['sentenceID'][index]
        target = df['target'][index]
        for topic in all_topics:
            tokens = lowercased.split(' ')
            for token in tokens:
                if topic in token:
                    term.append(topic)
#         print(term)
        if len(term) == 0:
            print(lowercased)
            count += 1
        sf = sf.append({'id': id_name,
                        'review': sentence.strip().lower().replace('  ', ' '),
                        'target': target,
                        'category': category,
                        'term': '|'.join(term),
                        'polarity': polarity}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("../Results/Aspect Terms Extraction/"+ tipe +".csv")
    sf.to_excel("../Results/Aspect Terms Extraction/"+ tipe +".xlsx")

In [14]:
aspect_topic("lda2vec",list(dict.fromkeys(all_topics)))

their sake list was extensive, but we were looking for purple haze, which wasn't listed but made for us upon request!
ambiance- relaxed and stylish.
if you've ever been along the river in weehawken you have an idea of the top of view the chart house has to offer.
the lava cake dessert was incredible and i recommend it.
once you step into cosette, you're miraculously in a small, off-the-beaten path parisian bistro.
my wife had the fried shrimp which are huge and loved it.
a large is $20, and toppings are about $3 each.
located at the end of a magnificent block.
get the tuna of gari.
try the crunchy tuna, it is to die for.
first went here to enjoy their garden terrace.
the bagel was huge.
the workers there also absolutely load the bagel with cream cheese (gets a little messy).
hats off to the chef.
salads were fantastic.
ingredients are organic which is a real plus for me.
i started out with a bombay beer which was big enough for two.
fish was overdone.
someone else recommended the desse