In [79]:
import os
import os.path
import pickle
import time
import shelve

import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np

from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec import LDA2Vec

In [80]:
# gpu_id = int(os.getenv('CUDA_GPU', 0))
# cuda.get_device(gpu_id).use()
# print("Using GPU:" + str(gpu_id))

In [81]:
#data_dir = os.getenv('data_dir', '../data/')
fn_vocab = 'vocab.pkl'
fn_corpus = 'corpus.pkl'
fn_flatnd = 'flattened.npy'
fn_docids = 'doc_ids.npy'
fn_vectors = 'vectors.npy'
vocab = pickle.load(open(fn_vocab, 'rb'))
corpus = pickle.load(open(fn_corpus, 'rb'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)

In [82]:
# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 30))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

In [83]:
# for key in sorted(locals().keys()):
#     val = locals()[key]
#     if len(str(val)) < 100 and '<' not in str(val):
#         print(key, val)

# training the model

In [84]:
model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=20, power=power, temperature=temperature)

(49, 300)
[[0.37454012 0.95071431 0.73199394 ... 0.21582103 0.62289048 0.08534746]
 [0.05168172 0.53135463 0.54063512 ... 0.17231987 0.19228902 0.04086862]
 [0.16893506 0.27859034 0.17701048 ... 0.89633582 0.01300192 0.08550853]
 ...
 [0.89466762 0.0116266  0.96590299 ... 0.97460921 0.68886648 0.89503275]
 [0.69937887 0.73511582 0.29041092 ... 0.03161501 0.77339222 0.0520524 ]
 [0.03815361 0.4298177  0.06307517 ... 0.38296701 0.20364864 0.64399795]]


In [85]:
# if os.path.exists('lda2vec.hdf5'):
#     print("Reloading from saved")
#     serializers.load_hdf5("lda2vec.hdf5", model)
    
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]

In [86]:
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

In [87]:
j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')

In [88]:
for epoch in range(1):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    print(epoch)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        model.cleargrads()
        #optimizer.use_cleargrads(use=False)
        l = model.fit_partial(d.copy(), f.copy())
        print("after partial fitting:", l)
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
                    prior=float(prior.data), rate=rate)
        print(msg.format(**logs))
        j += 1
    serializers.save_hdf5("lda2vec.hdf5", model)

Top words in topic 0 wine <SKIP> recommend appropriate value casual price reasonably fish ambience
Top words in topic 1 dish moderate friendly delicious price good menu flavorful fresh recommend
Top words in topic 2 out_of_vocabulary place atmosphere nice fish never would special ever like
Top words in topic 3 out_of_vocabulary atmosphere staff relaxed friendly high excellent moderate view reasonably
Top words in topic 4 recommend place appropriate <SKIP> view menu excellent relaxed best ambience
Top words in topic 5 value out_of_vocabulary price amazing sushi fish nice moderate food pizza
Top words in topic 6 recommend menu pizza flavorful special friendly appropriate moderate sushi wine
Top words in topic 7 sushi delicious amazing flavorful fresh menu never really absolutely excellent
Top words in topic 8 friendly relaxed plentiful atmosphere place reasonably casual view ambience moderate
Top words in topic 9 moderate friendly restaurant pizza place meal ever best out_of_vocabulary g

In [89]:
all_topics = []
for row in top_words:
    for word in row:
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))
print(len(sorted(list(dict.fromkeys(all_topics)))))

['<SKIP>', 'absolutely', 'always', 'amazing', 'ambience', 'appropriate', 'atmosphere', 'best', 'casual', 'decor', 'delicious', 'dish', 'ever', 'excellent', 'fish', 'flavorful', 'food', 'fresh', 'friendly', 'going', 'good', 'great', 'high', 'like', 'meal', 'menu', 'moderate', 'never', 'nice', 'out_of_vocabulary', 'pizza', 'place', 'plentiful', 'price', 'really', 'reasonably', 'recommend', 'relaxed', 'restaurant', 'service', 'special', 'staff', 'sushi', 'time', 'value', 'view', 'wine', 'would']
48


In [90]:
import pandas as pd

df = pd.read_csv('../res_mul_all.csv')
df.head()

Unnamed: 0,id,reviewID,sentenceID,review,category,polarity,entity,preprocessed_sentence,type_sentence
0,0,RL#3,RL#3:1,I am not necessarily fanatical about this plac...,VALUE#PRICES,positive,VALUE,i am not necessarily fanatical about this plac...,compound_sentence
1,2,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
2,3,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
3,4,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
4,5,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence


In [91]:
import pandas as pd

def aspect_topic(tipe, all_topics):
    sf = pd.DataFrame(columns=['id','review','category','term'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        for cat in df['category'][index].split(','):
            splitted = cat.split('#')
            if splitted[1] == 'PRICES':
                category.append('VALUE')
            else:
                category.append(splitted[0])
        id_name = df['id'][index]
        for topic in all_topics:
            tokens = lowercased.split(' ')
            for token in tokens:
                if token.startswith(topic):
                    term.append(topic)
#         print(term)
        if len(term) == 0:
            print(lowercased)
            count += 1
        sf = sf.append({'id': id_name, 'review': sentence.strip().lower().replace('  ', ' '), 'category': '|'.join(category), 'term': '|'.join(term)}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("lda2vec"+ tipe +".csv")
    sf.to_excel("lda2vec"+ tipe +".xlsx")

In [92]:
aspect_topic("-nice",list(dict.fromkeys(all_topics)))

chow fun was dry; pork shu mai was more than usually greasy and had to share a table with loud and rude family. 
the lava cake dessert was terrible.
once you step into cosette, you're miraculously in a small, off-the-beaten path parisian bistro.
my wife had the fried shrimp which are huge and loved it.
the hostess is rude to the point of being offensive.
there was a small wait, but shorter than i expected.
first went here to enjoy their garden terrace.
took my mom for mother's day, and the maitre d' was pretty rude.
tiny dessert was $8.00...just plain overpriced for what it is.
the tuna and wasabe potatoes are bad.
the bagel was small.
salads were bad.
ingredients are organic which is a real plus for me.
we even had a visit from the manager who wanted to make sure we were enjoying ourselves.
their tuna tartar appetizer is to die for.
the dining room is quietly elegant with no music to shout over -- how refreshing!
delivery is fast too.
thius is a must for anyone who loves shabu-shabu.
