In [1]:
import numpy as np
import nltk
import pandas as pd
from ast import literal_eval
from collections import Counter
from scipy.optimize import fmin_l_bfgs_b
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import multiprocessing
import random
from tqdm import tqdm
import optimizeTopicVectors as ot

In [2]:
def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)


def sampleFromCategorical(theta):
    # theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()


def word_indices(doc_sent_word_dict, sent_index):
    """
    :param doc_sent_word_dict:
    :param sent_index:
    :return:
    """
    sentence = doc_sent_word_dict[sent_index]
    for idx in sentence:
        yield idx

In [3]:
import numpy as np
import nltk
import pandas as pd
from ast import literal_eval
from collections import Counter
from scipy.optimize import fmin_l_bfgs_b
import optimizeTopicVectors as ot
from preprocess import *

def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)


def sampleFromCategorical(theta):
    # theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()


def word_indices(doc_sent_word_dict, sent_index):
    """
    :param doc_sent_word_dict:
    :param sent_index:
    :return:
    """
    sentence = doc_sent_word_dict[sent_index]
    for idx in sentence:
        yield idx


class ASUM_Gibbs_Sampler:
    def __init__(self, wordVectors, sentimentVector, numTopics, alpha, beta, gamma, binary=0.5, max_sentence=50, numSentiments=2):
        self.wordVectors = wordVectors # (V x H)
        self.numTopics = numTopics
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numSentiments = numSentiments
        #self.MAX_VOCAB_SIZE = max_vocab_size
        self.maxSentence = max_sentence
        self.dimension = self.wordVectors.shape[1]  # H
        self.binary = binary
        self.sentimentVector = sentimentVector # (L x H)

    def build_dataset(self, reviews, sentiment_list):
        """
        :param reviews: 리뷰 데이터 [ [[문서1의 문장1],[문서1의 문장2]], [[문서2의 문장1],[문서2의 문장2]], ...]]
        :return:
        """
        corpus = [word for review in reviews for sentence in review for word in sentence]
        text = nltk.Text(corpus)
        freq = nltk.FreqDist(text)
        #keywords = [tup[0] for tup in freq.most_common(self.MAX_VOCAB_SIZE)]  # 많이 등장한 단어 선택
        keywords = [tup[0] for tup in freq.most_common(self.wordVectors.shape[0])]  # 많이 등장한 단어 선택
        word2idx = {}  # key : 단어, value : index
        for index, key in enumerate(keywords):
            word2idx[key] = index

        idx2word = dict(zip(word2idx.values(), word2idx.keys()))  # key : index, value : 단어
        doc_sent_word_dict = {}  # key: 문서 index, value : [[list of sent1 단어의 index], [list of sent2 단어의 index]...]
        numSentence = {}  # key : 문서 index, value : 해당 문서의 문장수
        wordCountSentence = {}  # key : 문서 index, value : 해당 문서의 각 문장별 word count
        docSentiment = {}
        for index, review in enumerate(reviews):
            doc_sent_lst = []
            doc_sent_count = []
            for sent in review:
                word_indices = [word2idx[word] for word in sent if word in word2idx]
                doc_sent_lst.append(word_indices)
                counts = Counter(word_indices)
                doc_sent_count.append(counts)
            numSentence[index] = len(doc_sent_lst)
            doc_sent_word_dict[index] = doc_sent_lst
            wordCountSentence[index] = doc_sent_count
            docSentiment[index] = sentiment_list[index]

        return word2idx, idx2word, doc_sent_word_dict, wordCountSentence, numSentence, docSentiment

    def _initialize_(self, reviews, pos_neg_sentence_indices, pos_neg_sentiment_label, sentiment_list):
        self.word2idx, self.idx2word, self.doc_sent_word_dict, self.wordCountSentence, \
        self.numSentence, self.docSentiment = self.build_dataset(reviews, sentiment_list)
        self.numDocs = len(self.doc_sent_word_dict.keys())
        self.vocabSize = len(self.word2idx.keys())
        self.pos_neg_sentence_indices = pos_neg_sentence_indices
        self.pos_neg_sentiment_label = pos_neg_sentiment_label
        self.topicVectors = ot.orthogonal_matrix((self.numTopics, self.dimension))

        # Pseudocounts
        self.n_wkl = np.zeros((self.vocabSize, self.numTopics, self.numSentiments))  # 단어 i가 topic k, senti l로 할당된 수
        self.n_kl = np.zeros((self.numTopics, self.numSentiments))  # topic k, senti l로 할당된 단어 수
        self.ns_d = np.zeros((self.numDocs))  # 문서 d의 문장 수
        self.ns_dkl = np.zeros((self.numDocs, self.numTopics, self.numSentiments))  # 문서 d에서 topic k, sentiment l로 할당된 문장 수
        self.ns_dl = np.zeros((self.numDocs, self.numSentiments))  # 문서 d에서 topic k로 할당된 문장 수
        self.topics = {}
        self.sentiments = {}

        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)

        for d in range(self.numDocs):
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))

            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)

            for m in range(self.numSentence[d]):
                t = sampleFromCategorical(topicDistribution)
                # s = sampleFromCategorical(sentimentDistribution[t, :])
                # s = self.docSentiment[d]
                pos_score = np.dot(self.sentimentVector,
                                   self.wordVectors[self.doc_sent_word_dict[d][m]].T).sum(axis=1)
                s = np.argmax(pos_score)
                self.topics[(d, m)] = t  # d 문서의 m번째 문장의 topic
                self.sentiments[(d, m)] = s  # d 문서의 m 번째 문장의 sentiment
                self.ns_d[d] += 1
                self.ns_dkl[d, t, s] += 1
                self.ns_dl[d, s] += 1
                for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):  # d번째 문서의 m번째 문장의 단어를 돌면서
                    self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
                    self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def updateTopicVectors(self, lamda = 0.01):
        t = self.topicVectors # (K, H)
        for i in range(self.numTopics):
            x0 = t[i, :]
            x, f, d = fmin_l_bfgs_b(ot.loss, x0, fprime=ot.grad, args=(self.n_wkl, wordVectors, lamda), maxiter=15000)
            t[i, :] = x
        self.topicVectors = t


    def sampling(self, d, m):
        t = self.topics[(d, m)]
        s = self.sentiments[(d, m)]
        self.ns_d[d] -= 1
        self.ns_dkl[d, t, s] -= 1
        self.ns_dl[d, s] -= 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] -= 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] -= 1  # topic k, senti l로 할당된 단어 수

        firstFactor = np.ones((self.numTopics, self.numSentiments))

        word_count = self.wordCountSentence[d][m]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                beta0 = self.n_kl[t][s] + self.beta
                m0 = 0
                for word in word_count.keys():
                    betaw = self.n_wkl[word, t, s] + self.beta
                    cnt = word_count[word]
                    for i in range(cnt):
                        firstFactor[t][s] *= (betaw + i) / (beta0 + m0)
                        m0 += 1

        # topic_similarity = ot.softmax(np.dot(self.topicVectors,
        #                                      self.wordVectors[
        #                                          self.doc_sent_word_dict[d][m]].T))  # ( K x num words in sentence)
        # senti_similarity = ot.softmax(np.dot(self.sentimentVector,
        #                                      self.wordVectors[
        #                                          self.doc_sent_word_dict[d][m]].T))  # ( L x num words in sentence)
        # vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T))
        #
        # firstFactor = firstFactor *  vector_similarity # dim(K x L)

        secondFactor = (self.ns_dl[d, :] + self.alpha) / \
                       (self.ns_d[d] + self.numTopics * self.alpha)  # dim(L x 1)

        thirdFactor = (self.ns_dkl[d, :, :] + self.gamma) / \
                      (self.ns_dl[d] + self.numSentiments * self.gamma)[np.newaxis, :] #(K, L)

        prob = np.ones((self.numTopics, self.numSentiments))
        prob *= firstFactor * thirdFactor
        prob *= secondFactor[np.newaxis,:]
        prob /= np.sum(prob)

        ind = sampleFromCategorical(prob.flatten())
        t, s = np.unravel_index(ind, prob.shape)

        self.topics[(d, m)] = t
        self.sentiments[(d, m)] = s
        self.ns_d[d] += 1
        self.ns_dkl[d, t, s] += 1
        self.ns_dl[d, s] += 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def calculatePhi(self):
        firstFactor = (self.n_wkl + self.beta) / \
                      np.expand_dims(self.n_kl + self.n_wkl.shape[0] * self.beta, axis=0)

        # topic_similarity = ot.softmax(np.dot(self.topicVectors,
        #                                      self.wordVectors.T))  # ( K x V)
        # senti_similarity = ot.softmax(np.dot(self.sentimentVector,
        #                                      self.wordVectors.T))  # ( L x V)
        # vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T)) # K x L
        #
        # firstFactor = firstFactor * np.expand_dims(vector_similarity, axis=0)
        # firstFactor /= firstFactor.sum()
        return firstFactor

    def calculateTheta(self):
        secondFactor = (self.ns_dkl + self.alpha) / \
                       np.expand_dims(self.ns_dl + self.numTopics * self.alpha, axis=1)  # dim(K x 1)
        secondFactor /= secondFactor.sum()
        return secondFactor

    def calculatePi(self):
        thirdFactor = (self.ns_dl + self.gamma) / \
                      np.expand_dims(self.ns_d + self.numSentiments * self.gamma, axis=1)
        thirdFactor /= thirdFactor.sum()
        return thirdFactor


    def getTopKWordsByLikelihood(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(t, s | v) is maximum
        """
        pseudocounts = np.copy(self.n_wkl)
        normalizer = np.sum(pseudocounts, (1, 2))
        pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                # vocab = self.vectorizer.get_feature_names()
                print(t, s, [self.idx2word[i] for i in topWordIndices])

    def getTopKWordsByTS(self, K):
        """
        K 개 sentiment별 top words
        """
        topic_sentiment_arr = self.calculatePhi()
        dic = {}
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                index_list = np.argsort(-topic_sentiment_arr[:, t, s])[:10]
                if s == 0:
                    name = "p"
                else:
                    name = "n"
                dic['topic_' + '{:02d}'.format(t + 1) + '_' + name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopKWordsByTopic(self, K):
        dic = {}
        phi = self.calculatePhi()
        topic_arr = np.sum(phi, (2))
        for t in range(self.numTopics):
            index_list = np.argsort(-topic_arr[:, t])[:K]
            dic["Topic"+str(t+1)] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopicSentimentDist(self, d):
        theta = self.calculateTheta()[d]
        return theta

    def getDocSentimentDist(self, d):
        pi = self.calculatePi()[d]
        return pi

    def getTopWordsBySenti(self, K):
        dic = {}
        phi = self.calculatePhi()
        senti_arr = np.sum(phi, (1))
        for s in range(self.numSentiments):
            index_list = np.argsort(-senti_arr[:, s])[:K]
            if s == 0:
                name = "p"
            else:
                name = "n"
            dic["Sentiment_"+ name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def classify_senti(self):
        doc_sent_inference = []
        for i in range(self.numDocs):
            if i in self.pos_neg_sentence_indices:
                doc_sent_inference.append(np.argmax(self.getDocSentimentDist(i)))
        infer_arr = np.array(doc_sent_inference)
        answer = np.array(self.pos_neg_sentiment_label)
        return np.mean(infer_arr == answer)

    def runASUM(self, reviews, maxIters=10):
        for iteration in range(maxIters):
            self.updateTopicVectors()
            if (iteration + 1) % 2 == 0:
                print("Starting iteration %d of %d" % (iteration + 1, maxIters))
                print(self.classify_senti())

            for d in range(self.numDocs):
                for m in range(self.numSentence[d]):
                    self.sampling(d, m)


In [4]:
# load data
work_path = "/media/hs-ubuntu/data/dataset/MasterThesis/STMD_data/"
data = pd.read_csv(work_path + "preprocess_complete_Electronics.csv")

brand = ['Apple', 'Samsung','Canon']
brand_df = data[data.brand.isin(brand)]

In [6]:
#긍정, 부정 반반씩
pos_reviews = brand_df[brand_df.overall >= 4]
neg_reviews = brand_df[brand_df.overall <= 2]
pos_sample = pos_reviews.sample(3500, random_state=23)
neg_sample = neg_reviews.sample(3500, random_state=42)
df = pd.concat([pos_sample, neg_sample], axis=0)
df['preprocessed'] = df.preprocessed.apply(lambda row: literal_eval(row))
df.reset_index(drop=True, inplace=True)

In [217]:
# brand_df = data[data['brand'] == brand]
# brand_df.reset_index(drop=True, inplace=True)
# brand_df['preprocessed'] = brand_df.preprocessed.apply(lambda row: literal_eval(row))
# tagged_text_list = list(brand_df['reviewSentence_tagged'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# embedding
from preprocess import *

sentence_list, sentiment_label, sentence_senti_label, \
pos_neg_sentence_indices, pos_neg_sentiment_label, numSentence = prepare(df)

documents, sentence_list_again, bigram\
= bigram_and_sentence(sentence_senti_label, sentence_list, numSentence, max_vocab=5000, threshold = 5, min_count = 20)

In [14]:
# 단어 분포
corpus = [word for review in sentence_list_again for sent in review for word in sent]
text = nltk.Text(corpus)
freq = nltk.FreqDist(text)
# keywords = [tup[0] for tup in freq.most_common(3000)] 

In [15]:
len(freq.keys())

5000

In [16]:
window = [2]
size = [100]
passes = 5
for w in window:
    for s in size:
        model = Doc2Vec(dm=1, 
                        dm_mean=1, 
                        min_count=0, sample=1e-5,
                        window=w, size=s, 
                        workers=multiprocessing.cpu_count(), 
                        alpha=0.025, min_alpha=0.025)
        model.build_vocab(documents)

        for epoch in tqdm(range(passes)):
            random.shuffle(documents)
            model.train(documents)
            model.alpha -= 0.002  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
#             if (epoch + 1) % 5 ==0:
#                 model.save(work_path + 'model_' + str(w) + '_' + str(s) + '_' + str(epoch+1))

100%|██████████| 5/5 [02:41<00:00, 32.26s/it]


In [17]:
len(model.index2word)

5000

In [18]:
# model = Doc2Vec.load(work_path + 'model_3_100_10')

In [19]:
model.most_similar([model.docvecs['negative']], topn=15)

[('amazon', 0.7968705296516418),
 ('agent', 0.7953909635543823),
 ('return', 0.7908092737197876),
 ('two_week', 0.7863162755966187),
 ('custom_servic', 0.7842612266540527),
 ('ago', 0.777485728263855),
 ('exchang', 0.7764937281608582),
 ('factori', 0.7723338007926941),
 ('within_day', 0.7692259550094604),
 ('month', 0.7646186947822571),
 ('telephon', 0.7642788887023926),
 ('send', 0.7582880258560181),
 ('told', 0.7575987577438354),
 ('dealer', 0.7546489238739014),
 ('receipt', 0.7521483898162842)]

In [20]:
model.most_similar([model.docvecs['positive']], topn=15)

[('contest', 0.7410257458686829),
 ('shooter', 0.7270699739456177),
 ('creativ', 0.7251889705657959),
 ('defeat', 0.7168301343917847),
 ('bang_buck', 0.7161418199539185),
 ('con', 0.7043457627296448),
 ('easi_use', 0.6940544247627258),
 ('goal', 0.6927509307861328),
 ('size_weight', 0.6918964982032776),
 ('demand', 0.688834547996521),
 ('ultra', 0.6885380148887634),
 ('capabl', 0.6874716877937317),
 ('vivid', 0.6841638088226318),
 ('carri_around', 0.6841549873352051),
 ('hd_video', 0.683285117149353)]

In [21]:
wordVectors = np.zeros((len(model.index2word), model.vector_size))
for index, word in enumerate(model.index2word):
    wordVectors[index,:] = model[word]

In [22]:
sentimentVector = np.zeros((2, model.vector_size))
sentimentVector[0,:] = model.docvecs['positive']
sentimentVector[1,:] = model.docvecs['negative']

In [23]:
import numpy as np
import nltk
import pandas as pd
from ast import literal_eval
from collections import Counter
from scipy.optimize import fmin_l_bfgs_b
import optimizeTopicVectors as ot
from preprocess import *

def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)


def sampleFromCategorical(theta):
    # theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()


def word_indices(doc_sent_word_dict, sent_index):
    """
    :param doc_sent_word_dict:
    :param sent_index:
    :return:
    """
    sentence = doc_sent_word_dict[sent_index]
    for idx in sentence:
        yield idx


class ASUM_Gibbs_Sampler:
    def __init__(self, wordVectors, sentimentVector, numTopics, alpha, beta, gamma, binary=0.5, max_sentence=50, numSentiments=2):
        self.wordVectors = wordVectors # (V x H)
        self.numTopics = numTopics
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numSentiments = numSentiments
        #self.MAX_VOCAB_SIZE = max_vocab_size
        self.maxSentence = max_sentence
        self.dimension = self.wordVectors.shape[1]  # H
        self.binary = binary
        self.sentimentVector = sentimentVector # (L x H)

    def build_dataset(self, reviews, sentiment_list):
        """
        :param reviews: 리뷰 데이터 [ [[문서1의 문장1],[문서1의 문장2]], [[문서2의 문장1],[문서2의 문장2]], ...]]
        :return:
        """
        corpus = [word for review in reviews for sentence in review for word in sentence]
        text = nltk.Text(corpus)
        freq = nltk.FreqDist(text)
        #keywords = [tup[0] for tup in freq.most_common(self.MAX_VOCAB_SIZE)]  # 많이 등장한 단어 선택
        keywords = [tup[0] for tup in freq.most_common(self.wordVectors.shape[0])]  # 많이 등장한 단어 선택
        word2idx = {}  # key : 단어, value : index
        for index, key in enumerate(keywords):
            word2idx[key] = index

        idx2word = dict(zip(word2idx.values(), word2idx.keys()))  # key : index, value : 단어
        doc_sent_word_dict = {}  # key: 문서 index, value : [[list of sent1 단어의 index], [list of sent2 단어의 index]...]
        numSentence = {}  # key : 문서 index, value : 해당 문서의 문장수
        wordCountSentence = {}  # key : 문서 index, value : 해당 문서의 각 문장별 word count
        docSentiment = {}
        for index, review in enumerate(reviews):
            doc_sent_lst = []
            doc_sent_count = []
            for sent in review:
                word_indices = [word2idx[word] for word in sent if word in word2idx]
                doc_sent_lst.append(word_indices)
                counts = Counter(word_indices)
                doc_sent_count.append(counts)
            numSentence[index] = len(doc_sent_lst)
            doc_sent_word_dict[index] = doc_sent_lst
            wordCountSentence[index] = doc_sent_count
            docSentiment[index] = sentiment_list[index]

        return word2idx, idx2word, doc_sent_word_dict, wordCountSentence, numSentence, docSentiment

    def _initialize_(self, reviews, pos_neg_sentence_indices, pos_neg_sentiment_label, sentiment_list):
        self.word2idx, self.idx2word, self.doc_sent_word_dict, self.wordCountSentence, \
        self.numSentence, self.docSentiment = self.build_dataset(reviews, sentiment_list)
        self.numDocs = len(self.doc_sent_word_dict.keys())
        self.vocabSize = len(self.word2idx.keys())
        self.pos_neg_sentence_indices = pos_neg_sentence_indices
        self.pos_neg_sentiment_label = pos_neg_sentiment_label
        self.topicVectors = ot.orthogonal_matrix((self.numTopics, self.dimension))

        # Pseudocounts
        self.n_wkl = np.zeros((self.vocabSize, self.numTopics, self.numSentiments))  # 단어 i가 topic k, senti l로 할당된 수
        self.n_kl = np.zeros((self.numTopics, self.numSentiments))  # topic k, senti l로 할당된 단어 수
        self.ns_d = np.zeros((self.numDocs))  # 문서 d의 문장 수
        self.ns_dkl = np.zeros((self.numDocs, self.numTopics, self.numSentiments))  # 문서 d에서 topic k, sentiment l로 할당된 문장 수
        self.ns_dl = np.zeros((self.numDocs, self.numSentiments))  # 문서 d에서 topic k로 할당된 문장 수
        self.topics = {}
        self.sentiments = {}

        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)

        for d in range(self.numDocs):
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))

            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)

            for m in range(self.numSentence[d]):
                t = sampleFromCategorical(topicDistribution)
                # s = sampleFromCategorical(sentimentDistribution[t, :])
                # s = self.docSentiment[d]
                pos_score = np.dot(self.sentimentVector,
                                   self.wordVectors[self.doc_sent_word_dict[d][m]].T).sum(axis=1)
                s = np.argmax(pos_score)
                self.topics[(d, m)] = t  # d 문서의 m번째 문장의 topic
                self.sentiments[(d, m)] = s  # d 문서의 m 번째 문장의 sentiment
                self.ns_d[d] += 1
                self.ns_dkl[d, t, s] += 1
                self.ns_dl[d, s] += 1
                for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):  # d번째 문서의 m번째 문장의 단어를 돌면서
                    self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
                    self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def updateTopicVectors(self, lamda = 0.01):
        t = self.topicVectors # (K, H)
        for i in range(self.numTopics):
            x0 = t[i, :]
            x, f, d = fmin_l_bfgs_b(ot.loss, x0, fprime=ot.grad, args=(self.n_wkl, wordVectors, lamda), maxiter=15000)
            t[i, :] = x
        self.topicVectors = t


    def sampling(self, d, m):
        t = self.topics[(d, m)]
        s = self.sentiments[(d, m)]
        self.ns_d[d] -= 1
        self.ns_dkl[d, t, s] -= 1
        self.ns_dl[d, s] -= 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] -= 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] -= 1  # topic k, senti l로 할당된 단어 수

        firstFactor = np.ones((self.numTopics, self.numSentiments))

        word_count = self.wordCountSentence[d][m]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                beta0 = self.n_kl[t][s] + self.beta
                m0 = 0
                for word in word_count.keys():
                    betaw = self.n_wkl[word, t, s] + self.beta
                    cnt = word_count[word]
                    for i in range(cnt):
                        firstFactor[t][s] *= (betaw + i) / (beta0 + m0)
                        m0 += 1

        # topic_similarity = ot.softmax(np.dot(self.topicVectors,
        #                                      self.wordVectors[
        #                                          self.doc_sent_word_dict[d][m]].T))  # ( K x num words in sentence)
        # senti_similarity = ot.softmax(np.dot(self.sentimentVector,
        #                                      self.wordVectors[
        #                                          self.doc_sent_word_dict[d][m]].T))  # ( L x num words in sentence)
        # vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T))
        #
        # firstFactor = firstFactor *  vector_similarity # dim(K x L)

        secondFactor = (self.ns_dl[d, :] + self.alpha) / \
                       (self.ns_d[d] + self.numTopics * self.alpha)  # dim(L x 1)

        thirdFactor = (self.ns_dkl[d, :, :] + self.gamma) / \
                      (self.ns_dl[d] + self.numSentiments * self.gamma)[np.newaxis, :] #(K, L)

        prob = np.ones((self.numTopics, self.numSentiments))
        prob *= firstFactor * thirdFactor
        prob *= secondFactor[np.newaxis,:]
        prob /= np.sum(prob)

        ind = sampleFromCategorical(prob.flatten())
        t, s = np.unravel_index(ind, prob.shape)

        self.topics[(d, m)] = t
        self.sentiments[(d, m)] = s
        self.ns_d[d] += 1
        self.ns_dkl[d, t, s] += 1
        self.ns_dl[d, s] += 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def calculatePhi(self):
        firstFactor = (self.n_wkl + self.beta) / \
                      np.expand_dims(self.n_kl + self.n_wkl.shape[0] * self.beta, axis=0)

        # topic_similarity = ot.softmax(np.dot(self.topicVectors,
        #                                      self.wordVectors.T))  # ( K x V)
        # senti_similarity = ot.softmax(np.dot(self.sentimentVector,
        #                                      self.wordVectors.T))  # ( L x V)
        # vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T)) # K x L
        #
        # firstFactor = firstFactor * np.expand_dims(vector_similarity, axis=0)
        # firstFactor /= firstFactor.sum()
        return firstFactor

    def calculateTheta(self):
        secondFactor = (self.ns_dkl + self.alpha) / \
                       np.expand_dims(self.ns_dl + self.numTopics * self.alpha, axis=1)  # dim(K x 1)
        secondFactor /= secondFactor.sum()
        return secondFactor

    def calculatePi(self):
        thirdFactor = (self.ns_dl + self.gamma) / \
                      np.expand_dims(self.ns_d + self.numSentiments * self.gamma, axis=1)
        thirdFactor /= thirdFactor.sum()
        return thirdFactor


    def getTopKWordsByLikelihood(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(t, s | v) is maximum
        """
        pseudocounts = np.copy(self.n_wkl)
        normalizer = np.sum(pseudocounts, (1, 2))
        pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                # vocab = self.vectorizer.get_feature_names()
                print(t, s, [self.idx2word[i] for i in topWordIndices])

    def getTopKWordsByTS(self, K):
        """
        K 개 sentiment별 top words
        """
        topic_sentiment_arr = self.calculatePhi()
        dic = {}
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                index_list = np.argsort(-topic_sentiment_arr[:, t, s])[:10]
                if s == 0:
                    name = "p"
                else:
                    name = "n"
                dic['topic_' + '{:02d}'.format(t + 1) + '_' + name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopKWordsByTopic(self, K):
        dic = {}
        phi = self.calculatePhi()
        topic_arr = np.sum(phi, (2))
        for t in range(self.numTopics):
            index_list = np.argsort(-topic_arr[:, t])[:K]
            dic["Topic"+str(t+1)] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopicSentimentDist(self, d):
        theta = self.calculateTheta()[d]
        return theta

    def getDocSentimentDist(self, d):
        pi = self.calculatePi()[d]
        return pi

    def getTopWordsBySenti(self, K):
        dic = {}
        phi = self.calculatePhi()
        senti_arr = np.sum(phi, (1))
        for s in range(self.numSentiments):
            index_list = np.argsort(-senti_arr[:, s])[:K]
            if s == 0:
                name = "p"
            else:
                name = "n"
            dic["Sentiment_"+ name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def classify_senti(self):
        doc_sent_inference = []
        for i in range(self.numDocs):
            if i in self.pos_neg_sentence_indices:
                doc_sent_inference.append(np.argmax(self.getDocSentimentDist(i)))
        infer_arr = np.array(doc_sent_inference)
        answer = np.array(self.pos_neg_sentiment_label)
        return np.mean(infer_arr == answer)

    def runASUM(self, reviews, maxIters=10):
        for iteration in range(maxIters):
            self.updateTopicVectors()
            if (iteration + 1) % 2 == 0:
                print("Starting iteration %d of %d" % (iteration + 1, maxIters))
                print(self.classify_senti())

            for d in range(self.numDocs):
                for m in range(self.numSentence[d]):
                    self.sampling(d, m)

In [24]:
class STMD_Gibbs_Sampler:
    def __init__(self, wordVectors, sentimentVector, numTopics, alpha, beta, gamma, binary=0.5, max_sentence=50, numSentiments=2):
        self.wordVectors = wordVectors # (V x H)
        self.numTopics = numTopics
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numSentiments = numSentiments
        #self.MAX_VOCAB_SIZE = max_vocab_size
        self.maxSentence = max_sentence
        self.dimension = self.wordVectors.shape[1]  # H
        self.binary = binary
        self.sentimentVector = sentimentVector # (L x H)

    def build_dataset(self, reviews, sentiment_list):
        """
        :param reviews: 리뷰 데이터 [ [[문서1의 문장1],[문서1의 문장2]], [[문서2의 문장1],[문서2의 문장2]], ...]]
        :return:
        """
        corpus = [word for review in reviews for sentence in review for word in sentence]
        text = nltk.Text(corpus)
        freq = nltk.FreqDist(text)
        #keywords = [tup[0] for tup in freq.most_common(self.MAX_VOCAB_SIZE)]  # 많이 등장한 단어 선택
        keywords = [tup[0] for tup in freq.most_common(self.wordVectors.shape[0])]  # 많이 등장한 단어 선택
        word2idx = {}  # key : 단어, value : index
        for index, key in enumerate(keywords):
            word2idx[key] = index

        idx2word = dict(zip(word2idx.values(), word2idx.keys()))  # key : index, value : 단어
        doc_sent_word_dict = {}  # key: 문서 index, value : [[list of sent1 단어의 index], [list of sent2 단어의 index]...]
        numSentence = {}  # key : 문서 index, value : 해당 문서의 문장수
        wordCountSentence = {}  # key : 문서 index, value : 해당 문서의 각 문장별 word count
        docSentiment = {}
        for index, review in enumerate(reviews):
            doc_sent_lst = []
            doc_sent_count = []
            for sent in review:
                word_indices = [word2idx[word] for word in sent if word in word2idx]
                doc_sent_lst.append(word_indices)
                counts = Counter(word_indices)
                doc_sent_count.append(counts)
            numSentence[index] = len(doc_sent_lst)
            doc_sent_word_dict[index] = doc_sent_lst
            wordCountSentence[index] = doc_sent_count
            docSentiment[index] = sentiment_list[index]

        return word2idx, idx2word, doc_sent_word_dict, wordCountSentence, numSentence, docSentiment

    def _initialize_(self, reviews, pos_neg_sentence_indices, pos_neg_sentiment_label, sentiment_list):
        self.word2idx, self.idx2word, self.doc_sent_word_dict, self.wordCountSentence, \
        self.numSentence, self.docSentiment = self.build_dataset(reviews, sentiment_list)
        self.numDocs = len(self.doc_sent_word_dict.keys())
        self.vocabSize = len(self.word2idx.keys())
        self.pos_neg_sentence_indices = pos_neg_sentence_indices
        self.pos_neg_sentiment_label = pos_neg_sentiment_label
        self.topicVectors = ot.orthogonal_matrix((self.numTopics, self.dimension))

        # Pseudocounts
        self.n_wkl = np.zeros((self.vocabSize, self.numTopics, self.numSentiments))  # 단어 i가 topic k, senti l로 할당된 수
        self.n_kl = np.zeros((self.numTopics, self.numSentiments))  # topic k, senti l로 할당된 단어 수
        self.ns_d = np.zeros((self.numDocs))  # 문서 d의 문장 수
        self.ns_dkl = np.zeros((self.numDocs, self.numTopics, self.numSentiments))  # 문서 d에서 topic k, sentiment l로 할당된 문장 수
        self.ns_dl = np.zeros((self.numDocs, self.numSentiments))  # 문서 d에서 topic k로 할당된 문장 수
        self.topics = {}
        self.sentiments = {}

        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)

        for d in range(self.numDocs):
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))

            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)

            for m in range(self.numSentence[d]):
                t = sampleFromCategorical(topicDistribution)
                # s = sampleFromCategorical(sentimentDistribution[t, :])
                # s = self.docSentiment[d]
                pos_score = np.dot(self.sentimentVector,
                                   self.wordVectors[self.doc_sent_word_dict[d][m]].T).sum(axis=1)
                s = np.argmax(pos_score)
                self.topics[(d, m)] = t  # d 문서의 m번째 문장의 topic
                self.sentiments[(d, m)] = s  # d 문서의 m 번째 문장의 sentiment
                self.ns_d[d] += 1
                self.ns_dkl[d, t, s] += 1
                self.ns_dl[d, s] += 1
                for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):  # d번째 문서의 m번째 문장의 단어를 돌면서
                    self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
                    self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def updateTopicVectors(self, lamda = 0.01):
        t = self.topicVectors # (K, H)
        for i in range(self.numTopics):
            x0 = t[i, :]
            x, f, d = fmin_l_bfgs_b(ot.loss, x0, fprime=ot.grad, args=(self.n_wkl, wordVectors, lamda), maxiter=15000)
            t[i, :] = x
        self.topicVectors = t


    def sampling(self, d, m):
        t = self.topics[(d, m)]
        s = self.sentiments[(d, m)]
        self.ns_d[d] -= 1
        self.ns_dkl[d, t, s] -= 1
        self.ns_dl[d, s] -= 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] -= 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] -= 1  # topic k, senti l로 할당된 단어 수

        firstFactor = np.ones((self.numTopics, self.numSentiments))

        word_count = self.wordCountSentence[d][m]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                beta0 = self.n_kl[t][s] + self.beta
                m0 = 0
                for word in word_count.keys():
                    betaw = self.n_wkl[word, t, s] + self.beta
                    cnt = word_count[word]
                    for i in range(cnt):
                        firstFactor[t][s] *= (betaw + i) / (beta0 + m0)
                        m0 += 1

        topic_similarity = ot.softmax(np.dot(self.topicVectors,
                                             self.wordVectors[
                                                 self.doc_sent_word_dict[d][m]].T))  # ( K x num words in sentence)
        senti_similarity = ot.softmax(np.dot(self.sentimentVector,
                                             self.wordVectors[
                                                 self.doc_sent_word_dict[d][m]].T))  # ( L x num words in sentence)
        vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T))

        firstFactor = firstFactor *  vector_similarity # dim(K x L)

        secondFactor = (self.ns_dl[d, :] + self.alpha) / \
                       (self.ns_d[d] + self.numTopics * self.alpha)  # dim(L x 1)

        thirdFactor = (self.ns_dkl[d, :, :] + self.gamma) / \
                      (self.ns_dl[d] + self.numSentiments * self.gamma)[np.newaxis, :] #(K, L)

        prob = np.ones((self.numTopics, self.numSentiments))
        prob *= firstFactor * thirdFactor
        prob *= secondFactor[np.newaxis,:]
        prob /= np.sum(prob)

        ind = sampleFromCategorical(prob.flatten())
        t, s = np.unravel_index(ind, prob.shape)

        self.topics[(d, m)] = t
        self.sentiments[(d, m)] = s
        self.ns_d[d] += 1
        self.ns_dkl[d, t, s] += 1
        self.ns_dl[d, s] += 1
        for i, w in enumerate(word_indices(self.doc_sent_word_dict[d], m)):
            self.n_wkl[w, t, s] += 1  # w번째 단어가 topic은 t, sentiment s로 할당된 개수
            self.n_kl[t, s] += 1  # topic k, senti l로 할당된 단어 수

    def calculatePhi(self):
        firstFactor = (self.n_wkl + self.beta) / \
                      np.expand_dims(self.n_kl + self.n_wkl.shape[0] * self.beta, axis=0)

        topic_similarity = ot.softmax(np.dot(self.topicVectors,
                                             self.wordVectors.T))  # ( K x V)
        senti_similarity = ot.softmax(np.dot(self.sentimentVector,
                                             self.wordVectors.T))  # ( L x V)
        vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T)) # K x L

        firstFactor = firstFactor * np.expand_dims(vector_similarity, axis=0)
        firstFactor /= firstFactor.sum()
        return firstFactor

    def calculateTheta(self):
        secondFactor = (self.ns_dkl + self.alpha) / \
                       np.expand_dims(self.ns_dl + self.numTopics * self.alpha, axis=1)  # dim(K x 1)
        secondFactor /= secondFactor.sum()
        return secondFactor

    def calculatePi(self):
        thirdFactor = (self.ns_dl + self.gamma) / \
                      np.expand_dims(self.ns_d + self.numSentiments * self.gamma, axis=1)
        thirdFactor /= thirdFactor.sum()
        return thirdFactor


    def getTopKWordsByLikelihood(self, K):
        """
        Returns top K discriminative words for topic t and sentiment s
        ie words v for which p(t, s | v) is maximum
        """
        pseudocounts = np.copy(self.n_wkl)
        normalizer = np.sum(pseudocounts, (1, 2))
        pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K + 1):-1]
                # vocab = self.vectorizer.get_feature_names()
                print(t, s, [self.idx2word[i] for i in topWordIndices])

    def getTopKWordsByTS(self, K):
        """
        K 개 sentiment별 top words
        """
        topic_sentiment_arr = self.calculatePhi()
        dic = {}
        for t in range(self.numTopics):
            for s in range(self.numSentiments):
                index_list = np.argsort(-topic_sentiment_arr[:, t, s])[:10]
                if s == 0:
                    name = "p"
                else:
                    name = "n"
                dic['topic_' + '{:02d}'.format(t + 1) + '_' + name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopKWordsByTopic(self, K):
        dic = {}
        phi = self.calculatePhi()
        topic_arr = np.sum(phi, (2))
        for t in range(self.numTopics):
            index_list = np.argsort(-topic_arr[:, t])[:K]
            dic["Topic"+str(t+1)] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def getTopicSentimentDist(self, d):
        theta = self.calculateTheta()[d]
        return theta

    def getDocSentimentDist(self, d):
        pi = self.calculatePi()[d]
        return pi

    def getTopWordsBySenti(self, K):
        dic = {}
        phi = self.calculatePhi()
        senti_arr = np.sum(phi, (1))
        for s in range(self.numSentiments):
            index_list = np.argsort(-senti_arr[:, s])[:K]
            if s == 0:
                name = "p"
            else:
                name = "n"
            dic["Sentiment_"+ name] = [self.idx2word[index] for index in index_list]
        return pd.DataFrame(dic)

    def classify_senti(self):
        doc_sent_inference = []
        for i in range(self.numDocs):
            if i in self.pos_neg_sentence_indices:
                doc_sent_inference.append(np.argmax(self.getDocSentimentDist(i)))
        infer_arr = np.array(doc_sent_inference)
        answer = np.array(self.pos_neg_sentiment_label)
        return np.mean(infer_arr == answer)

    def runSTMD(self, reviews, maxIters=10):
        for iteration in range(maxIters):
            self.updateTopicVectors()
            if (iteration + 1) % 2 == 0:
                print("Starting iteration %d of %d" % (iteration + 1, maxIters))
                print(self.classify_senti())

            for d in range(self.numDocs):
                for m in range(self.numSentence[d]):
                    self.sampling(d, m)


In [25]:
# sampler = STMD_Gibbs_Sampler(wordVectors, sentimentVector, numTopics=50, alpha=0.01, beta=0.001, gamma=1, numSentiments=2)
sampler = ASUM_Gibbs_Sampler(wordVectors, sentimentVector, numTopics=10, alpha=0.01, beta=0.001, gamma=1, numSentiments=2)

In [26]:
sampler._initialize_(sentence_list_again, pos_neg_sentence_indices, pos_neg_sentiment_label, sentiment_label)

In [27]:
%%time
sampler.runASUM(sentence_list_again, maxIters= 10)

Starting iteration 2 of 10
0.749571428571
Starting iteration 4 of 10
0.749857142857
Starting iteration 6 of 10
0.742
Starting iteration 8 of 10
0.743571428571
Starting iteration 10 of 10
0.739857142857
CPU times: user 4min 26s, sys: 268 ms, total: 4min 26s
Wall time: 3min 54s


In [28]:
samp = STMD_Gibbs_Sampler(wordVectors, sentimentVector, numTopics=10, alpha=0.01, beta=0.001, gamma=1, numSentiments=2)
samp._initialize_(sentence_list_again, pos_neg_sentence_indices, pos_neg_sentiment_label, sentiment_label)

In [29]:
samp.runSTMD(sentence_list_again, maxIters= 10)

Starting iteration 2 of 10
0.756571428571
Starting iteration 4 of 10
0.755285714286
Starting iteration 6 of 10
0.756571428571
Starting iteration 8 of 10
0.750714285714
Starting iteration 10 of 10
0.750428571429


In [31]:
samp.getTopKWordsByTopic(10)

Unnamed: 0,Topic1,Topic10,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,camera,use,use,use,camera,tv,tv,camera,use,camera
1,love,one,tv,tablet,use,samsung,set,len,work,use
2,one,get,work,one,pictur,one,samsung,canon,devic,canon
3,product,tv,app,product,take,look,screen,samsung,appl,one
4,tablet,camera,would,amazon,would,pictur,use,use,app,would
5,ipad,bought,get,would,work,use,one,would,get,get
6,great,samsung,samsung,get,one,get,camera,get,connect,purchas
7,use,back,one,purchas,great,would,would,one,one,samsung
8,purchas,screen,appl,appl,get,screen,time,amazon,samsung,len
9,appl,amazon,devic,camera,photo,great,get,return,problem,time


### 연습장

In [115]:
(sampler.n_wkl + sampler.beta).shape

(5000, 10, 2)

In [116]:
firstFactor = (sampler.n_wkl + sampler.beta) / np.expand_dims(sampler.n_kl + sampler.n_wkl.shape[0] * sampler.beta, axis=0)

In [118]:
firstFactor = (sampler.n_wkl + sampler.beta) / \
              np.expand_dims(sampler.n_kl + sampler.n_wkl.shape[0] * sampler.beta, axis=0)

topic_similarity = ot.softmax(np.dot(sampler.topicVectors,
                                     sampler.wordVectors.T))  # ( K x V)
senti_similarity = ot.softmax(np.dot(sampler.sentimentVector,
                                     sampler.wordVectors.T))  # ( L x V)

In [122]:
firstFactor /= firstFactor.sum()

In [114]:
 (sampler.n_kl + sampler.n_wkl.shape[0] * sampler.beta).shape

(10, 2)

In [112]:
np.expand_dims(sampler.ns_d + sampler.numTopics * sampler.alpha, axis=1)

array([[ 64.1],
       [ 78.1],
       [ 13.1],
       ..., 
       [  3.1],
       [  4.1],
       [  9.1]])

In [100]:
?np.expand_dims

In [102]:
np.expand_dims(sampler.ns_dk + sampler.numSentiments*sampler.gamma, axis=2)

(5000, 10, 1)

In [105]:
k.shape

(5000, 10, 2)

In [39]:
sampler.doc_sent_word_dict[0][1]

[526, 27, 2691, 1258, 125, 1063, 1492]

In [38]:
sampler.getTopKWords(10)

Unnamed: 0,topic_1_neg,topic_1_pos,topic_2_neg,topic_2_pos,topic_3_neg,topic_3_pos,topic_4_neg,topic_4_pos,topic_5_neg,topic_5_pos,topic_6_neg,topic_6_pos,topic_7_neg,topic_7_pos,topic_8_neg,topic_8_pos,topic_9_neg,topic_9_pos,topic_10_neg,topic_10_pos
0,tv,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,supportgp,handl,supportgp
1,use,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,hunt,mp4*,hunt
2,tablet,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,winner,wmv3,winner
3,samsung,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,nand,window,nand
4,app,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,justifi,current,justifi
5,one,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,univers_remot,thing,univers_remot
6,veri,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,ir_blaster,not_found,ir_blaster
7,like,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,companion,avi*,companion
8,set,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,asf*,z
9,get,student,student,student,student,student,student,student,student,student,student,student,student,student,student,student,student,student,year_ago,student


In [18]:
for d in range(sampler.numDocs):
    for m in range(sampler.numSentence[d]):
        try:
            topic_similarity = ot.softmax(np.dot(sampler.topicVectors,
                                      sampler.wordVectors[sampler.doc_sent_word_dict[d][m]].T)) #( K x num words in sentence)
            senti_similarity = ot.softmax(np.dot(sampler.sentimentVector,
                                      sampler.wordVectors[sampler.doc_sent_word_dict[d][m]].T)) #( L x num words in sentence)
            vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T))
        except:
            print(d,m)
            break

1795 47
1825 18
1837 6
1840 3
1841 2
1843 0
2076 31
2092 15
2101 6
2102 5
2104 3
2105 2
2107 0


In [20]:
sampler.doc_sent_word_dict[2107]

[[],
 [0, 903, 113],
 [2545, 1529, 98, 529, 165, 3346],
 [153,
  2424,
  91,
  2424,
  4813,
  79,
  251,
  449,
  400,
  135,
  0,
  264,
  825,
  316,
  2274,
  3083,
  3017,
  3460,
  170,
  14,
  1,
  1118,
  92,
  1268,
  507],
 [929, 1, 27, 2672, 46, 143, 63, 48, 709, 304],
 [4962, 65, 314, 285, 314, 2421],
 [14, 2672, 279, 225, 1253, 4848, 41, 92],
 [1903, 1124, 1253],
 [104, 604, 9, 317, 249, 2672, 45, 1118],
 [1253],
 [759],
 [3108],
 [1502, 249, 489, 27, 249, 84, 8, 223, 111],
 [2466, 0, 136, 4],
 [3],
 [27],
 [1542, 136, 4, 13, 567, 98, 414, 903, 313, 13, 23, 45, 8, 307]]

In [241]:
sampler.topicVectors.shape

(10, 100)

In [243]:
sampler.wordVectors.shape

(2894, 100)

In [244]:
sampler.doc_sent_word_dict[0][2]

[2, 1063]

In [187]:
probabilities_ts = sampler.conditionalDistribution(0,2)

In [225]:
# 해당 문서와 topic vector와 유사도
topic_similarity = ot.softmax(np.dot(sampler.topicVectors, sampler.wordVectors[sampler.doc_sent_word_dict[0][1]].T))
#topic_similarity /= topic_similarity.sum()


In [221]:
topic_similarity.shape

(10, 7)

In [227]:
senti_similarity = ot.softmax(np.dot(sampler.sentimentVector, sampler.wordVectors[sampler.doc_sent_word_dict[0][1]].T))

In [228]:
senti_similarity.shape

(2, 7)

In [224]:
senti_similarity

array([[  2.02158501e-03,   1.93985192e-03,   5.31407746e-04,
          1.05096038e-03,   5.05358453e-03,   1.61914991e-03,
          3.83634056e-03],
       [  3.43673315e-04,   2.43161809e-02,   9.56667229e-01,
          1.71550308e-04,   9.82957866e-04,   3.72137117e-04,
          1.09339132e-03]])

In [229]:
vector_similarity = ot.softmax(np.dot(topic_similarity, senti_similarity.T))

In [231]:
result = probabilities_ts * vector_similarity
result/= result.sum()

In [233]:
result.flatten().sum()

1.0

In [234]:
ind = sampleFromCategorical(result.flatten())

In [235]:
np.unravel_index(ind, result.shape)

(2, 1)

In [76]:
sampler.getTopKWords(10)

Unnamed: 0,topic_1_neg,topic_1_pos,topic_2_neg,topic_2_pos,topic_3_neg,topic_3_pos,topic_4_neg,topic_4_pos,topic_5_neg,topic_5_pos,topic_6_neg,topic_6_pos,topic_7_neg,topic_7_pos,topic_8_neg,topic_8_pos,topic_9_neg,topic_9_pos,topic_10_neg,topic_10_pos
0,tablet,tablet,use,tv,tv,use,tv,tv,tv,tv,tv,tv,tv,tablet,tv,tv,tv,use,tv,use
1,use,tv,tv,use,use,tablet,use,use,use,use,use,samsung,tablet,tv,use,use,tablet,tv,samsung,tv
2,tv,use,samsung,tablet,tablet,tv,samsung,samsung,samsung,samsung,samsung,use,use,use,samsung,samsung,use,tablet,use,samsung
3,samsung,samsung,tablet,samsung,samsung,samsung,tablet,tablet,tablet,tablet,tablet,tablet,samsung,samsung,tablet,tablet,samsung,samsung,tablet,tablet
4,app,app,app,app,one,app,one,app,app,app,veri,veri,app,app,app,app,app,app,one,one
5,veri,veri,one,veri,veri,veri,veri,one,one,veri,one,app,one,one,veri,one,one,one,veri,app
6,one,one,veri,like,app,one,app,veri,veri,one,set,one,veri,veri,one,set,veri,like,app,veri
7,set,tab,like,one,like,camera,camera,set,like,set,like,set,like,get,set,veri,like,tab,like,like
8,look,screen,get,set,set,like,like,like,set,get,app,camera,video,like,tab,like,set,veri,get,also
9,get,like,look,screen,video,set,get,look,get,like,look,like,screen,set,get,look,get,get,also,look


In [62]:
pos_score = np.dot(sampler.sentimentVector, sampler.wordVectors[sampler.doc_sent_word_dict[0][1]].T).sum(axis=1)
pos_score = ot.softmax(pos_score)

In [65]:
pos_score

array([  1.34681396e-04,   9.99865319e-01])

In [66]:
np.argmax(pos_score)

1

In [28]:
sampler.getTopKWords(10)

Unnamed: 0,topic_1_neg,topic_1_pos,topic_2_neg,topic_2_pos,topic_3_neg,topic_3_pos,topic_4_neg,topic_4_pos,topic_5_neg,topic_5_pos,topic_6_neg,topic_6_pos,topic_7_neg,topic_7_pos,topic_8_neg,topic_8_pos,topic_9_neg,topic_9_pos,topic_10_neg,topic_10_pos
0,exceed_expect,exceed_expect,exceed_expect,exceed_expect,audio,tv,exceed_expect,exceed_expect,exceed_expect,hdcp,exceed_expect,exceed_expect,exceed_expect,exceed_expect,exceed_expect,lpcm,exceed_expect,exceed_expect,samsung,exceed_expect
1,packaging.galaxi,packaging.galaxi,packaging.galaxi,packaging.galaxi,support,use,packaging.galaxi,packaging.galaxi,packaging.galaxi,5.2-channel,packaging.galaxi,packaging.galaxi,packaging.galaxi,packaging.galaxi,packaging.galaxi,digit,packaging.galaxi,packaging.galaxi,wqxga_ppi,packaging.galaxi
2,satur,satur,satur,satur,rington,tablet,satur,satur,satur,tx-nr636,satur,satur,satur,satur,satur,bd-f5900,satur,satur,exyno_octa,satur
3,ultrahd,ultrahd,ultrahd,ultrahd,tone,samsung,ultrahd,ultrahd,ultrahd,featur,ultrahd,ultrahd,ultrahd,ultrahd,ultrahd,aac,ultrahd,ultrahd,quadcor_gb,ultrahd
4,not_keep,not_keep,not_keep,not_keep,mp3,app,not_keep,not_keep,not_keep,onkyo,not_keep,not_keep,not_keep,not_keep,not_keep,he-aac,not_keep,not_keep,x_microusb,not_keep
5,matt,matt,matt,matt,polyphon,veri,matt,matt,matt,hdmi,matt,matt,matt,matt,matt,dolbi_digit,matt,matt,quadcor_ghz,matt
6,microusb_port,microusb_port,microusb_port,microusb_port,amr,one,microusb_port,microusb_port,microusb_port,tx-nr535,microusb_port,microusb_port,microusb_port,microusb_port,microusb_port,videos.so,microusb_port,microusb_port,ram_gb,microusb_port
7,vast,vast,vast,vast,aac,like,vast,vast,vast,exceed_expect,vast,vast,vast,vast,vast,mp3/mpeg,vast,vast,kitkat_x,vast
8,not_but,not_but,not_but,not_but,wma/asf,get,not_but,not_but,not_but,microusb_port,not_but,not_but,not_but,not_but,not_but,wma,not_but,not_but,b_w,not_but
9,damag,damag,damag,damag,ogg,set,damag,damag,damag,not_but,damag,damag,damag,damag,damag,butt,damag,damag,galaxi_tabpro,damag


In [76]:
sampler.conditionalDistribution(0,0,20)

array([[  1.83506814e-10,   2.23290875e-04],
       [  1.60974086e-10,   5.36969161e-05],
       [  1.68805741e-10,   4.89018857e-05],
       [  2.66052459e-08,   9.99230810e-01],
       [  1.57084661e-10,   5.55868529e-05],
       [  2.31037422e-10,   1.60966821e-04],
       [  1.43182080e-10,   8.30383198e-05],
       [  1.88503172e-10,   7.10861515e-05],
       [  5.51199076e-11,   3.13752639e-05],
       [  2.24208768e-10,   4.12188409e-05]])

In [43]:
w = 0
t = 1
s = 0
k = (sampler.n_wkl[w,t,s] + sampler.beta) / (sampler.n_kl + sampler.n_wkl.shape[0] * sampler.beta)

In [44]:
k.shape

(10, 2)

In [139]:
prob = 1
d = 10
m = 2
for word_idx in sampler.doc_sent_word_dict[d][m]:
    for i in range(sampler.wordCountSentence[d][m][word_idx]):
        prob *= sampler.n_wkl[word_idx,:,:] + sampler.beta + sampler.wordCountSentence[d][m][word_idx] -1 -i
        prob /= prob.sum()

In [140]:
prob

array([[  1.28128710e-11,   3.32387162e-02],
       [  3.56568843e-28,   7.88681872e-05],
       [  4.99174088e-23,   1.62427575e-10],
       [  1.12064780e-23,   2.05496915e-04],
       [  4.62029047e-24,   2.66786082e-06],
       [  2.00164567e-14,   9.65114191e-01],
       [  1.34288983e-24,   1.35861253e-03],
       [  4.41120245e-16,   1.36397954e-06],
       [  3.06202612e-36,   1.09052640e-16],
       [  3.60526903e-34,   8.29690294e-08]])

In [111]:
sampler.n_wkl[word_idx,:,:] + sampler.beta + sampler.wordCountSentence[d][m][word_idx] -1 -i

array([[ 270.01,  995.01],
       [  32.01,  390.01],
       [  60.01,  359.01],
       [  67.01,  449.01],
       [  82.01,  480.01],
       [ 167.01,  679.01],
       [  74.01,  525.01],
       [ 148.01,  535.01],
       [  10.01,  148.01],
       [  29.01,  286.01]])

In [98]:
sampler.wordCountSentence[d][m][word_idx]

1

In [84]:
topic_score = np.dot(sampler.topicVectors, sampler.wordVectors[sampler.doc_sent_word_dict[0][1]].T).sum(axis=1)

In [100]:
sampleFromCategorical(ot.softmax(topic_score))

4

In [45]:
sampler.doc_sent_word_dict[0][1]

[526, 27, 2691, 1258, 125, 1063, 1492]

In [46]:
sampler.wordCountSentence[0][1]

Counter({27: 1, 125: 1, 526: 1, 1063: 1, 1258: 1, 1492: 1, 2691: 1})

In [47]:
d = 0
m = 1

prob = 1
for word_idx in sampler.doc_sent_word_dict[d][m]:
    for i in range(sampler.wordCountSentence[d][m][word_idx]):
        prob *= sampler.n_wkl[word_idx, :, :] + sampler.beta + sampler.wordCountSentence[d][m][word_idx] - 1 - i
        prob /= prob.sum()

In [48]:
sampler.doc_sent_word_dict[d][m]

[526, 27, 2691, 1258, 125, 1063, 1492]

In [49]:
sampler.wordCountSentence[d][m]

Counter({27: 1, 125: 1, 526: 1, 1063: 1, 1258: 1, 1492: 1, 2691: 1})

In [53]:
word_count = sampler.wordCountSentence[d][m]

In [54]:
word_count

Counter({27: 1, 125: 1, 526: 1, 1063: 1, 1258: 1, 1492: 1, 2691: 1})

In [55]:
secondFactor = (sampler.ns_dk[d, :] + sampler.alpha) / \
                       (sampler.ns_d[d] + sampler.numTopics * sampler.alpha)
thirdFactor = (sampler.ns_dkl[d, :, :] + sampler.gamma) / \
                      (sampler.ns_dk[d] + sampler.numSentiments * sampler.gamma)[:, np.newaxis] 

In [59]:
secondFactor[:,np.newaxis] * thirdFactor

array([[  9.83465702e-01,   1.51302416e-02],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05],
       [  7.80031201e-05,   7.80031201e-05]])

In [57]:
thirdFactor.shape

(10, 2)

In [60]:
beta0 = sampler.n_kl + sampler.beta

In [63]:
betaw = sampler.n_wkl[0,t,s] + sampler.beta

In [None]:
def sampling()

In [73]:
prob = np.ones((sampler.numTopics, sampler.numSentiments))

word_count = sampler.wordCountSentence[d][m]
for t in range(sampler.numTopics):
    for s in range(sampler.numSentiments):
        beta0 = sampler.n_kl[t][s] + sampler.beta
        m0 = 0
        for word in word_count.keys():
            betaw = sampler.n_wkl[word,t,s] + sampler.beta
            cnt = word_count[word]
            for i in range(cnt):
                prob[t][s] *= (betaw+i) / (beta0 + m0)
                m0 += 1

In [74]:
prob

array([[  2.87578284e-25,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.35537510e-15,   1.35537510e-15],
       [  1.60711003e-33,   1.35537510e-15]])

In [72]:
beta0

array([[  5.50457010e+05,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  1.00000000e-02,   1.00000000e-02],
       [  4.81010000e+02,   1.00000000e-02]])

In [None]:
probTable[ti][si] = (matrixSDT[si].getValue(docNo, ti) + this.alpha) / (sumDST[docNo][si] + this.sumAlpha)
//						* (matrixDS.getValue(docNo, si) + this.gammas[si]) / (sumDS[docNo] + this.sumGamma)

In [None]:

for s in range(sampler.numSentiments):
    for topic in range(sampler.numTopics):
        expectTSW = 1:
            for word in 