In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import cPickle
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


def sigmoid(x):
    x = np.clip( x, -500, 500 )

    return 1. / (1 + np.exp(-x))

class RBM(object):

    def __init__(self, n_visible, n_hidden, mbtsz, epochs, eta, mrate, np_rng, weightinit=0.001,\
               lambda1 = 1, lambda2 = 1, lambda3 = 1,lambda4 = 1, \
               p_A_vk = None,p_s_vk = None,p_Aj_vk= None,p_sj_vk = None):
        """
        CD-k training of RBM with SGD + Momentum.
        @param n_visible:   num of lexicon
        @param n_hidden:    num of latent topics
        @param epochs:      training epochs
        @param eta:         learning rate
        @param mrate:       momentum rate
        @param mbtsz:       mini-batch size
        @param np_rng:      instances of RandomState
        @param weightinit:  scaling of random weight initialization
        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.mbtsz = mbtsz
        self.epochs = epochs
        self.eta = eta
        self.mrate = mrate
        self.np_rng = np_rng
        self.W = weightinit * np_rng.randn(n_visible, n_hidden)
        self.vbias = weightinit * np_rng.randn(n_visible)
        self.hbias = np.zeros((n_hidden))
        # for momentum
        self.mW = np.zeros((n_visible, n_hidden))
        self.mvbias = np.zeros((n_visible))
        self.mhbias = np.zeros((n_hidden))

        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.lambda4 = lambda4
        self.p_A_vk = p_A_vk
        self.p_s_vk = p_s_vk
        self.p_Aj_vk = p_Aj_vk
        self.p_sj_vk = p_sj_vk
    def train(self, data):
        for epoch in range(self.epochs):
            print(epoch)
            #self.np_rng.shuffle(data)
            for i in range(0, data.shape[0], self.mbtsz):
                mData = data[i:i + self.mbtsz]
                ph_mean, nv_samples, nh_means = self.cd_k(mData)

                self.mW = self.mW * self.mrate + (np.dot(mData.T, ph_mean) - np.dot(nv_samples.T, nh_means))
                self.mvbias = self.mvbias * self.mrate + np.mean(mData - nv_samples, axis=0)
                self.mhbias = self.mhbias * self.mrate + np.mean(ph_mean - nh_means, axis=0)
                
                prior1 = np.zeros((self.n_visible,self.n_hidden))
                prior2 = np.zeros((self.n_visible,self.n_hidden))
                prior3 = np.zeros((self.n_visible,self.n_hidden))
                prior4 = np.zeros((self.n_visible,self.n_hidden))
                for doc in range(mData.shape[0]):

                    vk = mData[doc]
                    Gj = np.zeros((self.n_visible,self.n_hidden))

                    for i in range(0,self.n_visible):
                        if vk[i] != 0:                 
                            for j in range(0,self.n_hidden):
                                Gj[i][j] = self._logistic(self.W[i][j]*vk[i]+self.hbias[j])
                                if 1/(1+Gj[i][j]) == self.p_Aj_vk[i][j] or 1/(1+Gj[i][j]) == self.p_A_vk[i][j] or 1/(1+Gj[i][j]) == self.p_s_vk[i][j] or 1/(1+Gj[i][j]) == self.p_sj_vk[i][j]:
                                    continue
                                if self.p_Aj_vk[i][j] != 0:
                                    prior1[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_Aj_vk[i][j]))

                                #if 1/(1+Gj[i][j])-self.p_A_vk[i][j] != 0:
                                if self.p_A_vk[i][j] != 0:
                                    prior2[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_A_vk[i][j]))

                                if self.p_s_vk[i][j] != 0:
                                    prior3[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_s_vk[i][j]))

                                if self.p_sj_vk[i][j] != 0:
                                    prior4[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_sj_vk[i][j]))
                
                self.W += self.eta * self.mW - self.lambda1*prior1 - self.lambda2*prior2-self.lambda3*prior3 - self.lambda4*prior4
                #self.W += self.eta * self.mW 
                self.vbias += self.eta * self.mvbias
                self.hbias += self.eta * self.mhbias

    def cd_k(self, data, k=1):
        D = data.sum(axis=1)
        ph_mean, ph_sample = self.sample_h(data, D)
        chain_start = ph_sample

        for step in range(k):
            if step == 0:
                nv_means, nv_samples, nh_means, nh_samples = self.gibbs_hvh(chain_start, D) 
            else:
                nv_means, nv_samples, nh_means, nh_samples = self.gibbs_hvh(nh_samples, D)
        return ph_mean, nv_samples, nh_means

    def sample_h(self, v0_sample, D):
        h1_mean = sigmoid(np.dot(v0_sample, self.W) + np.outer(D, self.hbias))
        h1_sample = self.np_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean)
        return [h1_mean, h1_sample]

    def sample_v(self, h0_sample, D):
        x = np.dot(h0_sample, self.W.T)
        x = np.clip( x, -500, 500 )
        pre_soft = np.exp( x+ self.vbias)
        pre_soft_sum = pre_soft.sum(axis=1).reshape((self.mbtsz, 1))
        v1_mean = pre_soft/pre_soft_sum
        v1_sample = np.zeros((self.mbtsz, v1_mean.shape[1]))
        for i in range(self.mbtsz):
            v1_sample[i] = self.np_rng.multinomial(size=1, n=D[i], pvals=v1_mean[i])
        return [v1_mean, v1_sample]

    def gibbs_hvh(self, h0_sample, D):
        v1_mean, v1_sample = self.sample_v(h0_sample, D)
        h1_mean, h1_sample = self.sample_h(v1_sample, D)
        return [v1_mean, v1_sample, h1_mean, h1_sample]

    def wordPredict(self, topic, voc):
        vecTopics = np.zeros((topic, topic))
        for i in range(len(vecTopics)):
            vecTopics[i][i] = 1
        for i, vecTopic in enumerate(vecTopics):
            pre_soft = np.exp(np.dot(vecTopic, self.W.T) + self.vbias)
            pre_soft_sum = pre_soft.sum().reshape((1, 1))
            word_distribution = (pre_soft/pre_soft_sum).flatten()
            tmpDict = {}
            for j in range(len(voc)):
                tmpDict[voc[j]] = word_distribution[j]
            print 'topic', str(i), ':', vecTopic
            k = 0
            for word, prob in sorted(tmpDict.items(), key=lambda x:x[1], reverse=True):
                if (k < 30):
                    print word, str(prob)
                    k = k+1
            print '-'
    def run_visible(self, data,t):
        num_examples = data.shape[0]
        hidden_states = np.ones((num_examples, self.n_hidden))
        
        hidden_activations = np.dot(data, self.W)+self.hbias        
        hidden_probs = self._logistic(hidden_activations)
        '''
        for i in range(hidden_probs.shape[0]):
            m= max(hidden_probs[i])
            for j in range(hidden_probs.shape[1]):
                if hidden_probs[i][j] == m:
                    hidden_states[i][j] = 1
                else:
                    hidden_states[i][j] =0 
        '''
        threshold = t* np.ones((num_examples, self.n_hidden))
        hidden_states[:,:] = hidden_probs > threshold
        return hidden_states    

    def _logistic(self, x):
        x = np.clip( x, -500, 500 )
        return 1.0 / (1 + np.exp(-x))  

    def saveParams(self, filePath):
        cPickle.dump({'W': self.W,
                      'vbias': self.vbias,
                      'hbias': self.hbias},
                      open(filePath, 'w'))

def inputData(filePath):
    docs = []
    voc = defaultdict(lambda: len(voc))
    file = open(filePath, "r")
    for line in file:
        doc = line.rstrip().split()
        for word in doc:
            voc[word]
        cnt = Counter(doc)
        docs.append(cnt)
    file.close()
    docSize, vocSize = len(docs), len(voc)
    v = np.zeros((docSize, vocSize))
    for i in range(docSize):
        for word, freq in docs[i].most_common():
            wID = voc[word]
            v[i][wID] = freq
    return v, {v:k for k, v in voc.items()}

In [9]:
print("Loading dataset...")
doc_len = [0 for x in range(0,52624)]
doc_id = 0
with open('test_rr.txt') as f:
    content = f.readlines()
dataset = []
for line in content:
    i, words = line.split('\t')
    i = int(i)
    if doc_id == i:
        doc_len[i] = doc_len[i]+1
    else:
        doc_id = i
        doc_len[i]=1
    dataset.append(words.strip())
print 'Total number of sentences:' ,len(dataset)
tf_vectorizer = CountVectorizer(max_df = 0.85,stop_words = 'english',max_features = 10000)
tf = tf_vectorizer.fit_transform(dataset[0:15000])
tf = tf.toarray()
print 'Number of training objects: ', tf.shape[0]
print 'Number of vocabulary dictionary: ', tf.shape[1]

#vocab = tf_vectorizer.get_feature_names()
vocab = tf_vectorizer.vocabulary_
voc = defaultdict(lambda: len(voc))

for k,v in vocab.items():
    voc[v]=k
docs = tf
p_Aj_vk = np.loadtxt('p_Aj_vk')
p_A_vk = np.loadtxt('p_A_vk' )
p_s_vk = np.loadtxt('p_s_vk')
p_sj_vk = np.loadtxt('p_sj_vk')


Loading dataset...
Total number of sentences: 179139
Number of training objects:  15000
Number of vocabulary dictionary:  8959


In [3]:
topic = 10
rbm = RBM( n_visible=len(docs[0]), 
           n_hidden=topic, 
           mbtsz=50,
           epochs=10,
           eta=0.1,
           mrate=0.8,
           np_rng=np.random.RandomState(1234),
           lambda1 = 0.04, lambda2 = 0.01, lambda3 = 0.01,lambda4 = 0.01, 
           p_A_vk = p_A_vk,p_s_vk = p_s_vk,p_Aj_vk= p_Aj_vk,p_sj_vk = p_sj_vk)

rbm.train(docs[0:15000])
rbm.wordPredict(topic, voc)



0
1
2
3
4
5
6
7
8
9
topic 0 : [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
good 0.71097929205
wine 0.104821661652
list 0.048427253918
highly 0.04566918602
recommended 0.0192548076766
glass 0.0131384457555
nice 0.0118113090717
selection 0.00943310731536
fantastic 0.00766146222216
definitely 0.00560897239007
think 0.0049289787984
place 0.00465012417797
restaurant 0.00397008985121
come 0.00373744530931
value 0.00346859292651
interesting 0.00119184829197
work 0.000404358402058
decent 0.000149507075673
extensive 0.000139197069685
neighborhood 0.00013475337194
service 7.6719369327e-05
overall 5.93752168514e-05
wonderful 4.22499330743e-05
check 3.06694487708e-05
bottle 2.64506710228e-05
really 1.63641218341e-05
return 1.55491663094e-05
favorite 1.49703924821e-05
best 1.4180010854e-05
authentic 1.08945832641e-05
-
topic 1 : [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
excellent 0.4090902793
amaze 0.327143959431
restaurant 0.0776024913038
place 0.0673710924421
service 0.0518903079642
terrible 0.02038

In [12]:
label = np.loadtxt('labels')[0:15000]
result =rbm.run_visible(docs[0:15000],0.99)
from sklearn.metrics import precision_score,recall_score,f1_score

print(precision_score(label[:,0],result[:,2]))
print(recall_score(label[:,0],result[:,2]))
print(f1_score(label[:,0],result[:,2]))

print(precision_score(label[:,1],result[:,6]))
print(recall_score(label[:,1],result[:,6]))
print(f1_score(label[:,1],result[:,6]))

print(precision_score(label[:,2],result[:,8]))
print(recall_score(label[:,2],result[:,8]))
print(f1_score(label[:,2],result[:,8]))

0.730827067669
0.376379477251
0.496869009585
0.533302497687
0.623580313683
0.574918972825
0.360275689223
0.494836488812
0.416968817984


In [29]:
np.savetxt('rbm.txt',result,fmt = '%d')


In [16]:
#try different parameters for better accuracy

rbm = RBM( n_visible=len(docs[0]), 
           n_hidden=topic, 
           mbtsz=50,
           epochs=10,
           eta=0.1,
           mrate=0.8,
           np_rng=np.random.RandomState(1234),
           lambda1 = 0.01, lambda2 = 0.05, lambda3 = 0.01,lambda4 = 0.01, 
           p_A_vk = p_A_vk,p_s_vk = p_s_vk,p_Aj_vk= p_Aj_vk,p_sj_vk = p_sj_vk)

rbm.train(docs[0:15000])
rbm.wordPredict(topic, voc)

0
1
2
3
4
5
6
7
8
9
topic 0 : [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
food 0.773813706156
delicious 0.047049838745
highly 0.0430316140648
fantastic 0.0300558027493
best 0.0188862401376
fun 0.0148665177754
recommended 0.0142244455954
wonderful 0.013892720645
absolutely 0.00815230214643
think 0.00722398996762
ok 0.00537374174872
italian 0.00417631403085
restaurant 0.00348971054679
authentic 0.00348147159341
amazing 0.00214432427038
average 0.0020347908663
definitely 0.00135380623463
terrible 0.00120227697363
prepared 0.00106061093574
fresh 0.000915317268552
tasty 0.000850340334477
expect 0.000821537833215
awesome 0.000726476283768
excellent 0.00054442939722
amaze 0.000272486767054
importantly 0.000162108850914
mediocre 4.47541312157e-05
indian 3.99113017044e-05
outstanding 3.44555571429e-05
horrible 1.27668265281e-05
-
topic 1 : [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
excellent 0.54577853977
food 0.265375651167
restaurant 0.180973856772
service 0.0078663499178
delicious 3.79513530347

In [44]:
label = np.loadtxt('labels')[0:15000]
result =rbm.run_visible(docs[0:15000],0)

from sklearn.metrics import precision_score,recall_score,f1_score
print(precision_score(label[:,0],result[:,6]))
print(recall_score(label[:,0],result[:,6]))
print(f1_score(label[:,0],result[:,6]))

print(precision_score(label[:,1],result[:,7]))
print(recall_score(label[:,1],result[:,7]))
print(f1_score(label[:,1],result[:,7]))
print(precision_score(label[:,2],result[:,8]))
print(recall_score(label[:,2],result[:,8]))
print(f1_score(label[:,2],result[:,8]))

0.749122396443
0.619748305905
0.678321678322
0.319858712716
0.881557598702
0.469402447804
0.158092553441
0.57917383821
0.248385310943


In [45]:
#Normal RBM (without priors)


# -*- coding: utf-8 -*-
import numpy as np
import cPickle
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


def sigmoid(x):
    x = np.clip( x, -500, 500 )

    return 1. / (1 + np.exp(-x))

class RBM(object):

    def __init__(self, n_visible, n_hidden, mbtsz, epochs, eta, mrate, np_rng, weightinit=0.001,\
               lambda1 = 1, lambda2 = 1, lambda3 = 1,lambda4 = 1, \
               p_A_vk = None,p_s_vk = None,p_Aj_vk= None,p_sj_vk = None):
        """
        CD-k training of RBM with SGD + Momentum.
        @param n_visible:   num of lexicon
        @param n_hidden:    num of latent topics
        @param epochs:      training epochs
        @param eta:         learning rate
        @param mrate:       momentum rate
        @param mbtsz:       mini-batch size
        @param np_rng:      instances of RandomState
        @param weightinit:  scaling of random weight initialization
        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.mbtsz = mbtsz
        self.epochs = epochs
        self.eta = eta
        self.mrate = mrate
        self.np_rng = np_rng
        self.W = weightinit * np_rng.randn(n_visible, n_hidden)
        self.vbias = weightinit * np_rng.randn(n_visible)
        self.hbias = np.zeros((n_hidden))
        # for momentum
        self.mW = np.zeros((n_visible, n_hidden))
        self.mvbias = np.zeros((n_visible))
        self.mhbias = np.zeros((n_hidden))

        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.lambda4 = lambda4
        self.p_A_vk = p_A_vk
        self.p_s_vk = p_s_vk
        self.p_Aj_vk = p_Aj_vk
        self.p_sj_vk = p_sj_vk
    def train(self, data):
        for epoch in range(self.epochs):
            print(epoch)
            #self.np_rng.shuffle(data)
            for i in range(0, data.shape[0], self.mbtsz):
                mData = data[i:i + self.mbtsz]
                ph_mean, nv_samples, nh_means = self.cd_k(mData)

                self.mW = self.mW * self.mrate + (np.dot(mData.T, ph_mean) - np.dot(nv_samples.T, nh_means))
                self.mvbias = self.mvbias * self.mrate + np.mean(mData - nv_samples, axis=0)
                self.mhbias = self.mhbias * self.mrate + np.mean(ph_mean - nh_means, axis=0)
                '''
                prior1 = np.zeros((self.n_visible,self.n_hidden))
                prior2 = np.zeros((self.n_visible,self.n_hidden))
                prior3 = np.zeros((self.n_visible,self.n_hidden))
                prior4 = np.zeros((self.n_visible,self.n_hidden))
                for doc in range(mData.shape[0]):

                    vk = mData[doc]
                    Gj = np.zeros((self.n_visible,self.n_hidden))

                    for i in range(0,self.n_visible):
                        if vk[i] != 0:                 
                            for j in range(0,self.n_hidden):
                                Gj[i][j] = self._logistic(self.W[i][j]*vk[i]+self.hbias[j])
                                if 1/(1+Gj[i][j]) == self.p_Aj_vk[i][j] or 1/(1+Gj[i][j]) == self.p_A_vk[i][j] or 1/(1+Gj[i][j]) == self.p_s_vk[i][j] or 1/(1+Gj[i][j]) == self.p_sj_vk[i][j]:
                                    continue
                                if self.p_Aj_vk[i][j] != 0:
                                    prior1[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_Aj_vk[i][j]))

                                #if 1/(1+Gj[i][j])-self.p_A_vk[i][j] != 0:
                                if self.p_A_vk[i][j] != 0:
                                    prior2[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_A_vk[i][j]))

                                if self.p_s_vk[i][j] != 0:
                                    prior3[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_s_vk[i][j]))

                                if self.p_sj_vk[i][j] != 0:
                                    prior4[i][j] += 2*Gj[i][j]*vk[i]/((1+Gj[i][j])**2*(1/(1+Gj[i][j])-self.p_sj_vk[i][j]))
                '''
                #self.W += self.eta * self.mW - self.lambda1*prior1 - self.lambda2*prior2-self.lambda3*prior3 - self.lambda4*prior4
                self.W += self.eta * self.mW 
                self.vbias += self.eta * self.mvbias
                self.hbias += self.eta * self.mhbias

    def cd_k(self, data, k=1):
        D = data.sum(axis=1)
        ph_mean, ph_sample = self.sample_h(data, D)
        chain_start = ph_sample

        for step in range(k):
            if step == 0:
                nv_means, nv_samples, nh_means, nh_samples = self.gibbs_hvh(chain_start, D) 
            else:
                nv_means, nv_samples, nh_means, nh_samples = self.gibbs_hvh(nh_samples, D)
        return ph_mean, nv_samples, nh_means

    def sample_h(self, v0_sample, D):
        h1_mean = sigmoid(np.dot(v0_sample, self.W) + np.outer(D, self.hbias))
        h1_sample = self.np_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean)
        return [h1_mean, h1_sample]

    def sample_v(self, h0_sample, D):
        x = np.dot(h0_sample, self.W.T)
        x = np.clip( x, -500, 500 )
        pre_soft = np.exp( x+ self.vbias)
        pre_soft_sum = pre_soft.sum(axis=1).reshape((self.mbtsz, 1))
        v1_mean = pre_soft/pre_soft_sum
        v1_sample = np.zeros((self.mbtsz, v1_mean.shape[1]))
        for i in range(self.mbtsz):
            v1_sample[i] = self.np_rng.multinomial(size=1, n=D[i], pvals=v1_mean[i])
        return [v1_mean, v1_sample]

    def gibbs_hvh(self, h0_sample, D):
        v1_mean, v1_sample = self.sample_v(h0_sample, D)
        h1_mean, h1_sample = self.sample_h(v1_sample, D)
        return [v1_mean, v1_sample, h1_mean, h1_sample]

    def wordPredict(self, topic, voc):
        vecTopics = np.zeros((topic, topic))
        for i in range(len(vecTopics)):
            vecTopics[i][i] = 1
        for i, vecTopic in enumerate(vecTopics):
            pre_soft = np.exp(np.dot(vecTopic, self.W.T) + self.vbias)
            pre_soft_sum = pre_soft.sum().reshape((1, 1))
            word_distribution = (pre_soft/pre_soft_sum).flatten()
            tmpDict = {}
            for j in range(len(voc)):
                tmpDict[voc[j]] = word_distribution[j]
            print 'topic', str(i), ':', vecTopic
            k = 0
            for word, prob in sorted(tmpDict.items(), key=lambda x:x[1], reverse=True):
                if (k < 30):
                    print word, str(prob)
                    k = k+1
            print '-'
    def run_visible(self, data,t):
        num_examples = data.shape[0]
        hidden_states = np.ones((num_examples, self.n_hidden))
        
        hidden_activations = np.dot(data, self.W)+self.hbias        
        hidden_probs = self._logistic(hidden_activations)
        for i in range(hidden_probs.shape[0]):
            m= max(hidden_probs[i])
            for j in range(hidden_probs.shape[1]):
                if hidden_probs[i][j] == m:
                    hidden_states[i][j] = 1
                else:
                    hidden_states[i][j] =0        
        #threshold = t* np.ones((num_examples, self.n_hidden))
        #hidden_states[:,:] = hidden_probs > threshold
        return hidden_states    

    def _logistic(self, x):
        x = np.clip( x, -500, 500 )
        return 1.0 / (1 + np.exp(-x))  

    def saveParams(self, filePath):
        cPickle.dump({'W': self.W,
                      'vbias': self.vbias,
                      'hbias': self.hbias},
                      open(filePath, 'w'))

def inputData(filePath):
    docs = []
    voc = defaultdict(lambda: len(voc))
    file = open(filePath, "r")
    for line in file:
        doc = line.rstrip().split()
        for word in doc:
            voc[word]
        cnt = Counter(doc)
        docs.append(cnt)
    file.close()
    docSize, vocSize = len(docs), len(voc)
    v = np.zeros((docSize, vocSize))
    for i in range(docSize):
        for word, freq in docs[i].most_common():
            wID = voc[word]
            v[i][wID] = freq
    return v, {v:k for k, v in voc.items()}

In [56]:
topic = 6
rbm = RBM( n_visible=len(docs[0]), 
           n_hidden=topic, 
           mbtsz=50,
           epochs=10,
           eta=0.11,
           mrate=0.8,
           np_rng=np.random.RandomState(1234),
           lambda1 = 0, lambda2 = 0, lambda3 = 0,lambda4 = 0, 
           p_A_vk = p_A_vk,p_s_vk = p_s_vk,p_Aj_vk= p_Aj_vk,p_sj_vk = p_sj_vk)
rbm.train(docs[0:15000])
rbm.wordPredict(topic, voc)


0
1
2
3
4
5
6
7
8
9
topic 0 : [ 1.  0.  0.  0.  0.  0.]
good 0.139966849224
experience 0.0779364736623
great 0.0555212040315
price 0.0373843252411
dessert 0.035528114858
meal 0.026523199009
overall 0.0263259125905
bad 0.0217774662859
excellent 0.0199434301266
reasonable 0.0174974458685
really 0.0174325033738
beat 0.0146590172928
dine 0.0126927158176
better 0.0126311046985
nyc 0.010261263577
best 0.00977728540716
value 0.00917016001272
taste 0.00895001945997
everythe 0.00876463563418
stay 0.00662260047327
miss 0.00606548004447
say 0.00573404857636
definitely 0.00573302544249
love 0.0053613758348
away 0.00532709636927
pizza 0.00500847198244
coffee 0.00496461295363
fresh 0.00435417528327
appetizer 0.00425361075954
deal 0.00415288968704
-
topic 1 : [ 0.  1.  0.  0.  0.  0.]
shrimp 0.0217155005559
delicious 0.0193180794588
appetizer 0.0159969589477
steak 0.0129906417659
mouth 0.0122168552184
melt 0.0106350006175
potatoe 0.010469052448
cake 0.00998491117504
mushroom 0.00976518161897
bas 0.00

In [57]:
label = np.loadtxt('labels')[0:15000]
result =rbm.run_visible(docs[0:15000],0)

from sklearn.metrics import precision_score,recall_score,f1_score
print(precision_score(label[:,0],result[:,1]))
print(recall_score(label[:,0],result[:,1]))
print(f1_score(label[:,0],result[:,1]))

print(precision_score(label[:,1],result[:,3]))
print(recall_score(label[:,1],result[:,3]))
print(f1_score(label[:,1],result[:,3]))
print(precision_score(label[:,2],result[:,2]))
print(recall_score(label[:,2],result[:,2]))
print(f1_score(label[:,2],result[:,2]))

0.684947721444
0.672216844143
0.678522571819
0.654501216545
0.29096809086
0.402845376264
0.191881918819
0.223752151463
0.20659515296
