Skip to content

Commit

Permalink
add 2nd order moments
Browse files Browse the repository at this point in the history
  • Loading branch information
chyikwei committed Dec 28, 2017
1 parent 1f7e4a4 commit 9e952f9
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 2 deletions.
50 changes: 49 additions & 1 deletion tensor_lda/moments.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
from numpy import linalg as LA
import scipy.sparse as sp

from sklearn.externals.six.moves import xrange
Expand All @@ -8,6 +9,7 @@
from sklearn.utils.validation import check_non_negative



def first_order_moments(X, min_words, whom):
"""First-Order Moments
Expand Down Expand Up @@ -94,7 +96,7 @@ def cooccurrence_expectation(X, min_words, whom, batch_size=1000):
Returns
-------
e2 : sparse array, shape=(n_features, n_features)
e2 : sparse matrix, shape=(n_features, n_features)
Expectation of word pairs
ignored: integer
Expand Down Expand Up @@ -187,3 +189,49 @@ def cooccurrence_expectation(X, min_words, whom, batch_size=1000):
e2 /= (n_samples - ignored_docs)
e2 += sp.triu(e2, k=1).T
return (e2, ignored_docs)


def second_order_moments(n_components, e2, m1, alpha0):
"""Second-Order Moments
To prevent creating 2nd order moments explicitly, we construct its
decomposition with `n_components`. check reference [?] section 5.2
for details.
Parameters
----------
n_components: int
Number of components
e2: sparse matrix, shape=(n_features, n_features)
Expectation of word pairs. e2[i, j] is the expectation of word `i`
and `j` in the same document.
m1: array, shape=(n_features,)
Expectation of each words.
alpha0: double
Sum of topic topic concentration parameter
Returns
-------
m2_vals : array, shape=(n_components,)
eigen values of sencond-order moments
m2_vecs : array, shape=(n_features, n_components)
eigen values of sencond-order moments
"""

# eigen values and vectors of E2
e2_vals, e2_vecs = sp.linalg.eigsh(e2, k=n_components)
m1_p = np.dot(e2_vecs.T, m1)
m2_p = ((-1. * alpha0) / (alpha0 + 1.)) * (m1_p * m1_p[:, np.newaxis])
m2_p[np.diag_indices_from(m2_p)] += e2_vals

# eigen values and vectors of M2 prime
m2p_vals, m2p_vecs = LA.eigh(m2_p)

m2_vals = m2p_vals
m2_vecs = np.dot(e2_vecs, m2p_vecs)

return (m2_vals, m2_vecs)
42 changes: 41 additions & 1 deletion tensor_lda/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from sklearn.externals.six.moves import xrange

from tensor_lda.moments import (first_order_moments,
cooccurrence_expectation)
cooccurrence_expectation,
second_order_moments)


def test_first_order_moments():
Expand Down Expand Up @@ -115,3 +116,42 @@ def test_cooccurrence_expectation():
assert_greater(ignored_cnt, 0)
assert_equal(mask.sum(), n_samples - ignored_cnt)
assert_array_almost_equal(result, e2.toarray())
# cooccurrence should be symmertic
assert_array_almost_equal(result, e2.toarray().T)


def test_second_order_moments():
# compare create M2 directly vs create eigen value
# and vectors with optimized method
rng = np.random.RandomState(0)

n_features = 500
n_components = 50
min_count = 3
alpha0 = 20.
n_samples = rng.randint(100, 150)
doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
doc_word_mtx = sp.csr_matrix(doc_word_mtx)

m1, _ = first_order_moments(doc_word_mtx, min_words=min_count,
whom='test_second_order_moments')
e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count,
whom='test_second_order_moments')

# create M2 directly
m2 = e2.toarray()
m2 -= (alpha0 / (alpha0 + 1.)) * (m1 * m1[:, np.newaxis])
m2_vals_true, m2_vecs_true = sp.linalg.eigsh(m2, k=n_components)

# create M2 eigen values & vectors with optimized method
m2_vals, m2_vecs = second_order_moments(n_components, e2, m1, alpha0)
assert_equal(m2_vals.shape[0], n_components)
assert_equal(m2_vecs.shape[0], n_features)
assert_equal(m2_vecs.shape[1], n_components)

m2_reconstruct_true = np.dot(np.dot(m2_vecs_true, np.diag(m2_vals_true)), m2_vecs_true.T)
m2_reconstruct = np.dot(np.dot(m2_vecs, np.diag(m2_vals)), m2_vecs.T)
# compare reconstructed version
assert_array_almost_equal(m2_reconstruct_true, m2_reconstruct)
# compare original M2 with reconstructed version
assert_array_almost_equal(m2, m2_reconstruct, decimal=5)

0 comments on commit 9e952f9

Please sign in to comment.