add 2nd order moments

chyikwei · Dec 28, 2017 · 9e952f9 · 9e952f9
1 parent 1f7e4a4
commit 9e952f9
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 2 deletions.
diff --git a/tensor_lda/moments.py b/tensor_lda/moments.py
@@ -1,4 +1,5 @@
 import numpy as np
+from numpy import linalg as LA
 import scipy.sparse as sp
 
 from sklearn.externals.six.moves import xrange
@@ -8,6 +9,7 @@
 from sklearn.utils.validation import check_non_negative
 
 
+
 def first_order_moments(X, min_words, whom):
     """First-Order Moments
     
@@ -94,7 +96,7 @@ def cooccurrence_expectation(X, min_words, whom, batch_size=1000):
 
     Returns
     -------
-    e2 : sparse array, shape=(n_features, n_features)
+    e2 : sparse matrix, shape=(n_features, n_features)
         Expectation of word pairs
     
     ignored: integer
@@ -187,3 +189,49 @@ def cooccurrence_expectation(X, min_words, whom, batch_size=1000):
     e2 /= (n_samples - ignored_docs)
     e2 += sp.triu(e2, k=1).T
     return (e2, ignored_docs)
+
+
+def second_order_moments(n_components, e2, m1, alpha0):
+    """Second-Order Moments
+
+    To prevent creating 2nd order moments explicitly, we construct its
+    decomposition with `n_components`. check reference [?] section 5.2
+    for details.
+
+    Parameters
+    ----------
+    n_components: int
+        Number of components
+
+    e2: sparse matrix, shape=(n_features, n_features)
+        Expectation of word pairs. e2[i, j] is the expectation of word `i`
+        and `j` in the same document.
+    
+    m1: array, shape=(n_features,)
+        Expectation of each words.
+
+    alpha0: double
+        Sum of topic topic concentration parameter
+    
+    Returns
+    -------
+    m2_vals : array, shape=(n_components,)
+        eigen values of sencond-order moments
+
+    m2_vecs : array, shape=(n_features, n_components)
+        eigen values of sencond-order moments        
+    """
+
+    # eigen values and vectors of E2
+    e2_vals, e2_vecs = sp.linalg.eigsh(e2, k=n_components)
+    m1_p = np.dot(e2_vecs.T, m1)
+    m2_p = ((-1. * alpha0) / (alpha0 + 1.)) * (m1_p * m1_p[:, np.newaxis])
+    m2_p[np.diag_indices_from(m2_p)] += e2_vals
+
+    # eigen values and vectors of M2 prime 
+    m2p_vals, m2p_vecs = LA.eigh(m2_p)
+
+    m2_vals = m2p_vals
+    m2_vecs = np.dot(e2_vecs, m2p_vecs)
+
+    return (m2_vals, m2_vecs)
diff --git a/tensor_lda/tests/test_moments.py b/tensor_lda/tests/test_moments.py
@@ -11,7 +11,8 @@
 from sklearn.externals.six.moves import xrange
 
 from tensor_lda.moments import (first_order_moments,
-                                cooccurrence_expectation)
+                                cooccurrence_expectation,
+                                second_order_moments)
 
 
 def test_first_order_moments():
@@ -115,3 +116,42 @@ def test_cooccurrence_expectation():
     assert_greater(ignored_cnt, 0)
     assert_equal(mask.sum(), n_samples - ignored_cnt)
     assert_array_almost_equal(result, e2.toarray())
+    # cooccurrence should be symmertic
+    assert_array_almost_equal(result, e2.toarray().T)
+
+
+def test_second_order_moments():
+    # compare create M2 directly vs create eigen value
+    # and vectors with optimized method
+    rng = np.random.RandomState(0)
+
+    n_features = 500
+    n_components = 50
+    min_count = 3
+    alpha0 = 20.
+    n_samples = rng.randint(100, 150)
+    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
+    doc_word_mtx = sp.csr_matrix(doc_word_mtx)
+
+    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count,
+                                whom='test_second_order_moments')
+    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count,
+                                     whom='test_second_order_moments')
+
+    # create M2 directly
+    m2 = e2.toarray()
+    m2 -= (alpha0 / (alpha0 + 1.)) * (m1 * m1[:, np.newaxis])
+    m2_vals_true, m2_vecs_true = sp.linalg.eigsh(m2, k=n_components)
+
+    # create M2 eigen values & vectors with optimized method
+    m2_vals, m2_vecs = second_order_moments(n_components, e2, m1, alpha0)
+    assert_equal(m2_vals.shape[0], n_components)
+    assert_equal(m2_vecs.shape[0], n_features)
+    assert_equal(m2_vecs.shape[1], n_components)
+
+    m2_reconstruct_true = np.dot(np.dot(m2_vecs_true, np.diag(m2_vals_true)), m2_vecs_true.T)
+    m2_reconstruct = np.dot(np.dot(m2_vecs, np.diag(m2_vals)), m2_vecs.T)
+    # compare reconstructed version
+    assert_array_almost_equal(m2_reconstruct_true, m2_reconstruct)
+    # compare original M2 with reconstructed version
+    assert_array_almost_equal(m2, m2_reconstruct, decimal=5)