Basic Serafin et al. '03 implementation

dimazest · Nov 6, 2013 · 2b6f0d9 · 2b6f0d9
1 parent 9302730
commit 2b6f0d9
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 3 deletions.
diff --git a/fowler/corpora/io.py b/fowler/corpora/io.py
@@ -1,18 +1,18 @@
 """IO functions."""
 
 import numpy as np
-from scipy.sparse import csr_matrix
+from scipy.sparse import csc_matrix
 
 
-def load_cooccurrence_matrix(store):
+def load_cooccurrence_matrix(store, matrix_type=csc_matrix):
     """Load a co-occurrence matrix from a store."""
 
     ij = np.vstack((
         store['row_ids'].values,
         store['col_ids'].values,
     ))
 
-    matrix = csr_matrix((
+    matrix = matrix_type((
         store['data'].values,
         ij,
     ))

diff --git a/fowler/corpora/main.py → fowler/corpora/main/__init__.py b/fowler/corpora/main.py → fowler/corpora/main/__init__.py
diff --git a/fowler/corpora/options.py → fowler/corpora/main/options.py b/fowler/corpora/options.py → fowler/corpora/main/options.py
diff --git a/fowler/corpora/serafim03/__init__.py b/fowler/corpora/serafim03/__init__.py
@@ -0,0 +1,40 @@
+"""Implementation of Latent Semantic Analysis for dialogue act classification.
+
+Usefull links
+-------------
+
+    * http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html
+      suggests to use::
+
+        U . SIGMA' . VT = MATRIX'
+
+    for the closest document look up.
+
+References
+----------
+
+Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic
+Analysis for dialogue act classification." Proceedings of the 2003 Conference
+of the North American Chapter of the Association for Computational Linguistics
+on Human Language Technology: companion volume of the Proceedings of HLT-NAACL
+2003--short papers-Volume 2. Association for Computational Linguistics, 2003.
+
+"""
+import numpy as np
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sparsesvd import sparsesvd
+
+
+class PlainLSA(BaseEstimator, ClassifierMixin):
+    def __init__(self, k=100):
+        self.k = k
+
+    def fit(self, X, y):
+        self.y = y
+
+        ut, s, vt = sparsesvd(X, self.k)
+        self.M = np.dot(ut.T, np.dot(np.diag(s), vt))
+
+    def predict(self, X):
+        raise NotImplementedError()
diff --git a/fowler/corpora/serafim03/classifier.py b/fowler/corpora/serafim03/classifier.py
@@ -0,0 +1,46 @@
+"""Implementation of Latent Semantic Analysis for dialogue act classification.
+
+Usefull links
+-------------
+
+    * http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html
+      suggests to use::
+
+        U . SIGMA' . VT = MATRIX'
+
+    for the closest document look up.
+
+References
+----------
+
+Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic
+Analysis for dialogue act classification." Proceedings of the 2003 Conference
+of the North American Chapter of the Association for Computational Linguistics
+on Human Language Technology: companion volume of the Proceedings of HLT-NAACL
+2003--short papers-Volume 2. Association for Computational Linguistics, 2003.
+
+"""
+import numpy as np
+from scipy.sparse import csc_matrix
+from scipy.spatial.distance import cosine
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sparsesvd import sparsesvd
+
+
+class PlainLSA(BaseEstimator, ClassifierMixin):
+    def __init__(self, k=100):
+        self.k = k
+
+    def fit(self, X, y):
+        X = csc_matrix(X)
+        self.y = y
+
+        ut, s, vt = sparsesvd(X, self.k)
+        self.M = np.dot(ut.T, np.dot(np.diag(s), vt))
+
+    def predict(self, X):
+        _, l = min(
+            (cosine(x, X), l)
+            for l, x in zip(self.y, self.M)
+        )
+        return l
diff --git a/fowler/corpora/serafim03/main.py b/fowler/corpora/serafim03/main.py
@@ -0,0 +1,26 @@
+"""Implementation of Latent Semantic Analysis for dialogue act classification."""
+import numpy as np
+
+from fowler.corpora.main.options import Dispatcher
+
+from .classifier import PlainLSA
+
+dispatcher = Dispatcher()
+command = dispatcher.command
+
+
+@command()
+def plain_lsa(
+    cooccurrence_matrix,
+    k=('k', 100, 'The number of dimensions after SVD applicaion.'),
+):
+    """Perform the Plain LSA method."""
+    X = cooccurrence_matrix
+    y = np.zeroes(len(X))
+
+    c = PlainLSA(k)
+    c.fit(X, y)
+
+    import ipdb; ipdb.set_trace()
+
+
diff --git a/setup.py b/setup.py
@@ -46,6 +46,7 @@ def run_tests(self):
         'scikit-learn',
         'scipy',
         'setuptools',
+        'sparsesvd-cffi',
         'tables',
     ],
     entry_points={

diff --git a/test/test_serafin03.py b/test/test_serafin03.py
@@ -0,0 +1,68 @@
+import numpy as np
+
+from fowler.corpora.serafim03.classifier import PlainLSA
+
+import pytest
+
+
+@pytest.fixture
+def word_document_matrix():
+    """A small word-document matrix that represents a toy dialog.
+
+    The utterences are::
+
+        - How are you?
+        - I am fine, thank you.
+
+        - Are you OK?
+        - Yes, I am.
+
+        - Am I OK?
+        - No, you are not.
+
+    Punctuation is ignored, the utterance tags are Q and A, for the questions
+    and the answers respectively.
+
+    Rows in the matrix correspond to the words, collumns to the documents.
+    """
+    return np.matrix((
+        (1, 0, 0, 0, 0, 0),  # how
+        (1, 0, 1, 0, 0, 1),  # are
+        (1, 1, 1, 0, 0, 1),  # you
+        (0, 1, 0, 1, 1, 0),  # i
+        (0, 1, 0, 1, 1, 0),  # am
+        (0, 1, 0, 0, 0, 0),  # fine
+        (0, 1, 0, 0, 0, 0),  # thank
+        (0, 0, 1, 0, 1, 0),  # ok
+        (0, 0, 0, 1, 0, 0),  # yes
+        (0, 0, 0, 0, 0, 1),  # no
+        (0, 0, 0, 0, 0, 1),  # not
+    ))
+
+
+@pytest.fixture
+def y():
+    """The tags for the toy dialog."""
+    return np.array(list('012345'))
+
+
+@pytest.mark.parametrize(
+    ('vector', 'expected_label'),
+    (
+        # How are you?
+        (np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), '0'),
+        # I am fine, thank you.
+        (np.array([0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0]), '1'),
+        # I am *OK*, thank you.
+        (np.array([0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]), '1'),
+    ),
+)
+def test_plainlsa(word_document_matrix, y, vector, expected_label):
+    X = word_document_matrix.T
+
+    cl = PlainLSA(2)
+
+    cl.fit(X, y)
+
+    label = cl.predict(vector)
+    assert label == expected_label