Skip to content

Commit

Permalink
Basic Serafin et al. '03 implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
dimazest committed Nov 6, 2013
1 parent 9302730 commit 2b6f0d9
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 3 deletions.
6 changes: 3 additions & 3 deletions fowler/corpora/io.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
"""IO functions."""

import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix


def load_cooccurrence_matrix(store):
def load_cooccurrence_matrix(store, matrix_type=csc_matrix):
"""Load a co-occurrence matrix from a store."""

ij = np.vstack((
store['row_ids'].values,
store['col_ids'].values,
))

matrix = csr_matrix((
matrix = matrix_type((
store['data'].values,
ij,
))
Expand Down
File renamed without changes.
File renamed without changes.
40 changes: 40 additions & 0 deletions fowler/corpora/serafim03/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Implementation of Latent Semantic Analysis for dialogue act classification.
Usefull links
-------------
* http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html
suggests to use::
U . SIGMA' . VT = MATRIX'
for the closest document look up.
References
----------
Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic
Analysis for dialogue act classification." Proceedings of the 2003 Conference
of the North American Chapter of the Association for Computational Linguistics
on Human Language Technology: companion volume of the Proceedings of HLT-NAACL
2003--short papers-Volume 2. Association for Computational Linguistics, 2003.
"""
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sparsesvd import sparsesvd


class PlainLSA(BaseEstimator, ClassifierMixin):
def __init__(self, k=100):
self.k = k

def fit(self, X, y):
self.y = y

ut, s, vt = sparsesvd(X, self.k)
self.M = np.dot(ut.T, np.dot(np.diag(s), vt))

def predict(self, X):
raise NotImplementedError()
46 changes: 46 additions & 0 deletions fowler/corpora/serafim03/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Implementation of Latent Semantic Analysis for dialogue act classification.
Usefull links
-------------
* http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html
suggests to use::
U . SIGMA' . VT = MATRIX'
for the closest document look up.
References
----------
Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic
Analysis for dialogue act classification." Proceedings of the 2003 Conference
of the North American Chapter of the Association for Computational Linguistics
on Human Language Technology: companion volume of the Proceedings of HLT-NAACL
2003--short papers-Volume 2. Association for Computational Linguistics, 2003.
"""
import numpy as np
from scipy.sparse import csc_matrix
from scipy.spatial.distance import cosine
from sklearn.base import BaseEstimator, ClassifierMixin
from sparsesvd import sparsesvd


class PlainLSA(BaseEstimator, ClassifierMixin):
def __init__(self, k=100):
self.k = k

def fit(self, X, y):
X = csc_matrix(X)
self.y = y

ut, s, vt = sparsesvd(X, self.k)
self.M = np.dot(ut.T, np.dot(np.diag(s), vt))

def predict(self, X):
_, l = min(
(cosine(x, X), l)
for l, x in zip(self.y, self.M)
)
return l
26 changes: 26 additions & 0 deletions fowler/corpora/serafim03/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Implementation of Latent Semantic Analysis for dialogue act classification."""
import numpy as np

from fowler.corpora.main.options import Dispatcher

from .classifier import PlainLSA

dispatcher = Dispatcher()
command = dispatcher.command


@command()
def plain_lsa(
cooccurrence_matrix,
k=('k', 100, 'The number of dimensions after SVD applicaion.'),
):
"""Perform the Plain LSA method."""
X = cooccurrence_matrix
y = np.zeroes(len(X))

c = PlainLSA(k)
c.fit(X, y)

import ipdb; ipdb.set_trace()


1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def run_tests(self):
'scikit-learn',
'scipy',
'setuptools',
'sparsesvd-cffi',
'tables',
],
entry_points={
Expand Down
68 changes: 68 additions & 0 deletions test/test_serafin03.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import numpy as np

from fowler.corpora.serafim03.classifier import PlainLSA

import pytest


@pytest.fixture
def word_document_matrix():
"""A small word-document matrix that represents a toy dialog.
The utterences are::
- How are you?
- I am fine, thank you.
- Are you OK?
- Yes, I am.
- Am I OK?
- No, you are not.
Punctuation is ignored, the utterance tags are Q and A, for the questions
and the answers respectively.
Rows in the matrix correspond to the words, collumns to the documents.
"""
return np.matrix((
(1, 0, 0, 0, 0, 0), # how
(1, 0, 1, 0, 0, 1), # are
(1, 1, 1, 0, 0, 1), # you
(0, 1, 0, 1, 1, 0), # i
(0, 1, 0, 1, 1, 0), # am
(0, 1, 0, 0, 0, 0), # fine
(0, 1, 0, 0, 0, 0), # thank
(0, 0, 1, 0, 1, 0), # ok
(0, 0, 0, 1, 0, 0), # yes
(0, 0, 0, 0, 0, 1), # no
(0, 0, 0, 0, 0, 1), # not
))


@pytest.fixture
def y():
"""The tags for the toy dialog."""
return np.array(list('012345'))


@pytest.mark.parametrize(
('vector', 'expected_label'),
(
# How are you?
(np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), '0'),
# I am fine, thank you.
(np.array([0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0]), '1'),
# I am *OK*, thank you.
(np.array([0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]), '1'),
),
)
def test_plainlsa(word_document_matrix, y, vector, expected_label):
X = word_document_matrix.T

cl = PlainLSA(2)

cl.fit(X, y)

label = cl.predict(vector)
assert label == expected_label

0 comments on commit 2b6f0d9

Please sign in to comment.