-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Basic Serafin et al. '03 implementation
- Loading branch information
Showing
8 changed files
with
184 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""Implementation of Latent Semantic Analysis for dialogue act classification. | ||
Usefull links | ||
------------- | ||
* http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html | ||
suggests to use:: | ||
U . SIGMA' . VT = MATRIX' | ||
for the closest document look up. | ||
References | ||
---------- | ||
Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic | ||
Analysis for dialogue act classification." Proceedings of the 2003 Conference | ||
of the North American Chapter of the Association for Computational Linguistics | ||
on Human Language Technology: companion volume of the Proceedings of HLT-NAACL | ||
2003--short papers-Volume 2. Association for Computational Linguistics, 2003. | ||
""" | ||
import numpy as np | ||
|
||
from sklearn.base import BaseEstimator, ClassifierMixin | ||
from sparsesvd import sparsesvd | ||
|
||
|
||
class PlainLSA(BaseEstimator, ClassifierMixin): | ||
def __init__(self, k=100): | ||
self.k = k | ||
|
||
def fit(self, X, y): | ||
self.y = y | ||
|
||
ut, s, vt = sparsesvd(X, self.k) | ||
self.M = np.dot(ut.T, np.dot(np.diag(s), vt)) | ||
|
||
def predict(self, X): | ||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""Implementation of Latent Semantic Analysis for dialogue act classification. | ||
Usefull links | ||
------------- | ||
* http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html | ||
suggests to use:: | ||
U . SIGMA' . VT = MATRIX' | ||
for the closest document look up. | ||
References | ||
---------- | ||
Serafin, Riccardo, Barbara Di Eugenio, and Michael Glass. "Latent Semantic | ||
Analysis for dialogue act classification." Proceedings of the 2003 Conference | ||
of the North American Chapter of the Association for Computational Linguistics | ||
on Human Language Technology: companion volume of the Proceedings of HLT-NAACL | ||
2003--short papers-Volume 2. Association for Computational Linguistics, 2003. | ||
""" | ||
import numpy as np | ||
from scipy.sparse import csc_matrix | ||
from scipy.spatial.distance import cosine | ||
from sklearn.base import BaseEstimator, ClassifierMixin | ||
from sparsesvd import sparsesvd | ||
|
||
|
||
class PlainLSA(BaseEstimator, ClassifierMixin): | ||
def __init__(self, k=100): | ||
self.k = k | ||
|
||
def fit(self, X, y): | ||
X = csc_matrix(X) | ||
self.y = y | ||
|
||
ut, s, vt = sparsesvd(X, self.k) | ||
self.M = np.dot(ut.T, np.dot(np.diag(s), vt)) | ||
|
||
def predict(self, X): | ||
_, l = min( | ||
(cosine(x, X), l) | ||
for l, x in zip(self.y, self.M) | ||
) | ||
return l |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
"""Implementation of Latent Semantic Analysis for dialogue act classification.""" | ||
import numpy as np | ||
|
||
from fowler.corpora.main.options import Dispatcher | ||
|
||
from .classifier import PlainLSA | ||
|
||
dispatcher = Dispatcher() | ||
command = dispatcher.command | ||
|
||
|
||
@command() | ||
def plain_lsa( | ||
cooccurrence_matrix, | ||
k=('k', 100, 'The number of dimensions after SVD applicaion.'), | ||
): | ||
"""Perform the Plain LSA method.""" | ||
X = cooccurrence_matrix | ||
y = np.zeroes(len(X)) | ||
|
||
c = PlainLSA(k) | ||
c.fit(X, y) | ||
|
||
import ipdb; ipdb.set_trace() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import numpy as np | ||
|
||
from fowler.corpora.serafim03.classifier import PlainLSA | ||
|
||
import pytest | ||
|
||
|
||
@pytest.fixture | ||
def word_document_matrix(): | ||
"""A small word-document matrix that represents a toy dialog. | ||
The utterences are:: | ||
- How are you? | ||
- I am fine, thank you. | ||
- Are you OK? | ||
- Yes, I am. | ||
- Am I OK? | ||
- No, you are not. | ||
Punctuation is ignored, the utterance tags are Q and A, for the questions | ||
and the answers respectively. | ||
Rows in the matrix correspond to the words, collumns to the documents. | ||
""" | ||
return np.matrix(( | ||
(1, 0, 0, 0, 0, 0), # how | ||
(1, 0, 1, 0, 0, 1), # are | ||
(1, 1, 1, 0, 0, 1), # you | ||
(0, 1, 0, 1, 1, 0), # i | ||
(0, 1, 0, 1, 1, 0), # am | ||
(0, 1, 0, 0, 0, 0), # fine | ||
(0, 1, 0, 0, 0, 0), # thank | ||
(0, 0, 1, 0, 1, 0), # ok | ||
(0, 0, 0, 1, 0, 0), # yes | ||
(0, 0, 0, 0, 0, 1), # no | ||
(0, 0, 0, 0, 0, 1), # not | ||
)) | ||
|
||
|
||
@pytest.fixture | ||
def y(): | ||
"""The tags for the toy dialog.""" | ||
return np.array(list('012345')) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('vector', 'expected_label'), | ||
( | ||
# How are you? | ||
(np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), '0'), | ||
# I am fine, thank you. | ||
(np.array([0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0]), '1'), | ||
# I am *OK*, thank you. | ||
(np.array([0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]), '1'), | ||
), | ||
) | ||
def test_plainlsa(word_document_matrix, y, vector, expected_label): | ||
X = word_document_matrix.T | ||
|
||
cl = PlainLSA(2) | ||
|
||
cl.fit(X, y) | ||
|
||
label = cl.predict(vector) | ||
assert label == expected_label |