forked from fotisj/pydelta
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
93 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,3 +18,6 @@ nosetests.xml | |
.python-version | ||
|
||
NOTES | ||
.cache | ||
.pytest_cache | ||
pytest.xml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import os | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
|
||
@pytest.fixture(scope='session') | ||
def testdir() -> str: | ||
return os.fspath(Path(__file__).parent / 'corpus3') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,75 @@ | ||
from pytest import approx | ||
|
||
import delta as d | ||
import os | ||
from nose.tools import eq_ | ||
import pytest | ||
|
||
testdir = None | ||
|
||
@pytest.fixture | ||
def feature_generator() -> d.FeatureGenerator: | ||
return d.FeatureGenerator() | ||
|
||
def setup_module(): | ||
global testdir | ||
testdir = os.path.join( | ||
os.path.dirname( | ||
os.path.abspath(__file__)), | ||
'corpus3') | ||
|
||
def test_tokenize(): | ||
assert list(d.FeatureGenerator().tokenize(["This is a", "simple test"])) \ | ||
== ["This", "is", "a", "simple", "test"] | ||
|
||
class FeatureGenerator_Test: | ||
|
||
def setup(self): | ||
self.gen = d.FeatureGenerator() | ||
def test_tokenize_letters(): | ||
fg1 = d.FeatureGenerator(token_pattern=d.LETTERS_PATTERN) | ||
assert list(fg1.tokenize(["I don't like mondays."])) \ | ||
== ["I", "don", "t", "like", "mondays"] | ||
|
||
def test_tokenize(self): | ||
assert list(self.gen.tokenize(["This is a", "simple test"])) \ | ||
== ["This", "is", "a", "simple", "test"] | ||
|
||
def test_tokenize_letters(self): | ||
fg1 = d.FeatureGenerator(token_pattern=d.LETTERS_PATTERN) | ||
assert list(fg1.tokenize(["I don't like mondays."])) \ | ||
== ["I", "don", "t", "like", "mondays"] | ||
def test_tokenize_words(): | ||
fg1 = d.FeatureGenerator(token_pattern=d.WORD_PATTERN) | ||
assert list(fg1.tokenize(["I don't like mondays."])) \ | ||
== ["I", "don't", "like", "mondays"] | ||
|
||
def test_tokenize_words(self): | ||
fg1 = d.FeatureGenerator(token_pattern=d.WORD_PATTERN) | ||
assert list(fg1.tokenize(["I don't like mondays."])) \ | ||
== ["I", "don't", "like", "mondays"] | ||
|
||
def test_count_tokens(self): | ||
result = self.gen.count_tokens( | ||
def test_count_tokens(feature_generator): | ||
result = feature_generator.count_tokens( | ||
["this is a test", "testing this generator"]) | ||
assert result["this"] == 2 | ||
assert result["generator"] == 1 | ||
assert result.sum() == 7 | ||
assert result["this"] == 2 | ||
assert result["generator"] == 1 | ||
assert result.sum() == 7 | ||
|
||
|
||
def test_get_name(feature_generator): | ||
assert feature_generator.get_name('foo/bar.baz.txt') == 'bar.baz' | ||
|
||
def test_get_name(self): | ||
assert self.gen.get_name('foo/bar.baz.txt') == 'bar.baz' | ||
|
||
def test_call(self): | ||
df = self.gen(testdir) | ||
eq_(df.und.sum(), 25738.0) | ||
def test_call_fg(feature_generator, testdir): | ||
df = feature_generator(os.fspath(testdir)) | ||
assert df.loc[:, 'und'].sum() == approx(25738.0) | ||
|
||
class Corpus_Test: | ||
|
||
def parse_test(self): | ||
corpus = d.Corpus(testdir) | ||
eq_(corpus.und.sum(), 25738.0) | ||
## Corpus | ||
|
||
def mfw_test(self): | ||
corpus = d.Corpus(testdir) | ||
rel_corpus = corpus.get_mfw_table(0) | ||
eq_(rel_corpus.sum(axis=1).sum(), 9) | ||
@pytest.fixture(scope='module') | ||
def corpus(testdir): | ||
return d.Corpus(testdir) | ||
|
||
def test_corpus_parse(corpus): | ||
assert corpus.und.sum() == approx(25738.0) | ||
|
||
def test_corpus_mfw(corpus): | ||
rel_corpus = corpus.get_mfw_table(0) | ||
assert rel_corpus.sum(axis=1).sum() == approx(9) | ||
|
||
class Cluster_Test: | ||
|
||
def init_test(self): | ||
# FIXME | ||
corpus = d.Corpus(testdir).get_mfw_table(1000) | ||
deltas = d.functions.cosine_delta(corpus) | ||
hclust = d.Clustering(deltas) | ||
fclust = hclust.fclustering() | ||
print(fclust.describe()) | ||
print(fclust.evaluate()) | ||
assert fclust.data is not None | ||
def test_integration_cluster(corpus): | ||
# FIXME | ||
top1000 = corpus.get_mfw_table(1000) | ||
deltas = d.functions.cosine_delta(top1000) | ||
hclust = d.Clustering(deltas) | ||
fclust = hclust.fclustering() | ||
print(fclust.describe()) | ||
print(fclust.evaluate()) | ||
assert fclust.adjusted_rand_index() == 1 | ||
|
||
class Table_Describer_Test: | ||
|
||
def md_test(self): | ||
corpus = d.Corpus(testdir, document_describer=d.util.TableDocumentDescriber(testdir + '.csv', 'Author', 'Title')) | ||
assert corpus.document_describer.group_name(corpus.index[-1]) == 'Raabe' | ||
def test_table_describer(testdir): | ||
corpus = d.Corpus(testdir, | ||
document_describer=d.util.TableDocumentDescriber(testdir + '.csv', 'Author', 'Title')) | ||
assert corpus.document_describer.group_name(corpus.index[-1]) in {'Raabe', 'Marlitt', 'Fontane'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,36 @@ | ||
from pytest import approx | ||
|
||
import delta as d | ||
import os | ||
# from nose.tools import eq_ | ||
from math import log10, pow | ||
import pytest | ||
|
||
testdir = None | ||
c1000 = None | ||
|
||
|
||
def setup_module(): | ||
global testdir | ||
global c1000 | ||
testdir = os.path.join( | ||
os.path.dirname( | ||
os.path.abspath(__file__)), | ||
'corpus3') | ||
c1000 = d.Corpus(testdir).get_mfw_table(1000) | ||
|
||
|
||
def feq_(result, expected, msg=None, threshold=None): | ||
if threshold is None: | ||
threshold = pow(10, log10(expected)-2) | ||
if msg is None: | ||
msg = "{} != {}".format(result, expected) | ||
assert abs(expected - result) < threshold, msg | ||
|
||
|
||
class Delta_Test: | ||
|
||
def check_function(self, function, expected_distance, expected_score=None): | ||
distances = function(c1000) | ||
sample = distances.at['Fontane,-Theodor_Der-Stechlin', | ||
'Fontane,-Theodor_Effi-Briest'] | ||
feq_(sample, expected_distance, | ||
"{} Stechlin/Effi distance is {} instead of {}!".format( | ||
function.name, sample, expected_distance)) | ||
|
||
if expected_score is not None: | ||
feq_(expected_score, distances.simple_score(), | ||
"{} simple score is {} instead of expected {}!".format( | ||
function.name, distances.simple_score(), expected_score)) | ||
|
||
def burrows_test(self): | ||
self.check_function(d.functions.burrows, 0.7538867972199293) | ||
@pytest.fixture(scope='module') | ||
def c1000(testdir): | ||
return d.Corpus(testdir).get_mfw_table(1000) | ||
|
||
def linear_test(self): | ||
self.check_function(d.functions.linear, 1149.434663563308) | ||
|
||
def quadratic_test(self): | ||
self.check_function(d.functions.quadratic, 1102.845003724634) | ||
def fn_id(fn): | ||
return fn.name if isinstance(fn, d.DeltaFunction) else None | ||
|
||
def eder_test(self): | ||
self.check_function(d.functions.eder, 0.3703309813454142) | ||
@pytest.mark.parametrize("function,expected_distance", [(d.functions.burrows, 0.7538867972199293), | ||
(d.functions.linear, 1149.434663563308), | ||
(d.functions.quadratic, 1102.845003724634), | ||
(d.functions.eder, 0.3703309813454142), | ||
(d.functions.cosine_delta, 0.6156353166442046)], | ||
ids=fn_id) | ||
def test_distance(function, expected_distance, c1000): | ||
distances = function(c1000) | ||
sample = distances.at['Fontane,-Theodor_Der-Stechlin', | ||
'Fontane,-Theodor_Effi-Briest'] | ||
assert sample == approx(expected_distance, rel=1e-2) | ||
|
||
def cosine_delta_test(self): | ||
self.check_function(d.functions.cosine_delta, 0.6156353166442046) | ||
|
||
def composite_metric_test(self): | ||
mcosine = d.MetricDeltaFunction('cosine', 'mcosine') | ||
assert mcosine.fix_symmetry == True, "fix_symmetry is False!?" | ||
mcd = d.CompositeDeltaFunction('mcosine-z_score', 'metric_cosine_delta') | ||
assert mcd.basis.fix_symmetry == True, "basis.fix_symmetry is False!?" | ||
self.check_function(d.functions.metric_cosine_delta, 0.6156353166442046) | ||
def test_composite_metric(c1000): | ||
mcosine = d.MetricDeltaFunction('cosine', 'mcosine') | ||
assert mcosine.fix_symmetry == True, "fix_symmetry is False!?" | ||
mcd = d.CompositeDeltaFunction('mcosine-z_score', 'metric_cosine_delta') | ||
assert mcd.basis.fix_symmetry == True, "basis.fix_symmetry is False!?" | ||
test_distance(d.functions.metric_cosine_delta, 0.6156353166442046, c1000) |