Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
11fdf4a
commit 5c8c5a3
Showing
3 changed files
with
47 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,37 @@ | ||
import collections | ||
import pytest | ||
import cophi | ||
from cophi.text import utils | ||
|
||
|
||
PARAGRAPHS = [["A B C D E F", "G H I J K L"]] | ||
DOCUMENT = PARAGRAPHS[0][0] | ||
TOKENS = DOCUMENT.split(" ") | ||
|
||
def test_construct_ngrams(): | ||
ngrams = cophi.utils.construct_ngrams(TOKENS) | ||
ngrams = utils.construct_ngrams(TOKENS) | ||
assert list(ngrams) == ["A B", "B C", "C D", "D E", "E F"] | ||
|
||
def test_find_tokens(): | ||
tokens = cophi.utils.find_tokens(DOCUMENT, r"\w") | ||
tokens = utils.find_tokens(DOCUMENT, r"\w") | ||
assert list(tokens) == ["A", "B", "C", "D", "E", "F"] | ||
# Stop tokenizing after the first token: | ||
tokens = cophi.utils.find_tokens(DOCUMENT, r"\w", 1) | ||
tokens = utils.find_tokens(DOCUMENT, r"\w", 1) | ||
assert list(tokens) == ["A"] | ||
|
||
def test_lowercase_tokens(): | ||
tokens = cophi.utils.lowercase_tokens(TOKENS) | ||
tokens = utils.lowercase_tokens(TOKENS) | ||
assert tokens == ["a", "b", "c", "d", "e", "f"] | ||
|
||
def test_segment_fuzzy(): | ||
segments = cophi.utils.segment_fuzzy(PARAGRAPHS, 1) | ||
segments = utils.segment_fuzzy(PARAGRAPHS, 1) | ||
assert list(segments) == [[["A B C D E F"]], [["G H I J K L"]]] | ||
|
||
def test_parameter(): | ||
parameter = cophi.utils._parameter(TOKENS, "sichel_s") | ||
parameter = utils._parameter(TOKENS, "sichel_s") | ||
assert len(parameter) == 2 | ||
parameter = cophi.utils._parameter(TOKENS, "honore_h") | ||
parameter = utils._parameter(TOKENS, "honore_h") | ||
assert len(parameter) == 3 | ||
parameter = cophi.utils._parameter(TOKENS, "entropy") | ||
parameter = utils._parameter(TOKENS, "entropy") | ||
assert len(parameter) == 2 | ||
parameter = cophi.utils._parameter(TOKENS, "ttr") | ||
parameter = utils._parameter(TOKENS, "ttr") | ||
assert len(parameter) == 2 |