-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split tests into general NLP and language-specific subdirectories (#818)
* integrated OE tagger into core CLTK * minor docs fix * fixed RST docs for OE * fixed RST docs for OE * split tests into general nlp subdirectory and language-specific subdirectory * gathered language-specific tests for OE and ME * renamed test directories to be discoverable by nose * fixed paths for test files
- Loading branch information
1 parent
3ea5376
commit b7908bf
Showing
22 changed files
with
138 additions
and
8 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Test for Middle English, based on Clément Besnier's test for Old Norse.""" | ||
|
||
import os | ||
import unittest | ||
|
||
from cltk.corpus.middle_english.alphabet import normalize_middle_english | ||
from cltk.phonology.middle_english.transcription import Word as word_me | ||
from cltk.stem.middle_english.stem import affix_stemmer as MiddleEnglishAffixStemmer | ||
from cltk.tokenize.word import WordTokenizer | ||
|
||
|
||
__author__ = ["John Stewart <johnstewart@aya.yale.edu>", ] | ||
|
||
|
||
class TestMiddleEnglish(unittest.TestCase): | ||
def test_normalize_middle_english(self): | ||
"""Tests Middle English normalizer""" | ||
in_test = "'Madame,' quod he, 'reule me As ȝ,e ly:k?eþ best.'" | ||
target = "'madame' quod he 'reule me as ye lyketh best'" | ||
test = normalize_middle_english(in_test) | ||
self.assertEqual(target, test) | ||
|
||
def test_middle_english_syllabify(self): | ||
"""Test syllabification of Middle English""" | ||
|
||
words = ['marchall', 'content', 'thyne', 'greef', 'commaundyd'] | ||
|
||
syllabified = [word_me(w).syllabify() for w in words] | ||
target_syllabified = [['mar', 'chall'], ['con', 'tent'], ['thyne'], ['greef'], ['com', 'mau', 'ndyd']] | ||
|
||
assert syllabified == target_syllabified | ||
|
||
syllabified_str = [word_me(w).syllabified_str() for w in words] | ||
target_syllabified_str = ['mar.chall', 'con.tent', 'thyne', 'greef', 'com.mau.ndyd'] | ||
|
||
assert syllabified_str == target_syllabified_str | ||
|
||
def test_middle_english_stemmer(self): | ||
"""Test stemming of Middle English""" | ||
sentence = ['the', 'speke', 'the', 'henmest', 'kyng', 'in', 'the', 'hillis', 'he', 'beholdis','he', 'lokis', 'vnder', | ||
'his', 'hondis', 'and', 'his', 'hed', 'heldis'] | ||
stemmed = MiddleEnglishAffixStemmer(sentence) | ||
target = 'the spek the henm kyng in the hill he behold he lok vnd his hond and his hed held' | ||
self.assertEqual(stemmed, target) | ||
|
||
def test_middle_english_tokenizer(self): | ||
text = " Fers am I ferd of oure fare;\n Fle we ful fast þer-fore. \n Can Y no cownsel bot care.\n\n" | ||
target = ['Fers', 'am', 'I', 'ferd', 'of', 'oure', 'fare', ';', 'Fle', 'we', 'ful', 'fast', 'þer', '-', 'fore', '.', | ||
'Can', 'Y', 'no', 'cownsel', 'bot', 'care', '.'] | ||
tokenizer = WordTokenizer('middle_english') | ||
tokenized = tokenizer.tokenize(text) | ||
self.assertTrue(tokenized == target) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
"""Test for Old English, based on Clément Besnier's test for Old Norse.""" | ||
|
||
import os | ||
import unittest | ||
|
||
from cltk.corpus.utils.importer import CorpusImporter | ||
from cltk.corpus.swadesh import Swadesh | ||
from cltk.tag.pos import POSTag | ||
|
||
__author__ = ["John Stewart <johnstewart@aya.yale.edu>", ] | ||
|
||
|
||
class TestOldEnglish(unittest.TestCase): | ||
"""Class for unittest""" | ||
def setUp(self): | ||
corpus_importer = CorpusImporter("old_english") | ||
corpus_importer.import_corpus("old_english_models_cltk") | ||
file_rel = os.path.join('~/cltk_data/old_english/model/old_english_models_cltk/README.md') | ||
file = os.path.expanduser(file_rel) | ||
file_exists = os.path.isfile(file) | ||
self.assertTrue(file_exists) | ||
|
||
# Swadesh list | ||
def test_swadesh_old_english(self): | ||
swadesh = Swadesh('eng_old') | ||
first_word = 'ic, iċċ, ih' | ||
match = swadesh.words()[0] | ||
self.assertEqual(first_word, match) | ||
|
||
def test_pos_unigram_old_english(self): | ||
"""Test tagging Old English POS with unigram tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_unigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') | ||
self.assertTrue(tagged) | ||
|
||
def test_pos_bigram_old_english(self): | ||
"""Test tagging Old English POS with bigram tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_bigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') | ||
self.assertTrue(tagged) | ||
|
||
def test_pos_trigram_old_english(self): | ||
"""Test tagging old_english POS with trigram tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_trigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') | ||
self.assertTrue(tagged) | ||
|
||
def test_pos_ngram123_tagger_old_english(self): | ||
"""Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') # pylint: disable=line-too-long | ||
self.assertTrue(tagged) | ||
|
||
def test_pos_crf_tagger_old_english(self): | ||
"""Test tagging Old English POS with CRF tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_crf('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') | ||
self.assertTrue(tagged) | ||
|
||
def test_pos_perceptron_tagger_old_english(self): | ||
"""Test tagging Old English POS with Perceptron tagger.""" | ||
tagger = POSTag('old_english') | ||
tagged = tagger.tag_perceptron('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') | ||
self.assertTrue(tagged) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters