Skip to content

Commit

Permalink
Split tests into general NLP and language-specific subdirectories (#818)
Browse files Browse the repository at this point in the history
* integrated OE tagger into core CLTK

* minor docs fix

* fixed RST docs for OE

* fixed RST docs for OE

* split tests into general nlp subdirectory and language-specific subdirectory

* gathered language-specific tests for OE and ME

* renamed test directories to be discoverable by nose

* fixed paths for test files
  • Loading branch information
free-variation authored and kylepjohnson committed Aug 8, 2018
1 parent 3ea5376 commit b7908bf
Show file tree
Hide file tree
Showing 22 changed files with 138 additions and 8 deletions.
File renamed without changes.
File renamed without changes.
56 changes: 56 additions & 0 deletions cltk/tests/test_languages/test_middle_english.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Test for Middle English, based on Clément Besnier's test for Old Norse."""

import os
import unittest

from cltk.corpus.middle_english.alphabet import normalize_middle_english
from cltk.phonology.middle_english.transcription import Word as word_me
from cltk.stem.middle_english.stem import affix_stemmer as MiddleEnglishAffixStemmer
from cltk.tokenize.word import WordTokenizer


__author__ = ["John Stewart <johnstewart@aya.yale.edu>", ]


class TestMiddleEnglish(unittest.TestCase):
def test_normalize_middle_english(self):
"""Tests Middle English normalizer"""
in_test = "'Madame,' quod he, 'reule me As ȝ,e ly:k?eþ best.'"
target = "'madame' quod he 'reule me as ye lyketh best'"
test = normalize_middle_english(in_test)
self.assertEqual(target, test)

def test_middle_english_syllabify(self):
"""Test syllabification of Middle English"""

words = ['marchall', 'content', 'thyne', 'greef', 'commaundyd']

syllabified = [word_me(w).syllabify() for w in words]
target_syllabified = [['mar', 'chall'], ['con', 'tent'], ['thyne'], ['greef'], ['com', 'mau', 'ndyd']]

assert syllabified == target_syllabified

syllabified_str = [word_me(w).syllabified_str() for w in words]
target_syllabified_str = ['mar.chall', 'con.tent', 'thyne', 'greef', 'com.mau.ndyd']

assert syllabified_str == target_syllabified_str

def test_middle_english_stemmer(self):
"""Test stemming of Middle English"""
sentence = ['the', 'speke', 'the', 'henmest', 'kyng', 'in', 'the', 'hillis', 'he', 'beholdis','he', 'lokis', 'vnder',
'his', 'hondis', 'and', 'his', 'hed', 'heldis']
stemmed = MiddleEnglishAffixStemmer(sentence)
target = 'the spek the henm kyng in the hill he behold he lok vnd his hond and his hed held'
self.assertEqual(stemmed, target)

def test_middle_english_tokenizer(self):
text = " Fers am I ferd of oure fare;\n Fle we ful fast þer-fore. \n Can Y no cownsel bot care.\n\n"
target = ['Fers', 'am', 'I', 'ferd', 'of', 'oure', 'fare', ';', 'Fle', 'we', 'ful', 'fast', 'þer', '-', 'fore', '.',
'Can', 'Y', 'no', 'cownsel', 'bot', 'care', '.']
tokenizer = WordTokenizer('middle_english')
tokenized = tokenizer.tokenize(text)
self.assertTrue(tokenized == target)


if __name__ == '__main__':
unittest.main()
68 changes: 68 additions & 0 deletions cltk/tests/test_languages/test_old_english.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Test for Old English, based on Clément Besnier's test for Old Norse."""

import os
import unittest

from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.swadesh import Swadesh
from cltk.tag.pos import POSTag

__author__ = ["John Stewart <johnstewart@aya.yale.edu>", ]


class TestOldEnglish(unittest.TestCase):
"""Class for unittest"""
def setUp(self):
corpus_importer = CorpusImporter("old_english")
corpus_importer.import_corpus("old_english_models_cltk")
file_rel = os.path.join('~/cltk_data/old_english/model/old_english_models_cltk/README.md')
file = os.path.expanduser(file_rel)
file_exists = os.path.isfile(file)
self.assertTrue(file_exists)

# Swadesh list
def test_swadesh_old_english(self):
swadesh = Swadesh('eng_old')
first_word = 'ic, iċċ, ih'
match = swadesh.words()[0]
self.assertEqual(first_word, match)

def test_pos_unigram_old_english(self):
"""Test tagging Old English POS with unigram tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_unigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
self.assertTrue(tagged)

def test_pos_bigram_old_english(self):
"""Test tagging Old English POS with bigram tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_bigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
self.assertTrue(tagged)

def test_pos_trigram_old_english(self):
"""Test tagging old_english POS with trigram tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_trigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
self.assertTrue(tagged)

def test_pos_ngram123_tagger_old_english(self):
"""Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') # pylint: disable=line-too-long
self.assertTrue(tagged)

def test_pos_crf_tagger_old_english(self):
"""Test tagging Old English POS with CRF tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_crf('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
self.assertTrue(tagged)

def test_pos_perceptron_tagger_old_english(self):
"""Test tagging Old English POS with Perceptron tagger."""
tagger = POSTag('old_english')
tagged = tagger.tag_perceptron('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
self.assertTrue(tagged)


if __name__ == '__main__':
unittest.main()
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_tlgu_convert(self):
converts it.
Note: assertEquals fails on some accented characters ('ή', 'ί').
"""
in_test = os.path.abspath('cltk/tests/tlgu_test_text_beta_code.txt')
in_test = os.path.abspath('cltk/tests/test_nlp/tlgu_test_text_beta_code.txt')
out_test = os.path.expanduser('~/cltk_data/tlgu_test_text_unicode.txt')
tlgu = TLGU(testing=True)
tlgu.convert(in_test, out_test)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_open_pickle_fail_missing(self):

def test_open_pickle_fail_corrupt(self):
"""Test failure to open corrupted pickle."""
bad_file = 'cltk/tests/bad_pickle.pickle'
bad_file = 'cltk/tests/test_nlp/bad_pickle.pickle'
with self.assertRaises(UnpicklingError):
open_pickle(bad_file)

Expand Down Expand Up @@ -95,7 +95,7 @@ def test_concordance_from_string(self):
def test_concordance_from_file(self):
"""Test ``write_concordance_from_file()`` for file writing completion
of concordance builder. Doesn't test quality of output."""
text_file = 'cltk/tests/text-file.txt'
text_file = 'cltk/tests/test_nlp/text-file.txt'
philology.write_concordance_from_file(text_file, 'test_file')
file_conc = os.path.expanduser('~/cltk_data/user_data/concordance_test_file.txt')
is_file = os.path.isfile(file_conc)
Expand Down
File renamed without changes.
File renamed without changes.
16 changes: 11 additions & 5 deletions docs/old_english.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,16 @@ There are a number of different pre-trained models available for POS tagging of

(Bigram and trigram models are also available, but unsuitable due to low accuracy.)

The taggers were trained from annotated data from the `The ISWOC Treebank <http://iswoc.github.io/>`_ (version 0.9, license: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License).
The taggers were trained from annotated data from the `The ISWOC Treebank <http://iswoc.github.io/>`_ (license: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License).

The POS tag scheme is explained here: https://proiel.github.io/handbook/developer/

```Bech, Kristin and Kristine Eide. 2014. The ISWOC corpus. Department of Literature, Area Studies and European Languages, University of Oslo. http://iswoc.github.com.```
``Bech, Kristin and Kristine Eide. 2014. The ISWOC corpus.
Department of Literature, Area Studies and European Languages,
University of Oslo. http://iswoc.github.com.``

### Example: Tagging with the CRF tagger
``````````
Example: Tagging with the CRF tagger
------------------------------------

The following sentence is from the beginning of Beowulf:

Expand All @@ -155,7 +157,11 @@ The following sentence is from the beginning of Beowulf:
In [4]: tagger.tag_crf(sent)
Out[4]:[('Hwæt', 'I-'), ('!', 'C-'), ('We', 'NE'), ('Gardena', 'NE'), ('in', 'R-'), ('geardagum', 'NB'), (',', 'C-'), ('þeodcyninga', 'NB'), (',', 'C-'), ('þrym', 'PY'), ('gefrunon', 'NB'), (',', 'C-'), ('hu', 'DU'), ('ða', 'PD'), ('æþelingas', 'NB'), ('ellen', 'V-'), ('fremedon', 'V-'), ('.', 'C-')]
Out[4]:[('Hwæt', 'I-'), ('!', 'C-'),
('We', 'NE'), ('Gardena', 'NE'), ('in', 'R-'), ('geardagum', 'NB'), (',', 'C-'),
('þeodcyninga', 'NB'), (',', 'C-'), ('þrym', 'PY'), ('gefrunon', 'NB'),
(',', 'C-'), ('hu', 'DU'), ('ða', 'PD'), ('æþelingas', 'NB'), ('ellen', 'V-'),
('fremedon', 'V-'), ('.', 'C-')]
Swadesh
=======
Expand Down

0 comments on commit b7908bf

Please sign in to comment.