Skip to content

Commit

Permalink
Refactor tests (#899)
Browse files Browse the repository at this point in the history
* Refactor single test to run faster

* Move reader to setUpClass

* Add additional readers to handle consistency throughout tests

* Move _fileids change to setUpClass

* Remove unnecessary list calls

* Check for only one filtered word

* Change test to use uniq_word as indicator of success

* Use list len as check for import tests

* Added missing comma

* Add setUpClass for transcribers; as mentioned in #897
  • Loading branch information
diyclassics authored and kylepjohnson committed Apr 4, 2019
1 parent fb61d10 commit ed1a241
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 100 deletions.
8 changes: 4 additions & 4 deletions cltk/corpus/odia/alphabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
'0B15':'କ',
'0B16':'ଖ',
'0B17':'ଗ',
'0B18':'ଘ',
'0B18':'ଘ',
'0B19':'ଙ',
'0B1A':'ଚ',
'0B1B':'ଛ',
Expand Down Expand Up @@ -57,7 +57,7 @@
'0B38':'ସ',
'0B39':'ହ'
}

# The structured consonants are classified according to where the tongue touches the palate of the mouth and are classified accordingly into five structured groups.
# These consonants are shown here with their IAST transcriptions.

Expand Down Expand Up @@ -87,8 +87,8 @@
EXTRA_NUMERICAL_SYMBOLS = ['୵', '୶', '୷','୲', '୳','୴']
EXTRA_NUMERICAL_SYMBOLS_DESC = ['1/16', '1/8', '3/16', '1/4','1/2', '3/4']

# Anusvara is used for final velar nasal sound,
# Anusvara is used for final velar nasal sound,
# Visarga adds voiceless breath after vowel
# Candrabindu is used to nasalize vowels
# Candrabindu is used to nasalize vowels

MODIFIERS = ['◌্','◌ঁ','◌ং','◌ঃ']
167 changes: 82 additions & 85 deletions cltk/tests/test_corpus/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,91 +315,6 @@ def test_import_latin_library_corpus_reader(self):
ALL_FILE_IDS = list(reader.fileids())
self.assertTrue(len(ALL_FILE_IDS) > 2100)

def test_import_latin_library_corpus_filter_by_file(self):
"""Test the Latin Library corpus reader filter by files."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
filtered_reader = assemble_corpus(reader, types_requested=['old'],
type_files=corpus_texts_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_import_latin_library_corpus_filter_by_dir(self):
"""Test the Latin Library corpus reader filter by directories."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
filtered_reader = assemble_corpus(reader, types_requested=['old'],
type_dirs=corpus_directories_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_import_latin_library_corpus_filter_by_file_and_dir(self):
"""Test the Latin Library corpus reader filter by directories."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
filtered_reader = assemble_corpus(reader, types_requested=['old'],
type_dirs=corpus_directories_by_type,
type_files=corpus_texts_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_filtered_corpus_reader_sents(self):
"""Test filtered corpus sents method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['catullus.txt']
sents = list(reader.sents())
uniq_words = distinct_words(sents)
if 'Latin' in uniq_words:
self.fail('Filtered word present!')
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(len(sents) > 0)

def test_filtered_corpus_reader_paras(self):
"""Test filtered corpus paras method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['catullus.txt']
paras = list(reader.paras())
sents = [sent
for para in paras
for sent in para]
uniq_words = distinct_words(sents)
if 'Latin' in uniq_words:
self.fail('Filtered word present!')
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(len(paras) > 0)

def test_filtered_corpus_reader_words(self):
"""Test filtered corpus words method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['catullus.txt']
words = list(reader.words())
uniq_words = distinct_words(words)
if 'Latin' in uniq_words:
self.fail('Filtered word present!')
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(len(words) > 0)

def test_filtered_corpus_reader_docs(self):
"""Test filtered corpus docs method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['catullus.txt']
docs = list(reader.docs())
words = distinct_words(docs)
if 'Latin' in words:
self.fail('Filtered word present!')
if 'Library' in words:
self.fail('Filtered word present!')
self.assertTrue(len(docs) > 0)
problem_files = ['caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt',
'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt',
'varro.ll9.txt']
for filename in problem_files:
doc = list(reader.docs([filename]))
assert(doc)
assert(len(doc[0]) > 100)

def test_filtered_corpus_reader_sizes(self):
"""Test filtered corpus sizes method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['catullus.txt']
self.assertTrue(len(list(reader.sizes())) > 0)

def test_json_corpus_reader(self):
"""Test filtered corpus sents method."""
Expand Down Expand Up @@ -789,6 +704,88 @@ def test_normalize_middle_english(self):
test = normalize_middle_english(in_test)
self.assertEqual(target, test)

class TestFilteredCorpus(unittest.TestCase):
"""Test the Latin Library corpus reader filter"""
@classmethod
def setUpClass(cls):
try:
corpus_importer = CorpusImporter('latin')
corpus_importer.import_corpus('latin_text_latin_library')
except:
raise Exception('Failure to download test corpus')
cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
cls.reader._fileids = ['pervig.txt']
# Need a additional instance because tests below change internals #TO-DO Fix
cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')

def test_import_latin_library_corpus_filter_by_file(self):
"""Test the Latin Library corpus reader filter by files."""
filtered_reader = assemble_corpus(self.reader_2, types_requested=['old'],
type_files=corpus_texts_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_import_latin_library_corpus_filter_by_dir(self):
"""Test the Latin Library corpus reader filter by directories."""
filtered_reader = assemble_corpus(self.reader_3, types_requested=['old'],
type_dirs=corpus_directories_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_import_latin_library_corpus_filter_by_file_and_dir(self):
"""Test the Latin Library corpus reader filter by directories."""
filtered_reader = assemble_corpus(self.reader_4, types_requested=['old'],
type_dirs=corpus_directories_by_type,
type_files=corpus_texts_by_type)
self.assertTrue(len(list(filtered_reader.fileids())) > 0)

def test_filtered_corpus_reader_sents(self):
"""Test filtered corpus sents method."""
sents = self.reader.sents()
uniq_words = distinct_words(sents)
# Curious—why the original test checked for two different words?
if 'Library' in uniq_words:
self.fail('Filtered word present!')
# You can check for uniq_words because it implies that sents had content
self.assertTrue(uniq_words)

def test_filtered_corpus_reader_paras(self):
"""Test filtered corpus paras method."""
paras = self.reader.paras()
sents = [sent
for para in paras
for sent in para]
uniq_words = distinct_words(sents)
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(uniq_words)

def test_filtered_corpus_reader_words(self):
"""Test filtered corpus words method."""
words = self.reader.words()
uniq_words = distinct_words(words)
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(uniq_words)

def test_filtered_corpus_reader_docs(self):
"""Test filtered corpus docs method."""
docs = list(self.reader.docs())
uniq_words = distinct_words(docs)
if 'Library' in uniq_words:
self.fail('Filtered word present!')
self.assertTrue(len(docs) > 0)
problem_files = ['caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt',
'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt',
'varro.ll9.txt']
for filename in problem_files:
doc = list(self.reader.docs([filename]))
assert(doc)
assert(len(doc[0]) > 100)

def test_filtered_corpus_reader_sizes(self):
"""Test filtered corpus sizes method."""
self.assertTrue(len(list(self.reader.sizes())) > 0)

class TestUnicode(unittest.TestCase):
"Test py23char"
Expand Down
24 changes: 13 additions & 11 deletions cltk/tests/test_nlp/test_phonology.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
class TestSequenceFunctions(unittest.TestCase):
"""Class for unittest"""

"""Test the Latin Library corpus reader filter"""
@classmethod
def setUpClass(cls):
cls.greek_transcriber = grc.Transcriber("Attic", "Probert")
cls.latin_transcriber = lat.Transcriber("Classical", "Allen")

"""greek.transcription"""
def test_greek_refresh(self):
"""Test the Word class's `_refresh` method in Greek."""
Expand Down Expand Up @@ -120,8 +126,7 @@ def test_greek_print_ipa(self):
def test_greek_parse_diacritics(self):
"""Test the Transcriber class's `_parse_diacritics` in Greek."""
inputs = ["ἄ", "Φ", "ῷ", "ὑ", "ϊ", "ῑ"]
transcriber = grc.Transcriber("Attic", "Probert")
outputs = [transcriber._parse_diacritics(char) for char in inputs]
outputs = [self.greek_transcriber._parse_diacritics(char) for char in inputs]
target = [unicodedata.normalize('NFC', c) for c in
["α/" + grc.chars.ACUTE + "//",
"φ///", "ω/" + grc.chars.CIRCUMFLEX + "/"
Expand All @@ -132,8 +137,7 @@ def test_greek_parse_diacritics(self):
def test_greek_prep_text(self):
"""Test the Transcriber class's `_prep_text` in Greek."""
inputs = ["λείπειν", "ὕπνῳ"]
transcriber = grc.Transcriber("Attic", "Probert")
outputs = [transcriber._prep_text(w) for w in inputs]
outputs = [self.greek_transcriber._prep_text(w) for w in inputs]
target = [[('λ', '', ''), ('ει', '́', ''), ('π', '', ''),
('ει', '', ''), ('ν', '', '')],
[('h', '', ''), ('υ', '́', ''), ('π', '', ''),
Expand All @@ -142,7 +146,7 @@ def test_greek_prep_text(self):

def test_transcriber_probert(self):
"""Test Attic Greek IPA transcription via Probert reconstruction."""
transcriber = grc.Transcriber("Attic", "Probert").transcribe
transcriber = self.greek_transcriber.transcribe
transcription = [transcriber(x) for x in
[unicodedata.normalize('NFC', y) for y in
["ῥάξ", "εἰργασμένον", "φόρμιγξ", "γιγνώσκω"]]]
Expand Down Expand Up @@ -359,8 +363,7 @@ def test_latin_print_ipa(self):
def test_latin_parse_diacritics(self):
"""Test the Transcriber class's `_parse_diacritics` in Latin."""
inputs = ["a", "ū", "ï"]
transcriber = lat.Transcriber("Classical", "Allen")
outputs = [transcriber._parse_diacritics(char) for char in inputs]
outputs = [self.latin_transcriber._parse_diacritics(char) for char in inputs]
target = [unicodedata.normalize('NFC', c) for c in
["a///", "u/" + lat.chars.LONG + "//",
"i//" + lat.chars.DIAERESIS + "/"]]
Expand All @@ -369,8 +372,7 @@ def test_latin_parse_diacritics(self):
def test_latin_prep_text(self):
"""Test the Transcriber class's `_prep_text` in Latin."""
inputs = ["ūnam", "qui", "Belgae"]
transcriber = lat.Transcriber("Classical", "Allen")
outputs = [transcriber._prep_text(w) for w in inputs]
outputs = [self.latin_transcriber._prep_text(w) for w in inputs]
target = [[('u', '̄', ''), ('n', '', ''), ('a', '', ''),
('m', '', '')],
[('qu', '', ''), ('i', '', '')],
Expand All @@ -381,7 +383,7 @@ def test_latin_prep_text(self):
def test_transcriber_allen_without_macronizer(self):
"""Test Classical Latin IPA transcription via Allen reconstruction,\
input pre-macronized."""
transcriber = lat.Transcriber("Classical", "Allen").transcribe
transcriber = self.latin_transcriber.transcribe
transcription = [transcriber(x, macronize=False) for x in
[unicodedata.normalize('NFC', y) for y in
["Trōiae", "Gallia", "dīuīsa", "ūnam", "incolunt", "Belgae"]]]
Expand All @@ -393,7 +395,7 @@ def test_transcriber_allen_without_macronizer(self):
def test_transcriber_allen_with_macronizer(self):
"""Test Classical Latin IPA transcription via Allen reconstruction,\
with automatic macronization."""
transcriber = lat.Transcriber("Classical", "Allen").transcribe
transcriber = self.latin_transcriber.transcribe
transcription = transcriber(
"Quo usque tandem, O Catilina, abutere nostra patientia?",
macronize=True)
Expand Down

0 comments on commit ed1a241

Please sign in to comment.