Refactor tests (#899)

* Refactor single test to run faster * Move reader to setUpClass * Add additional readers to handle consistency throughout tests * Move _fileids change to setUpClass * Remove unnecessary list calls * Check for only one filtered word * Change test to use uniq_word as indicator of success * Use list len as check for import tests * Added missing comma * Add setUpClass for transcribers; as mentioned in #897
cltk · Apr 4, 2019 · ed1a241 · ed1a241
1 parent fb61d10
commit ed1a241
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 100 deletions.
diff --git a/cltk/corpus/odia/alphabet.py b/cltk/corpus/odia/alphabet.py
@@ -25,7 +25,7 @@
     '0B15':'କ',
     '0B16':'ଖ',
     '0B17':'ଗ',
-    '0B18':'ଘ', 
+    '0B18':'ଘ',
     '0B19':'ଙ',
     '0B1A':'ଚ',
     '0B1B':'ଛ',
@@ -57,7 +57,7 @@
     '0B38':'ସ',
     '0B39':'ହ'
     }
-    
+
 # The structured consonants are classified according to where the tongue touches the palate of the mouth and are classified accordingly into five structured groups.
 # These consonants are shown here with their IAST transcriptions.
 
@@ -87,8 +87,8 @@
 EXTRA_NUMERICAL_SYMBOLS = ['୵',	'୶', '୷','୲', '୳','୴']
 EXTRA_NUMERICAL_SYMBOLS_DESC = ['1/16', '1/8', '3/16', '1/4','1/2',	'3/4']
 
-# Anusvara is used for final velar nasal sound, 
+# Anusvara is used for final velar nasal sound,
 # Visarga adds voiceless breath after vowel
-# Candrabindu is used to nasalize vowels 
+# Candrabindu is used to nasalize vowels
 
 MODIFIERS = ['◌্','◌ঁ','◌ং','◌ঃ']
diff --git a/cltk/tests/test_corpus/test_corpus.py b/cltk/tests/test_corpus/test_corpus.py
@@ -315,91 +315,6 @@ def test_import_latin_library_corpus_reader(self):
         ALL_FILE_IDS = list(reader.fileids())
         self.assertTrue(len(ALL_FILE_IDS) > 2100)
 
-    def test_import_latin_library_corpus_filter_by_file(self):
-        """Test the Latin Library corpus reader filter by files."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        filtered_reader = assemble_corpus(reader, types_requested=['old'],
-                                          type_files=corpus_texts_by_type)
-        self.assertTrue(len(list(filtered_reader.fileids())) > 0)
-
-    def test_import_latin_library_corpus_filter_by_dir(self):
-        """Test the Latin Library corpus reader filter by directories."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        filtered_reader = assemble_corpus(reader, types_requested=['old'],
-                                          type_dirs=corpus_directories_by_type)
-        self.assertTrue(len(list(filtered_reader.fileids())) > 0)
-
-    def test_import_latin_library_corpus_filter_by_file_and_dir(self):
-        """Test the Latin Library corpus reader filter by directories."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        filtered_reader = assemble_corpus(reader, types_requested=['old'],
-                                          type_dirs=corpus_directories_by_type,
-                                          type_files=corpus_texts_by_type)
-        self.assertTrue(len(list(filtered_reader.fileids())) > 0)
-
-    def test_filtered_corpus_reader_sents(self):
-        """Test filtered corpus sents method."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        reader._fileids = ['catullus.txt']
-        sents = list(reader.sents())
-        uniq_words = distinct_words(sents)
-        if 'Latin' in uniq_words:
-            self.fail('Filtered word present!')
-        if 'Library' in uniq_words:
-            self.fail('Filtered word present!')
-        self.assertTrue(len(sents) > 0)
-
-    def test_filtered_corpus_reader_paras(self):
-        """Test filtered corpus paras method."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        reader._fileids = ['catullus.txt']
-        paras = list(reader.paras())
-        sents = [sent
-                 for para in paras
-                 for sent in para]
-        uniq_words = distinct_words(sents)
-        if 'Latin' in uniq_words:
-            self.fail('Filtered word present!')
-        if 'Library' in uniq_words:
-            self.fail('Filtered word present!')
-        self.assertTrue(len(paras) > 0)
-
-    def test_filtered_corpus_reader_words(self):
-        """Test filtered corpus words method."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        reader._fileids = ['catullus.txt']
-        words = list(reader.words())
-        uniq_words = distinct_words(words)
-        if 'Latin' in uniq_words:
-            self.fail('Filtered word present!')
-        if 'Library' in uniq_words:
-            self.fail('Filtered word present!')
-        self.assertTrue(len(words) > 0)
-
-    def test_filtered_corpus_reader_docs(self):
-        """Test filtered corpus docs method."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        reader._fileids = ['catullus.txt']
-        docs = list(reader.docs())
-        words = distinct_words(docs)
-        if 'Latin' in words:
-            self.fail('Filtered word present!')
-        if 'Library' in words:
-            self.fail('Filtered word present!')
-        self.assertTrue(len(docs) > 0)
-        problem_files = ['caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt',
-                     'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt',
-                     'varro.ll9.txt']
-        for filename in problem_files:
-            doc = list(reader.docs([filename]))
-            assert(doc)
-            assert(len(doc[0]) > 100)
-
-    def test_filtered_corpus_reader_sizes(self):
-        """Test filtered corpus sizes method."""
-        reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
-        reader._fileids = ['catullus.txt']
-        self.assertTrue(len(list(reader.sizes())) > 0)
 
     def test_json_corpus_reader(self):
         """Test filtered corpus sents method."""
@@ -789,6 +704,88 @@ def test_normalize_middle_english(self):
         test = normalize_middle_english(in_test)
         self.assertEqual(target, test)
 
+class TestFilteredCorpus(unittest.TestCase):
+        """Test the Latin Library corpus reader filter"""
+        @classmethod
+        def setUpClass(cls):
+            try:
+                corpus_importer = CorpusImporter('latin')
+                corpus_importer.import_corpus('latin_text_latin_library')
+            except:
+                raise Exception('Failure to download test corpus')
+            cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
+            cls.reader._fileids = ['pervig.txt']
+            # Need a additional instance because tests below change internals #TO-DO Fix
+            cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
+            cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
+            cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
+
+        def test_import_latin_library_corpus_filter_by_file(self):
+            """Test the Latin Library corpus reader filter by files."""
+            filtered_reader = assemble_corpus(self.reader_2, types_requested=['old'],
+                                              type_files=corpus_texts_by_type)
+            self.assertTrue(len(list(filtered_reader.fileids())) > 0)
+
+        def test_import_latin_library_corpus_filter_by_dir(self):
+            """Test the Latin Library corpus reader filter by directories."""
+            filtered_reader = assemble_corpus(self.reader_3, types_requested=['old'],
+                                              type_dirs=corpus_directories_by_type)
+            self.assertTrue(len(list(filtered_reader.fileids())) > 0)
+
+        def test_import_latin_library_corpus_filter_by_file_and_dir(self):
+            """Test the Latin Library corpus reader filter by directories."""
+            filtered_reader = assemble_corpus(self.reader_4, types_requested=['old'],
+                                              type_dirs=corpus_directories_by_type,
+                                              type_files=corpus_texts_by_type)
+            self.assertTrue(len(list(filtered_reader.fileids())) > 0)
+
+        def test_filtered_corpus_reader_sents(self):
+            """Test filtered corpus sents method."""
+            sents = self.reader.sents()
+            uniq_words = distinct_words(sents)
+            # Curious—why the original test checked for two different words?
+            if 'Library' in uniq_words:
+                self.fail('Filtered word present!')
+            # You can check for uniq_words because it implies that sents had content
+            self.assertTrue(uniq_words)
+
+        def test_filtered_corpus_reader_paras(self):
+            """Test filtered corpus paras method."""
+            paras = self.reader.paras()
+            sents = [sent
+                     for para in paras
+                     for sent in para]
+            uniq_words = distinct_words(sents)
+            if 'Library' in uniq_words:
+                self.fail('Filtered word present!')
+            self.assertTrue(uniq_words)
+
+        def test_filtered_corpus_reader_words(self):
+            """Test filtered corpus words method."""
+            words = self.reader.words()
+            uniq_words = distinct_words(words)
+            if 'Library' in uniq_words:
+                self.fail('Filtered word present!')
+            self.assertTrue(uniq_words)
+
+        def test_filtered_corpus_reader_docs(self):
+            """Test filtered corpus docs method."""
+            docs = list(self.reader.docs())
+            uniq_words = distinct_words(docs)
+            if 'Library' in uniq_words:
+                self.fail('Filtered word present!')
+            self.assertTrue(len(docs) > 0)
+            problem_files = ['caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt',
+                         'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt',
+                         'varro.ll9.txt']
+            for filename in problem_files:
+                doc = list(self.reader.docs([filename]))
+                assert(doc)
+                assert(len(doc[0]) > 100)
+
+        def test_filtered_corpus_reader_sizes(self):
+            """Test filtered corpus sizes method."""
+            self.assertTrue(len(list(self.reader.sizes())) > 0)
 
 class TestUnicode(unittest.TestCase):
     "Test py23char"

diff --git a/cltk/tests/test_nlp/test_phonology.py b/cltk/tests/test_nlp/test_phonology.py
@@ -23,6 +23,12 @@
 class TestSequenceFunctions(unittest.TestCase):
     """Class for unittest"""
 
+    """Test the Latin Library corpus reader filter"""
+    @classmethod
+    def setUpClass(cls):
+        cls.greek_transcriber = grc.Transcriber("Attic", "Probert")
+        cls.latin_transcriber = lat.Transcriber("Classical", "Allen")
+
     """greek.transcription"""
     def test_greek_refresh(self):
         """Test the Word class's `_refresh` method in Greek."""
@@ -120,8 +126,7 @@ def test_greek_print_ipa(self):
     def test_greek_parse_diacritics(self):
         """Test the Transcriber class's `_parse_diacritics` in Greek."""
         inputs = ["ἄ", "Φ", "ῷ", "ὑ", "ϊ", "ῑ"]
-        transcriber = grc.Transcriber("Attic", "Probert")
-        outputs = [transcriber._parse_diacritics(char) for char in inputs]
+        outputs = [self.greek_transcriber._parse_diacritics(char) for char in inputs]
         target = [unicodedata.normalize('NFC', c) for c in
                   ["α/" + grc.chars.ACUTE + "//",
                    "φ///", "ω/" + grc.chars.CIRCUMFLEX + "/"
@@ -132,8 +137,7 @@ def test_greek_parse_diacritics(self):
     def test_greek_prep_text(self):
         """Test the Transcriber class's `_prep_text` in Greek."""
         inputs = ["λείπειν", "ὕπνῳ"]
-        transcriber = grc.Transcriber("Attic", "Probert")
-        outputs = [transcriber._prep_text(w) for w in inputs]
+        outputs = [self.greek_transcriber._prep_text(w) for w in inputs]
         target = [[('λ', '', ''), ('ει', '́', ''), ('π', '', ''),
                    ('ει', '', ''), ('ν', '', '')],
                   [('h', '', ''), ('υ', '́', ''), ('π', '', ''),
@@ -142,7 +146,7 @@ def test_greek_prep_text(self):
 
     def test_transcriber_probert(self):
         """Test Attic Greek IPA transcription via Probert reconstruction."""
-        transcriber = grc.Transcriber("Attic", "Probert").transcribe
+        transcriber = self.greek_transcriber.transcribe
         transcription = [transcriber(x) for x in
                          [unicodedata.normalize('NFC', y) for y in
                           ["ῥάξ", "εἰργασμένον", "φόρμιγξ", "γιγνώσκω"]]]
@@ -359,8 +363,7 @@ def test_latin_print_ipa(self):
     def test_latin_parse_diacritics(self):
         """Test the Transcriber class's `_parse_diacritics` in Latin."""
         inputs = ["a", "ū", "ï"]
-        transcriber = lat.Transcriber("Classical", "Allen")
-        outputs = [transcriber._parse_diacritics(char) for char in inputs]
+        outputs = [self.latin_transcriber._parse_diacritics(char) for char in inputs]
         target = [unicodedata.normalize('NFC', c) for c in
                   ["a///", "u/" + lat.chars.LONG + "//",
                    "i//" + lat.chars.DIAERESIS + "/"]]
@@ -369,8 +372,7 @@ def test_latin_parse_diacritics(self):
     def test_latin_prep_text(self):
         """Test the Transcriber class's `_prep_text` in Latin."""
         inputs = ["ūnam", "qui", "Belgae"]
-        transcriber = lat.Transcriber("Classical", "Allen")
-        outputs = [transcriber._prep_text(w) for w in inputs]
+        outputs = [self.latin_transcriber._prep_text(w) for w in inputs]
         target = [[('u', '̄', ''), ('n', '', ''), ('a', '', ''),
                    ('m', '', '')],
                   [('qu', '', ''), ('i', '', '')],
@@ -381,7 +383,7 @@ def test_latin_prep_text(self):
     def test_transcriber_allen_without_macronizer(self):
         """Test Classical Latin IPA transcription via Allen reconstruction,\
          input pre-macronized."""
-        transcriber = lat.Transcriber("Classical", "Allen").transcribe
+        transcriber = self.latin_transcriber.transcribe
         transcription = [transcriber(x, macronize=False) for x in
                          [unicodedata.normalize('NFC', y) for y in
                           ["Trōiae", "Gallia", "dīuīsa", "ūnam", "incolunt", "Belgae"]]]
@@ -393,7 +395,7 @@ def test_transcriber_allen_without_macronizer(self):
     def test_transcriber_allen_with_macronizer(self):
         """Test Classical Latin IPA transcription via Allen reconstruction,\
          with automatic macronization."""
-        transcriber = lat.Transcriber("Classical", "Allen").transcribe
+        transcriber = self.latin_transcriber.transcribe
         transcription = transcriber(
             "Quo usque tandem, O Catilina, abutere nostra patientia?",
             macronize=True)